# Caractérisation des fichiers

## Capture 1 

### Statistiques 
![alt text](images/capture_1_statistiques.png "Title")

### Nombre de sessions par protocole
![alt text](images/capture_1_conversations.png "Title")

### Les différents protocoles utilisés
![alt text](images/capture_1_protocoles.png "Title")

## Capture 2

### Statistiques
![alt text](images/capture_2_statistiques.png "Title")

### Nombre de sessions par protocole
![alt text](images/capture_2_conversations.png "Title")

### Les différents protocoles utilisés
![alt text](images/capture_2_protocoles.png "Title")

# Exploration des fichiers

In [4]:
import pandas as pd

In [5]:
files = ["capture1.csv", "capture2.csv"]

for file in files:
    df = pd.read_csv(file)
    print(f"Fichier :{file}")
    df_taille_paquet = df[df["Info"].str.contains("Len")].copy()
    df_taille_paquet["taille_paquet"] = df_taille_paquet["Info"].str.extract(r"Len=(\d+)")
    df_taille_paquet["port_src"] = df_taille_paquet["Info"].str.extract(r"(\d+)")
    df_taille_paquet["port_dest"] = df_taille_paquet["Info"].str.extract(r" (\d+)")
    unique_flows = df.groupby(["Source", "Destination"])

    nombre_flux = len(unique_flows)
    nombre_total_paquets = 0

    moyenne_taille_paquets = 0
    max_packet_length = 0
    min_packet_length = float("inf")

    moyenne_temps_inter_paquets = 0
    max_temps_inter_paquets = 0
    min_temps_inter_paquets = float("inf")

    res = {}

    # Boucle sur chaque flux pour obtenir les informations requises
    for (source, destination), group in unique_flows:
        num_packets = len(group)
        avg_packet_length = group["Length"].mean()
        max_packet_length_cur = group["Length"].max()
        min_packet_length_cur = group["Length"].min()


        time_gaps = group["Time"].diff().mean()
        min_temps_inter_paquets_cur = group["Time"].diff().min()
        max_temps_inter_paquets_cur = group["Time"].diff().max()

        res[source, destination] =  {
            "num_packets": round(num_packets, 3),
            "avg_packet_length": round(avg_packet_length, 3), 
            "max_packet_length": round(max_packet_length_cur, 3),
            "min_packet_length": round(min_packet_length_cur, 3),
            "time_gaps": round(time_gaps, 3),
            "max_time_gaps": round(max_temps_inter_paquets_cur, 3),
            "min_time_gaps": round(min_temps_inter_paquets_cur, 3),
        }

        nombre_total_paquets += num_packets
        moyenne_taille_paquets += avg_packet_length
        if time_gaps > 0:
            moyenne_temps_inter_paquets += time_gaps

        if max_packet_length_cur > max_packet_length:
            max_packet_length = max_packet_length_cur

        if min_packet_length_cur < min_packet_length and min_packet_length_cur > 0:
            min_packet_length = min_packet_length_cur

        if max_temps_inter_paquets_cur > max_temps_inter_paquets:
            max_temps_inter_paquets = max_temps_inter_paquets_cur

        if min_temps_inter_paquets_cur < min_temps_inter_paquets and min_temps_inter_paquets_cur > 0:
            min_temps_inter_paquets = min_temps_inter_paquets_cur

    moyenne_taille_paquets /= len(unique_flows)
    moyenne_temps_inter_paquets /= len(unique_flows)

    print(f"Nombre de flux: {nombre_flux}")
    print(f"Nombre total de paquets: {nombre_total_paquets}")
    print(f"Moyenne des tailles des paquets: {moyenne_taille_paquets}")
    print(f"Taille maximale des paquets: {max_packet_length}")
    print(f"Taille minimale des paquets: {min_packet_length}")
    print(f"Moyenne du temps inter-paquets: {moyenne_temps_inter_paquets} secondes")
    print(f"Temps inter-paquets maximal: {max_temps_inter_paquets} secondes")
    print(f"Temps inter-paquets minimal: {min_temps_inter_paquets} secondes")

    df_calc = pd.DataFrame.from_dict(res, orient="index").reset_index()
    df_calc = df_calc.rename(columns={"level_0": "Source", "level_1": "Destination"})
    df_calc.to_csv(f"res_{file}")

Fichier :capture1.csv
Nombre de flux: 411
Nombre total de paquets: 100930
Moyenne des tailles des paquets: 168.19735277101486
Taille maximale des paquets: 1514
Taille minimale des paquets: 42
Moyenne du temps inter-paquets: 6.590242665894542 secondes
Temps inter-paquets maximal: 62.556986 secondes
Temps inter-paquets minimal: 9.999999974752427e-07 secondes
Fichier :capture2.csv
Nombre de flux: 383
Nombre total de paquets: 317593
Moyenne des tailles des paquets: 124.8878840382985
Taille maximale des paquets: 1514
Taille minimale des paquets: 42
Moyenne du temps inter-paquets: 16.988659524481157 secondes
Temps inter-paquets maximal: 141.084066 secondes
Temps inter-paquets minimal: 9.999999974752427e-07 secondes


# Cluster

In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [7]:
files = ["res_capture1", "res_capture2"]
for file in files:
    df_calc = pd.read_csv(f"{file}.csv", index_col=0)
    label_encoders = {}
    for column in ["Source", "Destination"]:
        label_encoders[column] = LabelEncoder()
        df_calc[column] = label_encoders[column].fit_transform(df_calc[column])
    df_calc.fillna(-1, inplace=True)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_calc)

    kmeans = KMeans(n_clusters=3, random_state=42)
    df_calc['cluster'] = kmeans.fit_predict(scaled_data)
    df_calc.to_csv(f"{file}_cluster.csv")

In [8]:
df_calc.head()

Unnamed: 0,Source,Destination,num_packets,avg_packet_length,max_packet_length,min_packet_length,time_gaps,max_time_gaps,min_time_gaps,cluster
0,0,33,13,60.0,60,60,13.475,41.597,0.615,0
1,0,46,6,342.0,342,342,10.158,13.957,5.08,0
2,1,1,205,144.341,290,102,0.966,8.932,0.0,0
3,1,2,172,124.233,158,106,1.145,4.155,0.0,0
4,1,3,348,133.793,270,98,0.583,4.208,0.0,0


In [9]:
for i in df_calc["cluster"].unique():
    print(f"Cluster {i}")
    df_curr = df_calc[df_calc["cluster"] == i]
    print("Mean num packets:", df_curr["num_packets"].mean())
    print("Mean avg packet length:", df_curr["avg_packet_length"].mean())
    print("Mean max packet length:", df_curr["max_packet_length"].mean())
    print("Mean min packet length:", df_curr["min_packet_length"].mean())
    print("Mean time gaps:", df_curr["time_gaps"].mean())
    print("Mean max time gaps:", df_curr["max_time_gaps"].mean())
    print("Mean min time gaps:", df_curr["min_time_gaps"].mean())
    

Cluster 0
Mean num packets: 1649.0628272251308
Mean avg packet length: 168.27608376963346
Mean max packet length: 370.42931937172773
Mean min packet length: 86.53403141361257
Mean time gaps: 2.94430890052356
Mean max time gaps: 11.59979057591623
Mean min time gaps: 0.6823141361256545
Cluster 2
Mean num packets: 4.162162162162162
Mean avg packet length: 82.0029009009009
Mean max packet length: 92.94594594594595
Mean min packet length: 78.25225225225225
Mean time gaps: 46.67015315315315
Mean max time gaps: 70.78353153153152
Mean min time gaps: 31.51102702702703
Cluster 1
Mean num packets: 26.666666666666668
Mean avg packet length: 81.34567901234568
Mean max packet length: 81.34567901234568
Mean min packet length: 81.34567901234568
Mean time gaps: 8.826061728395063
Mean max time gaps: 25.997876543209877
Mean min time gaps: 2.531925925925926


# Anomalies

In [10]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

## Isolation Forest

In [11]:
df = pd.read_csv("res_capture1_cluster.csv", index_col=0)
df.fillna(-1, inplace=True)
df_test = pd.read_csv("res_capture2_cluster.csv", index_col=0)
df_test.fillna(-1, inplace=True)

label_encoders = {}
for column in ["Source", "Destination"]:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    df_test[column] = label_encoders[column].fit_transform(df_test[column])

model = IsolationForest(contamination=0.1)
model.fit(df)

anomaly_scores = model.decision_function(df_test)
if_anomalies = df_test[anomaly_scores < 0]

print(f"Isolation forest à trouvé {len(if_anomalies)} anomalies dans le fichier capture2.csv")
print(f"La pourcentage d'anomalies est de {len(if_anomalies)/len(df_test)*100:.2f}%")
print("Les 6 premières anomalies sont:")
if_anomalies.head()

Isolation forest à trouvé 158 anomalies dans le fichier capture2.csv
La pourcentage d'anomalies est de 41.25%
Les 6 premières anomalies sont:


Unnamed: 0,Source,Destination,num_packets,avg_packet_length,max_packet_length,min_packet_length,time_gaps,max_time_gaps,min_time_gaps,cluster
1,0,46,6,342.0,342,342,10.158,13.957,5.08,0
6,1,7,11551,216.462,983,60,0.018,0.192,0.0,0
16,1,22,4097,134.824,286,118,0.049,3.601,0.0,0
27,1,52,19,158.526,571,60,10.015,141.036,0.001,2
29,1,54,62306,161.367,482,118,0.003,0.109,0.0,0


## Local Outlier Factor

In [12]:
df = pd.read_csv("res_capture1_cluster.csv", index_col=0)
df.fillna(-1, inplace=True)
df_test = pd.read_csv("res_capture2_cluster.csv", index_col=0)
df_test.fillna(-1, inplace=True)

label_encoders = {}
for column in ["Source", "Destination"]:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    df_test[column] = label_encoders[column].transform(df_test[column])

lof_model = LocalOutlierFactor(contamination=0.05, novelty=True)

lof_model.fit(df.values)
lof_scores = lof_model.predict(df_test)
lof_anomalies = df_test[lof_scores == -1]

print(f"LOF à trouvé {len(lof_anomalies)} anomalies")
print(f"La pourcentage d'anomalies est de {len(lof_anomalies)/len(df_test)*100:.2f}%")
print("Les 5 premières anomalies détecté par LOF:")
lof_anomalies.head()

LOF à trouvé 88 anomalies
La pourcentage d'anomalies est de 22.98%
Les 5 premières anomalies détecté par LOF:


Unnamed: 0,Source,Destination,num_packets,avg_packet_length,max_packet_length,min_packet_length,time_gaps,max_time_gaps,min_time_gaps,cluster
1,0,46,6,342.0,342,342,10.158,13.957,5.08,0
3,1,2,172,124.233,158,106,1.145,4.155,0.0,0
4,1,3,348,133.793,270,98,0.583,4.208,0.0,0
5,1,6,221,145.584,274,102,0.912,7.555,0.0,0
6,1,7,11551,216.462,983,60,0.018,0.192,0.0,0


## One-Class SVM

In [15]:
df = pd.read_csv("res_capture1_cluster.csv", index_col=0)
df.fillna(-1, inplace=True)
df_test = pd.read_csv("res_capture2_cluster.csv", index_col=0)
df_test.fillna(-1, inplace=True)

label_encoders = {}
for column in ["Source", "Destination"]:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    df_test[column] = label_encoders[column].transform(df_test[column])

svm_model = OneClassSVM(nu=0.1)

svm_model.fit(df)
svm_scores = svm_model.predict(df_test)
svm_anomalies = df_test[svm_scores == -1]

print(f"One-Class SVM à trouvé {len(svm_anomalies)} anomalies")
print(f"La pourcentage d'anomalies est de {len(svm_anomalies)/len(df_test)*100:.2f}%")
print("Les 5 premières anomalies détecté par SVM :")


One-Class SVM à trouvé 65 anomalies
La pourcentage d'anomalies est de 16.97%
Les 5 premières anomalies détecté par SVM :


# Mix des 3

In [14]:
df_concat = pd.concat([if_anomalies, lof_anomalies, svm_anomalies], ignore_index=True)

print(f"Nombre total d'anomalies: {len(df_concat)}")
duplicated = df_concat.duplicated()
print(f"nombre de duplicated : {len(duplicated[duplicated == True])}")

df_concat = df_concat.drop_duplicates()

print(f"Nombre total d'anomalies: {len(df_concat)} sur un total de {len(df_test)} flux")

Nombre total d'anomalies: 311
nombre de duplicated : 92
Nombre total d'anomalies: 219 sur un total de 383 flux


On se retrouve avec 224 anomalies sur 382 flux , ce qui est énorme. A voir ce qu'une anomalie veut vraiment dire.