# Demo 1: typical ML pipeline

## Data preprocessing: CICFlowMeter

## Data cleaning

In [1]:
import pandas as pd

In [2]:
df_youtube = pd.read_csv("./capture_youtube.pcap_Flow.csv")
df_vimeo = pd.read_csv("./capture_vimeo.pcap_Flow.csv")

print(df_youtube.columns)   # these are all columns that CICFlowMeter uses

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Len

In [3]:
df_youtube.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,172.18.0.10-142.250.190.150-40062-443-6,172.18.0.10,40062,142.250.190.150,443,6,08/09/2023 10:23:59 AM,77864,7,5,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NeedManualLabel
1,172.18.0.10-142.250.191.246-41358-443-6,172.18.0.10,41358,142.250.191.246,443,6,08/09/2023 10:23:59 AM,102287,7,6,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NeedManualLabel
2,172.18.0.10-142.250.190.142-39258-443-6,172.18.0.10,39258,142.250.190.142,443,6,08/09/2023 10:24:14 AM,67573,7,5,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NeedManualLabel
3,172.18.0.10-142.250.190.142-39248-443-6,172.18.0.10,39248,142.250.190.142,443,6,08/09/2023 10:24:14 AM,81246,7,5,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NeedManualLabel
4,172.18.0.10-172.18.0.6-40562-26512-6,172.18.0.10,40562,172.18.0.6,26512,6,08/09/2023 10:24:18 AM,15813,6,5,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NeedManualLabel


In [4]:
df_youtube['Label'] = 'other'
df_youtube.loc[(df_youtube['Total Fwd Packet'] > 30) | (df_youtube['Total Bwd packets'] > 30), 'Label'] = 'youtube'  
df_youtube = df_youtube.drop(df_youtube[(df_youtube['Protocol'] == 17) & (df_youtube['Label'] != 'youtube')].index)

In [5]:
df_vimeo['Label'] = 'other'
df_vimeo.loc[(df_vimeo['Total Fwd Packet'] > 30) | (df_vimeo['Total Bwd packets'] > 30), 'Label'] = 'vimeo'
df_vimeo = df_vimeo.drop(df_vimeo[(df_vimeo['Protocol'] == 17) & (df_vimeo['Label'] != 'vimeo')].index)

In [6]:
df = pd.concat([df_youtube, df_vimeo], ignore_index=True)
df = df.drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp'], axis=1)
df = df.dropna()  # remove rows with Nones

## Model training

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [8]:
target_variable = 'Label'
train_features = list(set(df.columns) - {target_variable})
X = df[train_features]
y = df[target_variable]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
classifiers = [
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(),
]

In [11]:
for clf in classifiers:
    clf.fit(X_train, y_train)

In [12]:
for clf in classifiers:
    print(clf)
    y_pred = clf.predict(X_train.values)
    print(metrics.classification_report(y_train, y_pred))
    print()

RandomForestClassifier()
              precision    recall  f1-score   support

       other       1.00      1.00      1.00      2071
       vimeo       1.00      1.00      1.00       187
     youtube       1.00      1.00      1.00       228

    accuracy                           1.00      2486
   macro avg       1.00      1.00      1.00      2486
weighted avg       1.00      1.00      1.00      2486


GradientBoostingClassifier()
              precision    recall  f1-score   support

       other       1.00      1.00      1.00      2071
       vimeo       1.00      1.00      1.00       187
     youtube       1.00      1.00      1.00       228

    accuracy                           1.00      2486
   macro avg       1.00      1.00      1.00      2486
weighted avg       1.00      1.00      1.00      2486


MLPClassifier()
              precision    recall  f1-score   support

       other       0.97      1.00      0.98      2071
       vimeo       0.96      0.70      0.81       187
   



In [13]:
for clf in classifiers:
    print(clf)
    y_pred = clf.predict(X_test.values)
    print(metrics.classification_report(y_test, y_pred))
    print()

RandomForestClassifier()
              precision    recall  f1-score   support

       other       1.00      1.00      1.00       508
       vimeo       1.00      0.98      0.99        57
     youtube       1.00      1.00      1.00        57

    accuracy                           1.00       622
   macro avg       1.00      0.99      1.00       622
weighted avg       1.00      1.00      1.00       622


GradientBoostingClassifier()




              precision    recall  f1-score   support

       other       1.00      1.00      1.00       508
       vimeo       1.00      1.00      1.00        57
     youtube       1.00      1.00      1.00        57

    accuracy                           1.00       622
   macro avg       1.00      1.00      1.00       622
weighted avg       1.00      1.00      1.00       622


MLPClassifier()
              precision    recall  f1-score   support

       other       0.96      1.00      0.98       508
       vimeo       0.95      0.72      0.82        57
     youtube       1.00      0.89      0.94        57

    accuracy                           0.96       622
   macro avg       0.97      0.87      0.92       622
weighted avg       0.96      0.96      0.96       622




