In [1]:
# install packages
!pip install numpy pandas pyarrow scikit-learn fastcore kagglehub[pandas-datasets] jinja2 boto

fish: Unknown command: pip
fish: 
pip install numpy pandas pyarrow scikit-learn fastcore kagglehub[pandas-datasets] jinja2 boto
^~^


# Utility methods

In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score, get_scorer, get_scorer_names
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
drop_columns = [
    "id",
    "Flow ID",        
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
    "Attempted Category",
]

def xs_y(df, targ): 
    if not isinstance(targ, list):
        xs = df[df.columns.difference([targ])].copy()
    else:
        xs = df[df.columns.difference(targ)].copy()
    y = df[targ].copy()
    return xs, y

def get_cic_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "cic/" + file)    
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df

def get_concap_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "concap/" + file)
    # clean the dataset
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df["Label"] = 1

    return df

def train_verify_one_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, feature: str):
    root = DecisionTreeClassifier(max_depth=1, criterion='gini')
    train_x, train_y = xs_y(train_df, "Label")
    test_x, test_y = xs_y(test_df, "Label")
    root.fit(train_x[feature].array.reshape(-1,1), train_y)

    predictions = root.predict(test_x[feature].array.reshape(-1,1))
    return feature, roc_auc_score(test_y, predictions), accuracy_score(test_y, predictions), precision_score(test_y, predictions), recall_score(test_y, predictions), f1_score(test_y, predictions)

def concap_cic_experiment(benign_df: pd.DataFrame, cic: pd.DataFrame, concap: pd.DataFrame):
    # Train on CIC => Predict ConCap
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    columns = ["Feature", "ROC AUC Score", "Accuracy", "Precision", "Recall", "F1 Score"]

    measurements_cic_concap = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"], 
                                           columns=columns
                                          )

    # Train on ConCap => Predict CIC
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    
    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_concap_cic = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                          columns=columns
                                          )
    
    
    return measurements_cic_concap, measurements_concap_cic

# Benign
Benign flows are used for balancing the training and testing sets

In [5]:
benign = get_cic_dataset("monday.csv")
benign["Label"] = 0
benign

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


# Tuesday

In [6]:
tuesday_cic = get_cic_dataset("tuesday.csv")

  df = kagglehub.load_dataset(


## Bruteforce FTP

In [7]:
cic_ftp = tuesday_cic[tuesday_cic["Label"] == "FTP-Patator"].copy()
cic_ftp["Label"] = 1
cic_ftp

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
4630,6,4008190,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4008190,1
4631,6,4018946,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4018946,1
4632,6,4067119,6,6,34,76,20,0,5.666667,8.981462,...,0,0,0.0,0.0,0,0,-1,-1,4067119,1
4633,6,4015015,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4015015,1
4634,6,4099543,6,6,31,76,17,0,5.166667,8.060190,...,0,0,0.0,0.0,0,0,-1,-1,4099543,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321766,6,9567017,11,17,135,188,34,0,12.272727,11.384998,...,0,0,0.0,0.0,0,0,-1,-1,9567017,1
321824,6,8627604,11,17,125,188,25,0,11.363636,9.871907,...,0,0,0.0,0.0,0,0,-1,-1,8627604,1
321857,6,8438103,11,17,116,188,20,0,10.545455,8.721968,...,0,0,0.0,0.0,0,0,-1,-1,8438103,1
321880,6,8958671,11,17,124,188,24,0,11.272727,9.654956,...,0,0,0.0,0.0,0,0,-1,-1,8958671,1


In [8]:
concap_ftp = get_concap_dataset("concap_ftp.csv")
concap_ftp.drop(columns=["label", "category", "subcategory"], inplace=True, errors='ignore')
concap_ftp["Label"] = 1
concap_ftp

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=concap/concap_ftp.csv...


100%|██████████████████████████████████████████████████████████████████████████| 3.96M/3.96M [00:00<00:00, 4.71MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,3029650,11,17,88.0,188.0,15.0,0.0,8.000000,6.496153,...,0,0,0,0,0,0,-1,-1,3029650,1
1,6,3034178,11,17,85.0,188.0,14.0,0.0,7.727273,6.198240,...,0,0,0,0,0,0,-1,-1,3034178,1
2,6,3032346,11,17,85.0,188.0,14.0,0.0,7.727273,6.214353,...,0,0,0,0,0,0,-1,-1,3032346,1
3,6,3032433,11,18,88.0,188.0,16.0,0.0,8.000000,6.511528,...,0,0,0,0,0,0,-1,-1,3032433,1
4,6,3027024,11,18,84.0,188.0,14.0,0.0,7.636364,6.152605,...,0,0,0,0,0,0,-1,-1,3027024,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,6,3235936,14,18,86.0,188.0,16.0,0.0,6.142857,6.502747,...,0,0,0,0,0,0,-1,-1,3235936,1
5696,6,3032999,11,17,88.0,188.0,17.0,0.0,8.000000,6.618157,...,0,0,0,0,0,0,-1,-1,3032999,1
5697,6,3034066,11,16,79.0,188.0,16.0,0.0,7.181818,5.980271,...,0,0,0,0,0,0,-1,-1,3034066,1
5698,6,3031003,11,17,87.0,188.0,15.0,0.0,7.909091,6.425800,...,0,0,0,0,0,0,-1,-1,3031003,1


In [9]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_ftp, concap_ftp)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
41,Packet Length Max,0.981621,0.984905,0.975026,1.0,0.987355
42,Packet Length Mean,0.981282,0.984595,0.974688,0.999825,0.987096
54,Average Packet Size,0.981282,0.984595,0.974688,0.999825,0.987096
55,Fwd Segment Size Avg,0.97923,0.98294,0.971867,1.0,0.985733
8,Fwd Packet Length Mean,0.97923,0.98294,0.971867,1.0,0.985733
64,Subflow Fwd Bytes,0.978978,0.982734,0.971536,1.0,0.985562
56,Bwd Segment Size Avg,0.976334,0.980562,0.968071,1.0,0.983776
12,Bwd Packet Length Mean,0.976334,0.980562,0.968071,1.0,0.983776
10,Bwd Packet Length Max,0.975579,0.979942,0.967085,1.0,0.983267
35,Bwd RST Flags,0.972814,0.968362,0.998337,0.947895,0.972462


In [11]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
53,Down/Up Ratio,0.992825,0.992825,0.989253,0.996475,0.992851
35,Bwd RST Flags,0.984768,0.984768,0.97508,0.994965,0.984922
42,Packet Length Mean,0.980488,0.980488,0.962442,1.0,0.980862
54,Average Packet Size,0.980488,0.980488,0.962442,1.0,0.980862
64,Subflow Fwd Bytes,0.976334,0.976334,0.959446,0.994713,0.976761
56,Bwd Segment Size Avg,0.976083,0.976083,0.954349,1.0,0.976641
12,Bwd Packet Length Mean,0.976083,0.976083,0.954349,1.0,0.976641
41,Packet Length Max,0.975453,0.975453,0.959825,0.992447,0.975863
10,Bwd Packet Length Max,0.974194,0.974194,0.950922,1.0,0.974844
8,Fwd Packet Length Mean,0.970796,0.970796,0.95766,0.985146,0.971209


## Bruteforce SSH

In [12]:
cic_ssh = tuesday_cic[tuesday_cic["Label"] == "SSH-Patator"].copy()
cic_ssh["Label"] = 1
cic_ssh

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
79299,6,4755497,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4755497,1
79300,6,4742052,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4742052,1
79301,6,1688206,10,10,1128,2009,640,0,112.800000,203.802083,...,0,0,0.0,0.0,0,0,-1,-1,1688206,1
79302,6,1884425,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1884425,1
79303,6,1937542,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1937542,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321777,6,11725520,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11725520,1
321784,6,11641016,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11641016,1
321785,6,12117686,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,12117686,1
321944,6,13640748,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,13640748,1


In [13]:
concap_ssh = get_concap_dataset("concap_ssh.csv")
concap_ssh.drop(columns=["label", "category", "subcategory"], inplace=True, errors='ignore')
concap_ssh["Label"] = 1
concap_ssh

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4660,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,4660,1
1,6,3557,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,3557,1
2,6,19511,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,19511,1
3,6,11027649,22,34,1928.0,2746.0,640.0,0.0,87.636364,137.780552,...,0,0,0,0,0,0,-1,-1,11027649,1
4,6,9870,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0,0,0,0,-1,-1,9870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7633,6,4487608,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4487608,1
7634,6,3367832,14,16,1272.0,2154.0,640.0,0.0,90.857143,174.415444,...,0,0,0,0,0,0,-1,-1,3367832,1
7635,6,4025378,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4025378,1
7636,6,3951,4,4,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,3951,1


In [14]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_ssh, concap_ssh)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
70,Fwd Seg Size Min,0.925101,0.957921,0.94527,0.999476,0.971618
67,FWD Init Win Bytes,0.890915,0.939051,0.922018,1.0,0.959427
24,Fwd IAT Min,0.861612,0.858666,0.943642,0.854936,0.897101
46,SYN Flag Count,0.813408,0.895745,0.873613,1.0,0.932544
7,Fwd Packet Length Min,0.795677,0.885838,0.863246,1.0,0.926604
40,Packet Length Min,0.794495,0.885178,0.862564,1.0,0.926211
56,Bwd Segment Size Avg,0.79409,0.838098,0.882954,0.89382,0.888354
12,Bwd Packet Length Mean,0.79409,0.838098,0.882954,0.89382,0.888354
0,Protocol,0.793144,0.884423,0.861785,1.0,0.925762
11,Bwd Packet Length Min,0.792131,0.883857,0.861202,1.0,0.925426


In [16]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
70,Fwd Seg Size Min,0.920128,0.920128,0.862682,0.999325,0.92599
24,Fwd IAT Min,0.845829,0.845829,0.800293,0.921648,0.856694
12,Bwd Packet Length Mean,0.841776,0.841776,0.766176,0.983789,0.861452
56,Bwd Segment Size Avg,0.841776,0.841776,0.766176,0.983789,0.861452
45,FIN Flag Count,0.815603,0.815603,0.731713,0.996623,0.843866
46,SYN Flag Count,0.813745,0.813745,0.728705,0.999662,0.842945
37,Bwd Header Length,0.813576,0.813576,0.728413,1.0,0.842869
30,Fwd PSH Flags,0.811212,0.811212,0.725913,1.0,0.841193
9,Fwd Packet Length Std,0.808342,0.808342,0.7229,1.0,0.839167
48,PSH Flag Count,0.807835,0.807835,0.722371,1.0,0.83881


# Wednesday

In [17]:
wednesday_cic = get_cic_dataset("wednesday.csv")

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=cic/wednesday.csv...


100%|████████████████████████████████████████████████████████████████████████████| 278M/278M [00:28<00:00, 10.3MB/s]


In [18]:
wednesday_cic["Label"].unique()

array(['BENIGN', 'DoS Slowloris', 'DoS Slowloris - Attempted',
       'DoS Slowhttptest', 'DoS Slowhttptest - Attempted', 'DoS Hulk',
       'DoS Hulk - Attempted', 'DoS GoldenEye', 'Heartbleed',
       'DoS GoldenEye - Attempted'], dtype=object)

## Slowloris

In [19]:
cic_slowloris = wednesday_cic[wednesday_cic["Label"] == "DoS Slowloris"].copy()
cic_slowloris["Label"] = 1
cic_slowloris

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
9593,6,17072865,4,3,239,0,231,0,59.75,114.228937,...,1415,1415,17071390.0,0.0,17071390,17071390,-1,-1,17072865,1
9594,6,17069963,4,3,239,0,231,0,59.75,114.228937,...,856,856,17069049.0,0.0,17069049,17069049,-1,-1,17069963,1
9595,6,17071303,4,3,239,0,231,0,59.75,114.228937,...,1002,1002,17070196.0,0.0,17070196,17070196,-1,-1,17071303,1
9596,6,17070583,4,3,239,0,231,0,59.75,114.228937,...,740,740,17069779.0,0.0,17069779,17069779,-1,-1,17070583,1
9597,6,17069147,4,3,239,0,231,0,59.75,114.228937,...,942,942,17068165.0,0.0,17068165,17068165,-1,-1,17069147,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31140,6,4603499,11,3,1155,483,231,0,105.00,120.635816,...,0,0,0.0,0.0,0,0,-1,-1,4603499,1
31172,6,4204862,11,3,1386,483,231,0,126.00,120.635816,...,0,0,0.0,0.0,0,0,-1,-1,4204862,1
31188,6,3202193,10,3,1386,483,231,0,138.60,119.287887,...,0,0,0.0,0.0,0,0,-1,-1,3202193,1
31204,6,2599709,10,3,1155,483,231,0,115.50,121.747690,...,0,0,0.0,0.0,0,0,-1,-1,2599709,1


In [20]:
concap_slowloris = get_concap_dataset("concap_slowloris.csv")
concap_slowloris

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=concap/concap_slowloris.csv...


100%|██████████████████████████████████████████████████████████████████████████| 1.32M/1.32M [00:00<00:00, 2.33MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,118926264,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118926264,1
1,6,118849273,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118849273,1
2,6,118886942,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118886942,1
3,6,118823677,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118823677,1
4,6,118644621,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118644621,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115,6,64981331,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7124966.0,7124966.0,1.928546e+07,1.296348e+07,33536459.0,8193397.0,-1,-1,64981331,1
2116,6,108873650,60,59,685.0,406.0,253.0,0.0,11.416667,31.795049,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,108873650,1
2117,6,64054899,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7221325.0,7221325.0,1.894452e+07,1.240389e+07,32514391.0,8190908.0,-1,-1,64054899,1
2118,6,107089854,59,58,677.0,406.0,253.0,0.0,11.474576,32.064781,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,107089854,1


In [21]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_slowloris, concap_slowloris)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [22]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
54,Average Packet Size,0.936153,0.932263,0.871051,0.949528,0.908599
42,Packet Length Mean,0.936153,0.932263,0.871051,0.949528,0.908599
14,Flow Bytes/s,0.922289,0.914367,0.832506,0.949528,0.887175
81,Total TCP Flow Time,0.918333,0.896638,0.777326,0.992925,0.871997
24,Fwd IAT Min,0.902284,0.891286,0.79213,0.940094,0.859793
19,Flow IAT Min,0.819038,0.768189,0.60546,0.993868,0.7525
67,FWD Init Win Bytes,0.796284,0.737582,0.574844,0.998113,0.729529
40,Packet Length Min,0.796191,0.736913,0.574059,1.0,0.7294
7,Fwd Packet Length Min,0.796191,0.736913,0.574059,1.0,0.7294
6,Fwd Packet Length Max,0.795204,0.841278,0.88293,0.636792,0.739929


In [23]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
10,Bwd Packet Length Max,0.939363,0.939363,0.950811,0.926665,0.938583
5,Total Length of Bwd Packet,0.939363,0.939363,0.950811,0.926665,0.938583
12,Bwd Packet Length Mean,0.938196,0.938196,0.948541,0.926665,0.937475
56,Bwd Segment Size Avg,0.938196,0.938196,0.948541,0.926665,0.937475
66,Subflow Bwd Bytes,0.937289,0.937289,0.946783,0.926665,0.936616
81,Total TCP Flow Time,0.908396,0.908396,0.880676,0.944804,0.911614
77,Idle Max,0.870821,0.870821,0.922872,0.809277,0.86235
75,Idle Mean,0.870044,0.870044,0.915842,0.814978,0.862471
38,Fwd Packets/s,0.842705,0.842705,0.834556,0.854885,0.844598
15,Flow Packets/s,0.837264,0.837264,0.824807,0.856439,0.840325


## Slowhttptest

In [24]:
cic_slowhttptest = wednesday_cic[wednesday_cic["Label"] == "DoS Slowhttptest"].copy()
cic_slowhttptest["Label"] = 1
cic_slowhttptest

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
17978,6,83400021,18,2,5200,0,520,0,288.888889,265.881196,...,7008529,377,1.164118e+07,6.049227e+06,20776227,5399131,-1,-1,83400021,1
17979,6,83385834,18,2,5200,0,520,0,288.888889,265.881196,...,7010520,364,1.163850e+07,6.086458e+06,20840094,5303271,-1,-1,83385834,1
17995,6,83385227,18,2,5200,0,520,0,288.888889,265.881196,...,7010191,549,1.163841e+07,6.086421e+06,20839833,5303148,-1,-1,83385227,1
17996,6,83430584,18,2,5200,0,520,0,288.888889,265.881196,...,7015028,549,1.164516e+07,6.050565e+06,20776068,5399130,-1,-1,83430584,1
18004,6,83426649,18,2,5200,0,520,0,288.888889,265.881196,...,7011426,724,1.164504e+07,6.050473e+06,20775800,5399051,-1,-1,83426649,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330256,6,31589183,13,2,1683,0,187,0,129.461538,89.831894,...,6533504,6533504,8.351745e+06,4.154461e+06,13103822,5407489,-1,-1,31589183,1
360465,6,31521274,13,2,1683,0,187,0,129.461538,89.831894,...,6541345,6541345,8.326465e+06,4.166749e+06,13087961,5347461,-1,-1,31521274,1
392236,6,31405313,13,2,1683,0,187,0,129.461538,89.831894,...,6541322,6541322,8.287807e+06,4.223231e+06,13103684,5215656,-1,-1,31405313,1
434805,6,31389292,7,2,935,0,187,0,133.571429,91.246657,...,1629605,1629605,2.975969e+07,0.000000e+00,29759687,29759687,-1,-1,31389292,1


In [25]:
concap_slowhttptest = get_concap_dataset("concap_slowhttptest.csv")
concap_slowhttptest

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=concap/concap_slowhttptest.csv...


100%|██████████████████████████████████████████████████████████████████████████| 1.31M/1.31M [00:00<00:00, 2.31MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,118697166,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118697166,1
1,6,118535863,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118535863,1
2,6,118513518,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118513518,1
3,6,118493155,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118493155,1
4,6,118374171,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118374171,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109,6,1038645,4,3,4.0,406.0,4.0,0.0,1.000000,2.000000,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,121389140,1
2110,6,118579235,64,62,697.0,406.0,461.0,0.0,10.890625,57.165068,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118579235,1
2111,6,64665907,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7069661.0,7069661.0,1.919875e+07,1.282431e+07,33279304.0,8187769.0,-1,-1,64665907,1
2112,6,113178113,63,61,689.0,406.0,461.0,0.0,10.936508,57.627523,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,113178113,1


In [26]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_slowhttptest, concap_slowhttptest)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
81,Total TCP Flow Time,0.919152,0.926291,0.886354,0.992901,0.936607
24,Fwd IAT Min,0.901236,0.903971,0.898856,0.929484,0.913913
14,Flow Bytes/s,0.896452,0.901895,0.878656,0.952674,0.914169
20,Fwd IAT Total,0.890688,0.896185,0.873854,0.947468,0.909173
1,Flow Duration,0.890401,0.895925,0.873473,0.947468,0.908967
10,Bwd Packet Length Max,0.794519,0.779133,0.943118,0.635589,0.759401
56,Bwd Segment Size Avg,0.794519,0.779133,0.943118,0.635589,0.759401
5,Total Length of Bwd Packet,0.794519,0.779133,0.943118,0.635589,0.759401
12,Bwd Packet Length Mean,0.794519,0.779133,0.943118,0.635589,0.759401
66,Subflow Bwd Bytes,0.793369,0.778095,0.940476,0.635589,0.758543


In [28]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
38,Fwd Packets/s,0.89569,0.89569,0.852174,0.957471,0.901759
15,Flow Packets/s,0.887644,0.887644,0.840141,0.957471,0.894977
39,Bwd Packets/s,0.885345,0.885345,0.833416,0.963218,0.893628
16,Flow IAT Mean,0.884483,0.884483,0.8335,0.96092,0.892686
21,Fwd IAT Mean,0.884195,0.884195,0.837797,0.952874,0.891638
81,Total TCP Flow Time,0.881034,0.881034,0.872891,0.891954,0.882319
24,Fwd IAT Min,0.865517,0.865517,0.863844,0.867816,0.865826
75,Idle Mean,0.863506,0.863506,0.920825,0.795402,0.853531
22,Fwd IAT Std,0.847701,0.847701,0.781657,0.964943,0.863683
5,Total Length of Bwd Packet,0.836782,0.836782,0.932792,0.725862,0.816419


## Hulk

In [29]:
cic_hulk = wednesday_cic[wednesday_cic["Label"] == "DoS Hulk"].copy()
cic_hulk["Label"] = 1
cic_hulk

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
33452,6,28504,9,9,348,11595,348,0,38.666667,116.000000,...,0,0,0.0,0.0,0,0,-1,-1,28504,1
33453,6,3466,6,6,382,11595,382,0,63.666667,155.950847,...,0,0,0.0,0.0,0,0,-1,-1,3466,1
33454,6,4212,6,6,372,11595,372,0,62.000000,151.868364,...,0,0,0.0,0.0,0,0,-1,-1,4212,1
33455,6,12080,7,7,349,11595,349,0,49.857143,131.909601,...,0,0,0.0,0.0,0,0,-1,-1,12080,1
33456,6,12005,6,5,371,11595,371,0,61.833333,151.460116,...,0,0,0.0,0.0,0,0,-1,-1,12005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496517,6,173006,7,7,343,11595,343,0,49.000000,129.641814,...,0,0,0.0,0.0,0,0,-1,-1,173006,1
496524,6,179441,8,4,346,11595,346,0,43.250000,122.329473,...,0,0,0.0,0.0,0,0,-1,-1,179441,1
496561,6,189968,8,8,353,11595,353,0,44.125000,124.804347,...,0,0,0.0,0.0,0,0,-1,-1,189968,1
496569,6,656477,11,5,1041,11595,347,0,94.636364,162.083480,...,0,0,0.0,0.0,0,0,-1,-1,656477,1


In [30]:
concap_hulk = get_concap_dataset("concap_hulk.csv")
concap_hulk

  df = kagglehub.load_dataset(


KaggleApiHTTPError: 404 Client Error.

Resource not found at URL: https://www.kaggle.com/datasets/jozefjankaj/thesis-files/versions/14
The server reported the following issues: Data not found
Please make sure you specified the correct resource identifiers.

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_hulk, concap_hulk)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

## GoldenEye

In [None]:
cic_goldeneye = wednesday_cic[wednesday_cic["Label"] == "DoS GoldenEye"].copy()
cic_goldeneye["Label"] = 1
cic_goldeneye

In [None]:
concap_goldeneye = get_concap_dataset("concap_goldeneye.csv")
concap_goldeneye

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_goldeneye, concap_goldeneye)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

## Heartbleed

In [None]:
cic_heartbleed = wednesday_cic[wednesday_cic["Label"] == "Heartbleed"].copy()
cic_heartbleed["Label"] = 1
cic_heartbleed

In [None]:
concap_heartbleed = get_concap_dataset("concap_heartbleed.csv")
concap_heartbleed

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_heartbleed, concap_heartbleed)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

# Thursday

In [None]:
thursday_cic = get_cic_dataset("thursday.csv")
thursday_cic["Label"].unique()

In [None]:
thursday_cic[thursday_cic["Label"] == "Web Attack - XSS"]

## Web Attack - Bruteforce

In [None]:
cic_bruteforce = thursday_cic[thursday_cic["Label"] == "Web Attack - Brute Force"].copy()
cic_bruteforce["Label"] = 1
cic_bruteforce

In [None]:
concap_bruteforce = get_concap_dataset("concap_web_bruteforce.csv")
concap_bruteforce

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_bruteforce, concap_bruteforce)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

## Web Attack - SQL Injection

In [None]:
cic_sqli = thursday_cic[thursday_cic["Label"] == "Web Attack - SQL Injection"].copy()
cic_sqli["Label"] = 1
cic_sqli

## Web Attack - Cross-Site Scripting

In [None]:
cic_xss = thursday_cic[thursday_cic["Label"] == "Web Attack - XSS"].copy()
cic_xss["Label"] = 1
cic_xss

# Friday

In [None]:
friday_cic = get_cic_dataset("friday.csv")

In [None]:
friday_cic["Label"].unique()

## LOIC

In [None]:
cic_loic = friday_cic[friday_cic["Label"] == "DDoS"].copy()
cic_loic["Label"] = 1
cic_loic

In [None]:
concap_loic = get_concap_dataset("concap_loic.csv")
concap_loic.drop(columns=["target", "port", "service"], inplace=True, errors='ignore')
concap_loic

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_bruteforce, concap_bruteforce)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

## Portscan

In [None]:
cic_portscan = friday_cic[friday_cic["Label"] == "Portscan"].copy()
cic_portscan["Label"] = 1
cic_portscan

In [None]:
concap_portscan = get_concap_dataset("concap_portscan.csv")
concap_portscan

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_portscan, concap_portscan)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)