In [1]:
# install packages
!pip install numpy pandas pyarrow scikit-learn fastcore kagglehub[pandas-datasets] jinja2 boto


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [49]:
drop_columns = [
    "id",
    "Flow ID",        
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
    "Attempted Category",
]

def xs_y(df, targ): 
    if not isinstance(targ, list):
        xs = df[df.columns.difference([targ])].copy()
    else:
        xs = df[df.columns.difference(targ)].copy()
    y = df[targ].copy()
    return xs, y


def get_concap_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "concap/" + file)
    # clean the dataset
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df["Label"] = 1

    return df

def get_cic_2018_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "cic_2018/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df

def get_cic_2017_dataset(file):
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "jozefjankaj/thesis-files",
        "cic/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df


def train_verify_one_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, feature: str):
    root = DecisionTreeClassifier(max_depth=1, criterion='gini')
    train_x, train_y = xs_y(train_df, "Label")
    test_x, test_y = xs_y(test_df, "Label")
    root.fit(train_x[feature].array.reshape(-1,1), train_y)

    predictions = root.predict(test_x[feature].array.reshape(-1,1))
    return feature, roc_auc_score(test_y, predictions), accuracy_score(test_y, predictions), precision_score(test_y, predictions), recall_score(test_y, predictions)

def concap_cic_experiment(benign_df: pd.DataFrame, cic: pd.DataFrame, concap: pd.DataFrame):
    # Train on CIC => Predict ConCap
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    columns = ["Feature", "ROC AUC Score", "Accuracy", "Precision", "Recall"]

    measurements_cic_concap = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                           columns=columns
                                          )


    # Train on ConCap => Predict CIC
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    
    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_concap_cic = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                          columns=columns
                                          )
    return measurements_cic_concap, measurements_concap_cic

# SSH Bruteforce

In [3]:
bruteforce = get_cic_2018_dataset("Wednesday-14-02-2018.csv")

  df = kagglehub.load_dataset(


In [4]:
bruteforce

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,115320241,17,15,901,1942,445,0,53.000000,147.544061,...,5344243,10175,9.987594e+06,5.816100e+04,10005167,9812232,-1,-1,115320241,BENIGN
1,6,116784235,27,21,3498,6304,436,0,129.555556,202.635773,...,8385364,11242,9.838241e+06,4.771012e+05,10004111,8408741,-1,-1,116784235,BENIGN
2,6,115867108,23,19,2626,4569,436,0,114.173913,195.490677,...,7780718,11225,9.808635e+06,5.990057e+05,10004152,8004832,-1,-1,115867108,BENIGN
3,6,119894757,17,16,597,768,293,0,35.117647,97.064336,...,115967,32245,9.944064e+06,2.342149e+05,10046602,9203252,-1,-1,119894757,BENIGN
4,6,112960508,57,90,9101,27113,1047,0,159.666667,274.657666,...,4316517,235349,2.694478e+07,2.276214e+07,58084562,9758551,-1,-1,112960508,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5898301,17,655,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,BENIGN
5898306,6,3,1,1,0,0,0,0,0.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,3,BENIGN
5898328,17,720,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,BENIGN
5898335,6,358224,5,6,2752,1408,1168,0,550.400000,547.120462,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,28135919901,BENIGN


In [5]:
benign = bruteforce[bruteforce["Label"] == "BENIGN"].copy()
benign["Label"] = 0
benign

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,115320241,17,15,901,1942,445,0,53.000000,147.544061,...,5344243,10175,9.987594e+06,5.816100e+04,10005167,9812232,-1,-1,115320241,0
1,6,116784235,27,21,3498,6304,436,0,129.555556,202.635773,...,8385364,11242,9.838241e+06,4.771012e+05,10004111,8408741,-1,-1,116784235,0
2,6,115867108,23,19,2626,4569,436,0,114.173913,195.490677,...,7780718,11225,9.808635e+06,5.990057e+05,10004152,8004832,-1,-1,115867108,0
3,6,119894757,17,16,597,768,293,0,35.117647,97.064336,...,115967,32245,9.944064e+06,2.342149e+05,10046602,9203252,-1,-1,119894757,0
4,6,112960508,57,90,9101,27113,1047,0,159.666667,274.657666,...,4316517,235349,2.694478e+07,2.276214e+07,58084562,9758551,-1,-1,112960508,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5898301,17,655,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,0
5898306,6,3,1,1,0,0,0,0,0.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,3,0
5898328,17,720,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,0
5898335,6,358224,5,6,2752,1408,1168,0,550.400000,547.120462,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,28135919901,0


In [6]:
bruteforce["Label"].unique()

array(['BENIGN', 'FTP-BruteForce - Attempted', 'SSH-BruteForce'],
      dtype=object)

In [7]:
cic_ssh = bruteforce[bruteforce["Label"] == "SSH-BruteForce"].copy()
cic_ssh["Label"] = 1
cic_ssh

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
5759575,6,364702,23,22,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,364702,1
5759577,6,393085,23,21,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,393085,1
5759582,6,397310,23,23,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,397310,1
5759583,6,372143,23,23,1944,2665,640,0,84.521739,135.953491,...,0,0,0.0,0.0,0,0,-1,-1,372143,1
5759584,6,369389,22,23,1928,2665,640,0,87.636364,137.780552,...,0,0,0.0,0.0,0,0,-1,-1,369389,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5894491,6,379886,23,21,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,379886,1
5894492,6,388358,23,23,1928,2665,640,0,83.826087,135.847392,...,0,0,0.0,0.0,0,0,-1,-1,388358,1
5894493,6,374336,25,21,1912,2665,640,0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,374336,1
5894494,6,371640,25,21,1912,2665,640,0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,371640,1


In [8]:
concap_ssh = get_concap_dataset("concap_ssh.csv")
concap_ssh

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4660,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,4660,1
1,6,3557,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,3557,1
2,6,19511,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,19511,1
3,6,11027649,22,34,1928.0,2746.0,640.0,0.0,87.636364,137.780552,...,0,0,0,0,0,0,-1,-1,11027649,1
4,6,9870,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0,0,0,0,-1,-1,9870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7633,6,4487608,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4487608,1
7634,6,3367832,14,16,1272.0,2154.0,640.0,0.0,90.857143,174.415444,...,0,0,0,0,0,0,-1,-1,3367832,1
7635,6,4025378,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4025378,1
7636,6,3951,4,4,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,3951,1


In [9]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_ssh, concap_ssh)

In [10]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
70,Fwd Seg Size Min,0.994674
67,FWD Init Win Bytes,0.987924
24,Fwd IAT Min,0.783406
29,Bwd IAT Min,0.735475
12,Bwd Packet Length Mean,0.723388
56,Bwd Segment Size Avg,0.723388
66,Subflow Bwd Bytes,0.697357
46,SYN Flag Count,0.692856
16,Flow IAT Mean,0.674432
45,FIN Flag Count,0.667415


In [11]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
30,Fwd PSH Flags,0.996242
70,Fwd Seg Size Min,0.99517
48,PSH Flag Count,0.99509
31,Bwd PSH Flags,0.992988
69,Fwd Act Data Pkts,0.991268
36,Fwd Header Length,0.986035
2,Total Fwd Packet,0.5
4,Total Length of Fwd Packet,0.5
9,Fwd Packet Length Std,0.5
10,Bwd Packet Length Max,0.5


# CIC-IDS-2017 + ConCap vs CSE-CIC-2018

In [6]:
benign = get_cic_2017_dataset("monday.csv")
benign["Label"] = 0
benign

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


In [7]:
tuesday = get_cic_2017_dataset("tuesday.csv")

  df = kagglehub.load_dataset(


In [8]:
tuesday["Label"].unique()

array(['BENIGN', 'FTP-Patator - Attempted', 'FTP-Patator', 'SSH-Patator',
       'SSH-Patator - Attempted'], dtype=object)

In [9]:
cic_2017_ssh = tuesday[tuesday["Label"] == "SSH-Patator"].copy()
cic_2017_ssh["Label"] = 1
cic_2017_ssh

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
79299,6,4755497,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4755497,1
79300,6,4742052,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4742052,1
79301,6,1688206,10,10,1128,2009,640,0,112.800000,203.802083,...,0,0,0.0,0.0,0,0,-1,-1,1688206,1
79302,6,1884425,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1884425,1
79303,6,1937542,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1937542,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321777,6,11725520,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11725520,1
321784,6,11641016,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11641016,1
321785,6,12117686,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,12117686,1
321944,6,13640748,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,13640748,1


In [10]:
concap_ssh = get_concap_dataset("concap_ssh.csv")
concap_ssh

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4660,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,4660,1
1,6,3557,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,3557,1
2,6,19511,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,19511,1
3,6,11027649,22,34,1928.0,2746.0,640.0,0.0,87.636364,137.780552,...,0,0,0,0,0,0,-1,-1,11027649,1
4,6,9870,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0,0,0,0,-1,-1,9870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7633,6,4487608,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4487608,1
7634,6,3367832,14,16,1272.0,2154.0,640.0,0.0,90.857143,174.415444,...,0,0,0,0,0,0,-1,-1,3367832,1
7635,6,4025378,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4025378,1
7636,6,3951,4,4,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,3951,1


Put the two datasets together and mix them

In [11]:
concap_cic_2017 = pd.concat([cic_2017_ssh, concap_ssh]).sample(frac=1).reset_index(drop=True)
concap_cic_2017

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,12676227,23,34,1912.0,2746.0,640.0,0.0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,12676227,1
1,6,2252,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0.0,0.0,0,0,-1,-1,2252,1
2,6,18814,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0.0,0.0,0,0,-1,-1,18814,1
3,6,15718,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0.0,0.0,0,0,-1,-1,15718,1
4,6,11774056,22,33,2008.0,2745.0,640.0,0.0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,11774056,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10594,6,2633,3,4,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0.0,0.0,0,0,-1,-1,2633,1
10595,6,2363,3,4,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0.0,0.0,0,0,-1,-1,2363,1
10596,6,5173,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0.0,0.0,0,0,-1,-1,5173,1
10597,6,11595415,23,34,2008.0,2745.0,640.0,0.0,87.304348,136.339961,...,0,0,0.0,0.0,0,0,-1,-1,11595415,1


Grab the CIC 2018 SSH attack

In [12]:
# cic_2018 = get_cic_2018_dataset("Wednesday-14-02-2018.csv")
# cic_2018_ssh = cic_2018[cic_2018["Label"] == "SSH-BruteForce"].copy()
cic_2018_ssh = get_cic_2018_dataset("cic_2018_ssh.csv")
cic_2018_ssh.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
cic_2018_ssh["Label"] = 1
cic_2018_ssh

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,364702,23,22,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,364702,1
1,6,393085,23,21,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,393085,1
2,6,397310,23,23,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,397310,1
3,6,372143,23,23,1944,2665,640,0,84.521739,135.953491,...,0,0,0.0,0.0,0,0,-1,-1,372143,1
4,6,369389,22,23,1928,2665,640,0,87.636364,137.780552,...,0,0,0.0,0.0,0,0,-1,-1,369389,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94192,6,379886,23,21,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,379886,1
94193,6,388358,23,23,1928,2665,640,0,83.826087,135.847392,...,0,0,0.0,0.0,0,0,-1,-1,388358,1
94194,6,374336,25,21,1912,2665,640,0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,374336,1
94195,6,371640,25,21,1912,2665,640,0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,371640,1


In [16]:
cic_2018_ssh.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd RST Flags',
       'Bwd RST Flags', 'Fwd Header Length', 'Bwd Header Length',
       'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min',
       'Packet Length Max', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'FIN Flag Count', '

In [13]:
result_separate, _ = concap_cic_experiment(benign, cic_2017_ssh, cic_2018_ssh)

In [14]:
result_separate.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
30,Fwd PSH Flags,0.994259
48,PSH Flag Count,0.992908
69,Fwd Act Data Pkts,0.991895
31,Bwd PSH Flags,0.988855
36,Fwd Header Length,0.970449
2,Total Fwd Packet,0.959473
6,Fwd Packet Length Max,0.937015
70,Fwd Seg Size Min,0.921648
8,Fwd Packet Length Mean,0.890365
55,Fwd Segment Size Avg,0.890365


In [20]:
result_together, _ = concap_cic_experiment(benign, concap_cic_2017, cic_2018_ssh)

In [21]:
result_together.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
69,Fwd Act Data Pkts,0.990235
31,Bwd PSH Flags,0.988584
4,Total Length of Fwd Packet,0.965563
6,Fwd Packet Length Max,0.944051
70,Fwd Seg Size Min,0.9232
67,FWD Init Win Bytes,0.891358
24,Fwd IAT Min,0.885499
37,Bwd Header Length,0.815596
45,FIN Flag Count,0.814935
46,SYN Flag Count,0.814228


In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = ""

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "dhoogla/nfuqnids",
  "NF-UQ-NIDS.parquet",
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)


df

  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/dhoogla/nfuqnids?dataset_version_number=2&file_name=NF-UQ-NIDS.parquet...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78.9M/78.9M [00:11<00:00, 6.95MB/s]

Extracting zip of NF-UQ-NIDS.parquet...





Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack,Dataset
0,62073,56082,6,0.000000,9672,416,11,8,25,15,0,Benign,NF-UNSW-NB15
1,32284,1526,6,0.000000,1776,104,6,2,25,0,0,Benign,NF-UNSW-NB15
2,21,21971,6,1.000000,1842,1236,26,22,25,1111,0,Benign,NF-UNSW-NB15
3,23800,46893,6,0.000000,528,8824,10,12,27,124,0,Benign,NF-UNSW-NB15
4,63062,21,6,1.000000,1786,2340,32,34,25,1459,0,Benign,NF-UNSW-NB15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9156320,80,80,6,7.000000,2330065,0,2523,0,0,4263037,0,Benign,NF-BoT-IoT
9156321,0,0,6,0.000000,1054423,0,1513,0,0,4263062,0,Benign,NF-BoT-IoT
9156322,365,565,17,0.000000,62422,0,1357,0,0,4263062,0,Benign,NF-BoT-IoT
9156323,50850,8883,6,222.177994,11300,1664,32,32,24,4264935,0,Benign,NF-BoT-IoT


In [2]:
df["Attack"].unique()

array(['Benign', 'Exploits', 'Reconnaissance', 'DoS', 'Generic',
       'Shellcode', 'Backdoor', 'Fuzzers', 'Worms', 'Analysis',
       'injection', 'DDoS', 'scanning', 'password', 'mitm', 'xss',
       'ransomware', 'Infilteration', 'Bot', 'Brute Force', 'Theft'],
      dtype=object)

# DoS - GoldenEye

In [50]:
benign = get_cic_2017_dataset("monday.csv")
benign["Label"] = 0
benign

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


In [51]:
thursday_15 = get_cic_2018_dataset("Thursday-15-02-2018.csv")
thursday_15

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,115574907,19,16,890,1152,293,0,46.842105,109.511168,...,5283677,30094,9.996242e+06,14965.083129,10000886,9951125,-1,-1,115574907,BENIGN
1,6,115697563,17,18,873,3930,431,0,51.352941,142.894603,...,5179919,40203,1.000677e+07,1571.878818,10011500,10006162,-1,-1,115697563,BENIGN
2,6,115944900,24,26,5634,6450,1475,0,234.750000,502.323666,...,5383445,48809,1.000215e+07,37077.422347,10013419,9890360,-1,-1,115944900,BENIGN
3,6,116791783,19,19,1304,5895,431,0,68.631579,161.210425,...,6278667,40563,1.000604e+07,206.302911,10006644,10005909,-1,-1,116791783,BENIGN
4,6,62400433,12,11,862,3812,283,0,71.833333,98.071248,...,3969013,3969013,5.833802e+07,0.000000,58338019,58338019,-1,-1,62400433,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5410079,6,1229970,2,2,0,0,0,0,0.000000,0.000000,...,0,0,0.000000e+00,0.000000,0,0,-1,-1,1229970,BENIGN
5410081,6,18,1,1,0,0,0,0,0.000000,0.000000,...,0,0,0.000000e+00,0.000000,0,0,-1,-1,27549377914,BENIGN
5410088,17,719,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000,0,0,-1,-1,0,BENIGN
5410099,17,653,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000,0,0,-1,-1,0,BENIGN


In [52]:
thursday_15["Label"].unique()

array(['BENIGN', 'DoS GoldenEye', 'DoS GoldenEye - Attempted',
       'DoS Slowloris', 'DoS Slowloris - Attempted'], dtype=object)

In [53]:
cic_2018_goldeneye = thursday_15[thursday_15["Label"] == "DoS GoldenEye"].copy()
cic_2018_goldeneye["Label"] = 1
cic_2018_goldeneye

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
5368573,6,11773134,5,4,383,662,383,0,76.600000,171.282807,...,386,283,5864043.0,1.219301e+06,6726219,5001867,-1,-1,11773134,1
5368574,6,11773739,5,4,336,662,336,0,67.200000,150.263768,...,305,287,5864304.5,1.218995e+06,6726264,5002345,-1,-1,11773739,1
5368575,6,11770111,5,4,531,972,531,0,106.200000,237.470419,...,314,292,5864485.0,1.218635e+06,6726190,5002780,-1,-1,11770111,1
5368576,6,11770815,5,4,323,972,323,0,64.600000,144.449991,...,316,291,5864795.5,1.218425e+06,6726352,5003239,-1,-1,11770815,1
5368577,6,12795112,6,4,436,972,436,0,72.666667,177.996255,...,1024023,290,5865021.0,1.218205e+06,6726422,5003620,-1,-1,12795112,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5407564,6,14074149,6,4,493,972,493,0,82.166667,201.266407,...,67524,44012,6981306.5,2.795989e+06,8958369,5004244,-1,-1,14074149,1
5407565,6,13903030,9,4,388,972,388,0,43.111111,129.333333,...,7081965,7081965,5003902.0,0.000000e+00,5003902,5003902,-1,-1,13903030,1
5407566,6,6950443,6,4,323,662,323,0,53.833333,131.864198,...,92102,92102,5004506.0,0.000000e+00,5004506,5004506,-1,-1,6950443,1
5407569,6,57041006,6,4,388,972,388,0,64.666667,158.400337,...,43396,5049,28496280.5,3.322274e+07,51988307,5004254,-1,-1,57041006,1


In [54]:
cic_2017 = get_cic_2017_dataset("wednesday.csv")

  df = kagglehub.load_dataset(


In [55]:
cic_2017["Label"].unique()

array(['BENIGN', 'DoS Slowloris', 'DoS Slowloris - Attempted',
       'DoS Slowhttptest', 'DoS Slowhttptest - Attempted', 'DoS Hulk',
       'DoS Hulk - Attempted', 'DoS GoldenEye', 'Heartbleed',
       'DoS GoldenEye - Attempted'], dtype=object)

In [56]:
cic_2017_goldeneye = cic_2017[cic_2017["Label"] == "DoS GoldenEye"].copy()
cic_2017_goldeneye["Label"] = 1

In [57]:
result_2017, _ = concap_cic_experiment(benign, cic_2017_goldeneye, cic_2018_goldeneye)
result_2017.sort_values("ROC AUC Score", ascending=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall
70,Fwd Seg Size Min,0.920554,0.960069,0.949413,0.999956
27,Bwd IAT Std,0.887912,0.936635,0.933317,0.985816
67,FWD Init Win Bytes,0.885203,0.942311,0.928507,0.999956
22,Fwd IAT Std,0.882824,0.937498,0.928752,0.992686
47,RST Flag Count,0.882408,0.881236,0.957927,0.880053
...,...,...,...,...,...
66,Subflow Bwd Bytes,0.460552,0.231354,0.000000,0.000000
54,Average Packet Size,0.459045,0.232582,0.121622,0.003989
42,Packet Length Mean,0.459045,0.232582,0.121622,0.003989
5,Total Length of Bwd Packet,0.398573,0.200219,0.000000,0.000000


In [58]:
concap_goldeneye = get_concap_dataset("concap_goldeneye.csv")
concap_goldeneye

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,13783803,10,11,1200.0,11610.0,300.0,0.0,120.000000,154.919334,...,853356.0,853356.0,1.289990e+07,0.000000e+00,12899901.0,12899901.0,-1,-1,13783803,1
1,6,56835564,9,6,500.0,11610.0,500.0,0.0,55.555556,166.666667,...,5046952.0,4878.0,2.589187e+07,1.734867e+07,38159227.0,13624507.0,-1,-1,56835564,1
2,6,25057619,9,7,404.0,11610.0,404.0,0.0,44.888889,134.666667,...,5047163.0,1704.0,1.000438e+07,7.025186e+06,14971933.0,5036819.0,-1,-1,25057619,1
3,6,38433193,10,6,512.0,11610.0,512.0,0.0,51.200000,161.908616,...,5045362.0,6431.0,1.669070e+07,4.830814e+06,20106601.0,13274799.0,-1,-1,38433193,1
4,6,5703764,9,8,328.0,11610.0,328.0,0.0,36.444444,109.333333,...,843.0,843.0,5.562874e+06,0.000000e+00,5562874.0,5562874.0,-1,-1,5703764,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,6,35282270,11,7,414.0,11610.0,414.0,0.0,37.636364,124.825697,...,1015225.0,1608.0,1.140679e+07,5.741702e+06,16095574.0,5003009.0,-1,-1,35282270,1
4658,6,75378643,8,6,509.0,11610.0,509.0,0.0,63.625000,179.958676,...,44432.0,2646.0,2.510878e+07,2.733661e+07,56235375.0,5002188.0,-1,-1,75378643,1
4659,6,97824485,15,12,2440.0,11610.0,610.0,0.0,162.666667,279.220002,...,5046710.0,854857.0,4.596146e+07,4.560548e+07,78209401.0,13713517.0,-1,-1,97824485,1
4660,6,88655311,9,6,363.0,11610.0,363.0,0.0,40.333333,121.000000,...,45612.0,3088.0,2.953409e+07,3.548058e+07,70216725.0,5001881.0,-1,-1,88655311,1


In [59]:
mixed = pd.concat([cic_2017_goldeneye, concap_goldeneye]).sample(frac=1).reset_index(drop=True)
mixed

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,11483489,9,4,317.0,11632.0,317.0,0.0,35.222222,105.666667,...,2411.0,892.0,5.720693e+06,1.018237e+06,6440695.0,5000691.0,-1,-1,11483489,1
1,6,5043762,9,6,495.0,11632.0,495.0,0.0,55.000000,165.000000,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,5043762,1
2,6,37754078,14,6,2720.0,11632.0,680.0,0.0,194.285714,318.788917,...,813939.0,4254.0,1.229854e+07,1.200357e+07,26152456.0,5000673.0,-1,-1,37754078,1
3,6,74347193,8,6,424.0,11610.0,424.0,0.0,53.000000,149.906638,...,5045862.0,3538.0,3.464890e+07,2.784725e+07,54339874.0,14957919.0,-1,-1,74347193,1
4,6,71221989,10,7,697.0,11610.0,697.0,0.0,69.700000,220.410753,...,1010165.0,1313.0,2.338884e+07,2.375518e+07,50210014.0,5000572.0,-1,-1,71221989,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12214,6,81413972,9,6,362.0,11610.0,362.0,0.0,40.222222,120.666667,...,44668.0,1724.0,2.712177e+07,3.146775e+07,63146659.0,5001390.0,-1,-1,81413972,1
12215,6,52799801,9,6,367.0,11610.0,367.0,0.0,40.777778,122.333333,...,42538.0,3679.0,1.758315e+07,1.497627e+07,34148073.0,5000557.0,-1,-1,52799801,1
12216,6,12754523,9,5,417.0,11424.0,417.0,0.0,46.333333,139.000000,...,1008367.0,1008367.0,1.171486e+07,0.000000e+00,11714858.0,11714858.0,-1,-1,12754523,1
12217,6,21977590,10,6,446.0,11610.0,446.0,0.0,44.600000,141.037584,...,2703.0,1872.0,9.895838e+06,6.920810e+06,14789590.0,5002087.0,-1,-1,21977590,1


In [60]:
result_mixed, _ = concap_cic_experiment(benign, mixed, cic_2018_goldeneye)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [61]:
result.sort_values("ROC AUC Score", ascending=False).head(20)

Unnamed: 0,Feature,ROC AUC Score
70,Fwd Seg Size Min,0.925013
27,Bwd IAT Std,0.892008
67,FWD Init Win Bytes,0.88933
47,RST Flag Count,0.88724
22,Fwd IAT Std,0.886432
39,Bwd Packets/s,0.878937
26,Bwd IAT Mean,0.878565
34,Fwd RST Flags,0.875866
17,Flow IAT Std,0.874991
81,Total TCP Flow Time,0.874902


In [62]:
result_mixed.sort_values("ROC AUC Score", ascending=False).head(20)


Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall
70,Fwd Seg Size Min,0.921821,0.945053,0.921942,0.999956
67,FWD Init Win Bytes,0.888717,0.921792,0.892436,0.999956
47,RST Flag Count,0.888345,0.885879,0.94019,0.880053
27,Bwd IAT Std,0.887302,0.916501,0.896131,0.985505
22,Fwd IAT Std,0.880417,0.913799,0.887704,0.992686
34,Fwd RST Flags,0.877912,0.866529,0.948713,0.839628
26,Bwd IAT Mean,0.872509,0.910377,0.878696,0.999867
81,Total TCP Flow Time,0.872406,0.894304,0.896689,0.946055
39,Bwd Packets/s,0.872349,0.910291,0.878534,0.999956
53,Down/Up Ratio,0.870466,0.908968,0.876963,0.999956


# DoS - SlowLoris

In [63]:
cic_2017_slowloris = cic_2017[cic_2017["Label"] == "DoS Slowloris"].copy()
cic_2017_slowloris["Label"] = 1

In [64]:
cic_2018_slowloris = thursday_15[thursday_15["Label"] == "DoS Slowloris"].copy()
cic_2018_slowloris["Label"] = 1
cic_2018_slowloris

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
5382494,6,3863707,8,2,920,0,230,0,115.000000,122.940171,...,0,0,0.00,0.000000e+00,0,0,-1,-1,3863707,1
5382495,6,1023719,4,2,230,0,230,0,57.500000,115.000000,...,0,0,0.00,0.000000e+00,0,0,-1,-1,1023719,1
5382496,6,830593,6,2,920,0,230,0,153.333333,118.771489,...,0,0,0.00,0.000000e+00,0,0,-1,-1,830593,1
5382497,6,578,3,2,230,0,230,0,76.666667,132.790562,...,0,0,0.00,0.000000e+00,0,0,-1,-1,578,1
5382498,6,1067,3,2,230,0,230,0,76.666667,132.790562,...,0,0,0.00,0.000000e+00,0,0,-1,-1,1067,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5407539,6,107038159,15,3,2530,0,230,0,168.666667,105.279673,...,7710551,4597706,23682473.75,2.066065e+07,53247766,6656008,-1,-1,107038159,1
5407540,6,107102143,15,3,2530,0,230,0,168.666667,105.279673,...,7775156,4536694,23697572.50,2.065870e+07,53247464,6655895,-1,-1,107102143,1
5407547,6,108101917,15,3,2530,0,230,0,168.666667,105.279673,...,7730203,4339295,24008104.25,2.058462e+07,53240921,6650353,-1,-1,108101917,1
5407556,6,106270312,14,3,2530,0,230,0,180.714286,97.937522,...,6687130,4343104,23810018.50,2.060334e+07,53247523,6667868,-1,-1,106270312,1


In [65]:
concap_slowloris = get_concap_dataset("concap_slowloris.csv")

  df = kagglehub.load_dataset(


In [66]:
result_2017, _ = concap_cic_experiment(benign, cic_2017_slowloris, cic_2018_slowloris)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [67]:
mixed_slowloris = pd.concat([cic_2017_slowloris, concap_slowloris]).sample(frac=1).reset_index(drop=True)
mixed_slowloris

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,55145226,3,3,8.0,483.0,8.0,0.0,2.666667,4.618802,...,73.0,73.0,5.513580e+07,0.000000e+00,55135805.0,55135805.0,-1,-1,1277168684,1
1,6,117014532,2,2,16.0,0.0,8.0,8.0,8.000000,0.000000,...,27.0,27.0,1.170144e+08,0.000000e+00,117014425.0,117014425.0,-1,-1,407126545,1
2,6,64015015,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7178071.0,7178071.0,1.894565e+07,1.240481e+07,32517252.0,8192724.0,-1,-1,64015015,1
3,6,100001930,2,2,16.0,0.0,8.0,8.0,8.000000,0.000000,...,52.0,52.0,1.000018e+08,0.000000e+00,100001800.0,100001800.0,-1,-1,1239070091,1
4,6,102638659,14,3,2541.0,0.0,231.0,0.0,181.500000,98.363337,...,6415576.0,573.0,1.924448e+07,1.878665e+07,51327510.0,5965623.0,-1,-1,102638659,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5974,6,102601204,14,3,2541.0,0.0,231.0,0.0,181.500000,98.363337,...,6410136.0,570.0,1.923809e+07,1.879362e+07,51263632.0,5766300.0,-1,-1,102601204,1
5975,6,108031195,5,4,247.0,0.0,231.0,0.0,49.400000,101.596260,...,1029410.0,1029410.0,1.070017e+08,0.000000e+00,107001712.0,107001712.0,-1,-1,108031195,1
5976,6,64964874,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7107550.0,7107550.0,1.928577e+07,1.296360e+07,33536469.0,8192653.0,-1,-1,64964874,1
5977,6,105668475,16,3,2541.0,0.0,231.0,0.0,158.812500,110.582831,...,9413543.0,543.0,1.925086e+07,1.883127e+07,51327306.0,5710639.0,-1,-1,105668475,1


In [68]:
result_mixed, _ = concap_cic_experiment(benign, mixed_slowloris, cic_2018_slowloris)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [69]:
result_2017.sort_values("ROC AUC Score", ascending=False).head(20)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall
10,Bwd Packet Length Max,0.95338,0.95611,0.975132,0.96066
5,Total Length of Bwd Packet,0.95338,0.95611,0.975132,0.96066
12,Bwd Packet Length Mean,0.95338,0.95611,0.975132,0.96066
56,Bwd Segment Size Avg,0.95338,0.95611,0.975132,0.96066
66,Subflow Bwd Bytes,0.951695,0.955057,0.973618,0.96066
26,Bwd IAT Mean,0.924487,0.910843,0.980367,0.888104
39,Bwd Packets/s,0.916254,0.905741,0.97228,0.888221
15,Flow Packets/s,0.910447,0.897077,0.972757,0.874794
16,Flow IAT Mean,0.909622,0.896429,0.972109,0.874441
28,Bwd IAT Max,0.90795,0.901126,0.963643,0.889753


In [70]:
result_mixed.sort_values("ROC AUC Score", ascending=False).head(20)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall
10,Bwd Packet Length Max,0.954824,0.955837,0.963952,0.96066
5,Total Length of Bwd Packet,0.954824,0.955837,0.963952,0.96066
12,Bwd Packet Length Mean,0.952984,0.954316,0.961452,0.96066
56,Bwd Segment Size Avg,0.952984,0.954316,0.961452,0.96066
66,Subflow Bwd Bytes,0.952231,0.953694,0.960433,0.96066
26,Bwd IAT Mean,0.924818,0.918446,0.970399,0.888104
81,Total TCP Flow Time,0.920972,0.929781,0.914026,0.971731
28,Bwd IAT Max,0.920566,0.909392,0.98777,0.856184
39,Bwd Packets/s,0.917483,0.910913,0.965482,0.879623
15,Flow Packets/s,0.911285,0.902827,0.968395,0.862544
