In [1]:
# install packages
!pip install numpy pandas pyarrow scikit-learn fastcore kagglehub[pandas-datasets] jinja2 boto


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
drop_columns = [
    "id",
    "Flow ID",        
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
    "Attempted Category",
]

def xs_y(df, targ): 
    if not isinstance(targ, list):
        xs = df[df.columns.difference([targ])].copy()
    else:
        xs = df[df.columns.difference(targ)].copy()
    y = df[targ].copy()
    return xs, y


def get_concap_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "concap/" + file)
    # clean the dataset
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df["Label"] = 1

    return df

def get_cic_2018_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "cic_2018/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df

def get_cic_2017_dataset(file):
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "jozefjankaj/thesis-files",
        "cic/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df


def train_verify_one_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, feature: str):
    root = DecisionTreeClassifier(max_depth=1, criterion='gini')
    train_x, train_y = xs_y(train_df, "Label")
    test_x, test_y = xs_y(test_df, "Label")
    root.fit(train_x[feature].array.reshape(-1,1), train_y)

    predictions = root.predict(test_x[feature].array.reshape(-1,1))
    return feature, roc_auc_score(test_y, predictions)

def concap_cic_experiment(benign_df: pd.DataFrame, cic: pd.DataFrame, concap: pd.DataFrame):
    # Train on CIC => Predict ConCap
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_cic_concap = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"], 
                                           columns=["Feature", "ROC AUC Score"]
                                          )

    # Train on ConCap => Predict CIC
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    
    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_concap_cic = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                          columns=["Feature", "ROC AUC Score"]
                                          )
    return measurements_cic_concap, measurements_concap_cic

# SSH Bruteforce

In [3]:
bruteforce = get_cic_2018_dataset("Wednesday-14-02-2018.csv")

  df = kagglehub.load_dataset(


In [4]:
bruteforce

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,115320241,17,15,901,1942,445,0,53.000000,147.544061,...,5344243,10175,9.987594e+06,5.816100e+04,10005167,9812232,-1,-1,115320241,BENIGN
1,6,116784235,27,21,3498,6304,436,0,129.555556,202.635773,...,8385364,11242,9.838241e+06,4.771012e+05,10004111,8408741,-1,-1,116784235,BENIGN
2,6,115867108,23,19,2626,4569,436,0,114.173913,195.490677,...,7780718,11225,9.808635e+06,5.990057e+05,10004152,8004832,-1,-1,115867108,BENIGN
3,6,119894757,17,16,597,768,293,0,35.117647,97.064336,...,115967,32245,9.944064e+06,2.342149e+05,10046602,9203252,-1,-1,119894757,BENIGN
4,6,112960508,57,90,9101,27113,1047,0,159.666667,274.657666,...,4316517,235349,2.694478e+07,2.276214e+07,58084562,9758551,-1,-1,112960508,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5898301,17,655,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,BENIGN
5898306,6,3,1,1,0,0,0,0,0.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,3,BENIGN
5898328,17,720,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,BENIGN
5898335,6,358224,5,6,2752,1408,1168,0,550.400000,547.120462,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,28135919901,BENIGN


In [5]:
benign = bruteforce[bruteforce["Label"] == "BENIGN"].copy()
benign["Label"] = 0
benign

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,115320241,17,15,901,1942,445,0,53.000000,147.544061,...,5344243,10175,9.987594e+06,5.816100e+04,10005167,9812232,-1,-1,115320241,0
1,6,116784235,27,21,3498,6304,436,0,129.555556,202.635773,...,8385364,11242,9.838241e+06,4.771012e+05,10004111,8408741,-1,-1,116784235,0
2,6,115867108,23,19,2626,4569,436,0,114.173913,195.490677,...,7780718,11225,9.808635e+06,5.990057e+05,10004152,8004832,-1,-1,115867108,0
3,6,119894757,17,16,597,768,293,0,35.117647,97.064336,...,115967,32245,9.944064e+06,2.342149e+05,10046602,9203252,-1,-1,119894757,0
4,6,112960508,57,90,9101,27113,1047,0,159.666667,274.657666,...,4316517,235349,2.694478e+07,2.276214e+07,58084562,9758551,-1,-1,112960508,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5898301,17,655,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,0
5898306,6,3,1,1,0,0,0,0,0.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,3,0
5898328,17,720,1,1,300,328,300,300,300.000000,0.000000,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,0,0
5898335,6,358224,5,6,2752,1408,1168,0,550.400000,547.120462,...,0,0,0.000000e+00,0.000000e+00,0,0,-1,-1,28135919901,0


In [6]:
bruteforce["Label"].unique()

array(['BENIGN', 'FTP-BruteForce - Attempted', 'SSH-BruteForce'],
      dtype=object)

In [7]:
cic_ssh = bruteforce[bruteforce["Label"] == "SSH-BruteForce"].copy()
cic_ssh["Label"] = 1
cic_ssh

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
5759575,6,364702,23,22,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,364702,1
5759577,6,393085,23,21,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,393085,1
5759582,6,397310,23,23,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,397310,1
5759583,6,372143,23,23,1944,2665,640,0,84.521739,135.953491,...,0,0,0.0,0.0,0,0,-1,-1,372143,1
5759584,6,369389,22,23,1928,2665,640,0,87.636364,137.780552,...,0,0,0.0,0.0,0,0,-1,-1,369389,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5894491,6,379886,23,21,1912,2665,640,0,83.130435,135.737482,...,0,0,0.0,0.0,0,0,-1,-1,379886,1
5894492,6,388358,23,23,1928,2665,640,0,83.826087,135.847392,...,0,0,0.0,0.0,0,0,-1,-1,388358,1
5894493,6,374336,25,21,1912,2665,640,0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,374336,1
5894494,6,371640,25,21,1912,2665,640,0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,371640,1


In [8]:
concap_ssh = get_concap_dataset("concap_ssh.csv")
concap_ssh

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4660,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,4660,1
1,6,3557,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,3557,1
2,6,19511,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,19511,1
3,6,11027649,22,34,1928.0,2746.0,640.0,0.0,87.636364,137.780552,...,0,0,0,0,0,0,-1,-1,11027649,1
4,6,9870,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0,0,0,0,-1,-1,9870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7633,6,4487608,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4487608,1
7634,6,3367832,14,16,1272.0,2154.0,640.0,0.0,90.857143,174.415444,...,0,0,0,0,0,0,-1,-1,3367832,1
7635,6,4025378,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4025378,1
7636,6,3951,4,4,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,3951,1


In [9]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_ssh, concap_ssh)

In [10]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
70,Fwd Seg Size Min,0.994674
67,FWD Init Win Bytes,0.987924
24,Fwd IAT Min,0.783406
29,Bwd IAT Min,0.735475
12,Bwd Packet Length Mean,0.723388
56,Bwd Segment Size Avg,0.723388
66,Subflow Bwd Bytes,0.697357
46,SYN Flag Count,0.692856
16,Flow IAT Mean,0.674432
45,FIN Flag Count,0.667415


In [11]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
30,Fwd PSH Flags,0.996242
70,Fwd Seg Size Min,0.99517
48,PSH Flag Count,0.99509
31,Bwd PSH Flags,0.992988
69,Fwd Act Data Pkts,0.991268
36,Fwd Header Length,0.986035
2,Total Fwd Packet,0.5
4,Total Length of Fwd Packet,0.5
9,Fwd Packet Length Std,0.5
10,Bwd Packet Length Max,0.5


# CIC-IDS-2017 + ConCap vs CSE-CIC-2018

In [12]:
benign = get_cic_2017_dataset("monday.csv")
benign["Label"] = 0
benign

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


In [13]:
tuesday = get_cic_2017_dataset("tuesday.csv")

  df = kagglehub.load_dataset(


In [14]:
tuesday["Label"].unique()

array(['BENIGN', 'FTP-Patator - Attempted', 'FTP-Patator', 'SSH-Patator',
       'SSH-Patator - Attempted'], dtype=object)

In [15]:
cic_2017_ssh = tuesday[tuesday["Label"] == "SSH-Patator"].copy()
cic_2017_ssh["Label"] = 1
cic_2017_ssh

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
79299,6,4755497,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4755497,1
79300,6,4742052,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4742052,1
79301,6,1688206,10,10,1128,2009,640,0,112.800000,203.802083,...,0,0,0.0,0.0,0,0,-1,-1,1688206,1
79302,6,1884425,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1884425,1
79303,6,1937542,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1937542,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321777,6,11725520,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11725520,1
321784,6,11641016,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11641016,1
321785,6,12117686,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,12117686,1
321944,6,13640748,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,13640748,1


In [16]:
concap_ssh = get_concap_dataset("concap_ssh.csv")
concap_ssh

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4660,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,4660,1
1,6,3557,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,3557,1
2,6,19511,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,19511,1
3,6,11027649,22,34,1928.0,2746.0,640.0,0.0,87.636364,137.780552,...,0,0,0,0,0,0,-1,-1,11027649,1
4,6,9870,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0,0,0,0,-1,-1,9870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7633,6,4487608,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4487608,1
7634,6,3367832,14,16,1272.0,2154.0,640.0,0.0,90.857143,174.415444,...,0,0,0,0,0,0,-1,-1,3367832,1
7635,6,4025378,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4025378,1
7636,6,3951,4,4,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,3951,1


Put the two datasets together and mix them

In [17]:
concap_cic_2017 = pd.concat([cic_2017_ssh, concap_ssh]).sample(frac=1).reset_index(drop=True)
concap_cic_2017

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4940874,17,16,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0.0,0.0,0,0,-1,-1,4940874,1
1,6,13232517,22,33,2008.0,2745.0,640.0,0.0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,13232517,1
2,6,10951011,25,35,1912.0,2746.0,640.0,0.0,76.480000,131.981413,...,0,0,0.0,0.0,0,0,-1,-1,10951011,1
3,6,12036529,22,33,2008.0,2745.0,640.0,0.0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,12036529,1
4,6,11977271,22,32,2008.0,2745.0,640.0,0.0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,11977271,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10594,6,9378,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0.0,0.0,0,0,-1,-1,9378,1
10595,6,18902,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0.0,0.0,0,0,-1,-1,18902,1
10596,6,2784,3,4,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0.0,0.0,0,0,-1,-1,2784,1
10597,6,13952452,22,33,2008.0,2745.0,640.0,0.0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,13952452,1


Grab the CIC 2018 SSH attack

In [18]:
# cic_2018 = get_cic_2018_dataset("Wednesday-14-02-2018.csv")
# cic_2018_ssh = cic_2018[cic_2018["Label"] == "SSH-BruteForce"].copy()
cic_2018_ssh = pd.read_csv("cic_2018_ssh.csv")
cic_2018_ssh.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
cic_2018_ssh["Label"] = 1
cic_2018_ssh

FileNotFoundError: [Errno 2] No such file or directory: 'cic_2018_ssh.csv'

In [None]:
cic_2018_ssh.columns

In [None]:
result_separate, _ = concap_cic_experiment(benign, cic_2017_ssh, cic_2018_ssh)

In [None]:
result.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
result_together, _ = concap_cic_experiment(benign, concap_cic_2017, cic_2018_ssh)

In [None]:
result_together.sort_values("ROC AUC Score", ascending=False).head(10)