In [60]:
# install packages
!pip install numpy pandas pyarrow scikit-learn fastcore kagglehub[pandas-datasets] jinja2 boto

Collecting boto
  Downloading boto-2.49.0-py2.py3-none-any.whl.metadata (7.3 kB)
Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: boto
Successfully installed boto-2.49.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Utility methods

In [11]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score, get_scorer, get_scorer_names
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count

In [12]:
get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'd2_absolute_error_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_max_error',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'neg_root_mean_squared_log_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 're

In [31]:
drop_columns = [
    "id",
    "Flow ID",        
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
    "Attempted Category",
]

def xs_y(df, targ): 
    if not isinstance(targ, list):
        xs = df[df.columns.difference([targ])].copy()
    else:
        xs = df[df.columns.difference(targ)].copy()
    y = df[targ].copy()
    return xs, y

def get_cic_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "cic/" + file)    
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df

def get_concap_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "concap/" + file)
    # clean the dataset
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df["Label"] = 1

    return df

def train_verify_one_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, feature: str):
    root = DecisionTreeClassifier(max_depth=1, criterion='gini')
    train_x, train_y = xs_y(train_df, "Label")
    test_x, test_y = xs_y(test_df, "Label")
    root.fit(train_x[feature].array.reshape(-1,1), train_y)

    predictions = root.predict(test_x[feature].array.reshape(-1,1))
    return feature, roc_auc_score(test_y, predictions), accuracy_score(test_y, predictions), precision_score(test_y, predictions), recall_score(test_y, predictions), f1_score(test_y, predictions)

def concap_cic_experiment(benign_df: pd.DataFrame, cic: pd.DataFrame, concap: pd.DataFrame):
    # Train on CIC => Predict ConCap
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    columns = ["Feature", "ROC AUC Score", "Accuracy", "Precision", "Recall", "F1 Score"]

    measurements_cic_concap = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"], 
                                           columns=columns
                                          )

    # Train on ConCap => Predict CIC
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    
    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_concap_cic = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                          columns=columns
                                          )
    
    
    return measurements_cic_concap, measurements_concap_cic

# Benign
Benign flows are used for balancing the training and testing sets

In [4]:
benign = get_cic_dataset("monday.csv")
benign["Label"] = 0
benign

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=cic/monday.csv...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198M/198M [00:28<00:00, 7.39MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


# Tuesday

In [5]:
tuesday_cic = get_cic_dataset("tuesday.csv")

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=cic/tuesday.csv...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170M/170M [00:22<00:00, 7.89MB/s]


## Bruteforce FTP

In [6]:
cic_ftp = tuesday_cic[tuesday_cic["Label"] == "FTP-Patator"].copy()
cic_ftp["Label"] = 1
cic_ftp

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
4630,6,4008190,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4008190,1
4631,6,4018946,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4018946,1
4632,6,4067119,6,6,34,76,20,0,5.666667,8.981462,...,0,0,0.0,0.0,0,0,-1,-1,4067119,1
4633,6,4015015,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4015015,1
4634,6,4099543,6,6,31,76,17,0,5.166667,8.060190,...,0,0,0.0,0.0,0,0,-1,-1,4099543,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321766,6,9567017,11,17,135,188,34,0,12.272727,11.384998,...,0,0,0.0,0.0,0,0,-1,-1,9567017,1
321824,6,8627604,11,17,125,188,25,0,11.363636,9.871907,...,0,0,0.0,0.0,0,0,-1,-1,8627604,1
321857,6,8438103,11,17,116,188,20,0,10.545455,8.721968,...,0,0,0.0,0.0,0,0,-1,-1,8438103,1
321880,6,8958671,11,17,124,188,24,0,11.272727,9.654956,...,0,0,0.0,0.0,0,0,-1,-1,8958671,1


In [7]:
concap_ftp = get_concap_dataset("concap_ftp.csv")
concap_ftp.drop(columns=["label", "category", "subcategory"], inplace=True, errors='ignore')
concap_ftp["Label"] = 1
concap_ftp

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=14&file_name=concap/concap_ftp.csv...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.96M/3.96M [00:01<00:00, 3.48MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,3029650,11,17,88.0,188.0,15.0,0.0,8.000000,6.496153,...,0,0,0,0,0,0,-1,-1,3029650,1
1,6,3034178,11,17,85.0,188.0,14.0,0.0,7.727273,6.198240,...,0,0,0,0,0,0,-1,-1,3034178,1
2,6,3032346,11,17,85.0,188.0,14.0,0.0,7.727273,6.214353,...,0,0,0,0,0,0,-1,-1,3032346,1
3,6,3032433,11,18,88.0,188.0,16.0,0.0,8.000000,6.511528,...,0,0,0,0,0,0,-1,-1,3032433,1
4,6,3027024,11,18,84.0,188.0,14.0,0.0,7.636364,6.152605,...,0,0,0,0,0,0,-1,-1,3027024,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,6,3235936,14,18,86.0,188.0,16.0,0.0,6.142857,6.502747,...,0,0,0,0,0,0,-1,-1,3235936,1
5696,6,3032999,11,17,88.0,188.0,17.0,0.0,8.000000,6.618157,...,0,0,0,0,0,0,-1,-1,3032999,1
5697,6,3034066,11,16,79.0,188.0,16.0,0.0,7.181818,5.980271,...,0,0,0,0,0,0,-1,-1,3034066,1
5698,6,3031003,11,17,87.0,188.0,15.0,0.0,7.909091,6.425800,...,0,0,0,0,0,0,-1,-1,3031003,1


In [32]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_ftp, concap_ftp)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
54,Average Packet Size,0.97889,0.98263,0.971531,0.999825,0.985475
42,Packet Length Mean,0.97889,0.98263,0.971531,0.999825,0.985475
41,Packet Length Max,0.978474,0.98232,0.970874,1.0,0.985222
55,Fwd Segment Size Avg,0.976586,0.980769,0.9684,1.0,0.983946
8,Fwd Packet Length Mean,0.976586,0.980769,0.9684,1.0,0.983946
64,Subflow Fwd Bytes,0.976208,0.980459,0.967906,1.0,0.983691
56,Bwd Segment Size Avg,0.973603,0.978288,0.964624,0.999825,0.981909
12,Bwd Packet Length Mean,0.973603,0.978288,0.964624,0.999825,0.981909
10,Bwd Packet Length Max,0.971803,0.97684,0.962188,1.0,0.98073
35,Bwd RST Flags,0.971682,0.967432,0.99668,0.947895,0.971675


In [34]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall,F1 Score
53,Down/Up Ratio,0.992825,0.992825,0.989253,0.996475,0.992851
35,Bwd RST Flags,0.981873,0.981873,0.969578,0.994965,0.982107
30,Fwd PSH Flags,0.97218,0.97218,0.951601,0.994965,0.9728
42,Packet Length Mean,0.970544,0.970544,0.945,0.999245,0.971366
54,Average Packet Size,0.970544,0.970544,0.945,0.999245,0.971366
48,PSH Flag Count,0.969789,0.969789,0.947267,0.994965,0.97053
64,Subflow Fwd Bytes,0.96853,0.96853,0.945215,0.994713,0.969333
41,Packet Length Max,0.96853,0.96853,0.947141,0.992447,0.969265
31,Bwd PSH Flags,0.965634,0.965634,0.938582,0.996475,0.966663
56,Bwd Segment Size Avg,0.965634,0.965634,0.935689,1.0,0.966776


## Bruteforce SSH

In [15]:
cic_ssh = tuesday_cic[tuesday_cic["Label"] == "SSH-Patator"].copy()
cic_ssh["Label"] = 1
cic_ssh

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
79299,6,4755497,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4755497,1
79300,6,4742052,14,15,1304,2153,640,0,93.142857,174.584719,...,0,0,0.0,0.0,0,0,-1,-1,4742052,1
79301,6,1688206,10,10,1128,2009,640,0,112.800000,203.802083,...,0,0,0.0,0.0,0,0,-1,-1,1688206,1
79302,6,1884425,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1884425,1
79303,6,1937542,13,11,1128,2009,640,0,86.769231,183.298551,...,0,0,0.0,0.0,0,0,-1,-1,1937542,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321777,6,11725520,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11725520,1
321784,6,11641016,22,33,2024,2745,640,0,92.000000,138.338439,...,0,0,0.0,0.0,0,0,-1,-1,11641016,1
321785,6,12117686,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,12117686,1
321944,6,13640748,22,33,2008,2745,640,0,91.272727,138.182137,...,0,0,0.0,0.0,0,0,-1,-1,13640748,1


In [16]:
concap_ssh = get_concap_dataset("concap_ssh.csv")
concap_ssh.drop(columns=["label", "category", "subcategory"], inplace=True, errors='ignore')
concap_ssh["Label"] = 1
concap_ssh

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=13&file_name=concap/concap_ssh.csv...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.81M/4.81M [00:01<00:00, 4.86MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,4660,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,4660,1
1,6,3557,3,3,24.0,0.0,24.0,0.0,8.000000,13.856406,...,0,0,0,0,0,0,-1,-1,3557,1
2,6,19511,4,3,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,19511,1
3,6,11027649,22,34,1928.0,2746.0,640.0,0.0,87.636364,137.780552,...,0,0,0,0,0,0,-1,-1,11027649,1
4,6,9870,5,4,24.0,0.0,24.0,0.0,4.800000,10.733126,...,0,0,0,0,0,0,-1,-1,9870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7633,6,4487608,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4487608,1
7634,6,3367832,14,16,1272.0,2154.0,640.0,0.0,90.857143,174.415444,...,0,0,0,0,0,0,-1,-1,3367832,1
7635,6,4025378,17,17,1272.0,2154.0,640.0,0.0,74.823529,161.218887,...,0,0,0,0,0,0,-1,-1,4025378,1
7636,6,3951,4,4,24.0,0.0,24.0,0.0,6.000000,12.000000,...,0,0,0,0,0,0,-1,-1,3951,1


In [17]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_ssh, concap_ssh)

In [18]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
70,Fwd Seg Size Min,0.92088
67,FWD Init Win Bytes,0.887707
24,Fwd IAT Min,0.866036
46,SYN Flag Count,0.816109
12,Bwd Packet Length Mean,0.797805
56,Bwd Segment Size Avg,0.797805
7,Fwd Packet Length Min,0.796353
40,Packet Length Min,0.796015
0,Protocol,0.794326
11,Bwd Packet Length Min,0.79382


In [19]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
70,Fwd Seg Size Min,0.92435
24,Fwd IAT Min,0.860689
12,Bwd Packet Length Mean,0.84279
56,Bwd Segment Size Avg,0.84279
45,FIN Flag Count,0.815772
46,SYN Flag Count,0.815265
37,Bwd Header Length,0.815096
30,Fwd PSH Flags,0.81155
48,PSH Flag Count,0.810368
9,Fwd Packet Length Std,0.809355


# Wednesday

In [20]:
wednesday_cic = get_cic_dataset("wednesday.csv")

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=13&file_name=cic/wednesday.csv...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 278M/278M [00:35<00:00, 8.11MB/s]


In [26]:
wednesday_cic["Label"].unique()

array(['BENIGN', 'DoS Slowloris', 'DoS Slowloris - Attempted',
       'DoS Slowhttptest', 'DoS Slowhttptest - Attempted', 'DoS Hulk',
       'DoS Hulk - Attempted', 'DoS GoldenEye', 'Heartbleed',
       'DoS GoldenEye - Attempted'], dtype=object)

## Slowloris

In [21]:
cic_slowloris = wednesday_cic[wednesday_cic["Label"] == "DoS Slowloris"].copy()
cic_slowloris["Label"] = 1
cic_slowloris

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
9593,6,17072865,4,3,239,0,231,0,59.75,114.228937,...,1415,1415,17071390.0,0.0,17071390,17071390,-1,-1,17072865,1
9594,6,17069963,4,3,239,0,231,0,59.75,114.228937,...,856,856,17069049.0,0.0,17069049,17069049,-1,-1,17069963,1
9595,6,17071303,4,3,239,0,231,0,59.75,114.228937,...,1002,1002,17070196.0,0.0,17070196,17070196,-1,-1,17071303,1
9596,6,17070583,4,3,239,0,231,0,59.75,114.228937,...,740,740,17069779.0,0.0,17069779,17069779,-1,-1,17070583,1
9597,6,17069147,4,3,239,0,231,0,59.75,114.228937,...,942,942,17068165.0,0.0,17068165,17068165,-1,-1,17069147,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31140,6,4603499,11,3,1155,483,231,0,105.00,120.635816,...,0,0,0.0,0.0,0,0,-1,-1,4603499,1
31172,6,4204862,11,3,1386,483,231,0,126.00,120.635816,...,0,0,0.0,0.0,0,0,-1,-1,4204862,1
31188,6,3202193,10,3,1386,483,231,0,138.60,119.287887,...,0,0,0.0,0.0,0,0,-1,-1,3202193,1
31204,6,2599709,10,3,1155,483,231,0,115.50,121.747690,...,0,0,0.0,0.0,0,0,-1,-1,2599709,1


In [23]:
concap_slowloris = get_concap_dataset("concap_slowloris.csv")
concap_slowloris

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,118926264,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118926264,1
1,6,118849273,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118849273,1
2,6,118886942,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118886942,1
3,6,118823677,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118823677,1
4,6,118644621,62,61,725.0,0.0,253.0,0.0,11.693548,31.180905,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118644621,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115,6,64981331,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7124966.0,7124966.0,1.928546e+07,1.296348e+07,33536459.0,8193397.0,-1,-1,64981331,1
2116,6,108873650,60,59,685.0,406.0,253.0,0.0,11.416667,31.795049,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,108873650,1
2117,6,64054899,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7221325.0,7221325.0,1.894452e+07,1.240389e+07,32514391.0,8190908.0,-1,-1,64054899,1
2118,6,107089854,59,58,677.0,406.0,253.0,0.0,11.474576,32.064781,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,107089854,1


In [27]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_slowloris, concap_slowloris)

In [28]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
54,Average Packet Size,0.942761
42,Packet Length Mean,0.942761
14,Flow Bytes/s,0.9311
81,Total TCP Flow Time,0.912891
24,Fwd IAT Min,0.908549
4,Total Length of Fwd Packet,0.801552
6,Fwd Packet Length Max,0.801552
8,Fwd Packet Length Mean,0.800127
55,Fwd Segment Size Avg,0.800127
10,Bwd Packet Length Max,0.797665


In [29]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
10,Bwd Packet Length Max,0.936642
5,Total Length of Bwd Packet,0.936642
12,Bwd Packet Length Mean,0.935087
56,Bwd Segment Size Avg,0.935087
66,Subflow Bwd Bytes,0.934569
81,Total TCP Flow Time,0.911376
75,Idle Mean,0.872506
77,Idle Max,0.872506
38,Fwd Packets/s,0.843483
15,Flow Packets/s,0.836745


## Slowhttptest

In [30]:
cic_slowhttptest = wednesday_cic[wednesday_cic["Label"] == "DoS Slowhttptest"].copy()
cic_slowhttptest["Label"] = 1
cic_slowhttptest

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
17978,6,83400021,18,2,5200,0,520,0,288.888889,265.881196,...,7008529,377,1.164118e+07,6.049227e+06,20776227,5399131,-1,-1,83400021,1
17979,6,83385834,18,2,5200,0,520,0,288.888889,265.881196,...,7010520,364,1.163850e+07,6.086458e+06,20840094,5303271,-1,-1,83385834,1
17995,6,83385227,18,2,5200,0,520,0,288.888889,265.881196,...,7010191,549,1.163841e+07,6.086421e+06,20839833,5303148,-1,-1,83385227,1
17996,6,83430584,18,2,5200,0,520,0,288.888889,265.881196,...,7015028,549,1.164516e+07,6.050565e+06,20776068,5399130,-1,-1,83430584,1
18004,6,83426649,18,2,5200,0,520,0,288.888889,265.881196,...,7011426,724,1.164504e+07,6.050473e+06,20775800,5399051,-1,-1,83426649,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330256,6,31589183,13,2,1683,0,187,0,129.461538,89.831894,...,6533504,6533504,8.351745e+06,4.154461e+06,13103822,5407489,-1,-1,31589183,1
360465,6,31521274,13,2,1683,0,187,0,129.461538,89.831894,...,6541345,6541345,8.326465e+06,4.166749e+06,13087961,5347461,-1,-1,31521274,1
392236,6,31405313,13,2,1683,0,187,0,129.461538,89.831894,...,6541322,6541322,8.287807e+06,4.223231e+06,13103684,5215656,-1,-1,31405313,1
434805,6,31389292,7,2,935,0,187,0,133.571429,91.246657,...,1629605,1629605,2.975969e+07,0.000000e+00,29759687,29759687,-1,-1,31389292,1


In [31]:
concap_slowhttptest = get_concap_dataset("concap_slowhttptest.csv")
concap_slowhttptest

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=13&file_name=concap/concap_slowhttptest.csv...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.31M/1.31M [00:00<00:00, 2.27MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,118697166,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118697166,1
1,6,118535863,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118535863,1
2,6,118513518,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118513518,1
3,6,118493155,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118493155,1
4,6,118374171,62,61,697.0,0.0,461.0,0.0,11.241935,58.060083,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118374171,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109,6,1038645,4,3,4.0,406.0,4.0,0.0,1.000000,2.000000,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,121389140,1
2110,6,118579235,64,62,697.0,406.0,461.0,0.0,10.890625,57.165068,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,118579235,1
2111,6,64665907,7,0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,7069661.0,7069661.0,1.919875e+07,1.282431e+07,33279304.0,8187769.0,-1,-1,64665907,1
2112,6,113178113,63,61,689.0,406.0,461.0,0.0,10.936508,57.627523,...,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,-1,-1,113178113,1


In [32]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_slowhttptest, concap_slowhttptest)

In [33]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
81,Total TCP Flow Time,0.920014
24,Fwd IAT Min,0.897788
20,Fwd IAT Total,0.894136
1,Flow Duration,0.893849
14,Flow Bytes/s,0.886918
10,Bwd Packet Length Max,0.79222
56,Bwd Segment Size Avg,0.79222
5,Total Length of Bwd Packet,0.79222
12,Bwd Packet Length Mean,0.79222
66,Subflow Bwd Bytes,0.79222


In [34]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
38,Fwd Packets/s,0.893103
15,Flow Packets/s,0.887644
16,Flow IAT Mean,0.88592
21,Fwd IAT Mean,0.88477
39,Bwd Packets/s,0.884195
81,Total TCP Flow Time,0.879885
75,Idle Mean,0.86408
24,Fwd IAT Min,0.856609
5,Total Length of Bwd Packet,0.843678
10,Bwd Packet Length Max,0.843678


## Hulk

In [35]:
cic_hulk = wednesday_cic[wednesday_cic["Label"] == "DoS Hulk"].copy()
cic_hulk["Label"] = 1
cic_hulk

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
33452,6,28504,9,9,348,11595,348,0,38.666667,116.000000,...,0,0,0.0,0.0,0,0,-1,-1,28504,1
33453,6,3466,6,6,382,11595,382,0,63.666667,155.950847,...,0,0,0.0,0.0,0,0,-1,-1,3466,1
33454,6,4212,6,6,372,11595,372,0,62.000000,151.868364,...,0,0,0.0,0.0,0,0,-1,-1,4212,1
33455,6,12080,7,7,349,11595,349,0,49.857143,131.909601,...,0,0,0.0,0.0,0,0,-1,-1,12080,1
33456,6,12005,6,5,371,11595,371,0,61.833333,151.460116,...,0,0,0.0,0.0,0,0,-1,-1,12005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496517,6,173006,7,7,343,11595,343,0,49.000000,129.641814,...,0,0,0.0,0.0,0,0,-1,-1,173006,1
496524,6,179441,8,4,346,11595,346,0,43.250000,122.329473,...,0,0,0.0,0.0,0,0,-1,-1,179441,1
496561,6,189968,8,8,353,11595,353,0,44.125000,124.804347,...,0,0,0.0,0.0,0,0,-1,-1,189968,1
496569,6,656477,11,5,1041,11595,347,0,94.636364,162.083480,...,0,0,0.0,0.0,0,0,-1,-1,656477,1


In [40]:
concap_hulk = get_concap_dataset("concap_hulk.csv")
concap_hulk

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,3070,7,5,366.0,11611.0,366.0,0.0,52.285714,138.334997,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,3070,1
1,6,6535,7,5,336.0,11611.0,336.0,0.0,48.000000,126.996063,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,6535,1
2,6,7113,7,5,361.0,11611.0,361.0,0.0,51.571429,136.445175,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,7113,1
3,6,6639,7,6,317.0,11611.0,317.0,0.0,45.285714,119.814738,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,6639,1
4,6,7900,7,8,367.0,11611.0,367.0,0.0,52.428571,138.712962,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,7900,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172550,6,1529068,10,11,373.0,11611.0,373.0,0.0,37.300000,117.952957,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,1529068,1
172551,6,309216,7,6,363.0,11611.0,363.0,0.0,51.857143,137.201104,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,309216,1
172552,6,378797,9,7,307.0,11611.0,307.0,0.0,34.111111,102.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,378797,1
172553,6,521034,10,7,638.0,11611.0,319.0,0.0,63.800000,134.502210,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,521034,1


In [41]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_hulk, concap_hulk)

In [42]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
13,Bwd Packet Length Std,0.957172
34,Fwd RST Flags,0.954443
47,RST Flag Count,0.945154
44,Packet Length Variance,0.941368
43,Packet Length Std,0.941368
66,Subflow Bwd Bytes,0.928695
42,Packet Length Mean,0.927164
54,Average Packet Size,0.927164
5,Total Length of Bwd Packet,0.923055
12,Bwd Packet Length Mean,0.912133


In [43]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
13,Bwd Packet Length Std,0.999041
12,Bwd Packet Length Mean,0.979226
56,Bwd Segment Size Avg,0.979226
10,Bwd Packet Length Max,0.976828
66,Subflow Bwd Bytes,0.97467
41,Packet Length Max,0.97366
5,Total Length of Bwd Packet,0.972802
42,Packet Length Mean,0.971881
54,Average Packet Size,0.971881
44,Packet Length Variance,0.956029


## GoldenEye

In [44]:
cic_goldeneye = wednesday_cic[wednesday_cic["Label"] == "DoS GoldenEye"].copy()
cic_goldeneye["Label"] = 1
cic_goldeneye

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
188941,6,11454901,9,4,1776,3525,444,0,197.333333,234.008547,...,827250,1474,5293676.0,4.105236e+05,5583960,5003392,-1,-1,11454901,1
188942,6,17437659,12,6,327,11632,327,0,27.250000,94.396769,...,7015689,7015689,5381199.0,0.000000e+00,5381199,5381199,-1,-1,17437659,1
188943,6,10753804,8,6,361,11632,361,0,45.125000,127.632774,...,4287,1108,5355279.5,5.017255e+05,5710053,5000506,-1,-1,10753804,1
188944,6,10752381,8,5,377,11632,377,0,47.125000,133.289628,...,1351,1351,5710766.0,0.000000e+00,5710766,5710766,-1,-1,10752381,1
188945,6,11422331,9,5,487,11632,487,0,54.111111,162.333333,...,1647,712,5690607.5,9.735835e+05,6379035,5002180,-1,-1,11422331,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496543,6,9726755,9,5,300,11632,300,0,33.333333,100.000000,...,4685619,4685619,5001471.0,0.000000e+00,5001471,5001471,-1,-1,201163062,1
496555,6,11509160,6,5,332,3525,332,0,55.333333,135.538432,...,1887,1887,6467018.0,0.000000e+00,6467018,6467018,-1,-1,11509160,1
496603,6,103680007,13,6,1472,11632,368,0,113.230769,176.781482,...,829021,3207,34269690.0,5.022459e+07,92263198,5000791,-1,-1,103680007,1
496606,6,11464350,9,5,588,11632,588,0,65.333333,196.000000,...,749,749,6423318.0,0.000000e+00,6423318,6423318,-1,-1,11464350,1


In [46]:
concap_goldeneye = get_concap_dataset("concap_goldeneye.csv")
concap_goldeneye

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,13783803,10,11,1200.0,11610.0,300.0,0.0,120.000000,154.919334,...,853356.0,853356.0,1.289990e+07,0.000000e+00,12899901.0,12899901.0,-1,-1,13783803,1
1,6,56835564,9,6,500.0,11610.0,500.0,0.0,55.555556,166.666667,...,5046952.0,4878.0,2.589187e+07,1.734867e+07,38159227.0,13624507.0,-1,-1,56835564,1
2,6,25057619,9,7,404.0,11610.0,404.0,0.0,44.888889,134.666667,...,5047163.0,1704.0,1.000438e+07,7.025186e+06,14971933.0,5036819.0,-1,-1,25057619,1
3,6,38433193,10,6,512.0,11610.0,512.0,0.0,51.200000,161.908616,...,5045362.0,6431.0,1.669070e+07,4.830814e+06,20106601.0,13274799.0,-1,-1,38433193,1
4,6,5703764,9,8,328.0,11610.0,328.0,0.0,36.444444,109.333333,...,843.0,843.0,5.562874e+06,0.000000e+00,5562874.0,5562874.0,-1,-1,5703764,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,6,35282270,11,7,414.0,11610.0,414.0,0.0,37.636364,124.825697,...,1015225.0,1608.0,1.140679e+07,5.741702e+06,16095574.0,5003009.0,-1,-1,35282270,1
4658,6,75378643,8,6,509.0,11610.0,509.0,0.0,63.625000,179.958676,...,44432.0,2646.0,2.510878e+07,2.733661e+07,56235375.0,5002188.0,-1,-1,75378643,1
4659,6,97824485,15,12,2440.0,11610.0,610.0,0.0,162.666667,279.220002,...,5046710.0,854857.0,4.596146e+07,4.560548e+07,78209401.0,13713517.0,-1,-1,97824485,1
4660,6,88655311,9,6,363.0,11610.0,363.0,0.0,40.333333,121.000000,...,45612.0,3088.0,2.953409e+07,3.548058e+07,70216725.0,5001881.0,-1,-1,88655311,1


In [47]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_goldeneye, concap_goldeneye)

In [48]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
12,Bwd Packet Length Mean,0.971546
56,Bwd Segment Size Avg,0.971546
66,Subflow Bwd Bytes,0.959165
34,Fwd RST Flags,0.957513
42,Packet Length Mean,0.956191
54,Average Packet Size,0.956191
47,RST Flag Count,0.948857
13,Bwd Packet Length Std,0.917094
44,Packet Length Variance,0.904598
43,Packet Length Std,0.904598


In [49]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
13,Bwd Packet Length Std,0.960552
43,Packet Length Std,0.956191
44,Packet Length Variance,0.956191
70,Fwd Seg Size Min,0.921501
81,Total TCP Flow Time,0.86342
27,Bwd IAT Std,0.8582
20,Fwd IAT Total,0.856482
1,Flow Duration,0.855293
29,Bwd IAT Min,0.854764
26,Bwd IAT Mean,0.8545


## Heartbleed

In [50]:
cic_heartbleed = wednesday_cic[wednesday_cic["Label"] == "Heartbleed"].copy()
cic_heartbleed["Label"] = 1
cic_heartbleed

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
244352,6,119302728,2685,1729,8299,7556917,517,0,3.090875,16.858421,...,2217,2217,5024984.0,0.0,5024984,5024984,-1,-1,119302728,1
245206,6,119262215,2791,2111,7920,7883927,66,0,2.837693,13.390275,...,0,0,0.0,0.0,0,0,-1,-1,239559056,1
246401,6,119261118,2793,2131,7920,7883880,66,0,2.835661,13.385693,...,0,0,0.0,0.0,0,0,-1,-1,359814372,1
247344,6,119260295,2790,2115,7920,7883880,66,0,2.83871,13.392567,...,0,0,0.0,0.0,0,0,-1,-1,480068718,1
248442,6,119297996,2781,2090,7920,7883880,66,0,2.847896,13.413253,...,0,0,0.0,0.0,0,0,-1,-1,600361033,1
248766,6,119259886,2781,2092,7920,7883880,66,0,2.847896,13.413253,...,0,0,0.0,0.0,0,0,-1,-1,720578677,1
249515,6,119259012,2800,2070,7920,7883880,66,0,2.828571,13.369695,...,0,0,0.0,0.0,0,0,-1,-1,840832907,1
249897,6,119257653,2801,2068,15066,7818181,1486,0,5.378793,62.757991,...,0,0,0.0,0.0,0,0,-1,-1,961085290,1
250774,6,119299621,2804,2029,7920,7884419,66,0,2.824536,13.360578,...,0,0,0.0,0.0,0,0,-1,-1,1081380819,1
251109,6,119296592,2796,2007,7920,7883880,66,0,2.832618,13.37883,...,0,0,0.0,0.0,0,0,-1,-1,1201633329,1


In [51]:
concap_heartbleed = get_concap_dataset("concap_heartbleed.csv")
concap_heartbleed

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=13&file_name=concap/concap_heartbleed.csv...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83.3k/83.3k [00:00<00:00, 598kB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,17099,7,8,815.0,7759.0,517.0,0.0,116.428571,190.551705,...,0,0,0,0,0,0,-1,-1,17099,1
1,6,1014038,16,12,709.0,67223.0,517.0,0.0,44.312500,130.665843,...,0,0,0,0,0,0,-1,-1,1014038,1
2,6,1012724,16,12,709.0,67223.0,517.0,0.0,44.312500,130.665843,...,0,0,0,0,0,0,-1,-1,1012724,1
3,6,15288,8,9,815.0,7759.0,517.0,0.0,101.875000,181.155369,...,0,0,0,0,0,0,-1,-1,15288,1
4,6,1012355,20,13,709.0,67223.0,517.0,0.0,35.450000,117.515273,...,0,0,0,0,0,0,-1,-1,1012355,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,6,1014955,19,13,709.0,67223.0,517.0,0.0,37.315789,120.430733,...,0,0,0,0,0,0,-1,-1,1014955,1
110,6,1014035,20,13,709.0,67223.0,517.0,0.0,35.450000,117.515273,...,0,0,0,0,0,0,-1,-1,1014035,1
111,6,1010372,15,13,709.0,67223.0,517.0,0.0,47.266667,134.697792,...,0,0,0,0,0,0,-1,-1,1010372,1
112,6,12340,8,9,815.0,7759.0,517.0,0.0,101.875000,181.155369,...,0,0,0,0,0,0,-1,-1,12340,1


In [52]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_heartbleed, concap_heartbleed)

In [53]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
13,Bwd Packet Length Std,1.0
43,Packet Length Std,0.986842
14,Flow Bytes/s,0.954545
9,Fwd Packet Length Std,0.863636
6,Fwd Packet Length Max,0.863636
7,Fwd Packet Length Min,0.818182
11,Bwd Packet Length Min,0.818182
0,Protocol,0.818182
40,Packet Length Min,0.818182
10,Bwd Packet Length Max,0.75


In [54]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
10,Bwd Packet Length Max,1.0
13,Bwd Packet Length Std,1.0
44,Packet Length Variance,1.0
41,Packet Length Max,1.0
43,Packet Length Std,1.0
14,Flow Bytes/s,0.954545
42,Packet Length Mean,0.954545
12,Bwd Packet Length Mean,0.954545
5,Total Length of Bwd Packet,0.954545
54,Average Packet Size,0.954545


# Thursday

In [38]:
thursday_cic = get_cic_dataset("thursday.csv")
thursday_cic["Label"].unique()

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=10&file_name=cic/thursday.csv...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 181M/181M [00:27<00:00, 7.01MB/s]


array(['BENIGN', 'Web Attack - Brute Force - Attempted',
       'Web Attack - Brute Force', 'Infiltration - Attempted',
       'Infiltration', 'Infiltration - Portscan',
       'Web Attack - XSS - Attempted', 'Web Attack - XSS',
       'Web Attack - SQL Injection - Attempted',
       'Web Attack - SQL Injection'], dtype=object)

In [51]:
thursday_cic[thursday_cic["Label"] == "Web Attack - XSS"]

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
111769,6,60170367,208,107,47903,183657,585,0,230.302885,248.36042,...,0,0,0.0,0.0,0,0,-1,-1,60170367,Web Attack - XSS
117429,6,69374569,206,105,48783,183572,585,0,236.81068,252.210265,...,0,0,0.0,0.0,0,0,-1,-1,69374569,Web Attack - XSS
122100,6,69422816,206,110,48783,183586,585,0,236.81068,252.210265,...,0,0,0.0,0.0,0,0,-1,-1,69422816,Web Attack - XSS
123890,6,67553643,205,105,48985,183689,585,0,238.95122,253.242691,...,0,0,0.0,0.0,0,0,-1,-1,67553643,Web Attack - XSS
135899,6,68116521,205,105,48985,183692,585,0,238.95122,253.242691,...,0,0,0.0,0.0,0,0,-1,-1,68116521,Web Attack - XSS
143598,6,66976121,205,106,48783,183586,585,0,237.965854,252.28078,...,0,0,0.0,0.0,0,0,-1,-1,66976121,Web Attack - XSS
157591,6,68151611,205,115,48783,183606,585,0,237.965854,252.28078,...,0,0,0.0,0.0,0,0,-1,-1,68151611,Web Attack - XSS
158229,6,68065237,214,107,48783,183584,585,0,227.957944,251.492415,...,0,0,0.0,0.0,0,0,-1,-1,68065237,Web Attack - XSS
175342,6,67291468,210,105,48985,183697,585,0,233.261905,252.845913,...,0,0,0.0,0.0,0,0,-1,-1,67291468,Web Attack - XSS
177980,6,67053821,206,105,48985,183687,585,0,237.791262,253.172264,...,0,0,0.0,0.0,0,0,-1,-1,67053821,Web Attack - XSS


## Web Attack - Bruteforce

In [39]:
cic_bruteforce = thursday_cic[thursday_cic["Label"] == "Web Attack - Brute Force"].copy()
cic_bruteforce["Label"] = 1
cic_bruteforce

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
23550,6,35253111,204,105,43906,72181,602,0,215.225490,228.432176,...,0,0,0.0,0.0,0,0,-1,-1,35253111,1
115400,6,32925619,204,105,43951,72198,602,0,215.446078,228.590459,...,0,0,0.0,0.0,0,0,-1,-1,32925619,1
121724,6,33028450,204,105,43951,72194,602,0,215.446078,228.590459,...,0,0,0.0,0.0,0,0,-1,-1,33028450,1
121829,6,33347965,204,105,43951,72202,602,0,215.446078,228.590459,...,0,0,0.0,0.0,0,0,-1,-1,33347965,1
132222,6,34212507,204,105,43951,72194,602,0,215.446078,228.590459,...,0,0,0.0,0.0,0,0,-1,-1,34212507,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348864,6,34000093,204,105,43906,72186,602,0,215.225490,228.432176,...,0,0,0.0,0.0,0,0,-1,-1,34000093,1
355070,6,34282481,204,107,43951,72189,602,0,215.446078,228.590459,...,0,0,0.0,0.0,0,0,-1,-1,34282481,1
358600,6,34048558,204,105,43906,72188,602,0,215.225490,228.432176,...,0,0,0.0,0.0,0,0,-1,-1,34048558,1
358656,6,33954481,204,105,43906,72180,602,0,215.225490,228.432176,...,0,0,0.0,0.0,0,0,-1,-1,33954481,1


In [41]:
concap_bruteforce = get_concap_dataset("concap_web_bruteforce.csv")
concap_bruteforce

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,936259,4,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,-1,-1,936259,1
1,6,1053085,4,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,-1,-1,1053085,1
2,6,1029193,4,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,-1,-1,1029193,1
3,6,1013865,4,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,-1,-1,1013865,1
4,6,1071168,17,11,3094.0,14735.0,638.0,0.0,182.0,231.709139,...,0,0,0,0,0,0,-1,-1,1071168,1
5,6,1011581,4,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,-1,-1,1011581,1
6,6,1270632,18,11,3095.0,14739.0,639.0,0.0,171.944444,228.967368,...,0,0,0,0,0,0,-1,-1,1270632,1
7,6,1266643,17,11,3094.0,14736.0,638.0,0.0,182.0,231.709139,...,0,0,0,0,0,0,-1,-1,1266643,1
8,6,1338306,17,12,3093.0,14737.0,637.0,0.0,181.941176,231.586234,...,0,0,0,0,0,0,-1,-1,1338306,1
9,6,1135702,17,12,3092.0,14735.0,636.0,0.0,181.882353,231.463518,...,0,0,0,0,0,0,-1,-1,1135702,1


In [43]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_bruteforce, concap_bruteforce)

In [44]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
70,Fwd Seg Size Min,0.931507
67,FWD Init Win Bytes,0.883562
24,Fwd IAT Min,0.845205
45,FIN Flag Count,0.835616
46,SYN Flag Count,0.815068
40,Packet Length Min,0.80137
0,Protocol,0.80137
7,Fwd Packet Length Min,0.80137
28,Bwd IAT Max,0.799315
11,Bwd Packet Length Min,0.794521


In [45]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

Unnamed: 0,Feature,ROC AUC Score
5,Total Length of Bwd Packet,0.986301
4,Total Length of Fwd Packet,0.972603
8,Fwd Packet Length Mean,0.972603
64,Subflow Fwd Bytes,0.972603
48,PSH Flag Count,0.972603
31,Bwd PSH Flags,0.972603
55,Fwd Segment Size Avg,0.972603
30,Fwd PSH Flags,0.965753
69,Fwd Act Data Pkts,0.952055
53,Down/Up Ratio,0.938356


## Web Attack - SQL Injection

In [47]:
cic_sqli = thursday_cic[thursday_cic["Label"] == "Web Attack - SQL Injection"].copy()
cic_sqli["Label"] = 1
cic_sqli

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
128107,6,5039303,5,5,537,1881,537,0,107.4,240.153701,...,34275,34275,5004343.0,0.0,5004343,5004343,-1,-1,5039303,1
132409,6,5013185,7,5,600,4149,600,0,85.714286,226.778684,...,12319,12319,5000160.0,0.0,5000160,5000160,-1,-1,5013185,1
170331,6,5006123,6,4,599,2021,599,0,99.833333,244.540726,...,5243,5243,5000145.0,0.0,5000145,5000145,-1,-1,5006123,1
178931,6,5087203,6,5,600,4149,600,0,100.0,244.948974,...,82611,82611,5003905.0,0.0,5003905,5003905,-1,-1,5087203,1
222113,6,5017466,6,6,599,4149,599,0,99.833333,244.540726,...,12710,12710,5004049.0,0.0,5004049,5004049,-1,-1,5017466,1
237434,6,5009656,5,5,599,2021,599,0,119.8,267.880944,...,5358,5358,5003805.0,0.0,5003805,5003805,-1,-1,5009656,1
237572,6,5006730,5,5,599,2021,599,0,119.8,267.880944,...,6119,6119,5000049.0,0.0,5000049,5000049,-1,-1,5006730,1
239282,6,5006912,6,5,447,530,447,0,74.5,182.486986,...,5712,5712,5000415.0,0.0,5000415,5000415,-1,-1,5006912,1
286051,6,5009699,5,5,523,530,523,0,104.6,233.89271,...,4355,4355,5004686.0,0.0,5004686,5004686,-1,-1,5009699,1
319013,6,5008202,6,4,537,1881,537,0,89.5,219.229332,...,6787,6787,5000673.0,0.0,5000673,5000673,-1,-1,5008202,1


## Web Attack - Cross-Site Scripting

In [19]:
cic_xss = thursday_cic[thursday_cic["Label"] == "Web Attack - XSS"].copy()
cic_xss["Label"] = 1
cic_xss

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
111769,6,60170367,208,107,47903,183657,585,0,230.302885,248.36042,...,0,0,0.0,0.0,0,0,-1,-1,60170367,1
117429,6,69374569,206,105,48783,183572,585,0,236.81068,252.210265,...,0,0,0.0,0.0,0,0,-1,-1,69374569,1
122100,6,69422816,206,110,48783,183586,585,0,236.81068,252.210265,...,0,0,0.0,0.0,0,0,-1,-1,69422816,1
123890,6,67553643,205,105,48985,183689,585,0,238.95122,253.242691,...,0,0,0.0,0.0,0,0,-1,-1,67553643,1
135899,6,68116521,205,105,48985,183692,585,0,238.95122,253.242691,...,0,0,0.0,0.0,0,0,-1,-1,68116521,1
143598,6,66976121,205,106,48783,183586,585,0,237.965854,252.28078,...,0,0,0.0,0.0,0,0,-1,-1,66976121,1
157591,6,68151611,205,115,48783,183606,585,0,237.965854,252.28078,...,0,0,0.0,0.0,0,0,-1,-1,68151611,1
158229,6,68065237,214,107,48783,183584,585,0,227.957944,251.492415,...,0,0,0.0,0.0,0,0,-1,-1,68065237,1
175342,6,67291468,210,105,48985,183697,585,0,233.261905,252.845913,...,0,0,0.0,0.0,0,0,-1,-1,67291468,1
177980,6,67053821,206,105,48985,183687,585,0,237.791262,253.172264,...,0,0,0.0,0.0,0,0,-1,-1,67053821,1


# Friday

In [55]:
friday_cic = get_cic_dataset("friday.csv")

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=13&file_name=cic/friday.csv...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 272M/272M [00:37<00:00, 7.52MB/s]


In [56]:
friday_cic["Label"].unique()

array(['BENIGN', 'Botnet - Attempted', 'Botnet', 'Portscan', 'DDoS'],
      dtype=object)

## LOIC

In [57]:
cic_loic = friday_cic[friday_cic["Label"] == "DDoS"].copy()
cic_loic["Label"] = 1
cic_loic

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
87193,6,9157589,8,6,20,11595,20,0,2.500000,7.071068,...,1084925,1084925,8072664.0,0.0,8072664,8072664,-1,-1,9157589,1
87194,6,6150288,8,6,20,11595,20,0,2.500000,7.071068,...,407765,407765,5742523.0,0.0,5742523,5742523,-1,-1,6150288,1
87195,6,8410717,8,6,20,11595,20,0,2.500000,7.071068,...,644213,644213,7766504.0,0.0,7766504,7766504,-1,-1,8410717,1
87198,6,7757913,8,5,20,11595,20,0,2.500000,7.071068,...,683993,683993,7073920.0,0.0,7073920,7073920,-1,-1,7757913,1
87199,6,8063397,8,6,20,11595,20,0,2.500000,7.071068,...,307905,307905,7755492.0,0.0,7755492,7755492,-1,-1,8063397,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547499,6,8809229,8,5,20,11595,20,0,2.500000,7.071068,...,708296,708296,8100933.0,0.0,8100933,8100933,-1,-1,8809229,1
547515,6,5746903,8,4,20,11595,20,0,2.500000,7.071068,...,0,0,0.0,0.0,0,0,-1,-1,5746903,1
547545,6,3966857,8,4,20,11595,20,0,2.500000,7.071068,...,0,0,0.0,0.0,0,0,-1,-1,3966857,1
547546,6,11276716,9,5,20,11595,20,0,2.222222,6.666667,...,113023,113023,11163693.0,0.0,11163693,11163693,-1,-1,11276716,1


In [58]:
concap_loic = get_concap_dataset("concap_loic.csv")
concap_loic.drop(columns=["target", "port", "service"], inplace=True, errors='ignore')
concap_loic

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=13&file_name=concap/concap_loic.csv...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22.4M/22.4M [00:03<00:00, 7.25MB/s]


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,31064,11,9,131.0,11574.0,131.0,0.0,11.909091,39.497986,...,0,0,0,0,0,0,-1,-1,31064,1
1,6,21147,10,8,131.0,11574.0,131.0,0.0,13.100000,41.425837,...,0,0,0,0,0,0,-1,-1,21147,1
2,6,20759,10,8,138.0,11574.0,138.0,0.0,13.800000,43.639432,...,0,0,0,0,0,0,-1,-1,20759,1
3,6,21115,10,8,138.0,11574.0,138.0,0.0,13.800000,43.639432,...,0,0,0,0,0,0,-1,-1,21115,1
4,6,20847,10,8,131.0,11574.0,131.0,0.0,13.100000,41.425837,...,0,0,0,0,0,0,-1,-1,20847,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33876,6,21435,10,8,132.0,11574.0,132.0,0.0,13.200000,41.742065,...,0,0,0,0,0,0,-1,-1,21435,1
33877,6,21625,10,8,138.0,11574.0,138.0,0.0,13.800000,43.639432,...,0,0,0,0,0,0,-1,-1,21625,1
33878,6,21474,10,8,131.0,11574.0,131.0,0.0,13.100000,41.425837,...,0,0,0,0,0,0,-1,-1,21474,1
33879,6,22083,10,8,138.0,11574.0,138.0,0.0,13.800000,43.639432,...,0,0,0,0,0,0,-1,-1,22083,1


In [59]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_bruteforce, concap_bruteforce)

NameError: name 'cic_bruteforce' is not defined

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)

## Portscan

In [None]:
cic_portscan = friday_cic[friday_cic["Label"] == "Portscan"].copy()
cic_portscan["Label"] = 1
cic_portscan

In [None]:
concap_portscan = get_concap_dataset("concap_portscan.csv")
concap_portscan

In [None]:
cic_concap, concap_cic = concap_cic_experiment(benign, cic_portscan, concap_portscan)

In [None]:
cic_concap.sort_values("ROC AUC Score", ascending=False).head(10)

In [None]:
concap_cic.sort_values("ROC AUC Score", ascending=False).head(10)