In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [105]:
drop_columns = [
    "id",
    "Flow ID",        
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
    "Attempted Category",
]

def xs_y(df, targ): 
    if not isinstance(targ, list):
        xs = df[df.columns.difference([targ])].copy()
    else:
        xs = df[df.columns.difference(targ)].copy()
    y = df[targ].copy()
    return xs, y


def get_concap_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "concap/" + file)
    # clean the dataset
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df["Label"] = 1

    return df

def get_cic_2017_dataset(file):
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "jozefjankaj/thesis-files",
        "cic/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df
    
def get_extension(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "extensions/" + file,
    )
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')

    return df


def train_verify_one_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, feature: str):
    root = DecisionTreeClassifier(max_depth=1, criterion='gini')
    train_x, train_y = xs_y(train_df, "Label")
    test_x, test_y = xs_y(test_df, "Label")
    root.fit(train_x[feature].array.reshape(-1,1), train_y)

    predictions = root.predict(test_x[feature].array.reshape(-1,1))
    return feature, roc_auc_score(test_y, predictions), accuracy_score(test_y, predictions), precision_score(test_y, predictions), recall_score(test_y, predictions)

def concap_cic_experiment(benign_df: pd.DataFrame, cic: pd.DataFrame, concap: pd.DataFrame):
    # Train on CIC => Predict ConCap
    benign_cic_balance = benign_df.sample(n=cic.shape[0])
    training_df = pd.concat([benign_cic_balance, cic]).sample(frac=1).reset_index(drop=True)
    
    benign_concap_balance = benign_df.sample(n=concap.shape[0])
    testing_df = pd.concat([benign_concap_balance, concap]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    columns = ["Feature", "ROC AUC Score", "Accuracy", "Precision", "Recall"]

    measurements_cic_concap = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                           columns=columns
                                          )

    # Train on ConCap => Predict CIC, just swap the values
    temp = training_df
    training_df = testing_df
    testing_df = temp
    
    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_concap_cic = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                          columns=columns
                                          )
    return measurements_cic_concap, measurements_concap_cic

def baseline_experiment(benign_df, cic, adversarial, feature):
    cic_baseline = cic.sample(frac=0.5)
    train_baseline = pd.concat([benign_df.sample(n=cic_baseline.shape[0]), cic_baseline]).sample(frac=1).reset_index(drop=True)
    test_baseline = pd.concat([benign_df.sample(n=adversarial.shape[0]), adversarial]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(train_baseline, "Label")
    test_x, test_y = xs_y(test_baseline, "Label")

    baseline_model = DecisionTreeClassifier(max_depth=1, criterion='gini', random_state=42)
    baseline_model.fit(train_x[feature].array.reshape(-1,1), train_y)

    baseline_predictions = baseline_model.predict(test_x[feature].array.reshape(-1,1))

    # lastly, test the model on cic traffic only
    test_cic_only = cic.drop(cic_baseline.index)
    test_cic = pd.concat([benign_df.sample(n=test_cic_only.shape[0]), test_cic_only]).sample(frac=1).reset_index(drop=True)
    
    test_cic_x, test_cic_y = xs_y(test_cic, "Label")
    cic_only_predictions = baseline_model.predict(test_cic_x[feature].array.reshape(-1,1))
    return roc_auc_score(test_y, baseline_predictions), roc_auc_score(test_cic_y, cic_only_predictions)

def adversarial_experiment(benign_df, cic, adversarial, feature):
    train_cic = cic.sample(frac=0.5)
    train_adversarial = adversarial.sample(frac=0.5)
    train_benign = benign_df.sample(n=train_cic.shape[0] + train_adversarial.shape[0])
    train = pd.concat([train_benign, train_cic, train_adversarial]).sample(frac=1).reset_index(drop=True)

    test_cic = cic.drop(train_cic.index)
    test_adversarial = adversarial.drop(train_adversarial.index)
    test_benign = benign_df.sample(n=test_cic.shape[0] + test_adversarial.shape[0])
    test = pd.concat([test_benign, test_cic, test_adversarial]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(train, "Label")
    test_x, test_y = xs_y(test, "Label")
    adversarial_model = DecisionTreeClassifier(max_depth=1, criterion='gini', random_state=42)
    adversarial_model.fit(train_x[feature].array.reshape(-1,1), train_y)

    adversarial_predictions = adversarial_model.predict(test_x[feature].array.reshape(-1,1))

    # lastly, test the performance of the model on 
    test_cic_only = cic.drop(train_cic.index)
    test_cic = pd.concat([benign_df.sample(n=test_cic_only.shape[0]), test_cic_only]).sample(frac=1).reset_index(drop=True)
    
    test_cic_x, test_cic_y = xs_y(test_cic, "Label")
    cic_only_predictions = adversarial_model.predict(test_cic_x[feature].array.reshape(-1,1))
    return roc_auc_score(test_y, adversarial_predictions), roc_auc_score(test_cic_y, cic_only_predictions)

def robustness_experiment(benign_df, cic, adversarial, feature):
    return baseline_experiment(benign_df, cic, adversarial, feature), adversarial_experiment(benign_df, cic, adversarial, feature)

def calculate_average_results(benign_df, cic, adversarial, feature, n_runs=10):
    base_count = 0
    base_cic_count = 0
    adverse_count = 0
    adverse_cic_count = 0
    for i in range(n_runs):
        (base, base_cic), (adverse, adverse_cic) = robustness_experiment(benign_df, cic, adversarial, feature)
        base_count += base
        base_cic_count += base_cic
        
        adverse_count += adverse
        adverse_cic_count += adverse_cic

    return base_count / 10.0, base_cic_count / 10.0, adverse_count / 10.0, adverse_cic_count / 10.0
        

In [7]:
benign = get_cic_2017_dataset("monday.csv")

  df = kagglehub.load_dataset(


In [8]:
benign["Label"] = 0
benign

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


# Bruteforce

## FTP

In [106]:
tuesday = get_cic_2017_dataset("tuesday.csv")
tuesday["Label"].unique()

  df = kagglehub.load_dataset(


array(['BENIGN', 'FTP-Patator - Attempted', 'FTP-Patator', 'SSH-Patator',
       'SSH-Patator - Attempted'], dtype=object)

In [107]:
cic_ftp = tuesday[tuesday["Label"] == "FTP-Patator"].copy()
cic_ftp["Label"] = 1

In [108]:
ftp_non_persist = get_extension("ftp_not_persistent.csv")
ftp_non_persist["Label"] = 1
ftp_non_persist

  df = kagglehub.load_dataset(


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,7765,4,4,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,7765,1
1,6,8932,4,4,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,8932,1
2,6,9109,5,5,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,9109,1
3,6,12650,5,5,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,12650,1
4,6,6574,5,5,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,6574,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,6,1014848,7,8,29.0,76.0,16.0,0.0,4.142857,7.128080,...,0,0,0,0,0,0,-1,-1,1014848,1
16996,6,1043760,7,7,25.0,76.0,14.0,0.0,3.571429,6.160550,...,0,0,0,0,0,0,-1,-1,1043760,1
16997,6,1015409,7,7,29.0,76.0,17.0,0.0,4.142857,7.221001,...,0,0,0,0,0,0,-1,-1,1015409,1
16998,6,1021941,7,7,26.0,76.0,13.0,0.0,3.714286,6.343350,...,0,0,0,0,0,0,-1,-1,1021941,1


In [113]:
base, base_cic, adversarial, adverse_cic = calculate_average_results(benign, cic_ftp, ftp_non_persist, "Bwd RST Flags")
print("Average ROC AUC Scores across 10 runs")
print("Base: ", round(base, 5), "Adversarial: ", round(adversarial, 5), "Difference: ", round(adversarial - base, 5))
print("Base CIC: ", round(base_cic, 5), "Adversarial CIC: ", round(adverse_cic, 5), "Difference: ", round(adverse_cic - base_cic, 5))


Average ROC AUC Scores across 10 runs
Base:  0.49822 Adversarial:  0.59236 Difference:  0.09414
Base CIC:  0.99577 Adversarial CIC:  0.99615 Difference:  0.00038


# GoldenEye - Post

In [114]:
wednesday = get_cic_2017_dataset("wednesday.csv")

  df = kagglehub.load_dataset(


In [115]:
goldeneye_post = get_extension("goldeneye_post.csv")
goldeneye_post["Label"] = 1

  df = kagglehub.load_dataset(


In [116]:
cic_goldeneye = wednesday[wednesday["Label"] == "DoS GoldenEye"].copy()
cic_goldeneye["Label"] = 1

In [123]:
base, base_cic, adversarial, adverse_cic = calculate_average_results(benign, cic_goldeneye, goldeneye_post, "Bwd Packet Length Std")
print("Average ROC AUC Scores across 10 runs")
print("Base: ", round(base, 5), "Adversarial: ", round(adversarial, 5), "Difference: ", round(adversarial - base, 5))
print("Base CIC: ", round(base_cic, 5), "Adversarial CIC: ", round(adverse_cic, 5), "Difference: ", round(adverse_cic - base_cic, 5))


Average ROC AUC Scores across 10 runs
Base:  0.4887 Adversarial:  0.8454 Difference:  0.35671
Base CIC:  0.98522 Adversarial CIC:  0.84716 Difference:  -0.13807


# LOIC UDP

In [124]:
friday = get_cic_2017_dataset("friday.csv")
cic_loic = friday[friday["Label"] == "DDoS"].copy()
cic_loic["Label"] = 1

  df = kagglehub.load_dataset(


In [125]:
loic_udp = get_extension("loic_udp.csv")
loic_udp["Label"] = 1

  df = kagglehub.load_dataset(


In [126]:
base, base_cic, adversarial, adverse_cic = calculate_average_results(benign, cic_loic, loic_udp, "Fwd Seg Size Min")
print("Average ROC AUC Scores across 10 runs")
print("Base: ", round(base, 5), "Adversarial: ", round(adversarial, 5), "Difference: ", round(adversarial - base, 5))
print("Base CIC: ", round(base_cic, 5), "Adversarial CIC: ", round(adverse_cic, 5), "Difference: ", round(adverse_cic - base_cic, 5))

Average ROC AUC Scores across 10 runs
Base:  0.2951 Adversarial:  0.78802 Difference:  0.49292
Base CIC:  0.78831 Adversarial CIC:  0.78879 Difference:  0.00048
