In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score
from fastcore.basics import *
from fastcore.parallel import *
from os import cpu_count
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
drop_columns = [
    "id",
    "Flow ID",        
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
    "Attempted Category",
]

def xs_y(df, targ): 
    if not isinstance(targ, list):
        xs = df[df.columns.difference([targ])].copy()
    else:
        xs = df[df.columns.difference(targ)].copy()
    y = df[targ].copy()
    return xs, y


def get_concap_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "concap/" + file)
    # clean the dataset
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df["Label"] = 1

    return df

def get_cic_2018_dataset(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          "cic_2018/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df

def get_cic_2017_dataset(file):
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "jozefjankaj/thesis-files",
        "cic/" + file)
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    return df

def get_extension(file):
    df = kagglehub.load_dataset(
          KaggleDatasetAdapter.PANDAS,
          "jozefjankaj/thesis-files",
          file,
    )
    df.drop(columns=drop_columns, inplace=True, errors='ignore')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.drop(columns=["category", "subcategory", "label"], inplace=True, errors='ignore')

    return df


def train_verify_one_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, feature: str):
    root = DecisionTreeClassifier(max_depth=1, criterion='gini')
    train_x, train_y = xs_y(train_df, "Label")
    test_x, test_y = xs_y(test_df, "Label")
    root.fit(train_x[feature].array.reshape(-1,1), train_y)

    predictions = root.predict(test_x[feature].array.reshape(-1,1))
    return feature, roc_auc_score(test_y, predictions), accuracy_score(test_y, predictions), precision_score(test_y, predictions), recall_score(test_y, predictions)

def concap_cic_experiment(benign_df: pd.DataFrame, cic: pd.DataFrame, concap: pd.DataFrame):
    # Train on CIC => Predict ConCap
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)

    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    columns = ["Feature", "ROC AUC Score", "Accuracy", "Precision", "Recall"]

    measurements_cic_concap = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                           columns=columns
                                          )


    # Train on ConCap => Predict CIC
    benign_balance = benign.sample(n=cic.shape[0])
    training_df = pd.concat([benign_balance, concap]).sample(frac=1).reset_index(drop=True)
    testing_df = pd.concat([benign_balance, cic]).sample(frac=1).reset_index(drop=True)
    
    train_x, train_y = xs_y(training_df, "Label")
    test_x, test_y = xs_y(testing_df, "Label")

    measurements_concap_cic = pd.DataFrame([train_verify_one_feature(training_df, testing_df, feature) for feature in training_df.columns if feature != "Label"],
                                          columns=columns
                                          )
    return measurements_cic_concap, measurements_concap_cic

In [14]:
benign = get_cic_2017_dataset("monday.csv")

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=17&file_name=cic/monday.csv...


100%|████████████████████████████████████████████████████████████████████████████| 198M/198M [00:08<00:00, 25.3MB/s]


In [16]:
benign["Label"] = 0
benign

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,0,119719148,231,0,0,0,0,0,0.000000,0.000000,...,22509459,17,12685486.0,5.296658e+06,20694308,6499982,-1,-1,0,0
1,17,65511209,6,6,288,288,48,48,48.000000,0.000000,...,1506210,1506210,64004884.0,0.000000e+00,64004884,64004884,-1,-1,0,0
2,17,113976922,267,0,20447,0,153,37,76.580524,44.140625,...,10983883,14,25498178.0,1.883305e+07,48523116,5463561,-1,-1,0,0
3,17,67037196,8,8,384,384,48,48,48.000000,0.000000,...,11034681,11034681,55956316.0,0.000000e+00,55956316,55956316,-1,-1,0,0
4,17,68045057,8,8,384,384,48,48,48.000000,0.000000,...,11043596,11043596,56943904.0,0.000000e+00,56943904,56943904,-1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371618,6,5571687,4,2,0,187,0,0,0.000000,0.000000,...,108511,108511,5354085.0,0.000000e+00,5354085,5354085,-1,-1,5571687,0
371619,6,63255945,8,5,169,123,46,0,21.125000,23.111144,...,223699,223699,58780167.0,0.000000e+00,58780167,58780167,-1,-1,183308410,0
371620,17,163,2,2,104,220,52,52,52.000000,0.000000,...,0,0,0.0,0.000000e+00,0,0,-1,-1,0,0
371621,6,4476954,8,9,577,4039,342,0,72.125000,136.576967,...,0,0,0.0,0.000000e+00,0,0,-1,-1,4476954,0


# FTP Patator

In [3]:
tuesday = get_cic_2017_dataset("tuesday.csv")

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/jozefjankaj/thesis-files?dataset_version_number=17&file_name=cic/tuesday.csv...


100%|████████████████████████████████████████████████████████████████████████████| 170M/170M [00:08<00:00, 22.2MB/s]


In [4]:
tuesday["Label"].unique()

array(['BENIGN', 'FTP-Patator - Attempted', 'FTP-Patator', 'SSH-Patator',
       'SSH-Patator - Attempted'], dtype=object)

In [5]:
cic_2017_patator = tuesday[tuesday["Label"] == "FTP-Patator"].copy()
cic_2017_patator["Label"] = 1
cic_2017_patator

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
4630,6,4008190,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4008190,1
4631,6,4018946,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4018946,1
4632,6,4067119,6,6,34,76,20,0,5.666667,8.981462,...,0,0,0.0,0.0,0,0,-1,-1,4067119,1
4633,6,4015015,6,6,30,76,16,0,5.000000,7.771744,...,0,0,0.0,0.0,0,0,-1,-1,4015015,1
4634,6,4099543,6,6,31,76,17,0,5.166667,8.060190,...,0,0,0.0,0.0,0,0,-1,-1,4099543,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321766,6,9567017,11,17,135,188,34,0,12.272727,11.384998,...,0,0,0.0,0.0,0,0,-1,-1,9567017,1
321824,6,8627604,11,17,125,188,25,0,11.363636,9.871907,...,0,0,0.0,0.0,0,0,-1,-1,8627604,1
321857,6,8438103,11,17,116,188,20,0,10.545455,8.721968,...,0,0,0.0,0.0,0,0,-1,-1,8438103,1
321880,6,8958671,11,17,124,188,24,0,11.272727,9.654956,...,0,0,0.0,0.0,0,0,-1,-1,8958671,1


In [10]:
patator_non_persistent = get_extension("ftp_not_persistent.csv")

  df = kagglehub.load_dataset(


In [12]:
patator_non_persistent["Label"] = 1
patator_non_persistent

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label
0,6,7765,4,4,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,7765,1
1,6,8932,4,4,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,8932,1
2,6,9109,5,5,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,9109,1
3,6,12650,5,5,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,12650,1
4,6,6574,5,5,0.0,64.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,-1,-1,6574,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,6,1014848,7,8,29.0,76.0,16.0,0.0,4.142857,7.128080,...,0,0,0,0,0,0,-1,-1,1014848,1
16996,6,1043760,7,7,25.0,76.0,14.0,0.0,3.571429,6.160550,...,0,0,0,0,0,0,-1,-1,1043760,1
16997,6,1015409,7,7,29.0,76.0,17.0,0.0,4.142857,7.221001,...,0,0,0,0,0,0,-1,-1,1015409,1
16998,6,1021941,7,7,26.0,76.0,13.0,0.0,3.714286,6.343350,...,0,0,0,0,0,0,-1,-1,1021941,1


In [17]:
result, _ = concap_cic_experiment(benign, cic_2017_patator, patator_non_persistent)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall
0,Protocol,0.792800,0.921514,0.911724,1.0
1,Flow Duration,0.393253,0.148961,0.000000,0.0
2,Total Fwd Packet,0.378147,0.143239,0.000000,0.0
3,Total Bwd packets,0.450025,0.170465,0.000000,0.0
4,Total Length of Fwd Packet,0.275428,0.104330,0.000000,0.0
...,...,...,...,...,...
77,Idle Max,0.630665,0.860099,0.852814,1.0
78,Idle Min,0.630665,0.860099,0.852814,1.0
79,ICMP Code,0.500000,0.189395,0.000000,0.0
80,ICMP Type,0.500000,0.189395,0.000000,0.0


In [18]:
result.sort_values("ROC AUC Score", ascending=False)

Unnamed: 0,Feature,ROC AUC Score,Accuracy,Precision,Recall
54,Average Packet Size,0.977216,0.991369,0.989465,1.000000
42,Packet Length Mean,0.977216,0.991369,0.989465,1.000000
64,Subflow Fwd Bytes,0.974950,0.990511,0.988430,1.000000
55,Fwd Segment Size Avg,0.974320,0.990273,0.988142,1.000000
8,Fwd Packet Length Mean,0.974320,0.990273,0.988142,1.000000
...,...,...,...,...,...
18,Flow IAT Max,0.351460,0.133130,0.000000,0.000000
4,Total Length of Fwd Packet,0.275428,0.104330,0.000000,0.000000
68,Bwd Init Win Bytes,0.195271,0.074003,0.000413,0.000059
19,Flow IAT Min,0.190329,0.076626,0.047455,0.007294
