In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('precision', 4)

In [2]:
dataset_presets = {
    'nslkdd': [
        True, 
        "/project/datasets/clean-ids-collection/nsl-kdd/clean/KDDTrain.feather", 
        "/project/datasets/clean-ids-collection/nsl-kdd/clean/KDDTest.feather",
        [
            'protocol_type', 
            'service',
            'flag'
        ],
        'class',
        'normal',
        False
              ],
    'unswnb15': [
        True, 
        "/project/datasets/clean-ids-collection/unsw-nb15/clean/designated-train-test-sets/UNSW_NB15_training-set.feather", 
        "/project/datasets/clean-ids-collection/unsw-nb15/clean/designated-train-test-sets/UNSW_NB15_testing-set.feather",
        [
            'proto', 
            'service',
            'state'
        ],
        'attack_cat',
        'normal',
        False
              ],
    'cicddos2019': [
        False, 
        "/project/datasets/clean-ids-collection/cic-ddos2019/clean/cicddos2019.feather", 
        None,
        [],
        'Label',
        'Benign',
        False
              ],
    'cicdos2017': [
        False, 
        "/project/datasets/clean-ids-collection/cic-dos2017/clean/cicdos2017.feather", 
        None,
        [],
        'Label',
        'Benign',
        False
              ],
    'cicids2017': [
        False, 
        "/project/datasets/clean-ids-collection/cic-ids2017/clean/cicids2017.feather", 
        None,
        [],
        'Label',
        'Benign',
        False
              ],
}

In [3]:
datasets = []
for d in dataset_presets:
    if dataset_presets[d][0]:
        df_train = pd.read_feather(dataset_presets[d][1])
        df_test = pd.read_feather(dataset_presets[d][2])
        df = pd.concat([df_train, df_test], ignore_index=True)
        
    else:
        df = pd.read_feather(dataset_presets[d][1])
    label = dataset_presets[d][4]
    datasets.append([df, label, d])
    

In [4]:
df.head(10)

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,1,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,3,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,1,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,609,7,4,484,414,233,0,69.1429,111.9679,207,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
5,879,9,4,656,3064,313,0,72.8889,136.1538,1532,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
6,1160,9,6,3134,3048,1552,0,348.2222,682.4825,1518,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
7,524,7,4,2812,2820,1397,0,401.7143,679.9149,1410,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
8,6,1,1,6,6,6,6,6.0,0.0,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
9,1119,9,6,3160,3060,1565,0,351.1111,688.215,1524,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [5]:
df.columns

Index(['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
       'Fwd Packets Length Total', 'Bwd Packets Length Total',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count',
       'ACK Flag Count', 'URG Flag Count', 'ECE F

In [6]:
data = ['Dataset', 'Unique attacks', 'Total entries', 'Non-normal entries', 'Total entries after balancing']
new_df = pd.DataFrame(columns=data)

f = open('/project/masterproef-machine-learning-for-network-intrusion-detection/scripts/tex-files/datasets.tex', 'a')
for ds in datasets:   
    vc = ds[0][ds[1]].value_counts()
    total = vc.sum()
    df_vc = vc.to_frame()
    df_vc['fraction'] = (df_vc[ds[1]] / total) * 100
    entry = {'Dataset': ds[2], 'Unique attacks': len(df_vc.index), 'Total entries': total, 'Non-normal entries': total - df_vc[ds[1]].iloc[0], 'Total entries after balancing': 2 * (total - df_vc[ds[1]].iloc[0]), }
    new_df = new_df.append(entry, ignore_index=True)
    tex = df_vc.to_latex(index=True)
    f.write(f'------------{ds[2]}------------\n')
    f.write(tex)
    
f.close()
f = open('/project/masterproef-machine-learning-for-network-intrusion-detection/scripts/tex-files/datasets_extra.tex', 'a')
tex = new_df.to_latex(index=False)
f.write(tex)
f.close()