Loading Libraries

In [117]:
import numba
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import gc  # garbage collector

from fastcore.basics import *
from fastcore.parallel import *
from numba import jit, njit, vectorize, cuda, uint32, f8, uint8

from sklearn.model_selection import train_test_split
from sklearn import metrics  # for accuracy calculation
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from functools import partial
from os import cpu_count
import matplotlib.pyplot as plt

import math
from pylab import imshow, show
from timeit import default_timer as timer

from dask import dataframe as dd
from dask.distributed import Client

Setting up paths to csv files / datasets

In [118]:
# CSV-01-12
path_DdoS_DNS = "C:\\CIC-DDOS-2019\\DrDoS_DNS.csv"
path_DdoS_MSSQL = "C:\\CIC-DDOS-2019\\DrDoS_MSSQL.csv"
path_DdoS_LDAP = "C:\\CIC-DDOS-2019\\DrDoS_LDAP.csv"
path_DdoS_NTP = "C:\\CIC-DDOS-2019\\DrDoS_NTP.csv"
path_DdoS_NetBIOS = "C:\\CIC-DDOS-2019\\DrDoS_NetBIOS.csv"
path_DdoS_SNMP = "C:\\CIC-DDOS-2019\\DrDoS_SNMP.csv"
path_DdoS_SSDP = "C:\\CIC-DDOS-2019\\DrDoS_SSDP.csv"
path_DdoS_UDP = "C:\\CIC-DDOS-2019\\DrDoS_UDP.csv"
path_Syn = "C:\\CIC-DDOS-2019\\Syn.csv"
path_TFTP = "C:\\CIC-DDOS-2019\\TFTP.csv"
path_UDPLag = "C:\\CIC-DDOS-2019\\UDPLag.csv"

# # CSV-03-11
# path__LDAP = "../CICDDoS-2019/CSV-03-11/03-11/LDAP.csv"
# path__MSSQL = "../CICDDoS-2019/CSV-03-11/03-11/MSSQL.csv"
# path__NetBIOS = "../CICDDoS-2019/CSV-03-11/03-11/NetBIOS.csv"
# path__Portmap = "../CICDDoS-2019/CSV-03-11/03-11/Portmap.csv"
# path__Syn = "../CICDDoS-2019/CSV-03-11/03-11/Syn.csv"
# path__UDP = "../CICDDoS-2019/CSV-03-11/03-11/UDP.csv"
# path__UDPLag = "../CICDDoS-2019/CSV-03-11/03-11/UDPLag.csv"

paths = [path_DdoS_DNS, path_DdoS_MSSQL, path_DdoS_LDAP, path_DdoS_NTP, path_DdoS_NetBIOS, path_DdoS_SNMP,
         path_DdoS_SSDP, path_DdoS_UDP, path_Syn, path_TFTP, path_UDPLag]
# , path__LDAP, path__MSSQL, path__NetBIOS,
#      path__Portmap, path__Syn, path__UDP, path__UDPLag]

Column / feature names

In [119]:
col_name_consistency = {
    'Flow ID': 'Flow ID',
    'Source IP': 'Source IP',
    'Src IP': 'Source IP',
    'Source Port': 'Source Port',
    'Src Port': 'Source Port',
    'Destination IP': 'Destination IP',
    'Dst IP': 'Destination IP',
    'Destination Port': 'Destination Port',
    'Dst Port': 'Destination Port',
    'Protocol': 'Protocol',
    'Timestamp': 'Timestamp',
    'Flow Duration': 'Flow Duration',
    'Total Fwd Packets': 'Total Fwd Packets',
    'Tot Fwd Pkts': 'Total Fwd Packets',
    'Total Backward Packets': 'Total Backward Packets',
    'Tot Bwd Pkts': 'Total Backward Packets',
    'Total Length of Fwd Packets': 'Fwd Packets Length Total',
    'TotLen Fwd Pkts': 'Fwd Packets Length Total',
    'Total Length of Bwd Packets': 'Bwd Packets Length Total',
    'TotLen Bwd Pkts': 'Bwd Packets Length Total',
    'Fwd Packet Length Max': 'Fwd Packet Length Max',
    'Fwd Pkt Len Max': 'Fwd Packet Length Max',
    'Fwd Packet Length Min': 'Fwd Packet Length Min',
    'Fwd Pkt Len Min': 'Fwd Packet Length Min',
    'Fwd Packet Length Mean': 'Fwd Packet Length Mean',
    'Fwd Pkt Len Mean': 'Fwd Packet Length Mean',
    'Fwd Packet Length Std': 'Fwd Packet Length Std',
    'Fwd Pkt Len Std': 'Fwd Packet Length Std',
    'Bwd Packet Length Max': 'Bwd Packet Length Max',
    'Bwd Pkt Len Max': 'Bwd Packet Length Max',
    'Bwd Packet Length Min': 'Bwd Packet Length Min',
    'Bwd Pkt Len Min': 'Bwd Packet Length Min',
    'Bwd Packet Length Mean': 'Bwd Packet Length Mean',
    'Bwd Pkt Len Mean': 'Bwd Packet Length Mean',
    'Bwd Packet Length Std': 'Bwd Packet Length Std',
    'Bwd Pkt Len Std': 'Bwd Packet Length Std',
    'Flow Bytes/s': 'Flow Bytes/s',
    'Flow Byts/s': 'Flow Bytes/s',
    'Flow Packets/s': 'Flow Packets/s',
    'Flow Pkts/s': 'Flow Packets/s',
    'Flow IAT Mean': 'Flow IAT Mean',
    'Flow IAT Std': 'Flow IAT Std',
    'Flow IAT Max': 'Flow IAT Max',
    'Flow IAT Min': 'Flow IAT Min',
    'Fwd IAT Total': 'Fwd IAT Total',
    'Fwd IAT Tot': 'Fwd IAT Total',
    'Fwd IAT Mean': 'Fwd IAT Mean',
    'Fwd IAT Std': 'Fwd IAT Std',
    'Fwd IAT Max': 'Fwd IAT Max',
    'Fwd IAT Min': 'Fwd IAT Min',
    'Bwd IAT Total': 'Bwd IAT Total',
    'Bwd IAT Tot': 'Bwd IAT Total',
    'Bwd IAT Mean': 'Bwd IAT Mean',
    'Bwd IAT Std': 'Bwd IAT Std',
    'Bwd IAT Max': 'Bwd IAT Max',
    'Bwd IAT Min': 'Bwd IAT Min',
    'Fwd PSH Flags': 'Fwd PSH Flags',
    'Bwd PSH Flags': 'Bwd PSH Flags',
    'Fwd URG Flags': 'Fwd URG Flags',
    'Bwd URG Flags': 'Bwd URG Flags',
    'Fwd Header Length': 'Fwd Header Length',
    'Fwd Header Len': 'Fwd Header Length',
    'Bwd Header Length': 'Bwd Header Length',
    'Bwd Header Len': 'Bwd Header Length',
    'Fwd Packets/s': 'Fwd Packets/s',
    'Fwd Pkts/s': 'Fwd Packets/s',
    'Bwd Packets/s': 'Bwd Packets/s',
    'Bwd Pkts/s': 'Bwd Packets/s',
    'Min Packet Length': 'Packet Length Min',
    'Pkt Len Min': 'Packet Length Min',
    'Max Packet Length': 'Packet Length Max',
    'Pkt Len Max': 'Packet Length Max',
    'Packet Length Mean': 'Packet Length Mean',
    'Pkt Len Mean': 'Packet Length Mean',
    'Packet Length Std': 'Packet Length Std',
    'Pkt Len Std': 'Packet Length Std',
    'Packet Length Variance': 'Packet Length Variance',
    'Pkt Len Var': 'Packet Length Variance',
    'FIN Flag Count': 'FIN Flag Count',
    'FIN Flag Cnt': 'FIN Flag Count',
    'SYN Flag Count': 'SYN Flag Count',
    'SYN Flag Cnt': 'SYN Flag Count',
    'RST Flag Count': 'RST Flag Count',
    'RST Flag Cnt': 'RST Flag Count',
    'PSH Flag Count': 'PSH Flag Count',
    'PSH Flag Cnt': 'PSH Flag Count',
    'ACK Flag Count': 'ACK Flag Count',
    'ACK Flag Cnt': 'ACK Flag Count',
    'URG Flag Count': 'URG Flag Count',
    'URG Flag Cnt': 'URG Flag Count',
    'CWE Flag Count': 'CWE Flag Count',
    'CWE Flag Cnt': 'CWE Flag Count',
    'ECE Flag Count': 'ECE Flag Count',
    'ECE Flag Cnt': 'ECE Flag Count',
    'Down/Up Ratio': 'Down/Up Ratio',
    'Average Packet Size': 'Avg Packet Size',
    'Pkt Size Avg': 'Avg Packet Size',
    'Avg Fwd Segment Size': 'Avg Fwd Segment Size',
    'Fwd Seg Size Avg': 'Avg Fwd Segment Size',
    'Avg Bwd Segment Size': 'Avg Bwd Segment Size',
    'Bwd Seg Size Avg': 'Avg Bwd Segment Size',
    'Fwd Avg Bytes/Bulk': 'Fwd Avg Bytes/Bulk',
    'Fwd Byts/b Avg': 'Fwd Avg Bytes/Bulk',
    'Fwd Avg Packets/Bulk': 'Fwd Avg Packets/Bulk',
    'Fwd Pkts/b Avg': 'Fwd Avg Packets/Bulk',
    'Fwd Avg Bulk Rate': 'Fwd Avg Bulk Rate',
    'Fwd Blk Rate Avg': 'Fwd Avg Bulk Rate',
    'Bwd Avg Bytes/Bulk': 'Bwd Avg Bytes/Bulk',
    'Bwd Byts/b Avg': 'Bwd Avg Bytes/Bulk',
    'Bwd Avg Packets/Bulk': 'Bwd Avg Packets/Bulk',
    'Bwd Pkts/b Avg': 'Bwd Avg Packets/Bulk',
    'Bwd Avg Bulk Rate': 'Bwd Avg Bulk Rate',
    'Bwd Blk Rate Avg': 'Bwd Avg Bulk Rate',
    'Subflow Fwd Packets': 'Subflow Fwd Packets',
    'Subflow Fwd Pkts': 'Subflow Fwd Packets',
    'Subflow Fwd Bytes': 'Subflow Fwd Bytes',
    'Subflow Fwd Byts': 'Subflow Fwd Bytes',
    'Subflow Bwd Packets': 'Subflow Bwd Packets',
    'Subflow Bwd Pkts': 'Subflow Bwd Packets',
    'Subflow Bwd Bytes': 'Subflow Bwd Bytes',
    'Subflow Bwd Byts': 'Subflow Bwd Bytes',
    'Init_Win_bytes_forward': 'Init Fwd Win Bytes',
    'Init Fwd Win Byts': 'Init Fwd Win Bytes',
    'Init_Win_bytes_backward': 'Init Bwd Win Bytes',
    'Init Bwd Win Byts': 'Init Bwd Win Bytes',
    'act_data_pkt_fwd': 'Fwd Act Data Packets',
    'Fwd Act Data Pkts': 'Fwd Act Data Packets',
    'min_seg_size_forward': 'Fwd Seg Size Min',
    'Fwd Seg Size Min': 'Fwd Seg Size Min',
    'Active Mean': 'Active Mean',
    'Active Std': 'Active Std',
    'Active Max': 'Active Max',
    'Active Min': 'Active Min',
    'Idle Mean': 'Idle Mean',
    'Idle Std': 'Idle Std',
    'Idle Max': 'Idle Max',
    'Idle Min': 'Idle Min',
    'Label': 'Label'
}

Following Columns may have little insignificance over model

In [120]:
drop_columns = [  # this list includes all spellings across CIC NIDS datasets
    "Flow ID",
    'Fwd Header Length.1',
    "Source IP",
    "Src IP",
    "Source Port",
    "Src Port",
    "Destination IP",
    "Dst IP",
    "Destination Port",
    "Dst Port",
    "Timestamp",
    "Unnamed: 0",
    "Inbound",
    "SimillarHTTP"  # CIC-DDoS other undocumented columns
]
len(drop_columns)  # src_port,dst_port, src_ip, dst_ip these are duplicate

14

In [121]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())
tf.config.experimental.list_physical_devices('GPU')
# import cudf
# dtypes = {'SimillarHTTP': 'object'}
# df_gpu = cudf.read_csv(path_TFTP, blocksize=50e6, low_memory=False, dtype=dtypes)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8515299740152523315
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2966106932
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6389743316616408369
physical_device_desc: "device: 0, name: NVIDIA GeForce 940MX, pci bus id: 0000:01:00.0, compute capability: 5.0"
]


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [122]:
def readHugeCsvFileAsDataFrame(file_path):
    # start = timer()
    dtypes = {'SimillarHTTP': 'object',
              'Timestamp': 'object',
              'Source IP': 'str',
              'Destination IP': 'str',
              'Flow ID': 'object',
              'Label': 'object',
              }
    for feature in [f'f_{i}' for i in range(82)]:
        dtypes[feature] = "float32"

    dask_df = dd.read_csv(file_path, low_memory=False, dtype=dtypes)  # 50MB chunk-size
    # elapsed_time = timer() - start
    # print("Read csv with dask: ", elapsed_time, "sec")
    return dask_df.compute()

In [123]:
scheduler = Client()
scheduler

Perhaps you already have a cluster running?
Hosting the HTTP server on port 14424 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:14425  Dashboard: http://127.0.0.1:14424/status,Cluster  Workers: 4  Cores: 4  Memory: 8.47 GB


Without Scheduler (see the elapsed time to process)

In [124]:
%%time
# workingDataFrame = readHugeCsvFileAsDataFrame(path_UDPLag)
# workingDataFrame = readHugeCsvFileAsDataFrame(path_Syn)
# workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_NTP)
# workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_LDAP)
workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_SSDP)
# workingDataFrame.describe()
workingDataFrame

Wall time: 3min 21s


Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,72,172.16.0.5-192.168.50.1-0-0-0,172.16.0.5,0,192.168.50.1,0,0,2018-12-01 12:23:13.663425,119714230,49476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
1,55171,172.16.0.5-192.168.50.1-700-36081-17,172.16.0.5,700,192.168.50.1,36081,17,2018-12-01 12:23:13.663475,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
2,39545,172.16.0.5-192.168.50.1-701-25269-17,172.16.0.5,701,192.168.50.1,25269,17,2018-12-01 12:23:13.663526,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
3,20334,172.16.0.5-192.168.50.1-702-2533-17,172.16.0.5,702,192.168.50.1,2533,17,2018-12-01 12:23:13.663622,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
4,18397,172.16.0.5-192.168.50.1-703-34942-17,172.16.0.5,703,192.168.50.1,34942,17,2018-12-01 12:23:13.663844,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76262,56822,172.16.0.5-192.168.50.1-58448-24357-17,172.16.0.5,58448,192.168.50.1,24357,17,2018-12-01 12:36:57.627128,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
76263,66655,172.16.0.5-192.168.50.1-43130-54967-17,172.16.0.5,43130,192.168.50.1,54967,17,2018-12-01 12:36:57.627130,46,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
76264,119733,172.16.0.5-192.168.50.1-52668-58190-17,172.16.0.5,52668,192.168.50.1,58190,17,2018-12-01 12:36:57.627177,50,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP
76265,45573,172.16.0.5-192.168.50.1-53694-34958-17,172.16.0.5,53694,192.168.50.1,34958,17,2018-12-01 12:36:57.627606,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_SSDP


With Scheduler (see the elapsed time to process)

%%time
workingDataFrame = scheduler.submit(readHugeCsvFileAsDataFrame, path_UDPLag)
df = workingDataFrame.result()
df

In [125]:
workingDataFrame.columns

Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [126]:
workingDataFrame.isnull().sum()

Unnamed: 0         0
Flow ID            0
 Source IP         0
 Source Port       0
 Destination IP    0
                  ..
 Idle Max          0
 Idle Min          0
SimillarHTTP       0
 Inbound           0
 Label             0
Length: 88, dtype: int64

Dropping Unnecessary Features

In [127]:
workingDataFrame.columns = workingDataFrame.columns.str.strip()  # sometimes there's leading / trailing whitespace
workingDataFrame.drop(columns=drop_columns, inplace=True, errors='ignore')
workingDataFrame.rename(columns=col_name_consistency, inplace=True)

workingDataFrame.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,119714230,49476,214,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
1,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
2,17,1,2,0,2604.0,0.0,1302.0,1302.0,1302.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
3,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
4,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP


In [128]:
# workingDataFrame.dtypes
workingDataFrame.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2611374 entries, 0 to 76266
Data columns (total 78 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Protocol                  int64  
 1   Flow Duration             int64  
 2   Total Fwd Packets         int64  
 3   Total Backward Packets    int64  
 4   Fwd Packets Length Total  float64
 5   Bwd Packets Length Total  float64
 6   Fwd Packet Length Max     float64
 7   Fwd Packet Length Min     float64
 8   Fwd Packet Length Mean    float64
 9   Fwd Packet Length Std     float64
 10  Bwd Packet Length Max     float64
 11  Bwd Packet Length Min     float64
 12  Bwd Packet Length Mean    float64
 13  Bwd Packet Length Std     float64
 14  Flow Bytes/s              float64
 15  Flow Packets/s            float64
 16  Flow IAT Mean             float64
 17  Flow IAT Std              float64
 18  Flow IAT Max              float64
 19  Flow IAT Min              float64
 20  Fwd IAT Total             

Data Cleaning Based on Data Types (DownSizing)

In [129]:
for column in workingDataFrame:
    if workingDataFrame[column].dtype == 'float64':
        workingDataFrame[column] = pd.to_numeric(workingDataFrame[column], downcast='float')
    if workingDataFrame[column].dtype == 'int64':
        workingDataFrame[column] = pd.to_numeric(workingDataFrame[column], downcast='integer')

workingDataFrame.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2611374 entries, 0 to 76266
Data columns (total 78 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Protocol                  int8   
 1   Flow Duration             int32  
 2   Total Fwd Packets         int32  
 3   Total Backward Packets    int16  
 4   Fwd Packets Length Total  float32
 5   Bwd Packets Length Total  float32
 6   Fwd Packet Length Max     float32
 7   Fwd Packet Length Min     float32
 8   Fwd Packet Length Mean    float32
 9   Fwd Packet Length Std     float32
 10  Bwd Packet Length Max     float32
 11  Bwd Packet Length Min     float32
 12  Bwd Packet Length Mean    float32
 13  Bwd Packet Length Std     float32
 14  Flow Bytes/s              float32
 15  Flow Packets/s            float32
 16  Flow IAT Mean             float32
 17  Flow IAT Std              float32
 18  Flow IAT Max              float32
 19  Flow IAT Min              float32
 20  Fwd IAT Total             

Removing NaN values

In [130]:
workingDataFrame.isna().any(axis=1).sum()

2

In [131]:
workingDataFrame.replace([np.inf, -np.inf], np.nan, inplace=True)
workingDataFrame.dropna(inplace=True)

After Dropping NaN values, want to see total rows.
That's why, Converting pandas dataFrame into dask's dataFrame

In [132]:
ddf = dd.from_pandas(workingDataFrame, npartitions=10)
ddf.compute()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,119714230,49476,214,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
0,17,105367,4,0,1398.0,0.0,369.0,330.0,349.5,22.516661,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
0,17,1,2,0,766.0,0.0,383.0,383.0,383.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
0,17,108461,4,0,1398.0,0.0,369.0,330.0,349.5,22.516661,...,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
0,17,1,2,0,750.0,0.0,375.0,375.0,375.0,0.000000,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138057,17,215404,6,0,2088.0,0.0,393.0,321.0,348.0,35.088459,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
138058,17,218527,6,0,2088.0,0.0,393.0,321.0,348.0,35.088459,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
138059,17,1,2,0,802.0,0.0,401.0,401.0,401.0,0.000000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
138060,17,107540,4,0,1438.0,0.0,389.0,330.0,359.5,34.063667,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP


Dropping Duplicates
There should be no duplicates because they can bias training and can lead to over-optimistic estimates of classification performance during testing.

In [133]:
workingDataFrame.duplicated().sum()

1678578

Fully duplicate rows to be removed

In [134]:
workingDataFrame.drop_duplicates(inplace=True)
workingDataFrame.reset_index(inplace=True, drop=True)
workingDataFrame

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,119714230,49476,214,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
1,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
2,17,1,2,0,2604.0,0.0,1302.0,1302.0,1302.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
3,17,2,2,0,2474.0,0.0,1237.0,1237.0,1237.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
4,17,3,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890741,17,105105,4,0,1302.0,0.0,330.0,321.0,325.5,5.196152,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
890742,17,80086,4,0,1498.0,0.0,404.0,345.0,374.5,34.063667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
890743,17,80622,4,0,1498.0,0.0,404.0,345.0,374.5,34.063667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
890744,17,53,2,0,660.0,0.0,330.0,330.0,330.0,0.000000,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP


After data cleaning, how much storage it holds now

In [135]:
workingDataFrame.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890746 entries, 0 to 890745
Data columns (total 78 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Protocol                  890746 non-null  int8   
 1   Flow Duration             890746 non-null  int32  
 2   Total Fwd Packets         890746 non-null  int32  
 3   Total Backward Packets    890746 non-null  int16  
 4   Fwd Packets Length Total  890746 non-null  float32
 5   Bwd Packets Length Total  890746 non-null  float32
 6   Fwd Packet Length Max     890746 non-null  float32
 7   Fwd Packet Length Min     890746 non-null  float32
 8   Fwd Packet Length Mean    890746 non-null  float32
 9   Fwd Packet Length Std     890746 non-null  float32
 10  Bwd Packet Length Max     890746 non-null  float32
 11  Bwd Packet Length Min     890746 non-null  float32
 12  Bwd Packet Length Mean    890746 non-null  float32
 13  Bwd Packet Length Std     890746 non-null  f

In [136]:
columnList = workingDataFrame.columns.tolist()
# columnList[:-1]  # just for now, Omitting 'label' column

from sklearn.feature_selection import VarianceThreshold

var_thr = VarianceThreshold(threshold=0.1)  #Removing both constant and quasi-constant
var_thr.fit(workingDataFrame[columnList[:-1]])

var_threshold_bool_list = var_thr.get_support()
var_threshold_bool_list_after_label_added = np.append(var_threshold_bool_list,True)
var_threshold_bool_list_after_label_added

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True, False, False, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [137]:
concol = [column for column in workingDataFrame.columns
          if column not in workingDataFrame.columns[var_threshold_bool_list_after_label_added]]

for omittedFeatures in concol:
    print(omittedFeatures)

workingDataFrame.drop(columns=concol, inplace=True, errors='ignore')
workingDataFrame

Bwd IAT Min
Fwd PSH Flags
Bwd PSH Flags
Fwd URG Flags
Bwd URG Flags
FIN Flag Count
SYN Flag Count
RST Flag Count
PSH Flag Count
ACK Flag Count
URG Flag Count
CWE Flag Count
ECE Flag Count
Down/Up Ratio
Fwd Avg Bytes/Bulk
Fwd Avg Packets/Bulk
Fwd Avg Bulk Rate
Bwd Avg Bytes/Bulk
Bwd Avg Packets/Bulk
Bwd Avg Bulk Rate


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,119714230,49476,214,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
1,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
2,17,1,2,0,2604.0,0.0,1302.0,1302.0,1302.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
3,17,2,2,0,2474.0,0.0,1237.0,1237.0,1237.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
4,17,3,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.000000,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890741,17,105105,4,0,1302.0,0.0,330.0,321.0,325.5,5.196152,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
890742,17,80086,4,0,1498.0,0.0,404.0,345.0,374.5,34.063667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
890743,17,80622,4,0,1498.0,0.0,404.0,345.0,374.5,34.063667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP
890744,17,53,2,0,660.0,0.0,330.0,330.0,330.0,0.000000,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SSDP


Now arising problem is that, row number has been significantly reduced but feature numbers are still 78!!
So, need feature engineering here

In [138]:

def featueEngineeringBasedOnZero(dataFrameArg, thresholdPercentage, showPercentage):
    totalCols = dataFrameArg.shape[1]
    totalRows = len(dataFrameArg)
    unNecessaryFeatureCount = 0
    unNecessaryFeatureNames = []

    for column in dataFrameArg:
        zerosInCol = (dataFrameArg[column] == 0).sum()
        if zerosInCol != 0:
            percentageOfZerosInRow = ((zerosInCol * 100) / totalRows)

            if showPercentage:
                print(column, "\t\t-\t\t", zerosInCol, "\t\t-\t\t", percentageOfZerosInRow)

            if percentageOfZerosInRow > thresholdPercentage:
                unNecessaryFeatureNames.append(column)
                unNecessaryFeatureCount = unNecessaryFeatureCount + 1

    print("\nTotal features having more than ", thresholdPercentage, "% zero are - ", unNecessaryFeatureCount,
          "out of ",
          totalCols)
    return unNecessaryFeatureNames

Identifying those features containing 99% zeroes

In [139]:
featureContainingAlmostZero = featueEngineeringBasedOnZero(dataFrameArg=workingDataFrame, thresholdPercentage=99,
                                                           showPercentage=False)
featureContainingAlmostZero


Total features having more than  99 % zero are -  23 out of  58


['Total Backward Packets',
 'Bwd Packets Length Total',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Bwd IAT Total',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd Header Length',
 'Bwd Packets/s',
 'Avg Bwd Segment Size',
 'Subflow Bwd Packets',
 'Subflow Bwd Bytes',
 'Active Mean',
 'Active Std',
 'Active Max',
 'Active Min',
 'Idle Mean',
 'Idle Std',
 'Idle Max',
 'Idle Min']

Omitting above features containing 99% zeroes

In [140]:
workingDataFrame.drop(columns=featureContainingAlmostZero, inplace=True, errors='ignore')
workingDataFrame.rename(columns=col_name_consistency, inplace=True)
workingDataFrame.reset_index(inplace=True, drop=True)
workingDataFrame

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Fwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Flow Bytes/s,Flow Packets/s,...,Packet Length Variance,Avg Packet Size,Avg Fwd Segment Size,Subflow Fwd Packets,Subflow Fwd Bytes,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min,Label
0,0,119714230,49476,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,4.150718e+02,...,0.000000,0.00,0.0,49476,0,-1,-1,0,0,DrDoS_SSDP
1,17,2,2,2944.0,1472.0,1472.0,1472.0,0.000000,1.472000e+09,1.000000e+06,...,0.000000,2208.00,1472.0,2,2944,-1,-1,1,-1,DrDoS_SSDP
2,17,1,2,2604.0,1302.0,1302.0,1302.0,0.000000,2.604000e+09,2.000000e+06,...,0.000000,1953.00,1302.0,2,2604,-1,-1,1,-1,DrDoS_SSDP
3,17,2,2,2474.0,1237.0,1237.0,1237.0,0.000000,1.237000e+09,1.000000e+06,...,0.000000,1855.50,1237.0,2,2474,-1,-1,1,-1,DrDoS_SSDP
4,17,3,2,2944.0,1472.0,1472.0,1472.0,0.000000,9.813333e+08,6.666667e+05,...,0.000000,2208.00,1472.0,2,2944,-1,-1,1,-1,DrDoS_SSDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890741,17,105105,4,1302.0,330.0,321.0,325.5,5.196152,1.238761e+04,3.805718e+01,...,24.299999,405.75,325.5,4,1302,-1,-1,3,20,DrDoS_SSDP
890742,17,80086,4,1498.0,404.0,345.0,374.5,34.063667,1.870489e+04,4.994631e+01,...,1044.300049,460.75,374.5,4,1498,-1,-1,3,20,DrDoS_SSDP
890743,17,80622,4,1498.0,404.0,345.0,374.5,34.063667,1.858054e+04,4.961425e+01,...,1044.300049,460.75,374.5,4,1498,-1,-1,3,20,DrDoS_SSDP
890744,17,53,2,660.0,330.0,330.0,330.0,0.000000,1.245283e+07,3.773585e+04,...,0.000000,495.00,330.0,2,660,-1,-1,1,20,DrDoS_SSDP


Now, To see the number of unique values in each column

In [141]:
workingDataFrame.nunique(axis=0)

Protocol                         3
Flow Duration                82361
Total Fwd Packets               73
Fwd Packets Length Total      1191
Fwd Packet Length Max          656
Fwd Packet Length Min          594
Fwd Packet Length Mean        1190
Fwd Packet Length Std          535
Flow Bytes/s                141907
Flow Packets/s               86923
Flow IAT Mean                86576
Flow IAT Std                416497
Flow IAT Max                 46096
Flow IAT Min                   214
Fwd IAT Total                82118
Fwd IAT Mean                 86230
Fwd IAT Std                 416080
Fwd IAT Max                  45802
Fwd IAT Min                    157
Fwd Header Length             4100
Fwd Packets/s                86905
Packet Length Min              591
Packet Length Max              680
Packet Length Mean            1499
Packet Length Std              640
Packet Length Variance         640
Avg Packet Size               1536
Avg Fwd Segment Size          1190
Subflow Fwd Packets 

In [142]:
workingDataFrame

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Fwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Flow Bytes/s,Flow Packets/s,...,Packet Length Variance,Avg Packet Size,Avg Fwd Segment Size,Subflow Fwd Packets,Subflow Fwd Bytes,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min,Label
0,0,119714230,49476,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,4.150718e+02,...,0.000000,0.00,0.0,49476,0,-1,-1,0,0,DrDoS_SSDP
1,17,2,2,2944.0,1472.0,1472.0,1472.0,0.000000,1.472000e+09,1.000000e+06,...,0.000000,2208.00,1472.0,2,2944,-1,-1,1,-1,DrDoS_SSDP
2,17,1,2,2604.0,1302.0,1302.0,1302.0,0.000000,2.604000e+09,2.000000e+06,...,0.000000,1953.00,1302.0,2,2604,-1,-1,1,-1,DrDoS_SSDP
3,17,2,2,2474.0,1237.0,1237.0,1237.0,0.000000,1.237000e+09,1.000000e+06,...,0.000000,1855.50,1237.0,2,2474,-1,-1,1,-1,DrDoS_SSDP
4,17,3,2,2944.0,1472.0,1472.0,1472.0,0.000000,9.813333e+08,6.666667e+05,...,0.000000,2208.00,1472.0,2,2944,-1,-1,1,-1,DrDoS_SSDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890741,17,105105,4,1302.0,330.0,321.0,325.5,5.196152,1.238761e+04,3.805718e+01,...,24.299999,405.75,325.5,4,1302,-1,-1,3,20,DrDoS_SSDP
890742,17,80086,4,1498.0,404.0,345.0,374.5,34.063667,1.870489e+04,4.994631e+01,...,1044.300049,460.75,374.5,4,1498,-1,-1,3,20,DrDoS_SSDP
890743,17,80622,4,1498.0,404.0,345.0,374.5,34.063667,1.858054e+04,4.961425e+01,...,1044.300049,460.75,374.5,4,1498,-1,-1,3,20,DrDoS_SSDP
890744,17,53,2,660.0,330.0,330.0,330.0,0.000000,1.245283e+07,3.773585e+04,...,0.000000,495.00,330.0,2,660,-1,-1,1,20,DrDoS_SSDP


Saving New DataFrame as csv file to new location

In [143]:
def dataCleaningResultToAnotherCSV(dataFrameArg, dirPath, file_name):
    dataFrameArg.to_csv(dirPath + file_name)

In [144]:
newCsvPath = "C:\\CIC-DDOS-2019\\AfterDataCleaning\\"
# newFileName = "UDPLag.csv"
# newFileName = "Syn.csv"
# newFileName = "DrDoS_NTP.csv"
# newFileName = "DrDoS_LDAP.csv"
newFileName = "DrDoS_SSDP.csv"
dataCleaningResultToAnotherCSV(dataFrameArg=workingDataFrame, dirPath=newCsvPath, file_name=newFileName)

In [145]:
gc.collect()

304