Loading Libraries

In [6]:
import numba
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import gc  # garbage collector

from fastcore.basics import *
from fastcore.parallel import *
from numba import jit, njit, vectorize, cuda, uint32, f8, uint8

from sklearn.model_selection import train_test_split
from sklearn import metrics  # for accuracy calculation
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from functools import partial
from os import cpu_count
import matplotlib.pyplot as plt

import math
from pylab import imshow, show
from timeit import default_timer as timer

from dask import dataframe as dd
from dask.distributed import Client

Setting up paths to csv files / datasets

In [7]:
tf.config.list_physical_devices(
    device_type=None
)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]

In [8]:
# CSV-01-12
path_DdoS_DNS = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_DNS.csv"
path_DdoS_MSSQL = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_MSSQL.csv"
path_DdoS_LDAP = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_LDAP.csv"
path_DdoS_NTP = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_NTP.csv"
path_DdoS_NetBIOS = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_NetBIOS.csv"
path_DdoS_SNMP = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_SNMP.csv"
path_DdoS_SSDP = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_SSDP.csv"
path_DdoS_UDP = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\DrDoS_UDP.csv"
path_Syn = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\Syn.csv"
path_TFTP = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\TFTP.csv"
path_UDPLag = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\01-12\\UDPLag.csv"

# # CSV-03-11
# path__LDAP = "../CICDDoS-2019/CSV-03-11/03-11/LDAP.csv"
# path__MSSQL = "../CICDDoS-2019/CSV-03-11/03-11/MSSQL.csv"
# path__NetBIOS = "../CICDDoS-2019/CSV-03-11/03-11/NetBIOS.csv"
# path__Portmap = "../CICDDoS-2019/CSV-03-11/03-11/Portmap.csv"
# path__Syn = "../CICDDoS-2019/CSV-03-11/03-11/Syn.csv"
# path__UDP = "../CICDDoS-2019/CSV-03-11/03-11/UDP.csv"
# path__UDPLag = "../CICDDoS-2019/CSV-03-11/03-11/UDPLag.csv"

paths = [path_DdoS_DNS, path_DdoS_MSSQL, path_DdoS_LDAP, path_DdoS_NTP, path_DdoS_NetBIOS, path_DdoS_SNMP, path_DdoS_SSDP, path_DdoS_UDP, path_Syn, path_TFTP, path_UDPLag]
# , path__LDAP, path__MSSQL, path__NetBIOS,
#      path__Portmap, path__Syn, path__UDP, path__UDPLag]

Column / feature names

In [9]:
col_name_consistency = {
    'Flow ID': 'Flow ID',
    'Source IP': 'Source IP',
    'Src IP': 'Source IP',
    'Source Port': 'Source Port',
    'Src Port': 'Source Port',
    'Destination IP': 'Destination IP',
    'Dst IP': 'Destination IP',
    'Destination Port': 'Destination Port',
    'Dst Port': 'Destination Port',
    'Protocol': 'Protocol',
    'Timestamp': 'Timestamp',
    'Flow Duration': 'Flow Duration',
    'Total Fwd Packets': 'Total Fwd Packets',
    'Tot Fwd Pkts': 'Total Fwd Packets',
    'Total Backward Packets': 'Total Backward Packets',
    'Tot Bwd Pkts': 'Total Backward Packets',
    'Total Length of Fwd Packets': 'Fwd Packets Length Total',
    'TotLen Fwd Pkts': 'Fwd Packets Length Total',
    'Total Length of Bwd Packets': 'Bwd Packets Length Total',
    'TotLen Bwd Pkts': 'Bwd Packets Length Total',
    'Fwd Packet Length Max': 'Fwd Packet Length Max',
    'Fwd Pkt Len Max': 'Fwd Packet Length Max',
    'Fwd Packet Length Min': 'Fwd Packet Length Min',
    'Fwd Pkt Len Min': 'Fwd Packet Length Min',
    'Fwd Packet Length Mean': 'Fwd Packet Length Mean',
    'Fwd Pkt Len Mean': 'Fwd Packet Length Mean',
    'Fwd Packet Length Std': 'Fwd Packet Length Std',
    'Fwd Pkt Len Std': 'Fwd Packet Length Std',
    'Bwd Packet Length Max': 'Bwd Packet Length Max',
    'Bwd Pkt Len Max': 'Bwd Packet Length Max',
    'Bwd Packet Length Min': 'Bwd Packet Length Min',
    'Bwd Pkt Len Min': 'Bwd Packet Length Min',
    'Bwd Packet Length Mean': 'Bwd Packet Length Mean',
    'Bwd Pkt Len Mean': 'Bwd Packet Length Mean',
    'Bwd Packet Length Std': 'Bwd Packet Length Std',
    'Bwd Pkt Len Std': 'Bwd Packet Length Std',
    'Flow Bytes/s': 'Flow Bytes/s',
    'Flow Byts/s': 'Flow Bytes/s',
    'Flow Packets/s': 'Flow Packets/s',
    'Flow Pkts/s': 'Flow Packets/s',
    'Flow IAT Mean': 'Flow IAT Mean',
    'Flow IAT Std': 'Flow IAT Std',
    'Flow IAT Max': 'Flow IAT Max',
    'Flow IAT Min': 'Flow IAT Min',
    'Fwd IAT Total': 'Fwd IAT Total',
    'Fwd IAT Tot': 'Fwd IAT Total',
    'Fwd IAT Mean': 'Fwd IAT Mean',
    'Fwd IAT Std': 'Fwd IAT Std',
    'Fwd IAT Max': 'Fwd IAT Max',
    'Fwd IAT Min': 'Fwd IAT Min',
    'Bwd IAT Total': 'Bwd IAT Total',
    'Bwd IAT Tot': 'Bwd IAT Total',
    'Bwd IAT Mean': 'Bwd IAT Mean',
    'Bwd IAT Std': 'Bwd IAT Std',
    'Bwd IAT Max': 'Bwd IAT Max',
    'Bwd IAT Min': 'Bwd IAT Min',
    'Fwd PSH Flags': 'Fwd PSH Flags',
    'Bwd PSH Flags': 'Bwd PSH Flags',
    'Fwd URG Flags': 'Fwd URG Flags',
    'Bwd URG Flags': 'Bwd URG Flags',
    'Fwd Header Length': 'Fwd Header Length',
    'Fwd Header Len': 'Fwd Header Length',
    'Bwd Header Length': 'Bwd Header Length',
    'Bwd Header Len': 'Bwd Header Length',
    'Fwd Packets/s': 'Fwd Packets/s',
    'Fwd Pkts/s': 'Fwd Packets/s',
    'Bwd Packets/s': 'Bwd Packets/s',
    'Bwd Pkts/s': 'Bwd Packets/s',
    'Min Packet Length': 'Packet Length Min',
    'Pkt Len Min': 'Packet Length Min',
    'Max Packet Length': 'Packet Length Max',
    'Pkt Len Max': 'Packet Length Max',
    'Packet Length Mean': 'Packet Length Mean',
    'Pkt Len Mean': 'Packet Length Mean',
    'Packet Length Std': 'Packet Length Std',
    'Pkt Len Std': 'Packet Length Std',
    'Packet Length Variance': 'Packet Length Variance',
    'Pkt Len Var': 'Packet Length Variance',
    'FIN Flag Count': 'FIN Flag Count',
    'FIN Flag Cnt': 'FIN Flag Count',
    'SYN Flag Count': 'SYN Flag Count',
    'SYN Flag Cnt': 'SYN Flag Count',
    'RST Flag Count': 'RST Flag Count',
    'RST Flag Cnt': 'RST Flag Count',
    'PSH Flag Count': 'PSH Flag Count',
    'PSH Flag Cnt': 'PSH Flag Count',
    'ACK Flag Count': 'ACK Flag Count',
    'ACK Flag Cnt': 'ACK Flag Count',
    'URG Flag Count': 'URG Flag Count',
    'URG Flag Cnt': 'URG Flag Count',
    'CWE Flag Count': 'CWE Flag Count',
    'CWE Flag Cnt': 'CWE Flag Count',
    'ECE Flag Count': 'ECE Flag Count',
    'ECE Flag Cnt': 'ECE Flag Count',
    'Down/Up Ratio': 'Down/Up Ratio',
    'Average Packet Size': 'Avg Packet Size',
    'Pkt Size Avg': 'Avg Packet Size',
    'Avg Fwd Segment Size': 'Avg Fwd Segment Size',
    'Fwd Seg Size Avg': 'Avg Fwd Segment Size',
    'Avg Bwd Segment Size': 'Avg Bwd Segment Size',
    'Bwd Seg Size Avg': 'Avg Bwd Segment Size',
    'Fwd Avg Bytes/Bulk': 'Fwd Avg Bytes/Bulk',
    'Fwd Byts/b Avg': 'Fwd Avg Bytes/Bulk',
    'Fwd Avg Packets/Bulk': 'Fwd Avg Packets/Bulk',
    'Fwd Pkts/b Avg': 'Fwd Avg Packets/Bulk',
    'Fwd Avg Bulk Rate': 'Fwd Avg Bulk Rate',
    'Fwd Blk Rate Avg': 'Fwd Avg Bulk Rate',
    'Bwd Avg Bytes/Bulk': 'Bwd Avg Bytes/Bulk',
    'Bwd Byts/b Avg': 'Bwd Avg Bytes/Bulk',
    'Bwd Avg Packets/Bulk': 'Bwd Avg Packets/Bulk',
    'Bwd Pkts/b Avg': 'Bwd Avg Packets/Bulk',
    'Bwd Avg Bulk Rate': 'Bwd Avg Bulk Rate',
    'Bwd Blk Rate Avg': 'Bwd Avg Bulk Rate',
    'Subflow Fwd Packets': 'Subflow Fwd Packets',
    'Subflow Fwd Pkts': 'Subflow Fwd Packets',
    'Subflow Fwd Bytes': 'Subflow Fwd Bytes',
    'Subflow Fwd Byts': 'Subflow Fwd Bytes',
    'Subflow Bwd Packets': 'Subflow Bwd Packets',
    'Subflow Bwd Pkts': 'Subflow Bwd Packets',
    'Subflow Bwd Bytes': 'Subflow Bwd Bytes',
    'Subflow Bwd Byts': 'Subflow Bwd Bytes',
    'Init_Win_bytes_forward': 'Init Fwd Win Bytes',
    'Init Fwd Win Byts': 'Init Fwd Win Bytes',
    'Init_Win_bytes_backward': 'Init Bwd Win Bytes',
    'Init Bwd Win Byts': 'Init Bwd Win Bytes',
    'act_data_pkt_fwd': 'Fwd Act Data Packets',
    'Fwd Act Data Pkts': 'Fwd Act Data Packets',
    'min_seg_size_forward': 'Fwd Seg Size Min',
    'Fwd Seg Size Min': 'Fwd Seg Size Min',
    'Active Mean': 'Active Mean',
    'Active Std': 'Active Std',
    'Active Max': 'Active Max',
    'Active Min': 'Active Min',
    'Idle Mean': 'Idle Mean',
    'Idle Std': 'Idle Std',
    'Idle Max': 'Idle Max',
    'Idle Min': 'Idle Min',
    'Label': 'Label'
}

Following Columns may have little insignificance over model

In [10]:
drop_columns = [  # this list includes all spellings across CIC NIDS datasets
    "Flow ID",
    'Fwd Header Length.1',
    "Timestamp",
    "Unnamed: 0",
    "Inbound",
    "SimillarHTTP"  # CIC-DDoS other undocumented columns
]
len(drop_columns)  # src_port,dst_port, src_ip, dst_ip these are duplicate

6

In [11]:
def readHugeCsvFileAsDataFrame(file_path):
    # start = timer()
    dtypes = {'SimillarHTTP': 'object',
              'Timestamp': 'object',
              'Source IP': 'str',
              'Destination IP': 'str',
              'Flow ID': 'object',
              'Label': 'object',
              }
    for feature in [f'f_{i}' for i in range(82)]:
        dtypes[feature] = "float32"

    dask_df = dd.read_csv(file_path, low_memory=False,blocksize=50000, dtype=dtypes)  # 50MB chunk-size
    # elapsed_time = timer() - start
    # print("Read csv with dask: ", elapsed_time, "sec")
    return dask_df.compute()

In [12]:
scheduler = Client()
scheduler

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 5
Total threads: 20,Total memory: 31.70 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:57645,Workers: 5
Dashboard: http://127.0.0.1:8787/status,Total threads: 20
Started: Just now,Total memory: 31.70 GiB

0,1
Comm: tcp://127.0.0.1:57690,Total threads: 4
Dashboard: http://127.0.0.1:57691/status,Memory: 6.34 GiB
Nanny: tcp://127.0.0.1:57652,
Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-do72v8gi,Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-do72v8gi

0,1
Comm: tcp://127.0.0.1:57680,Total threads: 4
Dashboard: http://127.0.0.1:57683/status,Memory: 6.34 GiB
Nanny: tcp://127.0.0.1:57649,
Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-oph0vm7j,Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-oph0vm7j

0,1
Comm: tcp://127.0.0.1:57679,Total threads: 4
Dashboard: http://127.0.0.1:57682/status,Memory: 6.34 GiB
Nanny: tcp://127.0.0.1:57651,
Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-bwzn_8zr,Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-bwzn_8zr

0,1
Comm: tcp://127.0.0.1:57687,Total threads: 4
Dashboard: http://127.0.0.1:57688/status,Memory: 6.34 GiB
Nanny: tcp://127.0.0.1:57648,
Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-eshpj06k,Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-eshpj06k

0,1
Comm: tcp://127.0.0.1:57678,Total threads: 4
Dashboard: http://127.0.0.1:57681/status,Memory: 6.34 GiB
Nanny: tcp://127.0.0.1:57650,
Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-o29yx8r0,Local directory: G:\Brig_Gen_Razzak_Sir_Thesis_Group\Thesis-on-DDOS-main\Data-PreProcessing\dask-worker-space\worker-o29yx8r0


Without Scheduler (see the elapsed time to process)
%%time
workingDataFrame = readHugeCsvFileAsDataFrame(path_UDPLag)
workingDataFrame = readHugeCsvFileAsDataFrame(path_Syn)
workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_NTP)
workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_LDAP)
workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_SSDP)
workingDataFrame = readHugeCsvFileAsDataFrame(path_DdoS_UDP)
workingDataFrame.describe()
workingDataFrame

With Scheduler (see the elapsed time to process)

In [13]:
%%time
df = scheduler.submit(readHugeCsvFileAsDataFrame, path_UDPLag)
workingDataFrame = df.result()
workingDataFrame

Wall time: 12.7 s


Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,186059,172.16.0.5-192.168.50.1-58445-4463-17,172.16.0.5,58445,192.168.50.1,4463,17,2018-12-01 13:04:45.928673,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
1,135692,172.16.0.5-192.168.50.1-36908-9914-17,172.16.0.5,36908,192.168.50.1,9914,17,2018-12-01 13:04:45.928913,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
2,33822,172.16.0.5-192.168.50.1-41727-32361-17,172.16.0.5,41727,192.168.50.1,32361,17,2018-12-01 13:04:45.928915,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
3,24498,172.16.0.5-192.168.50.1-55447-5691-17,172.16.0.5,55447,192.168.50.1,5691,17,2018-12-01 13:04:45.929024,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
4,117372,172.16.0.5-192.168.50.1-58794-56335-17,172.16.0.5,58794,192.168.50.1,56335,17,2018-12-01 13:04:45.929096,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,350624,172.16.0.5-192.168.50.1-60490-14102-6,172.16.0.5,60490,192.168.50.1,14102,6,2018-12-01 13:30:30.740273,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
123,336856,172.16.0.5-192.168.50.1-60491-58360-6,172.16.0.5,60491,192.168.50.1,58360,6,2018-12-01 13:30:30.740323,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
124,115128,172.16.0.5-192.168.50.1-60492-2905-6,172.16.0.5,60492,192.168.50.1,2905,6,2018-12-01 13:30:30.740374,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag
125,51370,172.16.0.5-192.168.50.1-60493-45714-6,172.16.0.5,60493,192.168.50.1,45714,6,2018-12-01 13:30:30.740424,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,UDP-lag


In [14]:
len(list(workingDataFrame.columns))

88

Dropping Unnecessary Features & maintaing column consistency

In [15]:
workingDataFrame.columns = workingDataFrame.columns.str.strip()  # sometimes there's leading / trailing whitespace
workingDataFrame.drop(columns=drop_columns, inplace=True, errors='ignore')
workingDataFrame.rename(columns=col_name_consistency, inplace=True)

workingDataFrame.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,172.16.0.5,58445,192.168.50.1,4463,17,1,2,0,766.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP-lag
1,172.16.0.5,36908,192.168.50.1,9914,17,1,2,0,778.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP-lag
2,172.16.0.5,41727,192.168.50.1,32361,17,2,2,0,750.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP-lag
3,172.16.0.5,55447,192.168.50.1,5691,17,2,2,0,738.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP-lag
4,172.16.0.5,58794,192.168.50.1,56335,17,1,2,0,750.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP-lag


Data Cleaning Based on Data Types (DownSizing)

In [16]:
# workingDataFrame.dtypes
workingDataFrame.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 370605 entries, 0 to 126
Data columns (total 82 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Source IP                 370605 non-null  object 
 1   Source Port               370605 non-null  int64  
 2   Destination IP            370605 non-null  object 
 3   Destination Port          370605 non-null  int64  
 4   Protocol                  370605 non-null  int64  
 5   Flow Duration             370605 non-null  int64  
 6   Total Fwd Packets         370605 non-null  int64  
 7   Total Backward Packets    370605 non-null  int64  
 8   Fwd Packets Length Total  370605 non-null  float64
 9   Bwd Packets Length Total  370605 non-null  float64
 10  Fwd Packet Length Max     370605 non-null  float64
 11  Fwd Packet Length Min     370605 non-null  float64
 12  Fwd Packet Length Mean    370605 non-null  float64
 13  Fwd Packet Length Std     370605 non-null  floa

In [17]:
# converting 64bit float to 32 bit float & 64bit integer to 8bit integer
for column in workingDataFrame:
    if workingDataFrame[column].dtype == 'float64':
        workingDataFrame[column] = pd.to_numeric(workingDataFrame[column], downcast='float')
    if workingDataFrame[column].dtype == 'int64':
        workingDataFrame[column] = pd.to_numeric(workingDataFrame[column], downcast='integer')

workingDataFrame.info(memory_usage="deep") # observe the dTypes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 370605 entries, 0 to 126
Data columns (total 82 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Source IP                 370605 non-null  object 
 1   Source Port               370605 non-null  int32  
 2   Destination IP            370605 non-null  object 
 3   Destination Port          370605 non-null  int32  
 4   Protocol                  370605 non-null  int8   
 5   Flow Duration             370605 non-null  int32  
 6   Total Fwd Packets         370605 non-null  int16  
 7   Total Backward Packets    370605 non-null  int16  
 8   Fwd Packets Length Total  370605 non-null  float32
 9   Bwd Packets Length Total  370605 non-null  float32
 10  Fwd Packet Length Max     370605 non-null  float32
 11  Fwd Packet Length Min     370605 non-null  float32
 12  Fwd Packet Length Mean    370605 non-null  float32
 13  Fwd Packet Length Std     370605 non-null  floa

Finding number of missing values contains features in the dataset

In [18]:
workingDataFrame.isnull().sum()

Source IP           0
Source Port         0
Destination IP      0
Destination Port    0
Protocol            0
                   ..
Idle Mean           0
Idle Std            0
Idle Max            0
Idle Min            0
Label               0
Length: 82, dtype: int64

Removing NaN values

In [19]:
workingDataFrame.isna().any(axis=1).sum()
# workingDataFrame.isna().sum()

36132

In [20]:
workingDataFrame.replace([np.inf, -np.inf], np.nan, inplace=True)
workingDataFrame.isna().any(axis=1).sum()

36403

After Dropping NaN values, want to see total rows.

In [21]:
workingDataFrame.dropna(inplace=True)
print(f"After dropping NaN values, number of rows: {workingDataFrame.shape[0]}")
print(f"After dropping NaN values, number of columns: {workingDataFrame.shape[1]}")

After dropping NaN values, number of rows: 334202
After dropping NaN values, number of columns: 82


That's why, Converting pandas dataFrame into dask's dataFrame

ddf = dd.from_pandas(workingDataFrame, npartitions=10)
ddf.compute()

Dropping Duplicates
There should be no duplicates because they can bias training and can lead to over-optimistic estimates of classification performance during testing.

In [22]:
workingDataFrame.duplicated().sum()

9

Fully duplicate rows to be removed

In [23]:
workingDataFrame.drop_duplicates(inplace=True)
workingDataFrame.reset_index(inplace=True, drop=True)
print(f"After dropping NaN values, number of rows: {workingDataFrame.shape[0]}")
print(f"After dropping NaN values, number of columns: {workingDataFrame.shape[1]}")
print(f"After dropping NaN values, unique no of Source IP: {workingDataFrame['Source IP'].nunique()}")
print(f"After dropping NaN values, unique no of Destination IP: {workingDataFrame['Destination IP'].nunique()}")
# print(workingDataFrame["Source IP"].unique())
# print(workingDataFrame["Destination IP"].unique())

After dropping NaN values, number of rows: 334193
After dropping NaN values, number of columns: 82
After dropping NaN values, unique no of Source IP: 117
After dropping NaN values, unique no of Destination IP: 138


After data cleaning, how much storage it holds now

In [24]:
workingDataFrame.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334193 entries, 0 to 334192
Data columns (total 82 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Source IP                 334193 non-null  object 
 1   Source Port               334193 non-null  int32  
 2   Destination IP            334193 non-null  object 
 3   Destination Port          334193 non-null  int32  
 4   Protocol                  334193 non-null  int8   
 5   Flow Duration             334193 non-null  int32  
 6   Total Fwd Packets         334193 non-null  int16  
 7   Total Backward Packets    334193 non-null  int16  
 8   Fwd Packets Length Total  334193 non-null  float64
 9   Bwd Packets Length Total  334193 non-null  float64
 10  Fwd Packet Length Max     334193 non-null  float64
 11  Fwd Packet Length Min     334193 non-null  float64
 12  Fwd Packet Length Mean    334193 non-null  float64
 13  Fwd Packet Length Std     334193 non-null  f

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(workingDataFrame, title="DDOS")
profile.to_file("UdpLag.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
columnList = workingDataFrame.columns.tolist()
# columnList[:-1]  # just for now, Omitting 'label' column

from sklearn.feature_selection import VarianceThreshold

var_thr = VarianceThreshold(threshold=0.1)  #Removing both constant and quasi-constant
var_thr.fit(workingDataFrame[columnList[:-1]])

var_threshold_bool_list = var_thr.get_support()
var_threshold_bool_list_after_label_added = np.append(var_threshold_bool_list,True)
var_threshold_bool_list_after_label_added

In [None]:
concol = [column for column in workingDataFrame.columns
          if column not in workingDataFrame.columns[var_threshold_bool_list_after_label_added]]

for omittedFeatures in concol:
    print(omittedFeatures)

workingDataFrame.drop(columns=concol, inplace=True, errors='ignore')
workingDataFrame

Now arising problem is that, row number has been significantly reduced but feature numbers are still 78!!
So, need feature engineering here

In [None]:

def featueEngineeringBasedOnZero(dataFrameArg, thresholdPercentage, showPercentage):
    totalCols = dataFrameArg.shape[1]
    totalRows = len(dataFrameArg)
    unNecessaryFeatureCount = 0
    unNecessaryFeatureNames = []

    for column in dataFrameArg:
        zerosInCol = (dataFrameArg[column] == 0).sum()
        if zerosInCol != 0:
            percentageOfZerosInRow = ((zerosInCol * 100) / totalRows)

            if showPercentage:
                print(column, "\t\t-\t\t", zerosInCol, "\t\t-\t\t", percentageOfZerosInRow)

            if percentageOfZerosInRow > thresholdPercentage:
                unNecessaryFeatureNames.append(column)
                unNecessaryFeatureCount = unNecessaryFeatureCount + 1

    print("\nTotal features having more than ", thresholdPercentage, "% zero are - ", unNecessaryFeatureCount,
          "out of ",
          totalCols)
    return unNecessaryFeatureNames

Identifying those features containing 99% zeroes

In [None]:
featureContainingAlmostZero = featueEngineeringBasedOnZero(dataFrameArg=workingDataFrame, thresholdPercentage=99,
                                                           showPercentage=False)
featureContainingAlmostZero

Omitting above features containing 99% zeroes

In [None]:
workingDataFrame.drop(columns=featureContainingAlmostZero, inplace=True, errors='ignore')
workingDataFrame.rename(columns=col_name_consistency, inplace=True)
workingDataFrame.reset_index(inplace=True, drop=True)
workingDataFrame

Now, To see the number of unique values in each column

In [None]:
workingDataFrame.nunique(axis=0)

In [None]:
workingDataFrame

Saving New DataFrame as csv file to new location

In [None]:
def dataCleaningResultToAnotherCSV(dataFrameArg, dirPath, file_name):
    dataFrameArg.to_csv(dirPath + file_name)

In [None]:
newCsvPath = "C:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\AfterDataCleaning(Final)\\"
newFileName = "UDPLag.csv"
# newFileName = "Syn.csv"
# newFileName = "DrDoS_NTP.csv"
# newFileName = "DrDoS_LDAP.csv"
# newFileName = "DrDoS_SSDP.csv"
# newFileName = "DrDoS_UDP.csv"
dataCleaningResultToAnotherCSV(dataFrameArg=workingDataFrame, dirPath=newCsvPath, file_name=newFileName)

In [None]:
import gc
gc.collect()
