In [11]:
import pandas as pd
import numpy as np
from dask import dataframe as dd
from dask.distributed import Client
from sklearn.utils import shuffle
from sklearn import preprocessing
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
def readCSV_Function(file_path):
    dask_df = dd.read_csv(file_path, blocksize=1e6)
    dask_df = dask_df.repartition(npartitions=8)
    return dask_df.compute(scheduler='threads')

filePath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\Concatenated_Data_Before_Feature_Engineering\\Cleaned_Data.csv"
df = readCSV_Function(filePath)
df.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
df = shuffle(df)

In [13]:
df.nunique()

Protocol                         3
Flow Duration               143257
Total Fwd Packets              365
Total Backward Packets         140
Fwd Packets Length Total      2625
                             ...  
Idle Mean                    65588
Idle Std                     52351
Idle Max                     65560
Idle Min                     65445
Label                           13
Length: 78, dtype: int64

#### Dropping categorical columns that have one category predominance

In [14]:
unique_value_columns = [col for col in df.columns if df[col].nunique() == 1]
unique_value_columns

['Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'FIN Flag Count',
 'PSH Flag Count',
 'ECE Flag Count',
 'Fwd Avg Bytes/Bulk',
 'Fwd Avg Packets/Bulk',
 'Fwd Avg Bulk Rate',
 'Bwd Avg Bytes/Bulk',
 'Bwd Avg Packets/Bulk',
 'Bwd Avg Bulk Rate']

#### Performing drop operation & saving that in new csv

In [15]:
df.drop(columns=unique_value_columns, inplace=True)
df.shape

(409000, 66)

def dataCleaningResultToAnotherCSV(dataFrameArg, dirPath, file_name):
    dataFrameArg.to_csv(dirPath + file_name)

newCsvPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\After_Column_Data_Cleaning\\"
csvFileName = "after_dropping_unique_data_containing_features.csv"
dataCleaningResultToAnotherCSV(dataFrameArg=df, dirPath=newCsvPath, file_name=csvFileName)

#### As we visualized the heatmap of the remaining 66 features, now need to find out the co-relation pairs
#### But Before that, need to apply encoding of categorical data, such as "Label" feature

In [16]:
le = preprocessing.LabelEncoder()
before_encoding_dataframe = df.copy(deep=True)
before_encoding_dataframe.head(5) # see the Label feature

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
57,17,48,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,1344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_LDAP
602,17,13,2,0,2808.0,0.0,1404.0,1404.0,1404.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SNMP
888,17,107842,4,0,1438.0,0.0,389.0,330.0,359.5,34.063667,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP-lag
2413,17,2,2,0,1688.0,0.0,844.0,844.0,844.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_MSSQL
1914,17,105815,4,0,1438.0,0.0,389.0,330.0,359.5,34.063667,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_UDP


In [17]:
df["Label"] = le.fit_transform(df["Label"])
df["Label"] = pd.to_numeric(df['Label'], downcast='integer')
df.head(5) # see the Label feature

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
57,17,48,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,1344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
602,17,13,2,0,2808.0,0.0,1404.0,1404.0,1404.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
888,17,107842,4,0,1438.0,0.0,389.0,330.0,359.5,34.063667,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
2413,17,2,2,0,1688.0,0.0,844.0,844.0,844.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1914,17,105815,4,0,1438.0,0.0,389.0,330.0,359.5,34.063667,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8


<center><h3 style="background:#FF00FF;color:black">
Encoding of "Label" visualization
</h3></center>

In [18]:
Label_keys = list(before_encoding_dataframe.Label)
Label_values = list(df.Label)
# Label_tuples = list(zip(Label_keys,Label_values)) # # creating tuples from above two lists
# Label_visualize_df = pd.DataFrame(Label_tuples, columns=['Label','Encoding'])
Label_dict = dict(Label = Label_keys,Encoding = Label_values)
Label_visualize_df = pd.DataFrame.from_dict(Label_dict)
Label_visualize_df.drop_duplicates(inplace=True)
Label_visualize_df.reset_index(inplace=True, drop=True)
Label_visualize_df

Unnamed: 0,Label,Encoding
0,DrDoS_LDAP,2
1,DrDoS_SNMP,6
2,UDP-lag,11
3,DrDoS_MSSQL,3
4,DrDoS_UDP,8
5,TFTP,10
6,DrDoS_SSDP,7
7,DrDoS_NetBIOS,5
8,Syn,9
9,DrDoS_NTP,4


<center><h4 style="background:#ADFF2F;color:black">
Saving the after encoding dataframe into new csv
</h4></center>

In [19]:
dirPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\After_Column_Data_Cleaning\\"
csvFileName = "After_Label_Encoding.csv"
encodingFileName = "Label_Encoding.csv"

Label_visualize_df.to_csv(dirPath+encodingFileName)
df.to_csv(dirPath + csvFileName)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 409000 entries, 57 to 1209
Data columns (total 66 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Protocol                  409000 non-null  int64  
 1   Flow Duration             409000 non-null  int64  
 2   Total Fwd Packets         409000 non-null  int64  
 3   Total Backward Packets    409000 non-null  int64  
 4   Fwd Packets Length Total  409000 non-null  float64
 5   Bwd Packets Length Total  409000 non-null  float64
 6   Fwd Packet Length Max     409000 non-null  float64
 7   Fwd Packet Length Min     409000 non-null  float64
 8   Fwd Packet Length Mean    409000 non-null  float64
 9   Fwd Packet Length Std     409000 non-null  float64
 10  Bwd Packet Length Max     409000 non-null  float64
 11  Bwd Packet Length Min     409000 non-null  float64
 12  Bwd Packet Length Mean    409000 non-null  float64
 13  Bwd Packet Length Std     409000 non-null  fl

<center><h3 style="background:yellow;color:black">
Pearson co-variance implementation
</h3></center>

In [20]:
Pearson_Threshold = 0.9
count = 0
for i in df.columns:
   for j in df.columns:
       if i!= j:
           corr,_ = stats.pearsonr(df[i],df[j])
           if corr >= Pearson_Threshold:
               count = count + 1
               print(f"Pearson co-relation between {i} & {j} : {corr:.3f}")

print("\n###########################################################################")
print(f"Found above pearson threshold co-related pairs: {count}")

Pearson co-relation between Flow Duration & Fwd IAT Total : 1.000
Pearson co-relation between Total Fwd Packets & Subflow Fwd Packets : 1.000
Pearson co-relation between Total Backward Packets & Subflow Bwd Packets : 1.000
Pearson co-relation between Fwd Packets Length Total & Subflow Fwd Bytes : 1.000
Pearson co-relation between Fwd Packets Length Total & Fwd Act Data Packets : 0.997
Pearson co-relation between Bwd Packets Length Total & Subflow Bwd Bytes : 1.000
Pearson co-relation between Fwd Packet Length Max & Fwd Packet Length Min : 0.987
Pearson co-relation between Fwd Packet Length Max & Fwd Packet Length Mean : 0.995
Pearson co-relation between Fwd Packet Length Max & Packet Length Min : 0.986
Pearson co-relation between Fwd Packet Length Max & Packet Length Max : 0.979
Pearson co-relation between Fwd Packet Length Max & Packet Length Mean : 0.995
Pearson co-relation between Fwd Packet Length Max & Avg Packet Size : 0.986
Pearson co-relation between Fwd Packet Length Max & Avg