In [3]:
import pandas as pd
import numpy as np
from dask import dataframe as dd
from dask.distributed import Client
from sklearn.utils import shuffle
from sklearn import preprocessing
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def readCSV_Function(file_path):
    dask_df = dd.read_csv(file_path, blocksize=1e6)
    dask_df = dask_df.repartition(npartitions=8)
    return dask_df.compute(scheduler='threads')

filePath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\Concatenated_Data_Before_Feature_Engineering\\Cleaned_Data.csv"
df = readCSV_Function(filePath)
df.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
df = shuffle(df)

In [5]:
df.nunique()

Protocol                         3
Flow Duration               143257
Total Fwd Packets              365
Total Backward Packets         140
Fwd Packets Length Total      2625
                             ...  
Idle Mean                    65588
Idle Std                     52351
Idle Max                     65560
Idle Min                     65445
Label                           13
Length: 78, dtype: int64

#### Dropping categorical columns that have one category predominance

In [6]:
unique_value_columns = [col for col in df.columns if df[col].nunique() == 1]
unique_value_columns

['Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'FIN Flag Count',
 'PSH Flag Count',
 'ECE Flag Count',
 'Fwd Avg Bytes/Bulk',
 'Fwd Avg Packets/Bulk',
 'Fwd Avg Bulk Rate',
 'Bwd Avg Bytes/Bulk',
 'Bwd Avg Packets/Bulk',
 'Bwd Avg Bulk Rate']

#### Performing drop operation & saving that in new csv

In [7]:
df.drop(columns=unique_value_columns, inplace=True)
df.shape

(409000, 66)

def dataCleaningResultToAnotherCSV(dataFrameArg, dirPath, file_name):
    dataFrameArg.to_csv(dirPath + file_name)

newCsvPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\After_Column_Data_Cleaning\\"
csvFileName = "after_dropping_unique_data_containing_features.csv"
dataCleaningResultToAnotherCSV(dataFrameArg=df, dirPath=newCsvPath, file_name=csvFileName)

#### As we visualized the heatmap of the remaining 66 features, now need to find out the co-relation pairs
#### But Before that, need to apply encoding of categorical data, such as "Label" feature

In [8]:
le = preprocessing.LabelEncoder()
before_encoding_dataframe = df.copy(deep=True)
before_encoding_dataframe.head(5) # see the Label feature

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1728,17,452,8,0,11776.0,0.0,1472.0,1472.0,1472.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_LDAP
773,17,218953,6,0,2088.0,0.0,393.0,321.0,348.0,35.088459,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_UDP
59,17,47,2,0,1192.0,0.0,596.0,596.0,596.0,0.0,...,449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_MSSQL
1951,17,6,2,0,476.0,0.0,238.0,238.0,238.0,0.0,...,1456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
1461,17,49,2,0,2634.0,0.0,1317.0,1317.0,1317.0,0.0,...,-1062718975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_SNMP


In [9]:
df["Label"] = le.fit_transform(df["Label"])
df["Label"] = pd.to_numeric(df['Label'], downcast='integer')
df.head(5) # see the Label feature

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1728,17,452,8,0,11776.0,0.0,1472.0,1472.0,1472.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
773,17,218953,6,0,2088.0,0.0,393.0,321.0,348.0,35.088459,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
59,17,47,2,0,1192.0,0.0,596.0,596.0,596.0,0.0,...,449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1951,17,6,2,0,476.0,0.0,238.0,238.0,238.0,0.0,...,1456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1461,17,49,2,0,2634.0,0.0,1317.0,1317.0,1317.0,0.0,...,-1062718975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6


<center><h3 style="background:#FF00FF;color:black">
Encoding of "Label" visualization
</h3></center>

In [10]:
Label_keys = list(before_encoding_dataframe.Label)
Label_values = list(df.Label)
# Label_tuples = list(zip(Label_keys,Label_values)) # # creating tuples from above two lists
# Label_visualize_df = pd.DataFrame(Label_tuples, columns=['Label','Encoding'])
Label_dict = dict(Label = Label_keys,Encoding = Label_values)
Label_visualize_df = pd.DataFrame.from_dict(Label_dict)
Label_visualize_df.drop_duplicates(inplace=True)
Label_visualize_df.reset_index(inplace=True, drop=True)
Label_visualize_df

Unnamed: 0,Label,Encoding
0,DrDoS_LDAP,2
1,DrDoS_UDP,8
2,DrDoS_MSSQL,3
3,DrDoS_DNS,1
4,DrDoS_SNMP,6
5,UDP-lag,11
6,TFTP,10
7,Syn,9
8,BENIGN,0
9,DrDoS_NTP,4


<center><h4 style="background:#ADFF2F;color:black">
Saving the after encoding dataframe into new csv
</h4></center>

In [11]:
# dirPath = "D:\\Thesis_Group_of_Brig_Gen_Razzak_Sir\\After_Column_Data_Cleaning\\"
# csvFileName = "After_Label_Encoding.csv"
# encodingFileName = "Label_Encoding.csv"
#
# Label_visualize_df.to_csv(dirPath+encodingFileName)
# df.to_csv(dirPath + csvFileName)
# df.info(memory_usage="deep")

<center><h3 style="background:orange;color:black">
Hypothesis Testing
</h3></center>

##### Null Hypothesis(H0): There exists no co-relation between two features
##### Alternative Hypothesis (H1): opposite of null hypothesis

<center><h3 style="background:yellow;color:black">
Chi-Squared Test
</h3></center>

In [12]:
from scipy.stats import chi2_contingency
def chi_squared_test(feature1,feature2):
    stat ,p, dof, expected = chi2_contingency(pd.crosstab(feature1,feature2))
    alpha = 0.05
    if p > alpha:
        return f"They are independent & P-value: {p}"
    else:
        return f"Co-related & P-value: {p}"

In [13]:
present_columns = sorted(df)
present_columns

['ACK Flag Count',
 'Active Max',
 'Active Mean',
 'Active Min',
 'Active Std',
 'Avg Bwd Segment Size',
 'Avg Fwd Segment Size',
 'Avg Packet Size',
 'Bwd Header Length',
 'Bwd IAT Max',
 'Bwd IAT Mean',
 'Bwd IAT Min',
 'Bwd IAT Std',
 'Bwd IAT Total',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Packets Length Total',
 'Bwd Packets/s',
 'CWE Flag Count',
 'Down/Up Ratio',
 'Flow Bytes/s',
 'Flow Duration',
 'Flow IAT Max',
 'Flow IAT Mean',
 'Flow IAT Min',
 'Flow IAT Std',
 'Flow Packets/s',
 'Fwd Act Data Packets',
 'Fwd Header Length',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Fwd IAT Min',
 'Fwd IAT Std',
 'Fwd IAT Total',
 'Fwd PSH Flags',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Fwd Packet Length Std',
 'Fwd Packets Length Total',
 'Fwd Packets/s',
 'Fwd Seg Size Min',
 'Idle Max',
 'Idle Mean',
 'Idle Min',
 'Idle Std',
 'Init Bwd Win Bytes',
 'Init Fwd Win Bytes',
 'Label',
 'P

##### Relation between Syn Flag count vs Protocol

In [14]:
pd.crosstab(df['SYN Flag Count'],df['Protocol'])

Protocol,0,6,17
SYN Flag Count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,344,76009,332533
1,0,114,0


In [15]:
chi_squared_test(df['SYN Flag Count'],df['Protocol'])

'Co-related & P-value: 5.24843500768918e-109'

##### Relation between RST Flag count vs Protocol

In [16]:
pd.crosstab(df['RST Flag Count'],df['Protocol'])

Protocol,0,6,17
RST Flag Count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,344,75097,332533
1,0,1026,0


In [17]:
chi_squared_test(df['RST Flag Count'],df['Protocol'])

'Co-related & P-value: 0.0'

##### Relation between CWE Flag count vs Protocol

In [18]:
pd.crosstab(df['CWE Flag Count'],df['Protocol'])

Protocol,0,6,17
CWE Flag Count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,344,73778,332533
1,0,2345,0


In [19]:
chi_squared_test(df['CWE Flag Count'],df['Protocol'])

'Co-related & P-value: 0.0'

##### Relation between URG Flag count vs Protocol

In [20]:
pd.crosstab(df['URG Flag Count'],df['Protocol'])

Protocol,0,6,17
URG Flag Count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,344,71593,332533
1,0,4530,0


In [21]:
chi_squared_test(df['URG Flag Count'],df['Protocol'])

'Co-related & P-value: 0.0'

<center><h3 style="background:yellow;color:black">
T Test
</h3></center>

In [24]:
from scipy.stats import ttest_ind
def t_value_test(feature1,feature2):
    score = ttest_ind(feature1,feature2,equal_var=False)[1]
    if score > 0.05:
        return f"Co-related & T-score: {score}"
    return f"They are independent & T-score: {score}"

##### Relation between Flow Duration vs FWD IAT Total

In [25]:
t_value_test(df['Flow Duration'],df['Fwd IAT Total'])

'Co-related & T-score: 0.8868497655037039'

<center><h3 style="background:yellow;color:black">
Pearson co-variance implementation
</h3></center>

In [26]:
Pearson_Threshold = 0.9
count = 0
for i in df.columns:
   for j in df.columns:
       if i!= j:
           corr,_ = stats.pearsonr(df[i],df[j])
           if corr >= Pearson_Threshold:
               count = count + 1
               print(f"Pearson co-relation between {i} & {j} : {corr:.3f}")

print("\n###########################################################################")
print(f"Found above pearson threshold co-related pairs: {count}")

Pearson co-relation between Flow Duration & Fwd IAT Total : 1.000
Pearson co-relation between Total Fwd Packets & Subflow Fwd Packets : 1.000
Pearson co-relation between Total Backward Packets & Subflow Bwd Packets : 1.000
Pearson co-relation between Fwd Packets Length Total & Subflow Fwd Bytes : 1.000
Pearson co-relation between Fwd Packets Length Total & Fwd Act Data Packets : 0.997
Pearson co-relation between Bwd Packets Length Total & Subflow Bwd Bytes : 1.000
Pearson co-relation between Fwd Packet Length Max & Fwd Packet Length Min : 0.987
Pearson co-relation between Fwd Packet Length Max & Fwd Packet Length Mean : 0.995
Pearson co-relation between Fwd Packet Length Max & Packet Length Min : 0.986
Pearson co-relation between Fwd Packet Length Max & Packet Length Max : 0.979
Pearson co-relation between Fwd Packet Length Max & Packet Length Mean : 0.995
Pearson co-relation between Fwd Packet Length Max & Avg Packet Size : 0.986
Pearson co-relation between Fwd Packet Length Max & Avg

<center><h3 style="background:orange;color:black">
Need to reduce dimentionality of those above co-related pairs using PCA
</h3></center>

##### But before applying PCA, need to data standardization & Compute the covariance matrix

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

X_std = StandardScaler().fit_transform(df) # data standardized here
X_norm = preprocessing.normalize(df)
mean_vec = np.mean(X_std,axis=0)
cov_mat = ((X_std - mean_vec).T.dot(X_std - mean_vec)) / (X_std.shape[0]-1)
cov_mat # 66 rows & 66 columns

array([[ 1.00000244, -0.7531957 , -0.06263539, ..., -0.75308395,
        -0.63974858, -0.43794953],
       [-0.7531957 ,  1.00000244, -0.00272005, ...,  0.8703841 ,
         0.64706931,  0.37749434],
       [-0.06263539, -0.00272005,  1.00000244, ..., -0.00814798,
        -0.00761259, -0.02526513],
       ...,
       [-0.75308395,  0.8703841 , -0.00814798, ...,  1.00000244,
         0.88018429,  0.38636973],
       [-0.63974858,  0.64706931, -0.00761259, ...,  0.88018429,
         1.00000244,  0.3459005 ],
       [-0.43794953,  0.37749434, -0.02526513, ...,  0.38636973,
         0.3459005 ,  1.00000244]])

#### Eigen decomposition of co-variance matrix

In [34]:
eig_values, eigen_vec = np.linalg.eig(cov_mat)
print(f"Eigen Vectors of co-variance matrix:\n {eigen_vec}")
print("\n###################################################################\n")
print(f"Eigen values of co-variance matrix:\n {eig_values}")

Eigen Vectors of co-variance matrix:
 [[-2.05454278e-01+0.j -6.45378474e-03+0.j  5.70199983e-02+0.j ...
   6.23846711e-14+0.j  1.05675346e-13+0.j  1.26844631e-14+0.j]
 [ 2.04908583e-01+0.j -3.78796076e-02+0.j -1.32903200e-01+0.j ...
  -1.33179662e-11+0.j -2.68750416e-11+0.j -3.25269919e-12+0.j]
 [ 2.31937691e-03+0.j  1.00417961e-02+0.j  1.25317518e-02+0.j ...
  -1.93169351e-01+0.j -4.33142433e-01+0.j  2.48326347e-02+0.j]
 ...
 [ 2.19464811e-01+0.j -4.97904868e-02+0.j -1.50166961e-01+0.j ...
  -1.22490784e-13+0.j -1.59846403e-13+0.j -3.65894936e-14+0.j]
 [ 1.94389440e-01+0.j -3.82159303e-02+0.j -1.31065048e-01+0.j ...
   2.29048963e-14+0.j  4.48744998e-14+0.j  1.60027735e-15+0.j]
 [ 1.36203332e-01+0.j -4.44433655e-02+0.j  5.54129387e-02+0.j ...
   8.43775346e-14+0.j  1.70032905e-13+0.j  2.42190234e-14+0.j]]

###################################################################

Eigen values of co-variance matrix:
 [ 1.64753971e+01+0.0000000e+00j  6.84879341e+00+0.0000000e+00j
  5.93273097

#### Now need to select the principal components

In [41]:
eig_pairs = [(np.abs(eig_values[i]),eigen_vec[:,i]) for i in range(len(eig_values))]
# here, eig_pairs data_types are list of tuples(eig_values,eigen_vec)
# need to sort in descending order
eig_pairs.sort(key=lambda x: x[0],reverse=True)
# ok need to visualize the descending sort of eigen pairs
for i,j in enumerate(eig_pairs):
    print(i,j[0])

0 16.475397052151536
1 6.848793408416348
2 5.932730965636069
3 3.5930312909164903
4 3.0909848350011333
5 3.015878860916777
6 2.612387777951956
7 2.489698726025646
8 2.0095953717664297
9 1.9974643992971517
10 1.9302519052437004
11 1.8376504960534132
12 1.2849150460412404
13 1.238546571850254
14 1.148480119420974
15 1.1241359618594615
16 1.0254346544512165
17 0.9818929539502012
18 0.9570973538169335
19 0.8990151404665084
20 0.8389531283311618
21 0.7735005532368867
22 0.6958247885763185
23 0.6010350671488337
24 0.5284440022068833
25 0.3668665228252984
26 0.32342648013437797
27 0.29849673402195165
28 0.23031009938556488
29 0.21510660932656891
30 0.14903006298668098
31 0.12098494397156888
32 0.09442751866155702
33 0.08381812838506587
34 0.04171413339790374
35 0.036672191923011965
36 0.022333565592598276
37 0.02165790294090773
38 0.017226165018425173
39 0.011579697284042166
40 0.0071775943918611865
41 0.0056372008995470135
42 0.004147264519501435
43 0.003907413060418367
44 0.0029354292850901

##### after 55, it seems it has too little insignificance