# Step 1: Loading all the Packages to be used

In [24]:
import json #will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

# Step 2: Having a suitable real world dataset

In [23]:
data= pd.read_csv(r'C:\Users\Michael Owen\OneDrive\Desktop\Datasets\train_mosaic.csv')
#Load dataset
data.head(4)

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0,739228.5,743103.4661,1264682,213775,49700000.0,41400000.0,79000000,20500000,DoS
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Step 3: Data Preprocessing/Cleaning

In [25]:
data.shape

(809361, 78)

In [10]:
original_features=list(data.columns)
len(original_features)

78

In [11]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

0

In [26]:
len(data)

809361

# Step 4: Encoding the dataset

In [12]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['Label']

In [13]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['Bwd_Packet_Length_Mean',
 'URG_Flag_Count',
 'Subflow_Bwd_Bytes',
 'Init_Win_bytes_backward',
 'Active_Min',
 'Packet_Length_Variance',
 'Idle_Std',
 'Fwd_Avg_Bulk_Rate',
 'min_seg_size_forward',
 'Bwd_PSH_Flags',
 'Fwd_IAT_Mean',
 'Fwd_IAT_Total',
 'Total_Length_of_Fwd_Packets',
 'Flow_IAT_Mean',
 'Packet_Length_Std',
 'PSH_Flag_Count',
 'Flow_IAT_Max',
 'Fwd_PSH_Flags',
 'Destination_Port',
 'Average_Packet_Size',
 'Max_Packet_Length',
 'Packet_Length_Mean',
 'Active_Std',
 'Bwd_Avg_Packets_Bulk',
 'Bwd_Packet_Length_Std',
 'CWE_Flag_Count',
 'Down_Up_Ratio',
 'Fwd_IAT_Min',
 'Bwd_Packets_Sec',
 'Fwd_Packet_Length_Std',
 'Active_Mean',
 'Fwd_Header_Length',
 'Flow_Duration',
 'ECE_Flag_Count',
 'Min_Packet_Length',
 'Flow_Packets_Sec',
 'Subflow_Fwd_Bytes',
 'Flow_Bytes_Sec',
 'Fwd_IAT_Std',
 'Fwd_Packet_Length_Min',
 'Active_Max',
 'Avg_Fwd_Segment_Size',
 'Bwd_Avg_Bulk_Rate',
 'RST_Flag_Count',
 'Bwd_Avg_Bytes_Bulk',
 'Idle_Max',
 'Fwd_IAT_Max',
 'Init_Win_bytes_forward',
 'Avg_B

In [14]:
encoders = {}
for column in ['Label']:
    categorical_convert = LabelEncoder()

In [15]:
nominal=['Label']
ordinal=list(set(categorical_features)-set(nominal))

In [16]:
df_nominal=pd.get_dummies(data[nominal])

In [17]:
target=['Label']

In [18]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [19]:
data[numerical]

Unnamed: 0,Bwd_Packet_Length_Mean,URG_Flag_Count,Subflow_Bwd_Bytes,Init_Win_bytes_backward,Active_Min,Packet_Length_Variance,Idle_Std,Fwd_Avg_Bulk_Rate,min_seg_size_forward,Bwd_PSH_Flags,...,Bwd_Header_Length,Bwd_IAT_Total,Fwd_Packets_Sec,Bwd_IAT_Std,Bwd_Packet_Length_Min,Total_Backward_Packets,FIN_Flag_Count,Bwd_IAT_Min,Bwd_URG_Flags,Idle_Mean
0,0.0000,0,0,29200,213775,13033.569260,4.140000e+07,0,0,0,...,40,0,0.197689,0.000000e+00,0,1,0,0,0,49700000.00
1,0.0000,1,0,33304,0,0.000000,0.000000e+00,0,32,0,...,32,0,17241.379310,0.000000e+00,0,1,0,0,0,0.00
2,122.0000,0,244,-1,0,1720.238095,0.000000e+00,0,20,0,...,40,1,128.427406,0.000000e+00,122,2,0,1,0,0.00
3,97.2500,0,389,237,0,33932.214290,0.000000e+00,0,20,0,...,88,130788,11.778378,7.520757e+04,0,4,0,4,0,0.00
4,981.4375,0,15703,100,0,584290.852200,0.000000e+00,0,20,0,...,332,11846006,1.005692,2.839742e+06,0,16,0,50,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809356,0.0000,0,0,29200,3026,2238.628788,3.499753e+06,0,0,0,...,40,0,0.146720,0.000000e+00,0,1,0,0,0,22553695.33
809357,0.0000,0,0,29200,0,13148.444440,0.000000e+00,0,0,0,...,40,0,40.320723,0.000000e+00,0,1,0,0,0,0.00
809358,0.0000,0,0,29200,1423,2995.750000,1.052882e+07,0,0,0,...,40,0,0.113042,0.000000e+00,0,1,0,0,0,29688734.00
809359,100.0000,0,200,-1,0,940.800000,0.000000e+00,0,20,0,...,40,4,2688.172043,0.000000e+00,100,2,0,4,0,0.00


In [20]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)
new_data.shape

(809361, 79)

# Step 5: Standardize the dataset

In [27]:
# set input matrix and target column
x_cols = [c for c in data.columns if c != 'Label']
x_cols2= [c for c in new_data.columns]
X = data[x_cols]
y = data['Label']

In [28]:
X=new_data.to_numpy()

In [29]:
X[0]

array([ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  2.92000000e+04,  2.13775000e+05,  1.30335693e+04,
        4.14000000e+07,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        5.32369611e+06,  1.01000000e+08,  9.69000000e+02,  5.05843970e+06,
        1.14164659e+02,  0.00000000e+00,  7.90000000e+07,  0.00000000e+00,
        8.00000000e+01,  4.61428571e+01,  3.53000000e+02,  4.40454546e+01,
        7.43103466e+05,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  2.00000000e+00,  9.88447100e-03,  1.19083551e+02,
        7.39228500e+05,  7.28000000e+02,  1.01168794e+08,  0.00000000e+00,
        0.00000000e+00,  2.07573889e-01,  9.69000000e+02,  9.57805230e+00,
        1.84000000e+07,  0.00000000e+00,  1.26468200e+06,  4.84500000e+01,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  7.90000000e+07,
        7.90000000e+07, -1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [30]:
X=StandardScaler().fit_transform(X)

In [31]:
X[0]

array([-9.19101676e-01,  9.19101676e-01, -3.70547215e-01, -2.40910824e-01,
       -4.33692669e-03,  1.01857280e+00,  2.68840421e-02, -1.85656577e-01,
        5.54507241e+00,  0.00000000e+00,  4.55176286e-03, -4.64564156e-02,
        2.69704704e-01,  2.20626751e+00,  1.32981740e-01,  4.89100470e-01,
        1.49630887e-02, -3.61911677e-01,  2.39292378e+00, -1.73935450e-01,
       -3.13659533e-01, -2.49174756e-01,  1.01609589e-02, -2.02537363e-01,
        9.19315893e-01,  0.00000000e+00, -3.03892094e-01,  0.00000000e+00,
       -6.39163643e-01, -1.03343892e-01, -1.12373622e-01,  3.44684549e-01,
        4.66782295e-01,  3.11894084e-03,  2.14160569e+00, -8.09246851e-03,
       -4.59960676e-01, -7.21472692e-01,  1.32981740e-01, -6.48067346e-01,
        1.59825624e+00, -3.53325184e-01,  6.12413133e-01, -6.09452103e-03,
        0.00000000e+00, -3.30276985e-01,  0.00000000e+00,  2.40334688e+00,
        2.48921670e+00, -3.17178725e-01, -3.70547215e-01, -1.74021030e-01,
       -4.25006181e-01,  

In [32]:
y=data[target]

In [33]:
y=y.to_numpy()

In [34]:
y

array([['DoS'],
       ['BENIGN'],
       ['BENIGN'],
       ...,
       ['DoS'],
       ['BENIGN'],
       ['DoS']], dtype=object)

In [35]:
y=categorical_convert.fit_transform(y)
encoders[column] = categorical_convert

  return f(*args, **kwargs)


In [36]:
y

array([1, 0, 0, ..., 1, 0, 1])

# Step 6: Feature Engineering