# Step 1: Loading all the Packages to be used

In [1]:
import json #will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

# Step 2: Having a suitable real world dataset

In [2]:
data= pd.read_csv(r'C:\Users\Michael Owen\OneDrive\Desktop\Datasets\train_mosaic.csv')
#Load dataset
data.head(4)

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0,739228.5,743103.4661,1264682,213775,49700000.0,41400000.0,79000000,20500000,DoS
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Step 3: Data Preprocessing/Cleaning

In [3]:
data.shape

(809361, 78)

In [4]:
original_features=list(data.columns)
len(original_features)

78

In [5]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

0

In [6]:
len(data)

809361

# Step 4: Encoding the Dataset

In [7]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['Label']

In [8]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['act_data_pkt_fwd',
 'Active_Mean',
 'Bwd_Packets_Sec',
 'Flow_IAT_Std',
 'Idle_Max',
 'Bwd_Avg_Packets_Bulk',
 'Total_Fwd_Packets',
 'Packet_Length_Variance',
 'Average_Packet_Size',
 'Init_Win_bytes_forward',
 'Active_Min',
 'Flow_IAT_Max',
 'Idle_Std',
 'CWE_Flag_Count',
 'Subflow_Fwd_Bytes',
 'Bwd_IAT_Std',
 'Total_Backward_Packets',
 'Bwd_Packet_Length_Mean',
 'Fwd_PSH_Flags',
 'Bwd_IAT_Mean',
 'Fwd_Packet_Length_Max',
 'Bwd_Packet_Length_Max',
 'Bwd_Packet_Length_Min',
 'Bwd_IAT_Total',
 'min_seg_size_forward',
 'Fwd_Packet_Length_Mean',
 'Fwd_IAT_Min',
 'Idle_Min',
 'Bwd_IAT_Max',
 'Fwd_URG_Flags',
 'Fwd_Avg_Packets_Bulk',
 'Idle_Mean',
 'Total_Length_of_Bwd_Packets',
 'Fwd_Packet_Length_Std',
 'Min_Packet_Length',
 'Bwd_PSH_Flags',
 'Flow_IAT_Min',
 'PSH_Flag_Count',
 'Packet_Length_Std',
 'Down_Up_Ratio',
 'RST_Flag_Count',
 'Packet_Length_Mean',
 'Subflow_Bwd_Packets',
 'Subflow_Bwd_Bytes',
 'Bwd_Avg_Bytes_Bulk',
 'Fwd_Avg_Bytes_Bulk',
 'ECE_Flag_Count',
 'Fwd_IAT_Std',
 'In

In [9]:
encoders = {}
for column in ['Label']:
    categorical_convert = LabelEncoder()

In [10]:
nominal=['Label']
ordinal=list(set(categorical_features)-set(nominal))

In [11]:
df_nominal=pd.get_dummies(data[nominal])

In [12]:
target=['Label']

In [13]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [14]:
data[numerical]

Unnamed: 0,act_data_pkt_fwd,Active_Mean,Bwd_Packets_Sec,Flow_IAT_Std,Idle_Max,Bwd_Avg_Packets_Bulk,Total_Fwd_Packets,Packet_Length_Variance,Average_Packet_Size,Init_Win_bytes_forward,...,FIN_Flag_Count,Bwd_URG_Flags,ACK_Flag_Count,Flow_Bytes_Sec,Flow_Packets_Sec,Avg_Fwd_Segment_Size,Bwd_Header_Length,Bwd_Avg_Bulk_Rate,Bwd_IAT_Min,Destination_Port
0,3,739228.5,0.009884,1.800000e+07,79000000,0,20,13033.569260,46.142857,-1,...,0,0,0,9.578052,0.207574,48.450000,40,0,0,80
1,0,0.0,17241.379310,0.000000e+00,0,0,1,0.000000,0.000000,60,...,0,0,1,2.000000,176182.000000,0.000000,32,0,0,60711
2,3,0.0,64.213703,1.340626e+04,0,0,4,1720.238095,71.500000,-1,...,0,0,0,36909.000000,127126.000000,37.000000,40,0,1,53
3,2,0.0,15.704504,6.313298e+04,0,0,3,33932.214290,116.857143,8192,...,0,0,0,158570.000000,157963.000000,143.000000,88,0,4,80
4,11,0.0,1.340923,2.120637e+06,0,0,12,584290.852200,740.464286,8192,...,0,0,0,79335.000000,133390.000000,419.166667,332,0,50,443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809356,6,165265.0,0.014672,1.098526e+07,26048320,0,10,2238.628788,21.181818,-1,...,0,0,0,3.418584,0.161392,23.300000,40,0,0,80
809357,1,0.0,5.760103,4.998758e+04,0,0,7,13148.444440,43.000000,-1,...,0,0,0,1981.475508,46.080826,49.142857,40,0,0,80
809358,4,1273143.0,0.016149,1.490070e+07,37133731,0,7,2995.750000,26.250000,-1,...,0,0,0,3.391267,0.129191,30.000000,40,0,0,80
809359,1,0.0,2688.172043,3.851441e+02,0,0,2,940.800000,83.000000,-1,...,0,0,0,183577.000000,204389.000000,44.000000,40,0,4,53


In [15]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)
new_data.shape

(809361, 79)

# Step 5: Standardizing the dataset

In [16]:
# set input matrix and target column
x_cols = [c for c in data.columns if c != 'Label']
x_cols2= [c for c in new_data.columns]
X = data[x_cols]
y = data['Label']

In [17]:
X=new_data.to_numpy()

In [18]:
X[0]

array([ 0.00000000e+00,  1.00000000e+00,  3.00000000e+00,  7.39228500e+05,
        9.88447100e-03,  1.80000000e+07,  7.90000000e+07,  0.00000000e+00,
        2.00000000e+01,  1.30335693e+04,  4.61428571e+01, -1.00000000e+00,
        2.13775000e+05,  7.90000000e+07,  4.14000000e+07,  0.00000000e+00,
        9.69000000e+02,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  3.53000000e+02,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  4.84500000e+01,
        2.00000000e+00,  2.05000000e+07,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  4.97000000e+07,  0.00000000e+00,  1.19083551e+02,
        0.00000000e+00,  0.00000000e+00,  2.00000000e+00,  0.00000000e+00,
        1.14164659e+02,  0.00000000e+00,  0.00000000e+00,  4.40454546e+01,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.84000000e+07,  2.92000000e+04,  1.26468200e+06,
        7.28000000e+02,  

In [19]:
X=StandardScaler().fit_transform(X)

In [20]:
X[0]

array([-9.19101676e-01,  9.19101676e-01, -2.54114316e-03,  4.66782295e-01,
       -1.12373622e-01,  1.42710518e+00,  2.40334688e+00,  0.00000000e+00,
        2.01311408e-02, -1.85656577e-01, -2.49174756e-01, -3.17178725e-01,
        2.68840421e-02,  2.39292378e+00,  5.54507241e+00,  0.00000000e+00,
        1.32981740e-01, -1.50363405e-01, -6.00445430e-03, -3.70547215e-01,
       -1.73935450e-01, -1.25612557e-01,  3.92497130e-01, -3.27908244e-01,
       -4.05879306e-01, -2.17181969e-01,  4.55176286e-03, -6.09452103e-03,
       -1.03343892e-01,  3.68190006e-01, -1.74021030e-01,  0.00000000e+00,
        0.00000000e+00,  1.47778855e+00, -4.33692669e-03,  3.44684549e-01,
       -4.59960676e-01, -4.64564156e-02, -5.84993856e-02, -3.61911677e-01,
        1.49630887e-02, -6.39163643e-01, -3.30276985e-01, -2.02537363e-01,
       -6.00445430e-03, -4.33692669e-03,  0.00000000e+00,  0.00000000e+00,
       -8.09246851e-03,  1.59825624e+00,  1.01857280e+00,  6.12413133e-01,
        3.11894084e-03, -

In [21]:
y=data[target]

In [22]:
y=y.to_numpy()

In [23]:
y

array([['DoS'],
       ['BENIGN'],
       ['BENIGN'],
       ...,
       ['DoS'],
       ['BENIGN'],
       ['DoS']], dtype=object)

In [24]:
y=categorical_convert.fit_transform(y)
encoders[column] = categorical_convert

  return f(*args, **kwargs)


In [25]:
y

array([1, 0, 0, ..., 1, 0, 1])

# Step 6: Feature Engineering

In [26]:
pca=PCA(n_components=2)

In [27]:
p_components=pca.fit_transform(X)

In [28]:
p_components

array([[-6.29910151,  1.87856524],
       [ 2.15924028, -3.30069319],
       [ 2.48911831, -1.63410274],
       ...,
       [-4.46395957,  0.38074199],
       [ 3.20123472, -1.67298794],
       [-0.08797942, -1.49656168]])

In [29]:
pca.explained_variance_ratio_

array([0.17348344, 0.15744644])

# Step 7: New Dataset Ready for Training

In [30]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [31]:
X1 = data[x_cols]
train_mode = dict(X1.mode().iloc[-1])
print(train_mode)

{'Destination_Port': 80.0, 'Flow_Duration': 3.0, 'Total_Fwd_Packets': 2.0, 'Total_Backward_Packets': 1.0, 'Total_Length_of_Fwd_Packets': 0.0, 'Total_Length_of_Bwd_Packets': 0.0, 'Fwd_Packet_Length_Max': 168.0, 'Fwd_Packet_Length_Min': 0.0, 'Fwd_Packet_Length_Mean': 0.0, 'Fwd_Packet_Length_Std': 0.0, 'Bwd_Packet_Length_Max': 0.0, 'Bwd_Packet_Length_Min': 0.0, 'Bwd_Packet_Length_Mean': 0.0, 'Bwd_Packet_Length_Std': 0.0, 'Flow_Bytes_Sec': 0.0, 'Flow_Packets_Sec': 219948.0, 'Flow_IAT_Mean': 3.0, 'Flow_IAT_Std': 0.0, 'Flow_IAT_Max': 3.0, 'Flow_IAT_Min': 3.0, 'Fwd_IAT_Total': 0.0, 'Fwd_IAT_Mean': 0.0, 'Fwd_IAT_Std': 0.0, 'Fwd_IAT_Max': 0.0, 'Fwd_IAT_Min': 3.0, 'Bwd_IAT_Total': 0.0, 'Bwd_IAT_Mean': 0.0, 'Bwd_IAT_Std': 0.0, 'Bwd_IAT_Max': 0.0, 'Bwd_IAT_Min': 0.0, 'Fwd_PSH_Flags': 0.0, 'Bwd_PSH_Flags': 0.0, 'Fwd_URG_Flags': 0.0, 'Bwd_URG_Flags': 0.0, 'Fwd_Header_Length': 40.0, 'Bwd_Header_Length': 40.0, 'Fwd_Packets_Sec': 666666.6667, 'Bwd_Packets_Sec': 0.0, 'Min_Packet_Length': 0.0, 'Max_Packe

In [32]:
new_data.head(2)

Unnamed: 0,Label_BENIGN,Label_DoS,act_data_pkt_fwd,Active_Mean,Bwd_Packets_Sec,Flow_IAT_Std,Idle_Max,Bwd_Avg_Packets_Bulk,Total_Fwd_Packets,Packet_Length_Variance,...,FIN_Flag_Count,Bwd_URG_Flags,ACK_Flag_Count,Flow_Bytes_Sec,Flow_Packets_Sec,Avg_Fwd_Segment_Size,Bwd_Header_Length,Bwd_Avg_Bulk_Rate,Bwd_IAT_Min,Destination_Port
0,0,1,3,739228.5,0.009884,18000000.0,79000000,0,20,13033.56926,...,0,0,0,9.578052,0.207574,48.45,40,0,0,80
1,1,0,0,0.0,17241.37931,0.0,0,0,1,0.0,...,0,0,1,2.0,176182.0,0.0,32,0,0,60711


In [33]:
X2=new_data[x_cols2]
dict(X2.loc[2])

{'Label_BENIGN': 1.0,
 'Label_DoS': 0.0,
 'act_data_pkt_fwd': 3.0,
 'Active_Mean': 0.0,
 'Bwd_Packets_Sec': 64.2137032,
 'Flow_IAT_Std': 13406.25627,
 'Idle_Max': 0.0,
 'Bwd_Avg_Packets_Bulk': 0.0,
 'Total_Fwd_Packets': 4.0,
 'Packet_Length_Variance': 1720.238095,
 'Average_Packet_Size': 71.5,
 'Init_Win_bytes_forward': -1.0,
 'Active_Min': 0.0,
 'Flow_IAT_Max': 30200.0,
 'Idle_Std': 0.0,
 'CWE_Flag_Count': 0.0,
 'Subflow_Fwd_Bytes': 148.0,
 'Bwd_IAT_Std': 0.0,
 'Total_Backward_Packets': 2.0,
 'Bwd_Packet_Length_Mean': 122.0,
 'Fwd_PSH_Flags': 0.0,
 'Bwd_IAT_Mean': 1.0,
 'Fwd_Packet_Length_Max': 37.0,
 'Bwd_Packet_Length_Max': 122.0,
 'Bwd_Packet_Length_Min': 122.0,
 'Bwd_IAT_Total': 1.0,
 'min_seg_size_forward': 20.0,
 'Fwd_Packet_Length_Mean': 37.0,
 'Fwd_IAT_Min': 1.0,
 'Idle_Min': 0.0,
 'Bwd_IAT_Max': 1.0,
 'Fwd_URG_Flags': 0.0,
 'Fwd_Avg_Packets_Bulk': 0.0,
 'Idle_Mean': 0.0,
 'Total_Length_of_Bwd_Packets': 244.0,
 'Fwd_Packet_Length_Std': 0.0,
 'Min_Packet_Length': 37.0,
 'Bwd_PS

In [34]:
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

RandomForestClassifier()

In [42]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)

# Step 8: Testing the Dataset

In [43]:
rf_model = rf.predict(X_test)

# Step 9: Viewing the Model Metrics

In [44]:
#Comparing the Accuracy scores of the 2 algorithms
print(accuracy_score(rf_model, y_test))

1.0


In [45]:
#Comparing the Recall scores of the 2 algorithms
print(recall_score(rf_model, y_test, average ='macro'))

1.0


In [46]:
#Comparing the Precision scores of the 2 algorithms
print(precision_score(rf_model, y_test, average = 'macro'))

1.0


In [47]:
#Comparing the F1 Scores of the 2 algorithms
print(f1_score(rf_model, y_test, average='macro'))

1.0


# Step 10: Saving the Models as objects in Joblibs

In [48]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']