# Step 1: Loading all the Packages to be used

In [1]:
import json #will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

# Step 2: Having a suitable real world dataset

In [2]:
data= pd.read_csv(r'C:\Users\Michael Owen\OneDrive\Desktop\Datasets\train_mosaic.csv')
#Load dataset
data.head(4)

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0,739228.5,743103.4661,1264682,213775,49700000.0,41400000.0,79000000,20500000,DoS
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Step 3: Data Preprocessing/Cleaning

In [3]:
data.shape

(809361, 78)

In [4]:
original_features=list(data.columns)
len(original_features)

78

In [5]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

0

In [6]:
len(data)

809361

# Step 4: Encoding the Dataset

In [7]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['Label']

In [8]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['CWE_Flag_Count',
 'Total_Fwd_Packets',
 'Bwd_IAT_Max',
 'Bwd_Header_Length',
 'Min_Packet_Length',
 'Idle_Max',
 'Fwd_Packet_Length_Mean',
 'Fwd_IAT_Std',
 'Bwd_URG_Flags',
 'RST_Flag_Count',
 'Init_Win_bytes_forward',
 'Active_Max',
 'Bwd_IAT_Std',
 'Subflow_Bwd_Bytes',
 'Bwd_Packets_Sec',
 'Flow_Duration',
 'Init_Win_bytes_backward',
 'Fwd_Packet_Length_Min',
 'Bwd_Packet_Length_Min',
 'Packet_Length_Std',
 'FIN_Flag_Count',
 'Avg_Bwd_Segment_Size',
 'min_seg_size_forward',
 'Flow_Packets_Sec',
 'SYN_Flag_Count',
 'Bwd_Avg_Bulk_Rate',
 'Idle_Min',
 'Down_Up_Ratio',
 'Flow_IAT_Mean',
 'Bwd_IAT_Total',
 'Flow_Bytes_Sec',
 'Flow_IAT_Min',
 'Fwd_Packet_Length_Std',
 'URG_Flag_Count',
 'Fwd_Avg_Bulk_Rate',
 'Bwd_Packet_Length_Mean',
 'Fwd_URG_Flags',
 'Destination_Port',
 'Packet_Length_Variance',
 'Fwd_IAT_Min',
 'act_data_pkt_fwd',
 'Bwd_Packet_Length_Std',
 'Fwd_IAT_Total',
 'Fwd_Avg_Packets_Bulk',
 'Subflow_Bwd_Packets',
 'Avg_Fwd_Segment_Size',
 'Average_Packet_Size',
 'Bwd_PSH_Fla

In [9]:
encoders = {}
for column in ['Label']:
    categorical_convert = LabelEncoder()

In [10]:
nominal=['Label']
ordinal=list(set(categorical_features)-set(nominal))

In [11]:
df_nominal=pd.get_dummies(data[nominal])

In [12]:
target=['Label']

In [13]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [14]:
data[numerical]

Unnamed: 0,CWE_Flag_Count,Total_Fwd_Packets,Bwd_IAT_Max,Bwd_Header_Length,Min_Packet_Length,Idle_Max,Fwd_Packet_Length_Mean,Fwd_IAT_Std,Bwd_URG_Flags,RST_Flag_Count,...,ECE_Flag_Count,Fwd_Packets_Sec,Fwd_Packet_Length_Max,Subflow_Fwd_Bytes,Max_Packet_Length,Idle_Mean,Flow_IAT_Max,Fwd_PSH_Flags,Fwd_Avg_Bytes_Bulk,Idle_Std
0,0,20,0,40,0,79000000,48.450000,1.840000e+07,0,0,...,0,0.197689,353,969,353,49700000.00,79000000,0,0,4.140000e+07
1,0,1,0,32,0,0,0.000000,0.000000e+00,0,0,...,0,17241.379310,0,0,0,0.00,58,0,0,0.000000e+00
2,0,4,1,40,37,0,37.000000,1.743482e+04,0,0,...,0,128.427406,37,148,122,0.00,30200,0,0,0.000000e+00
3,0,3,130438,88,0,0,143.000000,8.303272e+04,0,0,...,0,11.778378,423,429,423,0.00,123916,0,0,0.000000e+00
4,0,12,11049530,332,0,0,419.166667,6.855032e+04,0,0,...,0,1.005692,1525,5030,1793,0.00,11049290,0,0,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809356,0,10,0,40,0,26048320,23.300000,1.137151e+07,0,0,...,0,0.146720,168,233,168,22553695.33,26048320,0,0,3.499753e+06
809357,0,7,0,40,0,0,49.142857,5.380612e+04,0,0,...,0,40.320723,344,344,344,0.00,137216,0,0,0.000000e+00
809358,0,7,0,40,0,37133731,30.000000,1.575375e+07,0,0,...,0,0.113042,168,210,168,29688734.00,37133731,0,0,1.052882e+07
809359,0,2,4,40,44,0,44.000000,0.000000e+00,0,0,...,0,2688.172043,44,88,100,0.00,692,0,0,0.000000e+00


In [15]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)
new_data.shape

(809361, 79)

# Step 5: Standardizing the dataset

In [16]:
# set input matrix and target column
x_cols = [c for c in data.columns if c != 'Label']
x_cols2= [c for c in new_data.columns]
X = data[x_cols]
y = data['Label']

In [17]:
X=new_data.to_numpy()

In [18]:
X[0]

array([ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  2.00000000e+01,
        0.00000000e+00,  4.00000000e+01,  0.00000000e+00,  7.90000000e+07,
        4.84500000e+01,  1.84000000e+07,  0.00000000e+00,  0.00000000e+00,
       -1.00000000e+00,  1.26468200e+06,  0.00000000e+00,  0.00000000e+00,
        9.88447100e-03,  1.01168794e+08,  2.92000000e+04,  0.00000000e+00,
        0.00000000e+00,  1.14164659e+02,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  2.07573889e-01,  1.00000000e+00,  0.00000000e+00,
        2.05000000e+07,  0.00000000e+00,  5.05843970e+06,  0.00000000e+00,
        9.57805230e+00,  2.00000000e+00,  1.19083551e+02,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  8.00000000e+01,
        1.30335693e+04,  2.00000000e+00,  3.00000000e+00,  0.00000000e+00,
        1.01000000e+08,  0.00000000e+00,  1.00000000e+00,  4.84500000e+01,
        4.61428571e+01,  0.00000000e+00,  7.90000000e+07,  0.00000000e+00,
        0.00000000e+00,  

In [19]:
X=StandardScaler().fit_transform(X)

In [20]:
X[0]

array([-9.19101676e-01,  9.19101676e-01,  0.00000000e+00,  2.01311408e-02,
       -1.74021030e-01,  3.73063922e-03, -4.59960676e-01,  2.40334688e+00,
       -6.09452103e-03,  1.59825624e+00,  0.00000000e+00, -3.30276985e-01,
       -3.17178725e-01,  6.12413133e-01, -1.50363405e-01, -4.33692669e-03,
       -1.12373622e-01,  2.14160569e+00,  1.01857280e+00, -3.53325184e-01,
       -4.05879306e-01,  1.49630887e-02, -1.20779767e-01, -3.70547215e-01,
        4.55176286e-03, -7.21472692e-01,  1.08238486e+00,  0.00000000e+00,
        3.68190006e-01, -6.39163643e-01,  4.89100470e-01, -2.17181969e-01,
       -6.48067346e-01, -5.84993856e-02,  3.44684549e-01, -2.40910824e-01,
        0.00000000e+00, -3.70547215e-01,  0.00000000e+00, -3.13659533e-01,
       -1.85656577e-01, -1.03343892e-01, -2.54114316e-03, -3.03892094e-01,
        2.20626751e+00,  0.00000000e+00, -6.00445430e-03, -6.09452103e-03,
       -2.49174756e-01, -4.64564156e-02,  2.48921670e+00, -3.27908244e-01,
       -1.25612557e-01, -

In [21]:
y=data[target]

In [22]:
y=y.to_numpy()

In [23]:
y

array([['DoS'],
       ['BENIGN'],
       ['BENIGN'],
       ...,
       ['DoS'],
       ['BENIGN'],
       ['DoS']], dtype=object)

In [24]:
y=categorical_convert.fit_transform(y)
encoders[column] = categorical_convert

  return f(*args, **kwargs)


In [25]:
y

array([1, 0, 0, ..., 1, 0, 1])

# Step 6: Feature Engineering

In [26]:
pca=PCA(n_components=2)

In [27]:
p_components=pca.fit_transform(X)

In [28]:
p_components

array([[-6.2991014 ,  1.87856481],
       [ 2.1592398 , -3.30069346],
       [ 2.48911839, -1.63410266],
       ...,
       [-4.46395953,  0.38074172],
       [ 3.20123475, -1.67298792],
       [-0.08797942, -1.49656167]])

In [29]:
pca.explained_variance_ratio_

array([0.17348344, 0.15744644])

# Step 7: New Dataset Ready for Training

In [30]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [31]:
X1 = data[x_cols]
train_mode = dict(X1.mode().iloc[-1])
print(train_mode)

{'Destination_Port': 80.0, 'Flow_Duration': 3.0, 'Total_Fwd_Packets': 2.0, 'Total_Backward_Packets': 1.0, 'Total_Length_of_Fwd_Packets': 0.0, 'Total_Length_of_Bwd_Packets': 0.0, 'Fwd_Packet_Length_Max': 168.0, 'Fwd_Packet_Length_Min': 0.0, 'Fwd_Packet_Length_Mean': 0.0, 'Fwd_Packet_Length_Std': 0.0, 'Bwd_Packet_Length_Max': 0.0, 'Bwd_Packet_Length_Min': 0.0, 'Bwd_Packet_Length_Mean': 0.0, 'Bwd_Packet_Length_Std': 0.0, 'Flow_Bytes_Sec': 0.0, 'Flow_Packets_Sec': 219948.0, 'Flow_IAT_Mean': 3.0, 'Flow_IAT_Std': 0.0, 'Flow_IAT_Max': 3.0, 'Flow_IAT_Min': 3.0, 'Fwd_IAT_Total': 0.0, 'Fwd_IAT_Mean': 0.0, 'Fwd_IAT_Std': 0.0, 'Fwd_IAT_Max': 0.0, 'Fwd_IAT_Min': 3.0, 'Bwd_IAT_Total': 0.0, 'Bwd_IAT_Mean': 0.0, 'Bwd_IAT_Std': 0.0, 'Bwd_IAT_Max': 0.0, 'Bwd_IAT_Min': 0.0, 'Fwd_PSH_Flags': 0.0, 'Bwd_PSH_Flags': 0.0, 'Fwd_URG_Flags': 0.0, 'Bwd_URG_Flags': 0.0, 'Fwd_Header_Length': 40.0, 'Bwd_Header_Length': 40.0, 'Fwd_Packets_Sec': 666666.6667, 'Bwd_Packets_Sec': 0.0, 'Min_Packet_Length': 0.0, 'Max_Packe

In [32]:
new_data.head(2)

Unnamed: 0,Label_BENIGN,Label_DoS,CWE_Flag_Count,Total_Fwd_Packets,Bwd_IAT_Max,Bwd_Header_Length,Min_Packet_Length,Idle_Max,Fwd_Packet_Length_Mean,Fwd_IAT_Std,...,ECE_Flag_Count,Fwd_Packets_Sec,Fwd_Packet_Length_Max,Subflow_Fwd_Bytes,Max_Packet_Length,Idle_Mean,Flow_IAT_Max,Fwd_PSH_Flags,Fwd_Avg_Bytes_Bulk,Idle_Std
0,0,1,0,20,0,40,0,79000000,48.45,18400000.0,...,0,0.197689,353,969,353,49700000.0,79000000,0,0,41400000.0
1,1,0,0,1,0,32,0,0,0.0,0.0,...,0,17241.37931,0,0,0,0.0,58,0,0,0.0


In [33]:
X2=new_data[x_cols2]
dict(X2.loc[2])

{'Label_BENIGN': 1.0,
 'Label_DoS': 0.0,
 'CWE_Flag_Count': 0.0,
 'Total_Fwd_Packets': 4.0,
 'Bwd_IAT_Max': 1.0,
 'Bwd_Header_Length': 40.0,
 'Min_Packet_Length': 37.0,
 'Idle_Max': 0.0,
 'Fwd_Packet_Length_Mean': 37.0,
 'Fwd_IAT_Std': 17434.82346,
 'Bwd_URG_Flags': 0.0,
 'RST_Flag_Count': 0.0,
 'Init_Win_bytes_forward': -1.0,
 'Active_Max': 0.0,
 'Bwd_IAT_Std': 0.0,
 'Subflow_Bwd_Bytes': 244.0,
 'Bwd_Packets_Sec': 64.2137032,
 'Flow_Duration': 31146.0,
 'Init_Win_bytes_backward': -1.0,
 'Fwd_Packet_Length_Min': 37.0,
 'Bwd_Packet_Length_Min': 122.0,
 'Packet_Length_Std': 41.4757531,
 'FIN_Flag_Count': 0.0,
 'Avg_Bwd_Segment_Size': 122.0,
 'min_seg_size_forward': 20.0,
 'Flow_Packets_Sec': 127126.0,
 'SYN_Flag_Count': 0.0,
 'Bwd_Avg_Bulk_Rate': 0.0,
 'Idle_Min': 0.0,
 'Down_Up_Ratio': 0.0,
 'Flow_IAT_Mean': 6229.2,
 'Bwd_IAT_Total': 1.0,
 'Flow_Bytes_Sec': 36909.0,
 'Flow_IAT_Min': 1.0,
 'Fwd_Packet_Length_Std': 0.0,
 'URG_Flag_Count': 0.0,
 'Fwd_Avg_Bulk_Rate': 0.0,
 'Bwd_Packet_Lengt

In [34]:
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

RandomForestClassifier()

In [35]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et.fit(X_train, y_train)

ExtraTreesClassifier()

# Step 8: Testing the Dataset

In [36]:
rf_model = rf.predict(X_test)
et_model = et.predict(X_test)

# Step 9: Viewing the Model Metrics

In [37]:
#Comparing the Accuracy scores of the 2 algorithms
print(accuracy_score(rf_model, y_test))

1.0


In [38]:
#Comparing the Recall scores of the 2 algorithms
print(recall_score(rf_model, y_test, average ='macro'))

1.0


In [39]:
#Comparing the Precision scores of the 2 algorithms
print(precision_score(rf_model, y_test, average = 'macro'))

1.0


In [40]:
#Comparing the F1 Scores of the 2 algorithms
print(f1_score(rf_model, y_test, average='macro'))

1.0


# Step 10: Saving the Models as objects in Joblibs

In [41]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']