# Step 1: Loading all the Packages to be used

In [2]:
import json #will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

# Step 2: Having a suitable real world dataset

In [3]:
data= pd.read_csv(r'C:\Users\Michael Owen\OneDrive\Desktop\Datasets\train_mosaic.csv')
#Load dataset
data.head(4)

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0,739228.5,743103.4661,1264682,213775,49700000.0,41400000.0,79000000,20500000,DoS
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Step 3: Data Preprocessing/Cleaning

In [4]:
data.shape

(809361, 78)

In [5]:
original_features=list(data.columns)
len(original_features)

78

In [6]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

0

In [7]:
len(data)

809361

# Step 4: Encoding the Dataset

In [8]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['Label']

In [9]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['Total_Length_of_Fwd_Packets',
 'Fwd_Packet_Length_Std',
 'Fwd_Header_Length',
 'Idle_Min',
 'Fwd_URG_Flags',
 'PSH_Flag_Count',
 'Flow_IAT_Std',
 'Avg_Fwd_Segment_Size',
 'Flow_IAT_Mean',
 'Flow_Bytes_Sec',
 'Active_Std',
 'Total_Length_of_Bwd_Packets',
 'Bwd_Packet_Length_Std',
 'Idle_Max',
 'Fwd_IAT_Min',
 'Avg_Bwd_Segment_Size',
 'CWE_Flag_Count',
 'Bwd_IAT_Min',
 'Packet_Length_Std',
 'URG_Flag_Count',
 'Idle_Mean',
 'Bwd_Avg_Bulk_Rate',
 'ACK_Flag_Count',
 'Total_Backward_Packets',
 'Total_Fwd_Packets',
 'Bwd_Packet_Length_Mean',
 'Bwd_URG_Flags',
 'SYN_Flag_Count',
 'Packet_Length_Mean',
 'Bwd_PSH_Flags',
 'Fwd_Packets_Sec',
 'Subflow_Bwd_Packets',
 'Destination_Port',
 'Init_Win_bytes_forward',
 'Max_Packet_Length',
 'act_data_pkt_fwd',
 'Bwd_Packet_Length_Max',
 'Bwd_Avg_Bytes_Bulk',
 'Min_Packet_Length',
 'Down_Up_Ratio',
 'Fwd_Packet_Length_Max',
 'Bwd_IAT_Total',
 'Fwd_PSH_Flags',
 'Fwd_IAT_Total',
 'Packet_Length_Variance',
 'Bwd_Packets_Sec',
 'Average_Packet_Size',
 'Fl

In [10]:
encoders = {}
for column in ['Label']:
    categorical_convert = LabelEncoder()

In [11]:
nominal=['Label']
ordinal=list(set(categorical_features)-set(nominal))

In [12]:
df_nominal=pd.get_dummies(data[nominal])

In [13]:
target=['Label']

In [14]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [15]:
data[numerical]

Unnamed: 0,Total_Length_of_Fwd_Packets,Fwd_Packet_Length_Std,Fwd_Header_Length,Idle_Min,Fwd_URG_Flags,PSH_Flag_Count,Flow_IAT_Std,Avg_Fwd_Segment_Size,Flow_IAT_Mean,Flow_Bytes_Sec,...,Bwd_IAT_Max,Bwd_Header_Length,FIN_Flag_Count,Fwd_Avg_Packets_Bulk,ECE_Flag_Count,Active_Max,Init_Win_bytes_backward,Idle_Std,Fwd_Avg_Bulk_Rate,Subflow_Fwd_Bytes
0,969,119.083551,728,20500000,0,0,1.800000e+07,48.450000,5.058440e+06,9.578052,...,0,40,0,0,0,1264682,29200,4.140000e+07,0,969
1,0,0.000000,32,0,0,0,0.000000e+00,0.000000,5.800000e+01,2.000000,...,0,32,0,0,0,0,33304,0.000000e+00,0,0
2,148,0.000000,80,0,0,0,1.340626e+04,37.000000,6.229200e+03,36909.000000,...,1,40,0,0,0,0,-1,0.000000e+00,0,148
3,429,242.505670,72,0,0,1,6.313298e+04,143.000000,4.245067e+04,158570.000000,...,130438,88,0,0,0,0,237,0.000000e+00,0,429
4,5030,644.896586,252,0,0,1,2.120637e+06,419.166667,4.419288e+05,79335.000000,...,11049530,332,0,0,0,0,100,0.000000e+00,0,5030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809356,233,51.337770,344,19048836,0,0,1.098526e+07,23.300000,6.815688e+06,3.418584,...,0,40,0,0,0,306895,29200,3.499753e+06,0,233
809357,344,130.019779,284,0,0,0,4.998758e+04,49.142857,2.480114e+04,1981.475508,...,0,40,0,0,0,0,29200,0.000000e+00,0,344
809358,210,61.326992,224,22243737,0,0,1.490070e+07,30.000000,8.846251e+06,3.391267,...,0,40,0,0,0,2544863,29200,1.052882e+07,0,210
809359,88,0.000000,40,0,0,0,3.851441e+02,44.000000,2.480000e+02,183577.000000,...,4,40,0,0,0,0,-1,0.000000e+00,0,88


In [16]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)
new_data.shape

(809361, 79)

# Step 5: Standardizing the dataset

In [17]:
# set input matrix and target column
x_cols = [c for c in data.columns if c != 'Label']
x_cols2= [c for c in new_data.columns]
X = data[x_cols]
y = data['Label']

In [18]:
X=new_data.to_numpy()

In [19]:
X[0]

array([ 0.00000000e+00,  1.00000000e+00,  9.69000000e+02,  1.19083551e+02,
        7.28000000e+02,  2.05000000e+07,  0.00000000e+00,  0.00000000e+00,
        1.80000000e+07,  4.84500000e+01,  5.05843970e+06,  9.57805230e+00,
        7.43103466e+05,  0.00000000e+00,  0.00000000e+00,  7.90000000e+07,
        2.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.14164659e+02,  0.00000000e+00,  4.97000000e+07,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  2.00000000e+01,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  4.40454546e+01,  0.00000000e+00,
        1.97689418e-01,  1.00000000e+00,  8.00000000e+01, -1.00000000e+00,
        3.53000000e+02,  3.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  3.53000000e+02,  0.00000000e+00,
        0.00000000e+00,  1.01000000e+08,  1.30335693e+04,  9.88447100e-03,
        4.61428571e+01,  2.07573889e-01,  7.39228500e+05,  0.00000000e+00,
        7.90000000e+07,  

In [20]:
X=StandardScaler().fit_transform(X)

In [21]:
X[0]

array([-9.19101676e-01,  9.19101676e-01,  1.32981740e-01,  3.44684549e-01,
        3.11894084e-03,  3.68190006e-01,  0.00000000e+00, -3.61911677e-01,
        1.42710518e+00, -6.09452103e-03,  4.89100470e-01, -6.48067346e-01,
        9.19315893e-01, -4.33692669e-03, -3.03892094e-01,  2.40334688e+00,
       -1.03343892e-01, -3.70547215e-01,  0.00000000e+00, -8.90209854e-02,
        1.49630887e-02, -2.40910824e-01,  1.47778855e+00,  0.00000000e+00,
       -4.25006181e-01, -6.00445430e-03,  2.01311408e-02, -3.70547215e-01,
        0.00000000e+00,  1.08238486e+00, -2.02537363e-01, -4.64564156e-02,
       -1.79938277e-01, -6.00445430e-03, -3.13659533e-01, -3.17178725e-01,
        1.01609589e-02, -2.54114316e-03, -3.27908244e-01,  0.00000000e+00,
       -4.59960676e-01, -6.39163643e-01,  3.92497130e-01, -2.17181969e-01,
       -1.73935450e-01,  2.20626751e+00, -1.85656577e-01, -1.12373622e-01,
       -2.49174756e-01, -7.21472692e-01,  4.66782295e-01, -3.53325184e-01,
        2.39292378e+00,  

In [22]:
y=data[target]

In [23]:
y=y.to_numpy()

In [24]:
y

array([['DoS'],
       ['BENIGN'],
       ['BENIGN'],
       ...,
       ['DoS'],
       ['BENIGN'],
       ['DoS']], dtype=object)

In [25]:
y=categorical_convert.fit_transform(y)
encoders[column] = categorical_convert

  return f(*args, **kwargs)


In [26]:
y

array([1, 0, 0, ..., 1, 0, 1])

# Step 6: Feature Engineering

In [27]:
pca=PCA(n_components=2)

In [28]:
p_components=pca.fit_transform(X)

In [29]:
p_components

array([[-6.29910168,  1.87856403],
       [ 2.15924105, -3.30069123],
       [ 2.48911825, -1.63410299],
       ...,
       [-4.46395976,  0.38074116],
       [ 3.20123472, -1.6729881 ],
       [-0.08797939, -1.49656174]])

In [30]:
pca.explained_variance_ratio_

array([0.17348344, 0.15744644])

# Step 7: New Dataset Ready for Training

In [31]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [32]:
X1 = data[x_cols]
train_mode = dict(X1.mode().iloc[-1])
print(train_mode)

{'Destination_Port': 80.0, 'Flow_Duration': 3.0, 'Total_Fwd_Packets': 2.0, 'Total_Backward_Packets': 1.0, 'Total_Length_of_Fwd_Packets': 0.0, 'Total_Length_of_Bwd_Packets': 0.0, 'Fwd_Packet_Length_Max': 168.0, 'Fwd_Packet_Length_Min': 0.0, 'Fwd_Packet_Length_Mean': 0.0, 'Fwd_Packet_Length_Std': 0.0, 'Bwd_Packet_Length_Max': 0.0, 'Bwd_Packet_Length_Min': 0.0, 'Bwd_Packet_Length_Mean': 0.0, 'Bwd_Packet_Length_Std': 0.0, 'Flow_Bytes_Sec': 0.0, 'Flow_Packets_Sec': 219948.0, 'Flow_IAT_Mean': 3.0, 'Flow_IAT_Std': 0.0, 'Flow_IAT_Max': 3.0, 'Flow_IAT_Min': 3.0, 'Fwd_IAT_Total': 0.0, 'Fwd_IAT_Mean': 0.0, 'Fwd_IAT_Std': 0.0, 'Fwd_IAT_Max': 0.0, 'Fwd_IAT_Min': 3.0, 'Bwd_IAT_Total': 0.0, 'Bwd_IAT_Mean': 0.0, 'Bwd_IAT_Std': 0.0, 'Bwd_IAT_Max': 0.0, 'Bwd_IAT_Min': 0.0, 'Fwd_PSH_Flags': 0.0, 'Bwd_PSH_Flags': 0.0, 'Fwd_URG_Flags': 0.0, 'Bwd_URG_Flags': 0.0, 'Fwd_Header_Length': 40.0, 'Bwd_Header_Length': 40.0, 'Fwd_Packets_Sec': 666666.6667, 'Bwd_Packets_Sec': 0.0, 'Min_Packet_Length': 0.0, 'Max_Packe

In [33]:
new_data.head(2)

Unnamed: 0,Label_BENIGN,Label_DoS,Total_Length_of_Fwd_Packets,Fwd_Packet_Length_Std,Fwd_Header_Length,Idle_Min,Fwd_URG_Flags,PSH_Flag_Count,Flow_IAT_Std,Avg_Fwd_Segment_Size,...,Bwd_IAT_Max,Bwd_Header_Length,FIN_Flag_Count,Fwd_Avg_Packets_Bulk,ECE_Flag_Count,Active_Max,Init_Win_bytes_backward,Idle_Std,Fwd_Avg_Bulk_Rate,Subflow_Fwd_Bytes
0,0,1,969,119.083551,728,20500000,0,0,18000000.0,48.45,...,0,40,0,0,0,1264682,29200,41400000.0,0,969
1,1,0,0,0.0,32,0,0,0,0.0,0.0,...,0,32,0,0,0,0,33304,0.0,0,0


In [34]:
X2=new_data[x_cols2]
dict(X2.loc[2])

{'Label_BENIGN': 1.0,
 'Label_DoS': 0.0,
 'Total_Length_of_Fwd_Packets': 148.0,
 'Fwd_Packet_Length_Std': 0.0,
 'Fwd_Header_Length': 80.0,
 'Idle_Min': 0.0,
 'Fwd_URG_Flags': 0.0,
 'PSH_Flag_Count': 0.0,
 'Flow_IAT_Std': 13406.25627,
 'Avg_Fwd_Segment_Size': 37.0,
 'Flow_IAT_Mean': 6229.2,
 'Flow_Bytes_Sec': 36909.0,
 'Active_Std': 0.0,
 'Total_Length_of_Bwd_Packets': 244.0,
 'Bwd_Packet_Length_Std': 0.0,
 'Idle_Max': 0.0,
 'Fwd_IAT_Min': 1.0,
 'Avg_Bwd_Segment_Size': 122.0,
 'CWE_Flag_Count': 0.0,
 'Bwd_IAT_Min': 1.0,
 'Packet_Length_Std': 41.4757531,
 'URG_Flag_Count': 0.0,
 'Idle_Mean': 0.0,
 'Bwd_Avg_Bulk_Rate': 0.0,
 'ACK_Flag_Count': 0.0,
 'Total_Backward_Packets': 2.0,
 'Total_Fwd_Packets': 4.0,
 'Bwd_Packet_Length_Mean': 122.0,
 'Bwd_URG_Flags': 0.0,
 'SYN_Flag_Count': 0.0,
 'Packet_Length_Mean': 61.28571429,
 'Bwd_PSH_Flags': 0.0,
 'Fwd_Packets_Sec': 128.4274064,
 'Subflow_Bwd_Packets': 2.0,
 'Destination_Port': 53.0,
 'Init_Win_bytes_forward': -1.0,
 'Max_Packet_Length': 122.

In [35]:
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

RandomForestClassifier()

In [43]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et.fit(X_train, y_train)

ExtraTreesClassifier()

# Step 8: Testing the Dataset

In [44]:
rf_model = rf.predict(X_test)
et_model = et.predict(X_test)

# Step 9: Viewing the Model Metrics

In [45]:
#Comparing the Accuracy scores of the 2 algorithms
print(accuracy_score(rf_model, y_test))
print(accuracy_score(et_model, y_test))

1.0
1.0


In [46]:
#Comparing the Recall scores of the 2 algorithms
print(recall_score(rf_model, y_test, average ='macro'))
print(recall_score(et_model, y_test, average ='macro'))

1.0
1.0


In [47]:
#Comparing the Precision scores of the 2 algorithms
print(precision_score(rf_model, y_test, average = 'macro'))
print(precision_score(et_model, y_test, average = 'macro'))

1.0
1.0


In [48]:
#Comparing the F1 Scores of the 2 algorithms
print(f1_score(rf_model, y_test, average='macro'))
print(f1_score(et_model, y_test, average='macro'))

1.0
1.0


# Step 10: Saving the Models as objects in Joblibs

In [42]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']