#### NSL-KDD MultiClass Classifier 

NSL-KDD dataset has data in the following categories, Normal , DoS, Probe, R2L, U2R.KNN, Support Vector Machine(SVM), Decision Tree , Random Forest , Logistic Regression and XGboost learning algorithms are trained separately and their performances are analysed separately.

In [1]:
#Importing essential libraries
from sklearn.datasets import fetch_openml

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os 
import cv2
import random 
import math
import seaborn as sns

from datetime import datetime

from sklearn import metrics
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow import keras
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing the training and test dataset 
pickle_in = open("X_train_NSL_MC.pickle","rb")
X_train = pickle.load(pickle_in)

pickle_in = open("y_train_NSL_MC.pickle","rb")
y_train = pickle.load(pickle_in)

pickle_in = open("X_test_NSL_MC.pickle","rb")
X_test = pickle.load(pickle_in)

pickle_in = open("y_test_NSL_MC.pickle","rb")
y_test = pickle.load(pickle_in)

In [3]:
#shape of the training and test datasets 
print('X_train ',X_train.shape)
print('y_train ',y_train.shape)
print('X_test ',X_test.shape)
print('y_test ',y_test.shape)

X_train  (125973, 41)
y_train  (125973, 1)
X_test  (22544, 41)
y_test  (22544, 1)


In [4]:
digit_train, counts_train = np.unique(y_train, return_counts = True)

distribution_train = dict(zip(digit_train, counts_train))
print(distribution_train)

{0: 67343, 1: 45927, 2: 11656, 3: 995, 4: 52}


#### KNN  Classifier

In [5]:
#Importing KNN
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train) #training the model 
knn_prediction = knn_model.predict(X_test) #Predicting the test data

In [6]:
#Evaluating Performance
print("Train score is: ", knn_model.score(X_train, y_train))
print("Test score is: ",knn_model.score(X_test,y_test))

Train score is:  0.9953323331190017
Test score is:  0.7274662881476224


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

print('Confusion Matrix')
print(confusion_matrix(knn_prediction, y_test))
print('Classification Report')
print(classification_report(knn_prediction, y_test))

Confusion Matrix
[[9435 1589  648 2583  100]
 [ 131 5548  370    2   59]
 [ 137  321 1403  163   33]
 [   8    0    0    6    0]
 [   0    0    0    0    8]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.66      0.78     14355
           1       0.74      0.91      0.82      6110
           2       0.58      0.68      0.63      2057
           3       0.00      0.43      0.00        14
           4       0.04      1.00      0.08         8

    accuracy                           0.73     22544
   macro avg       0.47      0.74      0.46     22544
weighted avg       0.87      0.73      0.78     22544



#### Support Vector Machine(SVM)

In [8]:
#Importing SVC Classifier
from sklearn.svm import SVC
svc_model=SVC(random_state=0)
svc_model.fit(X_train, y_train) #training the model 
svc_prediction = svc_model.predict(X_test) #Predicting the test data

In [9]:
#Evaluating Performance
print("Train score is: ", svc_model.score(X_train, y_train))
print("Test score is: ",svc_model.score(X_test,y_test))

Train score is:  0.5351146674287347
Test score is:  0.43066891412349184


In [10]:
print('Confusion Matrix')
print(confusion_matrix(svc_prediction, y_test))
print('Classification Report')
print(classification_report(svc_prediction, y_test))

Confusion Matrix
[[9709 7458 2421 2752  200]
 [   0    0    0    0    0]
 [   0    0    0    2    0]
 [   2    0    0    0    0]
 [   0    0    0    0    0]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.43      0.60     22540
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         0

    accuracy                           0.43     22544
   macro avg       0.20      0.09      0.12     22544
weighted avg       1.00      0.43      0.60     22544



#### Logistic Regression

In [11]:
#Importing Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train.ravel()) #training the model 
lr_prediction = lr_model.predict(X_test) #Predicting the test data

In [12]:
#Evaluating Performance
print("Train score is: ", lr_model.score(X_train, y_train))
print("Test score is: ",lr_model.score(X_test,y_test))

Train score is:  0.8427202654537084
Test score is:  0.6282381121362669


In [13]:
print('Confusion Matrix')
print(confusion_matrix(lr_prediction, y_test))
print('Classification Report')
print(classification_report(lr_prediction, y_test))

Confusion Matrix
[[9379 1717 1142 2118   81]
 [ 277 4784 1279   17   92]
 [  55  957    0  619   27]
 [   0    0    0    0    0]
 [   0    0    0    0    0]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.65      0.78     14437
           1       0.64      0.74      0.69      6449
           2       0.00      0.00      0.00      1658
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

    accuracy                           0.63     22544
   macro avg       0.32      0.28      0.29     22544
weighted avg       0.80      0.63      0.69     22544



#### Decision Tree

In [14]:
#Importing Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=3)                   # decision tree
dt_model.fit(X_train,y_train) #training the model 
dt_prediction = dt_model.predict(X_test) #Predicting the test data

In [15]:
#Evaluating Performance
print("Train score is: ", dt_model.score(X_train, y_train))
print("Test score is: ",dt_model.score(X_test,y_test))

Train score is:  0.9524421899930937
Test score is:  0.7233410220014195


In [16]:
print('Confusion Matrix')
print(confusion_matrix(dt_prediction, y_test))
print('Classification Report')
print(classification_report(dt_prediction, y_test))

Confusion Matrix
[[9477 1925  857 2525  163]
 [  57 5442  176    5   32]
 [ 177   91 1388  224    5]
 [   0    0    0    0    0]
 [   0    0    0    0    0]]
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.63      0.77     14947
           1       0.73      0.95      0.83      5712
           2       0.57      0.74      0.64      1885
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

    accuracy                           0.72     22544
   macro avg       0.46      0.46      0.45     22544
weighted avg       0.88      0.72      0.77     22544



#### Random Forest

In [17]:
#Importing Random Forest
from scipy.io.arff import loadarff
from sklearn.ensemble import RandomForestClassifier

In [18]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train.ravel()) #training the model 
rfc_prediction = rfc_model.predict(X_test) #Predicting the test data

In [19]:
#Evaluating Performance
print("Train score is: ", rfc_model.score(X_train, y_train))
print("Test score is: ",rfc_model.score(X_test,y_test))

Train score is:  0.999944432537131
Test score is:  0.7531937544357701


In [20]:
print('Confusion Matrix')
print(confusion_matrix(rfc_prediction, y_test))
print('Classification Report')
print(classification_report(rfc_prediction, y_test))

Confusion Matrix
[[9456 1394  819 2694  193]
 [  65 6024  162    0    0]
 [ 190   40 1440    2    1]
 [   0    0    0   57    3]
 [   0    0    0    1    3]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.65      0.78     14556
           1       0.81      0.96      0.88      6251
           2       0.59      0.86      0.70      1673
           3       0.02      0.95      0.04        60
           4       0.01      0.75      0.03         4

    accuracy                           0.75     22544
   macro avg       0.48      0.83      0.49     22544
weighted avg       0.90      0.75      0.80     22544



#### XG-Boost

In [21]:
#Importing XGBoost 
from xgboost import XGBClassifier
xgb_model =XGBClassifier(max_depth=9,
                          subsample=0.9,
                          objective='multi:softmax',
                          num_class = 3,
                          min_child_weight=2,
                          colsample_bytree=0.7,
                          n_estimators=1000,
                          learning_rate=0.08,
                          n_jobs = -1)

xgb_model.fit(X_train,y_train,verbose=True) #training the model 
xgb_prediction = xgb_model.predict(X_test) #Predicting the test data

In [22]:
#Evaluating Performance
print("Train score is: ", xgb_model.score(X_train, y_train))
print("Test score is: ",xgb_model.score(X_test,y_test))

Train score is:  0.999944432537131
Test score is:  0.7668115684882896


In [23]:
print('Confusion Matrix')
print(confusion_matrix(xgb_prediction, y_test))
print('Classification Report')
print(classification_report(xgb_prediction, y_test))

Confusion Matrix
[[9444 1199  751 2583  189]
 [  67 6162  163    0    0]
 [ 198   97 1507    1    2]
 [   1    0    0  169    4]
 [   1    0    0    1    5]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.67      0.79     14166
           1       0.83      0.96      0.89      6392
           2       0.62      0.83      0.71      1805
           3       0.06      0.97      0.12       174
           4       0.03      0.71      0.05         7

    accuracy                           0.77     22544
   macro avg       0.50      0.83      0.51     22544
weighted avg       0.90      0.77      0.81     22544

