#### NSL-KDD Binary Classifier 

NSL-KDD dataset is converted into two categories with '0' representing Normal Traffic and '1' representing the Attack. KNN, Support Vector Machine(SVM), Decision Tree , Random Forest , Logistic Regression and XGboost learning algorithms are trained separately and their performances are analysed separately.   

In [1]:
#Importing essential libraries
from sklearn.datasets import fetch_openml

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os 
import cv2
import random 
import math
import seaborn as sns

from datetime import datetime

from sklearn import metrics
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow import keras
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing the datasets from the pickle file 
pickle_in = open("X_train_NSL_B.pickle","rb")
X_train = pickle.load(pickle_in)

pickle_in = open("y_train_NSL_B.pickle","rb")
y_train = pickle.load(pickle_in)

pickle_in = open("X_test_NSL_B.pickle","rb")
X_test = pickle.load(pickle_in)

pickle_in = open("y_test_NSL_B.pickle","rb")
y_test = pickle.load(pickle_in)

In [3]:
#shape of the training and test datasets 
print('X_train ',X_train.shape)
print('y_train ',y_train.shape)
print('X_test ',X_test.shape)
print('y_test ',y_test.shape)

X_train  (125973, 41)
y_train  (125973, 1)
X_test  (22544, 41)
y_test  (22544, 1)


In [4]:
digit_train, counts_train = np.unique(y_train, return_counts = True)

distribution_train = dict(zip(digit_train, counts_train))
print(distribution_train)

{0: 67343, 1: 58630}


#### KNN Binary Classifier

In [5]:
#Importing KNN
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)    #training the model 
knn_prediction = knn_model.predict(X_test)  #Predicting the test data

In [6]:
#Evaluating performance
print("Train score is: ", knn_model.score(X_train, y_train))
print("Test score is: ",knn_model.score(X_test,y_test))

Train score is:  0.9971343065577545
Test score is:  0.7699609652235628


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

print('Confusion Matrix')
print(confusion_matrix(knn_prediction, y_test))
print('Classification Report')
print(classification_report(knn_prediction, y_test))

Confusion Matrix
[[9427 4902]
 [ 284 7931]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.66      0.78     14329
           1       0.62      0.97      0.75      8215

    accuracy                           0.77     22544
   macro avg       0.79      0.81      0.77     22544
weighted avg       0.84      0.77      0.77     22544



#### Support Vector Machine(SVM)

In [8]:
#Importing SVC Classifier
from sklearn.svm import SVC
svc_model=SVC(random_state=0)
svc_model.fit(X_train, y_train)  #training the model 
svc_prediction = svc_model.predict(X_test) #Predicting the test data

In [9]:
#Evaluating performance
print("Train score is: ", svc_model.score(X_train, y_train))
print("Test score is: ",svc_model.score(X_test,y_test))

Train score is:  0.535122605637716
Test score is:  0.43080198722498225


In [10]:
print('Confusion Matrix')
print(confusion_matrix(svc_prediction, y_test))
print('Classification Report')
print(classification_report(svc_prediction, y_test))

Confusion Matrix
[[ 9710 12831]
 [    1     2]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.43      0.60     22541
           1       0.00      0.67      0.00         3

    accuracy                           0.43     22544
   macro avg       0.50      0.55      0.30     22544
weighted avg       1.00      0.43      0.60     22544



#### Logistic Regression

In [11]:
#Importing Logistic Regression
from sklearn.linear_model import LogisticRegression 
lr_model=LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train.ravel())  #training the model
lr_prediction = lr_model.predict(X_test) #Predicting the test data

In [12]:
#Evaluating performance
print("Train score is: ", lr_model.score(X_train, y_train))
print("Test score is: ",lr_model.score(X_test,y_test))

Train score is:  0.8765926031768713
Test score is:  0.7036462029808375


In [13]:
print('Confusion Matrix')
print(confusion_matrix(lr_prediction, y_test))
print('Classification Report')
print(classification_report(lr_prediction, y_test))

Confusion Matrix
[[8746 5716]
 [ 965 7117]]
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.60      0.72     14462
           1       0.55      0.88      0.68      8082

    accuracy                           0.70     22544
   macro avg       0.73      0.74      0.70     22544
weighted avg       0.78      0.70      0.71     22544



#### Decision Tree

In [14]:
#Importing Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=3)                   # decision tree
dt_model.fit(X_train,y_train)       #training the model 
dt_prediction = dt_model.predict(X_test) #Predicting the test data

In [15]:
#Evaluating performance
print("Train score is: ", dt_model.score(X_train, y_train))
print("Test score is: ",dt_model.score(X_test,y_test))

Train score is:  0.9585387344907242
Test score is:  0.7876153300212917


In [16]:
print('Confusion Matrix')
print(confusion_matrix(dt_prediction, y_test))
print('Classification Report')
print(classification_report(dt_prediction, y_test))

Confusion Matrix
[[9388 4465]
 [ 323 8368]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.68      0.80     13853
           1       0.65      0.96      0.78      8691

    accuracy                           0.79     22544
   macro avg       0.81      0.82      0.79     22544
weighted avg       0.85      0.79      0.79     22544



#### Random Forest

In [17]:
#Importing Random Forest
from scipy.io.arff import loadarff
from sklearn.ensemble import RandomForestClassifier

In [18]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train.ravel())  #training the model 
rfc_prediction = rfc_model.predict(X_test) #Predicting the test data

In [19]:
#Evaluating performance
print("Train score is: ", rfc_model.score(X_train, y_train))
print("Test score is: ",rfc_model.score(X_test,y_test))

Train score is:  0.999944432537131
Test score is:  0.7774574166075231


In [20]:
print('Confusion Matrix')
print(confusion_matrix(rfc_prediction, y_test))
print('Classification Report')
print(classification_report(rfc_prediction, y_test))

Confusion Matrix
[[9449 4755]
 [ 262 8078]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.67      0.79     14204
           1       0.63      0.97      0.76      8340

    accuracy                           0.78     22544
   macro avg       0.80      0.82      0.78     22544
weighted avg       0.85      0.78      0.78     22544



#### XG-Boost

In [21]:
#Importing XGBoost 
from xgboost import XGBClassifier
xgb_model = XGBClassifier(silent=0,
                     scale_pos_weight=1,
                     learning_rate=0.01,
                     colsample_bytree=0.6,
                     subsample=0.8,
                     objective='binary:logistic',
                     n_estimators=100,
                     reg_alpha=0.3,
                     max_depth=3,
                     gamma=1)

xgb_model.fit(X_train,y_train,verbose=True)  #training the model 
xgb_prediction = xgb_model.predict(X_test) #Predicting the test data

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [22]:
#Evaluating performance
print("Train score is: ", xgb_model.score(X_train, y_train))
print("Test score is: ",xgb_model.score(X_test,y_test))

Train score is:  0.9798845784414121
Test score is:  0.7590046132008517


In [23]:
print('Confusion Matrix')
print(confusion_matrix(xgb_prediction, y_test))
print('Classification Report')
print(classification_report(xgb_prediction, y_test))

Confusion Matrix
[[9445 5167]
 [ 266 7666]]
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.65      0.78     14612
           1       0.60      0.97      0.74      7932

    accuracy                           0.76     22544
   macro avg       0.78      0.81      0.76     22544
weighted avg       0.84      0.76      0.76     22544

