## Importing Libraries 

In [52]:
import pandas as pd
import numpy as np
from statistics import mean
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

### Loading Datasets

In [3]:
data = pd.read_csv('ids_binary.csv')
data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag,label
0,0.0,491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.17,0.0,0.0,0.0,0.05,0.0,1.0,20.0,9.0,0.0
1,0.0,146.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.88,0.0,0.0,0.0,0.0,0.0,2.0,44.0,9.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,49.0,5.0,1.0
3,0.0,232.0,8153.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.03,0.04,0.03,0.01,0.0,0.01,1.0,24.0,9.0,0.0
4,0.0,199.0,420.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,24.0,9.0,0.0


In [47]:
x = data.iloc[:,0:-1].values
y = data.iloc[:,-1].values
x.shape,y.shape

((160367, 41), (160367,))

### splitting the dataset into train test split

In [9]:
from sklearn.model_selection import train_test_split

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((120275, 41), (40092, 41), (120275,), (40092,))

In [35]:
target_names = ['Normal','Abnormal']

# Applying Machine Learning Algorithms

## ADABoost

In [14]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()

In [19]:
abc.fit(x_train,y_train)

AdaBoostClassifier()

### Training accuracy

In [20]:
abc.score(x_train,y_train)



0.9649054250675535

### Testing accuracy

In [21]:
abc.score(x_test,y_test)



0.9638331836775417

### Predicted y values for train and test datas

In [27]:
abc_y_train = abc.predict(x_train)



In [28]:
abc_y_test = abc.predict(x_test)




## Confusion matrix

### For training

In [29]:
print(confusion_matrix(y_train,abc_y_train))

[[57350  2051]
 [ 2170 58704]]


### For testing

In [30]:
print(confusion_matrix(y_test,abc_y_test))

[[19101   704]
 [  746 19541]]


## Classification report

### For training

In [36]:
print(classification_report(y_train,abc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.96      0.97      0.96     59401
    Abnormal       0.97      0.96      0.97     60874

    accuracy                           0.96    120275
   macro avg       0.96      0.96      0.96    120275
weighted avg       0.96      0.96      0.96    120275



### For testing

In [37]:
print(classification_report(y_test,abc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.96      0.96      0.96     19805
    Abnormal       0.97      0.96      0.96     20287

    accuracy                           0.96     40092
   macro avg       0.96      0.96      0.96     40092
weighted avg       0.96      0.96      0.96     40092



## K fold Cross Validation

In [38]:
from sklearn.model_selection import KFold

In [48]:

abc_training_scores = []
abc_testing_scores = []
best_abc = AdaBoostClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_abc.fit(X_train, Y_train)
    abc_training_scores.append(best_abc.score(X_train, Y_train))
    abc_testing_scores.append(best_abc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]
Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


In [49]:
abc_training_scores

[0.9608396405103942,
 0.9579088492747071,
 0.9589302695371569,
 0.9621494380095718,
 0.9822049355386846]

In [50]:
abc_testing_scores

[0.9742782315894494,
 0.9701627486437613,
 0.9720949084900071,
 0.9715960465188788,
 0.7879213045240545]

In [53]:
abc_avg_train = mean(abc_training_scores)
abc_avg_test = mean(abc_testing_scores)
abc_max_train = max(abc_training_scores)
abc_max_test = max(abc_testing_scores)
print("Average Accuracy for training data is : ",abc_avg_train)
print("Max Accuracy for training data is : ",abc_max_train)
print("Average Accuracy for testing data is : ",abc_avg_test)
print("Max Accuracy for testing data is : ",abc_max_test)

Average Accuracy for training data is :  0.9644066265741029
Max Accuracy for training data is :  0.9822049355386846
Average Accuracy for testing data is :  0.9352106479532302
Max Accuracy for testing data is :  0.9742782315894494


## XGBoost

In [54]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()

In [55]:
xgbc.fit(x_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Training accuracy

In [56]:
xgbc.score(x_train,y_train)

0.9989856578673872

### Testing accuracy

In [57]:
xgbc.score(x_test,y_test)

0.9970817120622568

### Predicted Y values

In [59]:
xgbc_y_train = xgbc.predict(x_train)
xgbc_y_test = xgbc.predict(x_test)

## Confusion matrix

### For training

In [63]:
print(confusion_matrix(y_train,xgbc_y_train))

[[59349    52]
 [   70 60804]]


### For testing

In [65]:
print(confusion_matrix(y_test,xgbc_y_test))

[[19748    57]
 [   60 20227]]


## Classification Report

### For training

In [60]:
print(classification_report(y_train,xgbc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     59401
    Abnormal       1.00      1.00      1.00     60874

    accuracy                           1.00    120275
   macro avg       1.00      1.00      1.00    120275
weighted avg       1.00      1.00      1.00    120275



### For testing

In [62]:
print(classification_report(y_test,xgbc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     19805
    Abnormal       1.00      1.00      1.00     20287

    accuracy                           1.00     40092
   macro avg       1.00      1.00      1.00     40092
weighted avg       1.00      1.00      1.00     40092



## K Fold Cross Validation

In [71]:

xgbc_training_scores = []
xgbc_testing_scores = []
best_xgbc = XGBClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_xgbc.fit(X_train, Y_train)
    xgbc_training_scores.append(best_xgbc.score(X_train, Y_train))
    xgbc_testing_scores.append(best_xgbc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]




Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]






In [75]:
xgbc_training_scores

[0.9984956310944478,
 0.9985190150670731,
 0.9984176968525418,
 0.9986749185464636,
 0.9998674918546464]

In [76]:
xgbc_testing_scores

[0.9986905281536447,
 0.9989087734613706,
 0.9988775605649611,
 0.9988775605649611,
 0.9646119789230817]

In [77]:
xgbc_avg_train = mean(xgbc_training_scores)
xgbc_avg_test = mean(xgbc_testing_scores)
xgbc_max_train = max(xgbc_training_scores)
xgbc_max_test = max(xgbc_testing_scores)
print("Average Accuracy for training data is : ",xgbc_avg_train)
print("Max Accuracy for training data is : ",xgbc_max_train)
print("Average Accuracy for testing data is : ",xgbc_avg_test)
print("Max Accuracy for testing data is : ",xgbc_max_test)

Average Accuracy for training data is :  0.9987949506830346
Max Accuracy for training data is :  0.9998674918546464
Average Accuracy for testing data is :  0.9919932803336039
Max Accuracy for testing data is :  0.9989087734613706
