## Importing Libraries 

In [22]:
import pandas as pd
import numpy as np
from statistics import mean
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score,recall_score,f1_score

### Loading Datasets

In [23]:
data = pd.read_csv('ids_binary.csv')
data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag,label
0,0.0,491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.17,0.0,0.0,0.0,0.05,0.0,1.0,20.0,9.0,0.0
1,0.0,146.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.88,0.0,0.0,0.0,0.0,0.0,2.0,44.0,9.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,49.0,5.0,1.0
3,0.0,232.0,8153.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.03,0.04,0.03,0.01,0.0,0.01,1.0,24.0,9.0,0.0
4,0.0,199.0,420.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,24.0,9.0,0.0


In [24]:
x = data.iloc[:,0:-1].values
y = data.iloc[:,-1].values
x.shape,y.shape

((160367, 41), (160367,))

### splitting the dataset into train test split

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((120275, 41), (40092, 41), (120275,), (40092,))

In [27]:
target_names = ['Normal','Abnormal']

# Applying Machine Learning Algorithms

## ADABoost

In [28]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()

In [29]:
abc.fit(x_train,y_train)

AdaBoostClassifier()

### Training accuracy

In [123]:
abc_train_score = abc.score(x_train,y_train)
abc_train_score

0.9635086260652671

### Testing accuracy

In [125]:
abc_test_score = abc.score(x_test,y_test)
abc_test_score

0.9634091589344508

### Predicted y values for train and test datas

In [32]:
abc_y_train = abc.predict(x_train)

In [33]:
abc_y_test = abc.predict(x_test)


## Confusion matrix

### For training

In [34]:
print(confusion_matrix(y_train,abc_y_train))

[[57446  1965]
 [ 2424 58440]]


### For testing

In [35]:
print(confusion_matrix(y_test,abc_y_test))

[[19106   689]
 [  778 19519]]


## Classification report

### For training

In [36]:
print(classification_report(y_train,abc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.96      0.97      0.96     59411
    Abnormal       0.97      0.96      0.96     60864

    accuracy                           0.96    120275
   macro avg       0.96      0.96      0.96    120275
weighted avg       0.96      0.96      0.96    120275



### For testing

In [37]:
print(classification_report(y_test,abc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.96      0.97      0.96     19795
    Abnormal       0.97      0.96      0.96     20297

    accuracy                           0.96     40092
   macro avg       0.96      0.96      0.96     40092
weighted avg       0.96      0.96      0.96     40092



### Score

In [38]:
abc_precision_train = precision_score(y_train,abc_y_train)
abc_precision_test = precision_score(y_test,abc_y_test)
abc_recall_train = recall_score(y_train,abc_y_train)
abc_recall_test = recall_score(y_test,abc_y_test)
abc_f1_train = f1_score(y_train,abc_y_train)
abc_f1_test = f1_score(y_test,abc_y_test)
print("\nTraining scores\n")
print("Precision score:",abc_precision_train)
print("Recall score:",abc_recall_train)
print("F1 score:",abc_f1_train)
print("\nTesting scores\n")
print("Precision score:",abc_precision_test)
print("Recall score:",abc_recall_test)
print("F1 score:",abc_f1_test)


Training scores

Precision score: 0.9674695803327539
Recall score: 0.9601735015772871
F1 score: 0.963807733221186

Testing scores

Precision score: 0.9659045922406968
Recall score: 0.9616692121988472
F1 score: 0.9637822491050487


## K fold Cross Validation

In [39]:
from sklearn.model_selection import KFold

In [40]:

abc_training_scores = []
abc_testing_scores = []
best_abc = AdaBoostClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_abc.fit(X_train, Y_train)
    abc_training_scores.append(best_abc.score(X_train, Y_train))
    abc_testing_scores.append(best_abc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]
Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


In [41]:
abc_training_scores

[0.9608396405103942,
 0.9579088492747071,
 0.9589302695371569,
 0.9621494380095718,
 0.9822049355386846]

In [42]:
abc_testing_scores

[0.9742782315894494,
 0.9701627486437613,
 0.9720949084900071,
 0.9715960465188788,
 0.7879213045240545]

In [43]:
abc_avg_train = mean(abc_training_scores)
abc_avg_test = mean(abc_testing_scores)
abc_max_train = max(abc_training_scores)
abc_max_test = max(abc_testing_scores)
print("Average Accuracy for training data is : ",abc_avg_train)
print("Max Accuracy for training data is : ",abc_max_train)
print("Average Accuracy for testing data is : ",abc_avg_test)
print("Max Accuracy for testing data is : ",abc_max_test)

Average Accuracy for training data is :  0.9644066265741029
Max Accuracy for training data is :  0.9822049355386846
Average Accuracy for testing data is :  0.9352106479532302
Max Accuracy for testing data is :  0.9742782315894494


## XGBoost

In [44]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()

In [45]:
xgbc.fit(x_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Training accuracy

In [126]:
xgbc_train_score = xgbc.score(x_train,y_train)
xgbc_train_score

0.9988276865516524

### Testing accuracy

In [127]:
xgbc_test_score = xgbc.score(x_test,y_test)
xgbc_test_score

0.9971565399580964

### Predicted Y values

In [48]:
xgbc_y_train = xgbc.predict(x_train)
xgbc_y_test = xgbc.predict(x_test)

## Confusion matrix

### For training

In [49]:
print(confusion_matrix(y_train,xgbc_y_train))

[[59344    67]
 [   74 60790]]


### For testing

In [50]:
print(confusion_matrix(y_test,xgbc_y_test))

[[19742    53]
 [   61 20236]]


### Classification Report

### For training

In [51]:
print(classification_report(y_train,xgbc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     59411
    Abnormal       1.00      1.00      1.00     60864

    accuracy                           1.00    120275
   macro avg       1.00      1.00      1.00    120275
weighted avg       1.00      1.00      1.00    120275



### For testing

In [52]:
print(classification_report(y_test,xgbc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     19795
    Abnormal       1.00      1.00      1.00     20297

    accuracy                           1.00     40092
   macro avg       1.00      1.00      1.00     40092
weighted avg       1.00      1.00      1.00     40092



### Score

In [53]:
xgbc_precision_train = precision_score(y_train,xgbc_y_train)
xgbc_precision_test = precision_score(y_test,xgbc_y_test)
xgbc_recall_train = recall_score(y_train,xgbc_y_train)
xgbc_recall_test = recall_score(y_test,xgbc_y_test)
xgbc_f1_train = f1_score(y_train,xgbc_y_train)
xgbc_f1_test = f1_score(y_test,xgbc_y_test)
print("\nTraining scores\n")
print("Precision score:",xgbc_precision_train)
print("Recall score:",xgbc_recall_train)
print("F1 score:",xgbc_f1_train)
print("\nTesting scores\n")
print("Precision score:",xgbc_precision_test)
print("Recall score:",xgbc_recall_test)
print("F1 score:",xgbc_f1_test)


Training scores

Precision score: 0.998899058448494
Recall score: 0.998784174553102
F1 score: 0.998841613197394

Testing scores

Precision score: 0.9973877470550545
Recall score: 0.9969946297482386
F1 score: 0.9971911496575173


### K Fold Cross Validation

In [54]:

xgbc_training_scores = []
xgbc_testing_scores = []
best_xgbc = XGBClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_xgbc.fit(X_train, Y_train)
    xgbc_training_scores.append(best_xgbc.score(X_train, Y_train))
    xgbc_testing_scores.append(best_xgbc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]




Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]






In [55]:
xgbc_training_scores

[0.9984956310944478,
 0.9985190150670731,
 0.9984176968525418,
 0.9986749185464636,
 0.9998674918546464]

In [56]:
xgbc_testing_scores

[0.9986905281536447,
 0.9989087734613706,
 0.9988775605649611,
 0.9988775605649611,
 0.9646119789230817]

In [57]:
xgbc_avg_train = mean(xgbc_training_scores)
xgbc_avg_test = mean(xgbc_testing_scores)
xgbc_max_train = max(xgbc_training_scores)
xgbc_max_test = max(xgbc_testing_scores)
print("Average Accuracy for training data is : ",xgbc_avg_train)
print("Max Accuracy for training data is : ",xgbc_max_train)
print("Average Accuracy for testing data is : ",xgbc_avg_test)
print("Max Accuracy for testing data is : ",xgbc_max_test)

Average Accuracy for training data is :  0.9987949506830346
Max Accuracy for training data is :  0.9998674918546464
Average Accuracy for testing data is :  0.9919932803336039
Max Accuracy for testing data is :  0.9989087734613706


## Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [59]:
rfc.fit(x_train,y_train)

RandomForestClassifier()

### Training accuracy

In [128]:
rfc_train_score = rfc.score(x_train,y_train)
rfc_train_score

0.9994928289336936

### Testing accuracy

In [129]:
rfc_test_score=rfc.score(x_test,y_test)
rfc_test_score

0.997281253117829

### Predicted y values

In [62]:
rfc_y_train = rfc.predict(x_train)
rfc_y_test = rfc.predict(x_test)

## Confusion matrix

### For training

In [63]:
print(confusion_matrix(y_train,rfc_y_train))

[[59384    27]
 [   34 60830]]


### For testing

In [64]:
print(confusion_matrix(y_test,rfc_y_test))

[[19746    49]
 [   60 20237]]


## Classification Report

### For training

In [65]:
print(classification_report(y_train,rfc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     59411
    Abnormal       1.00      1.00      1.00     60864

    accuracy                           1.00    120275
   macro avg       1.00      1.00      1.00    120275
weighted avg       1.00      1.00      1.00    120275



### For testing

In [66]:
print(classification_report(y_test,rfc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     19795
    Abnormal       1.00      1.00      1.00     20297

    accuracy                           1.00     40092
   macro avg       1.00      1.00      1.00     40092
weighted avg       1.00      1.00      1.00     40092



### Score

In [67]:
rfc_precision_train = precision_score(y_train,rfc_y_train)
rfc_precision_test = precision_score(y_test,rfc_y_test)
rfc_recall_train = recall_score(y_train,rfc_y_train)
rfc_recall_test = recall_score(y_test,rfc_y_test)
rfc_f1_train = f1_score(y_train,rfc_y_train)
rfc_f1_test = f1_score(y_test,rfc_y_test)
print("\nTraining scores\n")
print("Precision score:",rfc_precision_train)
print("Recall score:",rfc_recall_train)
print("F1 score:",rfc_f1_train)
print("\nTesting scores\n")
print("Precision score:",rfc_precision_test)
print("Recall score:",rfc_recall_test)
print("F1 score:",rfc_f1_test)


Training scores

Precision score: 0.9995563369867065
Recall score: 0.9994413774973712
F1 score: 0.9994988539364612

Testing scores

Precision score: 0.9975845410628019
Recall score: 0.9970438981130216
F1 score: 0.9973141463174235


### K Fold Cross Validation

In [68]:
rfc_training_scores = []
rfc_testing_scores = []
best_rfc = RandomForestClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_rfc.fit(X_train, Y_train)
    rfc_training_scores.append(best_rfc.score(X_train, Y_train))
    rfc_testing_scores.append(best_rfc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]
Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


In [69]:
rfc_training_scores

[0.9991425876704108,
 0.999134793012869,
 0.9991425943535941,
 0.9992283349182347,
 0.9999454378225014]

In [70]:
rfc_testing_scores

[0.9988464176591632,
 0.9990023071646816,
 0.9989399183113522,
 0.9986593084525925,
 0.962180026813831]

In [71]:
rfc_avg_train = mean(rfc_training_scores)
rfc_avg_test = mean(rfc_testing_scores)
rfc_max_train = max(rfc_training_scores)
rfc_max_test = max(rfc_testing_scores)
print("Average Accuracy for training data is : ",rfc_avg_train)
print("Max Accuracy for training data is : ",rfc_max_train)
print("Average Accuracy for testing data is : ",rfc_avg_test)
print("Max Accuracy for testing data is : ",rfc_max_test)

Average Accuracy for training data is :  0.999318749555522
Max Accuracy for training data is :  0.9999454378225014
Average Accuracy for testing data is :  0.9915255956803241
Max Accuracy for testing data is :  0.9990023071646816


## Bernoulli Naive bayes

In [72]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

In [73]:
bnb.fit(x_train,y_train)

BernoulliNB()

### Training accuracy

In [131]:
bnb_train_score=bnb.score(x_train,y_train)
bnb_train_score

0.8554811889420079

### Testing accuracy

In [130]:
bnb_test_score=bnb.score(x_test,y_test)
bnb_test_score

0.8549585952309687

### Predicting y values

In [76]:
bnb_y_train = bnb.predict(x_train)
bnb_y_test = bnb.predict(x_test)

## Confusion Matrix

### For training

In [77]:
print(confusion_matrix(y_train,bnb_y_train))

[[54831  4580]
 [12802 48062]]


### For testing

In [78]:
print(confusion_matrix(y_test,bnb_y_test))

[[18266  1529]
 [ 4286 16011]]


## Classification Report

### For training

In [79]:
print(classification_report(y_train,bnb_y_train, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.81      0.92      0.86     59411
    Abnormal       0.91      0.79      0.85     60864

    accuracy                           0.86    120275
   macro avg       0.86      0.86      0.86    120275
weighted avg       0.86      0.86      0.85    120275



### For testing

In [80]:
print(classification_report(y_test,bnb_y_test, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.81      0.92      0.86     19795
    Abnormal       0.91      0.79      0.85     20297

    accuracy                           0.85     40092
   macro avg       0.86      0.86      0.85     40092
weighted avg       0.86      0.85      0.85     40092



### Scores

In [81]:
bnb_precision_train = precision_score(y_train,bnb_y_train)
bnb_precision_test = precision_score(y_test,bnb_y_test)
bnb_recall_train = recall_score(y_train,bnb_y_train)
bnb_recall_test = recall_score(y_test,bnb_y_test)
bnb_f1_train = f1_score(y_train,bnb_y_train)
bnb_f1_test = f1_score(y_test,bnb_y_test)
print("\nTraining scores\n")
print("Precision score:",bnb_precision_train)
print("Recall score:",bnb_recall_train)
print("F1 score:",bnb_f1_train)
print("\nTesting scores\n")
print("Precision score:",bnb_precision_test)
print("Recall score:",bnb_recall_test)
print("F1 score:",bnb_f1_test)


Training scores

Precision score: 0.9129972265491433
Recall score: 0.7896621976866456
F1 score: 0.8468627209134318

Testing scores

Precision score: 0.9128278221208665
Recall score: 0.7888357885401783
F1 score: 0.8463144541057694


## K Fold Cross Validation

In [82]:
bnb_training_scores = []
bnb_testing_scores = []
best_bnb = BernoulliNB()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_bnb.fit(X_train, Y_train)
    bnb_training_scores.append(best_bnb.score(X_train, Y_train))
    bnb_testing_scores.append(best_bnb.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]
Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


In [83]:
bnb_training_scores

[0.847762543552649,
 0.8481444817721934,
 0.8483639141347218,
 0.849042044055061,
 0.8925514833117683]

In [84]:
bnb_testing_scores

[0.8849535449273555,
 0.8832387603666522,
 0.8841393072054375,
 0.8795871917188913,
 0.6912667976179341]

In [85]:
bnb_avg_train = mean(bnb_training_scores)
bnb_avg_test = mean(bnb_testing_scores)
bnb_max_train = max(bnb_training_scores)
bnb_max_test = max(bnb_testing_scores)
print("Average Accuracy for training data is : ",bnb_avg_train)
print("Max Accuracy for training data is : ",bnb_max_train)
print("Average Accuracy for testing data is : ",bnb_avg_test)
print("Max Accuracy for testing data is : ",bnb_max_test)

Average Accuracy for training data is :  0.8571728933652787
Max Accuracy for training data is :  0.8925514833117683
Average Accuracy for testing data is :  0.8446371203672541
Max Accuracy for testing data is :  0.8849535449273555


## Support Vector Machine

In [90]:
from sklearn.svm import LinearSVC
svc = LinearSVC()

In [91]:
svc.fit(x_train,y_train)



LinearSVC()

### Training Accuracy

In [132]:
svc_train_score=svc.score(x_train,y_train)
svc_train_score

0.7397879858657244

### Testing Accurcy

In [133]:
svc_test_score=svc.score(x_test,y_test)
svc_test_score

0.7382520203531877

### Predicting y values

In [94]:
svc_y_train = svc.predict(x_train)
svc_y_test = svc.predict(x_test)

### Confusion Matrix

### For training 

In [95]:
print(confusion_matrix(y_train,svc_y_train))

[[42791 16620]
 [14677 46187]]


### For testing

In [96]:
print(confusion_matrix(y_test,svc_y_test))

[[14257  5538]
 [ 4956 15341]]


### Classification Report

### For training

In [97]:
print(classification_report(y_train,svc_y_train, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.74      0.72      0.73     59411
    Abnormal       0.74      0.76      0.75     60864

    accuracy                           0.74    120275
   macro avg       0.74      0.74      0.74    120275
weighted avg       0.74      0.74      0.74    120275



### For testing

In [98]:
print(classification_report(y_test,svc_y_test, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.74      0.72      0.73     19795
    Abnormal       0.73      0.76      0.75     20297

    accuracy                           0.74     40092
   macro avg       0.74      0.74      0.74     40092
weighted avg       0.74      0.74      0.74     40092



### Score

In [99]:
svc_precision_train = precision_score(y_train,svc_y_train)
svc_precision_test = precision_score(y_test,svc_y_test)
svc_recall_train = recall_score(y_train,svc_y_train)
svc_recall_test = recall_score(y_test,svc_y_test)
svc_f1_train = f1_score(y_train,svc_y_train)
svc_f1_test = f1_score(y_test,svc_y_test)
print("\nTraining scores\n")
print("Precision score:",svc_precision_train)
print("Recall score:",svc_recall_train)
print("F1 score:",svc_f1_train)
print("\nTesting scores\n")
print("Precision score:",svc_precision_test)
print("Recall score:",svc_recall_test)
print("F1 score:",svc_f1_test)


Training scores

Precision score: 0.7353798143519035
Recall score: 0.7588558096740273
F1 score: 0.746933395864835

Testing scores

Precision score: 0.7347574117534365
Recall score: 0.7558259841355865
F1 score: 0.7451428016320187


### K Fold Cross Validation

In [103]:
svc_training_scores = []
svc_testing_scores = []
best_svc = LinearSVC()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_svc.fit(X_train, Y_train)
    svc_training_scores.append(best_svc.score(X_train, Y_train))
    svc_testing_scores.append(best_svc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]




Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]




In [106]:
svc_training_scores

[0.8384713117629177,
 0.7457382709890641,
 0.8674217032752896,
 0.8252139616817622,
 0.9369495066020235]

In [107]:
svc_testing_scores

[0.8356301053813058,
 0.7501714784560704,
 0.9240482648957067,
 0.8848564212889346,
 0.6869952919901475]

In [108]:
svc_avg_train = mean(svc_training_scores)
svc_avg_test = mean(svc_testing_scores)
svc_max_train = max(svc_training_scores)
svc_max_test = max(svc_testing_scores)

print("Average Accuracy for training data is : ",svc_avg_train)
print("Max Accuracy for training data is : ",svc_max_train)
print("Average Accuracy for testing data is : ",svc_avg_test)
print("Max Accuracy for testing data is : ",svc_max_test)

Average Accuracy for training data is :  0.8427589508622114
Max Accuracy for training data is :  0.9369495066020235
Average Accuracy for testing data is :  0.816340312402433
Max Accuracy for testing data is :  0.9240482648957067


## Logistic Regression

In [109]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Training accuracy

In [110]:
lr_train_score = lr.score(x_train,y_train)
lr_train_score

0.8130118478486801

### Testing accuracy

In [112]:
lr_test_score = lr.score(x_test,y_test)
lr_test_score

0.8148757856929063

### Predicting y values

In [113]:
lr_y_train = lr.predict(x_train)
lr_y_test = lr.predict(x_test)

## Confusion Matrix

### For training 

In [114]:
print(confusion_matrix(y_train,lr_y_train))

[[49669  9742]
 [12748 48116]]


### For testing 

In [115]:
print(confusion_matrix(y_test,lr_y_test))

[[16504  3291]
 [ 4131 16166]]


## Classification report

### For training

In [116]:
print(classification_report(y_train,lr_y_train,target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.80      0.84      0.82     59411
    Abnormal       0.83      0.79      0.81     60864

    accuracy                           0.81    120275
   macro avg       0.81      0.81      0.81    120275
weighted avg       0.81      0.81      0.81    120275



### For testing

In [117]:
print(classification_report(y_test,lr_y_test,target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.80      0.83      0.82     19795
    Abnormal       0.83      0.80      0.81     20297

    accuracy                           0.81     40092
   macro avg       0.82      0.82      0.81     40092
weighted avg       0.82      0.81      0.81     40092



### Score

In [118]:
lr_precision_train = precision_score(y_train,lr_y_train)
lr_precision_test = precision_score(y_test,lr_y_test)
lr_recall_train = recall_score(y_train,lr_y_train)
lr_recall_test = recall_score(y_test,lr_y_test)
lr_f1_train = f1_score(y_train,lr_y_train)
lr_f1_test = f1_score(y_test,lr_y_test)
print("\nTraining scores\n")
print("Precision score:",lr_precision_train)
print("Recall score:",lr_recall_train)
print("F1 score:",lr_f1_train)
print("\nTesting scores\n")
print("Precision score:",lr_precision_test)
print("Recall score:",lr_recall_test)
print("F1 score:",lr_f1_test)


Training scores

Precision score: 0.8316222475716409
Recall score: 0.7905494216614091
F1 score: 0.8105658597395596

Testing scores

Precision score: 0.8308577889705504
Recall score: 0.7964723850815392
F1 score: 0.8133018061075614


### K Fold Cross Validation

In [119]:
lr_training_scores = []
lr_testing_scores = []
best_lr = LogisticRegression()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_lr.fit(X_train, Y_train)
    lr_training_scores.append(best_lr.score(X_train, Y_train))
    lr_testing_scores.append(best_lr.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [120]:
lr_training_scores

[0.8021715915911234,
 0.7975415650113412,
 0.8021809281805852,
 0.8038567664894695,
 0.8736729698972672]

In [121]:
lr_testing_scores

[0.8562075201097462,
 0.8511255222298435,
 0.8557977114707075,
 0.8493748635924298,
 0.6103264428023571]

In [122]:
lr_avg_train = mean(lr_training_scores)
lr_avg_test = mean(lr_testing_scores)
lr_max_train = max(lr_training_scores)
lr_max_test = max(lr_testing_scores)

print("Average Accuracy for training data is : ",lr_avg_train)
print("Max Accuracy for training data is : ",lr_max_train)
print("Average Accuracy for testing data is : ",lr_avg_test)
print("Max Accuracy for testing data is : ",lr_max_test)

Average Accuracy for training data is :  0.8158847642339573
Max Accuracy for training data is :  0.8736729698972672
Average Accuracy for testing data is :  0.8045664120410169
Max Accuracy for testing data is :  0.8562075201097462


## Decision Tree Classifier

In [134]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

DecisionTreeClassifier()

### Training accuracy

In [135]:
dtc_train_score = dtc.score(x_train,y_train)
dtc_train_score

0.9994928289336936

### Testing accuracy

In [136]:
dtc_test_score = dtc.score(x_test,y_test)
dtc_test_score

0.996233662576075

### Predicting y values

In [138]:
dtc_y_train = dtc.predict(x_train)
dtc_y_test = dtc.predict(x_test)

## Confusion Matrix

### for training

In [139]:
print(confusion_matrix(dtc_y_train,y_train))

[[59411    61]
 [    0 60803]]


### for testing

In [140]:
print(confusion_matrix(dtc_y_test,y_test))

[[19719    75]
 [   76 20222]]


### Classification Report

### For training

In [142]:
print(classification_report(dtc_y_train,y_train, target_names=target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     59472
    Abnormal       1.00      1.00      1.00     60803

    accuracy                           1.00    120275
   macro avg       1.00      1.00      1.00    120275
weighted avg       1.00      1.00      1.00    120275



### For testing

In [143]:
print(classification_report(dtc_y_test,y_test, target_names=target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     19794
    Abnormal       1.00      1.00      1.00     20298

    accuracy                           1.00     40092
   macro avg       1.00      1.00      1.00     40092
weighted avg       1.00      1.00      1.00     40092



### Score

In [144]:
dtc_precision_train = precision_score(y_train,dtc_y_train)
dtc_precision_test = precision_score(y_test,dtc_y_test)
dtc_recall_train = recall_score(y_train,dtc_y_train)
dtc_recall_test = recall_score(y_test,dtc_y_test)
dtc_f1_train = f1_score(y_train,dtc_y_train)
dtc_f1_test = f1_score(y_test,dtc_y_test)
print("\nTraining scores\n")
print("Precision score:",dtc_precision_train)
print("Recall score:",dtc_recall_train)
print("F1 score:",dtc_f1_train)
print("\nTesting scores\n")
print("Precision score:",dtc_precision_test)
print("Recall score:",dtc_recall_test)
print("F1 score:",dtc_f1_test)


Training scores

Precision score: 1.0
Recall score: 0.9989977655099895
F1 score: 0.9994986315105986

Testing scores

Precision score: 0.9962557887476599
Recall score: 0.996304872641277
F1 score: 0.9962803300899126


### K Fold Cross Validation

In [145]:
dtc_training_scores = []
dtc_testing_scores = []
best_dtc = DecisionTreeClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_dtc.fit(X_train, Y_train)
    dtc_training_scores.append(best_dtc.score(X_train, Y_train))
    dtc_testing_scores.append(best_dtc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]
Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


In [146]:
dtc_training_scores

[0.9991425876704108,
 0.999134793012869,
 0.9991425943535941,
 0.9992283349182347,
 0.9999454378225014]

In [147]:
dtc_testing_scores

[0.9975057679117042,
 0.9975993016150153,
 0.9976615845103358,
 0.9979110154959,
 0.9530446169675428]

In [148]:
dtc_avg_train = mean(dtc_training_scores)
dtc_avg_test = mean(dtc_testing_scores)
dtc_max_train = max(dtc_training_scores)
dtc_max_test = max(dtc_testing_scores)

print("Average Accuracy for training data is : ",dtc_avg_train)
print("Max Accuracy for training data is : ",dtc_max_train)
print("Average Accuracy for testing data is : ",dtc_avg_test)
print("Max Accuracy for testing data is : ",dtc_max_test)

Average Accuracy for training data is :  0.999318749555522
Max Accuracy for training data is :  0.9999454378225014
Average Accuracy for testing data is :  0.9887444573000996
Max Accuracy for testing data is :  0.9979110154959
