## Importing Libraries 

In [25]:
import pandas as pd
import numpy as np
from statistics import mean
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score,recall_score,f1_score

### Loading Datasets

In [26]:
data = pd.read_csv('ids_multi.csv')
data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag,label
0,0.0,491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.17,0.0,0.0,0.0,0.05,0.0,1.0,20.0,9.0,0.0
1,0.0,146.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.88,0.0,0.0,0.0,0.0,0.0,2.0,44.0,9.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,49.0,5.0,2.0
3,0.0,232.0,8153.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.03,0.04,0.03,0.01,0.0,0.01,1.0,24.0,9.0,0.0
4,0.0,199.0,420.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,24.0,9.0,0.0


In [27]:
x = data.iloc[:,0:-1].values
y = data.iloc[:,-1].values
x.shape,y.shape

((160367, 41), (160367,))

### splitting the dataset into train test split

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test = train_test_split(x,y)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((120275, 41), (40092, 41), (120275,), (40092,))

In [30]:
target_names = ['Normal','Probe','DOS','u2r','r2l']

# Applying Machine Learning Algorithms

## ADABoost

In [31]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()

In [32]:
abc.fit(x_train,y_train)

AdaBoostClassifier()

### Training accuracy

In [33]:
abc_train_score = abc.score(x_train,y_train)
abc_train_score

0.8599293286219081

### Testing accuracy

In [34]:
abc_test_score = abc.score(x_test,y_test)
abc_test_score

0.8568542352589045

### Predicted y values for train and test datas

In [35]:
abc_y_train = abc.predict(x_train)

In [36]:
abc_y_test = abc.predict(x_test)


## Confusion matrix

### For training

In [37]:
print(confusion_matrix(y_train,abc_y_train))

[[56103  1149  1021   908   101]
 [  892 10835   658     0     2]
 [ 6653   942 35648   121     0]
 [   52     1     1    81     7]
 [ 4032    42     1   264   761]]


### For testing

In [38]:
print(confusion_matrix(y_test,abc_y_test))

[[18868   382   335   305    34]
 [  329  3512   248     0     3]
 [ 2287   340 11685    51     0]
 [   20     1     1    22     0]
 [ 1309    10     2    82   266]]


## Classification report

### For training

In [39]:
print(classification_report(y_train,abc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.83      0.95      0.88     59282
       Probe       0.84      0.87      0.85     12387
         DOS       0.95      0.82      0.88     43364
         u2r       0.06      0.57      0.11       142
         r2l       0.87      0.15      0.25      5100

    accuracy                           0.86    120275
   macro avg       0.71      0.67      0.60    120275
weighted avg       0.88      0.86      0.85    120275



### For testing

In [40]:
print(classification_report(y_test,abc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       0.83      0.95      0.88     19924
       Probe       0.83      0.86      0.84      4092
         DOS       0.95      0.81      0.88     14363
         u2r       0.05      0.50      0.09        44
         r2l       0.88      0.16      0.27      1669

    accuracy                           0.86     40092
   macro avg       0.71      0.66      0.59     40092
weighted avg       0.87      0.86      0.85     40092



### Score

In [41]:
abc_precision_train = precision_score(y_train,abc_y_train,average='micro')
abc_precision_test = precision_score(y_test,abc_y_test,average='micro')
abc_recall_train = recall_score(y_train,abc_y_train,average='micro')
abc_recall_test = recall_score(y_test,abc_y_test,average='micro')
abc_f1_train = f1_score(y_train,abc_y_train,average='micro')
abc_f1_test = f1_score(y_test,abc_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",abc_precision_train)
print("Recall score:",abc_recall_train)
print("F1 score:",abc_f1_train)
print("\nTesting scores\n")
print("Precision score:",abc_precision_test)
print("Recall score:",abc_recall_test)
print("F1 score:",abc_f1_test)


Training scores

Precision score: 0.8599293286219081
Recall score: 0.8599293286219081
F1 score: 0.8599293286219082

Testing scores

Precision score: 0.8568542352589045
Recall score: 0.8568542352589045
F1 score: 0.8568542352589045


## K fold Cross Validation

In [42]:
from sklearn.model_selection import KFold

In [43]:

abc_training_scores = []
abc_testing_scores = []
best_abc = AdaBoostClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_abc.fit(X_train, Y_train)
    abc_training_scores.append(best_abc.score(X_train, Y_train))
    abc_testing_scores.append(best_abc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [64148 64149 64150 ... 96218 96219 96220]
Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [ 96221  96222  96223 ... 128291 128292 128293]
Train Index:  [     0      1      2 ... 128291 128292 128293] 

Test Index:  [128294 128295 128296 ... 160364 160365 160366]


In [44]:
abc_training_scores

[0.8260856009291232,
 0.8915373403069536,
 0.8793786147442593,
 0.8737197374779803,
 0.9152883221350959]

In [45]:
abc_testing_scores

[0.8487560017459624,
 0.9219928914385483,
 0.9138839522339662,
 0.8972656128207527,
 0.7217597356031553]

In [46]:
abc_avg_train = mean(abc_training_scores)
abc_avg_test = mean(abc_testing_scores)
abc_max_train = max(abc_training_scores)
abc_max_test = max(abc_testing_scores)
print("Average Accuracy for training data is : ",abc_avg_train)
print("Max Accuracy for training data is : ",abc_max_train)
print("Average Accuracy for testing data is : ",abc_avg_test)
print("Max Accuracy for testing data is : ",abc_max_test)

Average Accuracy for training data is :  0.8772019231186825
Max Accuracy for training data is :  0.9152883221350959
Average Accuracy for testing data is :  0.860731638768477
Max Accuracy for testing data is :  0.9219928914385483


## XGBoost

In [47]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()

In [48]:
xgbc.fit(x_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

### Training accuracy

In [49]:
xgbc_train_score = xgbc.score(x_train,y_train)
xgbc_train_score

0.9993348576179588

### Testing accuracy

In [50]:
xgbc_test_score = xgbc.score(x_test,y_test)
xgbc_test_score

0.9972563104858825

### Predicted Y values

In [51]:
xgbc_y_train = xgbc.predict(x_train)
xgbc_y_test = xgbc.predict(x_test)

## Confusion matrix

### For training

In [52]:
print(confusion_matrix(y_train,xgbc_y_train))

[[59245     1     4     0    32]
 [    0 12387     0     0     0]
 [    0     0 43364     0     0]
 [    1     0     0   141     0]
 [   42     0     0     0  5058]]


### For testing

In [53]:
print(confusion_matrix(y_test,xgbc_y_test))

[[19885     7     6     2    24]
 [    9  4082     0     0     1]
 [    4     0 14359     0     0]
 [    3     0     0    39     2]
 [   51     0     0     1  1617]]


### Classification Report

### For training

In [54]:
print(classification_report(y_train,xgbc_y_train, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     59282
       Probe       1.00      1.00      1.00     12387
         DOS       1.00      1.00      1.00     43364
         u2r       1.00      0.99      1.00       142
         r2l       0.99      0.99      0.99      5100

    accuracy                           1.00    120275
   macro avg       1.00      1.00      1.00    120275
weighted avg       1.00      1.00      1.00    120275



### For testing

In [55]:
print(classification_report(y_test,xgbc_y_test, target_names = target_names))

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     19924
       Probe       1.00      1.00      1.00      4092
         DOS       1.00      1.00      1.00     14363
         u2r       0.93      0.89      0.91        44
         r2l       0.98      0.97      0.98      1669

    accuracy                           1.00     40092
   macro avg       0.98      0.97      0.98     40092
weighted avg       1.00      1.00      1.00     40092



### Score

In [56]:
xgbc_precision_train = precision_score(y_train,xgbc_y_train,average='micro')
xgbc_precision_test = precision_score(y_test,xgbc_y_test,average='micro')
xgbc_recall_train = recall_score(y_train,xgbc_y_train,average='micro')
xgbc_recall_test = recall_score(y_test,xgbc_y_test,average='micro')
xgbc_f1_train = f1_score(y_train,xgbc_y_train,average='micro')
xgbc_f1_test = f1_score(y_test,xgbc_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",xgbc_precision_train)
print("Recall score:",xgbc_recall_train)
print("F1 score:",xgbc_f1_train)
print("\nTesting scores\n")
print("Precision score:",xgbc_precision_test)
print("Recall score:",xgbc_recall_test)
print("F1 score:",xgbc_f1_test)


Training scores

Precision score: 0.9993348576179588
Recall score: 0.9993348576179588
F1 score: 0.9993348576179588

Testing scores

Precision score: 0.9972563104858825
Recall score: 0.9972563104858825
F1 score: 0.9972563104858825


### K Fold Cross Validation

In [None]:

xgbc_training_scores = []
xgbc_testing_scores = []
best_xgbc = XGBClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_xgbc.fit(X_train, Y_train)
    xgbc_training_scores.append(best_xgbc.score(X_train, Y_train))
    xgbc_testing_scores.append(best_xgbc.score(X_test, Y_test))


Train Index:  [ 32074  32075  32076 ... 160364 160365 160366] 

Test Index:  [    0     1     2 ... 32071 32072 32073]




Train Index:  [     0      1      2 ... 160364 160365 160366] 

Test Index:  [32074 32075 32076 ... 64145 64146 64147]






In [None]:
xgbc_training_scores

In [None]:
xgbc_testing_scores

In [None]:
xgbc_avg_train = mean(xgbc_training_scores)
xgbc_avg_test = mean(xgbc_testing_scores)
xgbc_max_train = max(xgbc_training_scores)
xgbc_max_test = max(xgbc_testing_scores)
print("Average Accuracy for training data is : ",xgbc_avg_train)
print("Max Accuracy for training data is : ",xgbc_max_train)
print("Average Accuracy for testing data is : ",xgbc_avg_test)
print("Max Accuracy for testing data is : ",xgbc_max_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
rfc.fit(x_train,y_train)

### Training accuracy

In [None]:
rfc_train_score = rfc.score(x_train,y_train)
rfc_train_score

### Testing accuracy

In [None]:
rfc_test_score=rfc.score(x_test,y_test)
rfc_test_score

### Predicted y values

In [None]:
rfc_y_train = rfc.predict(x_train)
rfc_y_test = rfc.predict(x_test)

## Confusion matrix

### For training

In [None]:
print(confusion_matrix(y_train,rfc_y_train))

### For testing

In [None]:
print(confusion_matrix(y_test,rfc_y_test))

## Classification Report

### For training

In [None]:
print(classification_report(y_train,rfc_y_train, target_names = target_names))

### For testing

In [None]:
print(classification_report(y_test,rfc_y_test, target_names = target_names))

### Score

In [None]:
rfc_precision_train = precision_score(y_train,rfc_y_train,average='micro')
rfc_precision_test = precision_score(y_test,rfc_y_test,average='micro')
rfc_recall_train = recall_score(y_train,rfc_y_train,average='micro')
rfc_recall_test = recall_score(y_test,rfc_y_test,average='micro')
rfc_f1_train = f1_score(y_train,rfc_y_train,average='micro')
rfc_f1_test = f1_score(y_test,rfc_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",rfc_precision_train)
print("Recall score:",rfc_recall_train)
print("F1 score:",rfc_f1_train)
print("\nTesting scores\n")
print("Precision score:",rfc_precision_test)
print("Recall score:",rfc_recall_test)
print("F1 score:",rfc_f1_test)

### K Fold Cross Validation

In [None]:
rfc_training_scores = []
rfc_testing_scores = []
best_rfc = RandomForestClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_rfc.fit(X_train, Y_train)
    rfc_training_scores.append(best_rfc.score(X_train, Y_train))
    rfc_testing_scores.append(best_rfc.score(X_test, Y_test))


In [None]:
rfc_training_scores

In [None]:
rfc_testing_scores

In [None]:
rfc_avg_train = mean(rfc_training_scores)
rfc_avg_test = mean(rfc_testing_scores)
rfc_max_train = max(rfc_training_scores)
rfc_max_test = max(rfc_testing_scores)
print("Average Accuracy for training data is : ",rfc_avg_train)
print("Max Accuracy for training data is : ",rfc_max_train)
print("Average Accuracy for testing data is : ",rfc_avg_test)
print("Max Accuracy for testing data is : ",rfc_max_test)

## Bernoulli Naive bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

In [None]:
bnb.fit(x_train,y_train)

### Training accuracy

In [None]:
bnb_train_score=bnb.score(x_train,y_train)
bnb_train_score

### Testing accuracy

In [None]:
bnb_test_score=bnb.score(x_test,y_test)
bnb_test_score

### Predicting y values

In [None]:
bnb_y_train = bnb.predict(x_train)
bnb_y_test = bnb.predict(x_test)

## Confusion Matrix

### For training

In [None]:
print(confusion_matrix(y_train,bnb_y_train))

### For testing

In [None]:
print(confusion_matrix(y_test,bnb_y_test))

## Classification Report

### For training

In [None]:
print(classification_report(y_train,bnb_y_train, target_names=target_names))

### For testing

In [None]:
print(classification_report(y_test,bnb_y_test, target_names=target_names))

### Scores

In [None]:
bnb_precision_train = precision_score(y_train,bnb_y_train,average='micro')
bnb_precision_test = precision_score(y_test,bnb_y_test,average='micro')
bnb_recall_train = recall_score(y_train,bnb_y_train,average='micro')
bnb_recall_test = recall_score(y_test,bnb_y_test,average='micro')
bnb_f1_train = f1_score(y_train,bnb_y_train,average='micro')
bnb_f1_test = f1_score(y_test,bnb_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",bnb_precision_train)
print("Recall score:",bnb_recall_train)
print("F1 score:",bnb_f1_train)
print("\nTesting scores\n")
print("Precision score:",bnb_precision_test)
print("Recall score:",bnb_recall_test)
print("F1 score:",bnb_f1_test)

## K Fold Cross Validation

In [None]:
bnb_training_scores = []
bnb_testing_scores = []
best_bnb = BernoulliNB()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_bnb.fit(X_train, Y_train)
    bnb_training_scores.append(best_bnb.score(X_train, Y_train))
    bnb_testing_scores.append(best_bnb.score(X_test, Y_test))


In [None]:
bnb_training_scores

In [None]:
bnb_testing_scores

In [None]:
bnb_avg_train = mean(bnb_training_scores)
bnb_avg_test = mean(bnb_testing_scores)
bnb_max_train = max(bnb_training_scores)
bnb_max_test = max(bnb_testing_scores)
print("Average Accuracy for training data is : ",bnb_avg_train)
print("Max Accuracy for training data is : ",bnb_max_train)
print("Average Accuracy for testing data is : ",bnb_avg_test)
print("Max Accuracy for testing data is : ",bnb_max_test)

## Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC
svc = LinearSVC()

In [None]:
svc.fit(x_train,y_train)

### Training Accuracy

In [None]:
svc_train_score=svc.score(x_train,y_train)
svc_train_score

### Testing Accurcy

In [None]:
svc_test_score=svc.score(x_test,y_test)
svc_test_score

### Predicting y values

In [None]:
svc_y_train = svc.predict(x_train)
svc_y_test = svc.predict(x_test)

### Confusion Matrix

### For training 

In [None]:
print(confusion_matrix(y_train,svc_y_train))

### For testing

In [None]:
print(confusion_matrix(y_test,svc_y_test))

### Classification Report

### For training

In [None]:
print(classification_report(y_train,svc_y_train, target_names=target_names))

### For testing

In [None]:
print(classification_report(y_test,svc_y_test, target_names=target_names))

### Score

In [None]:
svc_precision_train = precision_score(y_train,svc_y_train,average='micro')
svc_precision_test = precision_score(y_test,svc_y_test,average='micro')
svc_recall_train = recall_score(y_train,svc_y_train,average='micro')
svc_recall_test = recall_score(y_test,svc_y_test,average='micro')
svc_f1_train = f1_score(y_train,svc_y_train,average='micro')
svc_f1_test = f1_score(y_test,svc_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",svc_precision_train)
print("Recall score:",svc_recall_train)
print("F1 score:",svc_f1_train)
print("\nTesting scores\n")
print("Precision score:",svc_precision_test)
print("Recall score:",svc_recall_test)
print("F1 score:",svc_f1_test)

### K Fold Cross Validation

In [None]:
svc_training_scores = []
svc_testing_scores = []
best_svc = LinearSVC()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_svc.fit(X_train, Y_train)
    svc_training_scores.append(best_svc.score(X_train, Y_train))
    svc_testing_scores.append(best_svc.score(X_test, Y_test))


In [None]:
svc_training_scores

In [None]:
svc_testing_scores

In [None]:
svc_avg_train = mean(svc_training_scores)
svc_avg_test = mean(svc_testing_scores)
svc_max_train = max(svc_training_scores)
svc_max_test = max(svc_testing_scores)

print("Average Accuracy for training data is : ",svc_avg_train)
print("Max Accuracy for training data is : ",svc_max_train)
print("Average Accuracy for testing data is : ",svc_avg_test)
print("Max Accuracy for testing data is : ",svc_max_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)

### Training accuracy

In [None]:
lr_train_score = lr.score(x_train,y_train)
lr_train_score

### Testing accuracy

In [None]:
lr_test_score = lr.score(x_test,y_test)
lr_test_score

### Predicting y values

In [None]:
lr_y_train = lr.predict(x_train)
lr_y_test = lr.predict(x_test)

## Confusion Matrix

### For training 

In [None]:
print(confusion_matrix(y_train,lr_y_train))

### For testing 

In [None]:
print(confusion_matrix(y_test,lr_y_test))

## Classification report

### For training

In [None]:
print(classification_report(y_train,lr_y_train,target_names = target_names))

### For testing

In [None]:
print(classification_report(y_test,lr_y_test,target_names = target_names))

### Score

In [None]:
lr_precision_train = precision_score(y_train,lr_y_train,average='micro')
lr_precision_test = precision_score(y_test,lr_y_test,average='micro')
lr_recall_train = recall_score(y_train,lr_y_train,average='micro')
lr_recall_test = recall_score(y_test,lr_y_test,average='micro')
lr_f1_train = f1_score(y_train,lr_y_train,average='micro')
lr_f1_test = f1_score(y_test,lr_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",lr_precision_train)
print("Recall score:",lr_recall_train)
print("F1 score:",lr_f1_train)
print("\nTesting scores\n")
print("Precision score:",lr_precision_test)
print("Recall score:",lr_recall_test)
print("F1 score:",lr_f1_test)

### K Fold Cross Validation

In [None]:
lr_training_scores = []
lr_testing_scores = []
best_lr = LogisticRegression()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_lr.fit(X_train, Y_train)
    lr_training_scores.append(best_lr.score(X_train, Y_train))
    lr_testing_scores.append(best_lr.score(X_test, Y_test))


In [None]:
lr_training_scores

In [None]:
lr_testing_scores

In [None]:
lr_avg_train = mean(lr_training_scores)
lr_avg_test = mean(lr_testing_scores)
lr_max_train = max(lr_training_scores)
lr_max_test = max(lr_testing_scores)

print("Average Accuracy for training data is : ",lr_avg_train)
print("Max Accuracy for training data is : ",lr_max_train)
print("Average Accuracy for testing data is : ",lr_avg_test)
print("Max Accuracy for testing data is : ",lr_max_test)

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

### Training accuracy

In [None]:
dtc_train_score = dtc.score(x_train,y_train)
dtc_train_score

### Testing accuracy

In [None]:
dtc_test_score = dtc.score(x_test,y_test)
dtc_test_score

### Predicting y values

In [None]:
dtc_y_train = dtc.predict(x_train)
dtc_y_test = dtc.predict(x_test)

## Confusion Matrix

### for training

In [None]:
print(confusion_matrix(dtc_y_train,y_train))

### for testing

In [None]:
print(confusion_matrix(dtc_y_test,y_test))

### Classification Report

### For training

In [None]:
print(classification_report(dtc_y_train,y_train, target_names=target_names))

### For testing

In [None]:
print(classification_report(dtc_y_test,y_test, target_names=target_names))

### Score

In [None]:
dtc_precision_train = precision_score(y_train,dtc_y_train,average='micro')
dtc_precision_test = precision_score(y_test,dtc_y_test,average='micro')
dtc_recall_train = recall_score(y_train,dtc_y_train,average='micro')
dtc_recall_test = recall_score(y_test,dtc_y_test,average='micro')
dtc_f1_train = f1_score(y_train,dtc_y_train,average='micro')
dtc_f1_test = f1_score(y_test,dtc_y_test,average='micro')
print("\nTraining scores\n")
print("Precision score:",dtc_precision_train)
print("Recall score:",dtc_recall_train)
print("F1 score:",dtc_f1_train)
print("\nTesting scores\n")
print("Precision score:",dtc_precision_test)
print("Recall score:",dtc_recall_test)
print("F1 score:",dtc_f1_test)

### K Fold Cross Validation

In [None]:
dtc_training_scores = []
dtc_testing_scores = []
best_dtc = DecisionTreeClassifier()
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(x):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, Y_train, Y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    best_dtc.fit(X_train, Y_train)
    dtc_training_scores.append(best_dtc.score(X_train, Y_train))
    dtc_testing_scores.append(best_dtc.score(X_test, Y_test))


In [None]:
dtc_training_scores

In [None]:
dtc_testing_scores

In [None]:
dtc_avg_train = mean(dtc_training_scores)
dtc_avg_test = mean(dtc_testing_scores)
dtc_max_train = max(dtc_training_scores)
dtc_max_test = max(dtc_testing_scores)

print("Average Accuracy for training data is : ",dtc_avg_train)
print("Max Accuracy for training data is : ",dtc_max_train)
print("Average Accuracy for testing data is : ",dtc_avg_test)
print("Max Accuracy for testing data is : ",dtc_max_test)