## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering


## Import Train and test datasets

In [2]:
train = pd.read_csv(r'D:\Project Phase II\Dataset\train.csv',encoding='cp1252')
test = pd.read_csv(r'D:\Project Phase II\Dataset\test.csv',encoding='cp1252')

In [3]:
train

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.9,0,149.171.126.8,83,120,2,0.001046,146,178,31,...,0,1,3,3,1,1,1,1,-1,0
1,59.166.0.0,0,149.171.126.4,0,114,5,1.005964,1580,10168,31,...,0,8,8,7,4,4,4,4,-1,0
2,59.166.0.7,0,149.171.126.5,0,114,5,0.042514,5174,90266,31,...,0,7,7,8,9,1,1,1,-1,0
3,59.166.0.9,0,149.171.126.0,0,114,5,7.320488,13454,548216,31,...,0,3,6,1,4,1,1,1,-1,0
4,59.166.0.1,0,149.171.126.5,0,114,5,5.814120,16744,687116,31,...,0,5,10,3,5,2,1,3,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82457,175.45.176.1,0,149.171.126.10,128,114,5,0.238637,744,5570,62,...,0,1,1,1,1,1,1,1,3,1
82458,175.45.176.2,0,149.171.126.16,1093,114,5,0.662554,1758,766,254,...,0,1,2,3,1,1,1,1,4,1
82459,175.45.176.0,0,149.171.126.10,1300,120,6,0.000010,632,0,254,...,0,5,5,2,2,2,1,6,4,1
82460,175.45.176.1,0,149.171.126.12,0,132,6,0.000004,200,0,254,...,0,21,16,6,6,6,6,6,4,1


In [4]:
test

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.9,0,149.171.126.0,86853,114,5,0.010884,2438,19502,31,...,0,6,2,4,2,1,1,2,-1,0
1,59.166.0.1,0,149.171.126.0,37,114,5,0.910855,37342,3380,31,...,0,1,2,1,1,1,1,3,-1,0
2,59.166.0.5,0,149.171.126.7,0,114,5,0.071288,3390,44988,31,...,0,3,2,1,3,1,1,1,-1,0
3,59.166.0.7,0,149.171.126.9,0,114,5,0.146954,2438,20194,31,...,0,12,5,4,3,1,1,1,-1,0
4,59.166.0.6,0,149.171.126.8,267044,120,2,0.001664,544,304,31,...,0,3,12,4,4,1,1,3,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35337,175.45.176.1,0,149.171.126.10,377,114,5,0.741769,4678,354,254,...,0,4,3,1,1,1,1,3,4,1
35338,175.45.176.1,0,149.171.126.11,377,114,5,1.201228,26576,642,254,...,0,4,4,1,1,1,1,1,4,1
35339,175.45.176.0,2294806,149.171.126.19,32896,114,5,0.532201,2628,354,254,...,0,2,1,1,1,1,1,1,3,1
35340,175.45.176.0,0,149.171.126.12,33,114,5,1.099823,872,682,254,...,1,2,2,1,1,1,1,2,4,1


## Train-Test Sample Dataset loading

In [3]:
featureset = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime',
       'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat']


todrop = ['srcip','dstip','Stime', 'Ltime','attack_cat']

In [4]:
todrop = ['srcip','dstip','sport', 'dsport', 'Stime', 'Ltime','attack_cat',  

        'sttl', 'dttl', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'ct_state_ttl']
reducedTrain = train.drop(todrop, axis = 1)
reducedTest = test.drop(todrop, axis = 1)

In [7]:
reducedTrain.columns

Index(['sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin',
       'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
       'res_bdy_len', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack',
       'ackdat', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'Label'],
      dtype='object')

In [5]:
x_train = reducedTrain.drop(['Label'], axis = 1)
y_train = reducedTrain['Label']
x_test = reducedTest.drop(['Label'], axis = 1)
y_test = reducedTest['Label']

## Baseline model evaluation

In [6]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier

#Skip SVM
classifiers = [
 ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
 ('LinearDiscriminantAnalysis',  sklearn.discriminant_analysis.LinearDiscriminantAnalysis),
 ('LogisticRegression', sklearn.linear_model._logistic.LogisticRegression),
 ('Perceptron', sklearn.linear_model._perceptron.Perceptron),
 ('QuadraticDiscriminantAnalysis',  sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis),
 ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
 ('StackingClassifier', sklearn.ensemble._stacking.StackingClassifier),
 ('XGBClassifier', xgboost.sklearn.XGBClassifier),
 ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers=classifiers,predictions=True)


In [7]:
models,predictions = clf.fit(x_train, x_test, y_train, y_test)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  7%|▋         | 1/15 [00:19<04:28, 19.15s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.9871823892252843, 'Balanced Accuracy': 0.9871823892252845, 'ROC AUC': 0.9871823892252845, 'F1 Score': 0.987181783245811, 'Time taken': 19.1525821685791}


 13%|█▎        | 2/15 [00:42<04:40, 21.54s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.9909739120593062, 'Balanced Accuracy': 0.9909739120593062, 'ROC AUC': 0.9909739120593061, 'F1 Score': 0.9909735841963899, 'Time taken': 23.20749044418335}


 20%|██        | 3/15 [00:43<02:24, 12.02s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.931865768773697, 'Balanced Accuracy': 0.931865768773697, 'ROC AUC': 0.931865768773697, 'F1 Score': 0.9318559608440856, 'Time taken': 0.699282169342041}


 27%|██▋       | 4/15 [00:46<01:36,  8.73s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.9825703129421085, 'Balanced Accuracy': 0.9825703129421085, 'ROC AUC': 0.9825703129421086, 'F1 Score': 0.9825701789242001, 'Time taken': 3.686598300933838}


 33%|███▎      | 5/15 [00:47<00:57,  5.70s/it]

{'Model': 'DummyClassifier', 'Accuracy': 0.5, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.3333333333333333, 'Time taken': 0.32899951934814453}


 40%|████      | 6/15 [00:47<00:35,  3.93s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.8379831362118726, 'Balanced Accuracy': 0.8379831362118726, 'ROC AUC': 0.8379831362118726, 'F1 Score': 0.8354885900045967, 'Time taken': 0.4950072765350342}


 47%|████▋     | 7/15 [04:06<09:01, 67.65s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.9872106841718069, 'Balanced Accuracy': 0.9872106841718069, 'ROC AUC': 0.9872106841718069, 'F1 Score': 0.9872101234908305, 'Time taken': 198.8054440021515}


 53%|█████▎    | 8/15 [04:08<05:27, 46.72s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.9599909456171127, 'Balanced Accuracy': 0.9599909456171127, 'ROC AUC': 0.9599909456171127, 'F1 Score': 0.9599794877767857, 'Time taken': 1.8565797805786133}


 60%|██████    | 9/15 [04:12<03:21, 33.56s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.9779016467658876, 'Balanced Accuracy': 0.9779016467658876, 'ROC AUC': 0.9779016467658876, 'F1 Score': 0.9779016416528896, 'Time taken': 4.612891435623169}


 67%|██████▋   | 10/15 [04:14<01:57, 23.59s/it]

{'Model': 'Perceptron', 'Accuracy': 0.9660460641729387, 'Balanced Accuracy': 0.9660460641729387, 'ROC AUC': 0.9660460641729387, 'F1 Score': 0.9660425798492275, 'Time taken': 1.2501451969146729}


 73%|███████▎  | 11/15 [04:15<01:07, 16.75s/it]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.8910361609416558, 'Balanced Accuracy': 0.8910361609416558, 'ROC AUC': 0.8910361609416559, 'F1 Score': 0.8903000874590185, 'Time taken': 1.2551119327545166}


 80%|████████  | 12/15 [05:20<01:34, 31.46s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.9922754795993436, 'Balanced Accuracy': 0.9922754795993436, 'ROC AUC': 0.9922754795993435, 'F1 Score': 0.9922750711121064, 'Time taken': 65.10864877700806}


 93%|█████████▎| 14/15 [05:39<00:21, 21.15s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.9920491200271632, 'Balanced Accuracy': 0.9920491200271632, 'ROC AUC': 0.9920491200271632, 'F1 Score': 0.9920487379192848, 'Time taken': 18.566524028778076}
[LightGBM] [Info] Number of positive: 41231, number of negative: 41231
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4917
[LightGBM] [Info] Number of data points in the train set: 82462, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


100%|██████████| 15/15 [06:01<00:00, 24.08s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.9922754795993436, 'Balanced Accuracy': 0.9922754795993436, 'ROC AUC': 0.9922754795993435, 'F1 Score': 0.9922750647292472, 'Time taken': 22.017250537872314}





In [None]:
from sklearn.metrics import roc_auc_score
for model in predictions.columns:
    print(model)
    print(classification_report(y_test, predictions[model]))
    auc_scores = roc_auc_score(y_test, predictions[model], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     17671
           1       0.98      0.99      0.99     17671

    accuracy                           0.99     35342
   macro avg       0.99      0.99      0.99     35342
weighted avg       0.99      0.99      0.99     35342

AUC Scores for each class: 0.9871823892252845
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     17671
           1       0.99      1.00      0.99     17671

    accuracy                           0.99     35342
   macro avg       0.99      0.99      0.99     35342
weighted avg       0.99      0.99      0.99     35342

AUC Scores for each class: 0.9909739120593061
-------------------------------------------------