## Libraries

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering


## Import Train and test datasets

In [39]:
train = pd.read_csv(r'D:\Project Phase II\Dataset\UNSW_NB15_training-set.csv',encoding='cp1252')
test = pd.read_csv(r'D:\Project Phase II\Dataset\UNSW_NB15_testing-set.csv',encoding='cp1252')

In [4]:
train.columns

Index(['ï»¿id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')

## Train-Test Sample Dataset loading

In [5]:
featureset = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime',
       'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat']


todrop = ['srcip','dstip','Stime', 'Ltime','attack_cat']

In [16]:
fs.sort()
fs

['ackdat',
 'attack_cat',
 'ct_dst_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_flw_http_mthd',
 'ct_ftp_cmd',
 'ct_src_ ltm',
 'ct_src_dport_ltm',
 'ct_srv_dst',
 'ct_srv_src',
 'ct_state_ttl',
 'dbytes',
 'dintpkt',
 'djit',
 'dload',
 'dloss',
 'dmeansz',
 'dpkts',
 'dsport',
 'dstip',
 'dtcpb',
 'dttl',
 'dur',
 'dwin',
 'is_ftp_login',
 'ltime',
 'proto',
 'res_bdy_len',
 'sbytes',
 'service',
 'sintpkt',
 'sjit',
 'sload',
 'sloss',
 'smeansz',
 'spkts',
 'sport',
 'srcip',
 'state',
 'stcpb',
 'stime',
 'sttl',
 'swin',
 'synack',
 'tcprtt',
 'trans_depth']

In [40]:
todrop = ['ï»¿id','attack_cat']
reducedTrain = train.drop(todrop, axis = 1)
reducedTest = test.drop(todrop, axis = 1)

In [41]:
redtr = reducedTrain.drop_duplicates()

In [47]:
redtr['label'].value_counts()

0    51890
1    49379
Name: label, dtype: int64

In [48]:
redte['label'].value_counts()

0    34206
1    19746
Name: label, dtype: int64

In [42]:
len(redtr)

101269

In [43]:
len(reducedTrain)

175341

In [44]:
redte = reducedTest.drop_duplicates()

In [45]:
len(redte)

53952

In [46]:
len(reducedTest)

82332

In [55]:
redte.columns

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'label'],
      dtype='object')

In [67]:
# Fill any null values in proto
redtr['proto'].fillna('Any', inplace=True)
redte['proto'].fillna('Any', inplace=True)
redtr['service'].fillna('Any', inplace=True)
redte['service'].fillna('Any', inplace=True)
redtr['state'].fillna('Any', inplace=True)
redte['state'].fillna('Any', inplace=True)


In [63]:
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Label encode the 'proto' column
redte['proto'] = label_encoder.fit_transform(redte['proto'])
redte['proto'] = label_encoder.transform(redte['proto'])

# Label encode the 'service' column
redte['service'] = label_encoder.fit_transform(redte['service'])
redte['service'] = label_encoder.transform(redte['service'])

# Label encode the 'state' column
redte['state'] = label_encoder.fit_transform(redte['state'])
redte['state'] = label_encoder.transform(redte['state'])
#label encoding proto, service, state



In [68]:
label_encoder = LabelEncoder()

# Label encode the 'proto' column
redtr['proto'] = label_encoder.fit_transform(redtr['proto'])
redtr['proto'] = label_encoder.transform(redtr['proto'])

# Label encode the 'service' column
redtr['service'] = label_encoder.fit_transform(redtr['service'])
redtr['service'] = label_encoder.transform(redtr['service'])

# Label encode the 'state' column
redtr['state'] = label_encoder.fit_transform(redtr['state'])
redtr['state'] = label_encoder.transform(redtr['state'])

In [7]:
reducedTrain.columns

Index(['sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin',
       'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
       'res_bdy_len', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack',
       'ackdat', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'Label'],
      dtype='object')

In [69]:
x_train = redtr.drop(['label'], axis = 1)
y_train = redtr['label']
x_test = redte.drop(['label'], axis = 1)
y_test = redte['label']

## Baseline model evaluation

In [70]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier

#Skip SVM
classifiers = [
 ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
 ('LinearDiscriminantAnalysis',  sklearn.discriminant_analysis.LinearDiscriminantAnalysis),
 ('LogisticRegression', sklearn.linear_model._logistic.LogisticRegression),
 ('Perceptron', sklearn.linear_model._perceptron.Perceptron),
 ('QuadraticDiscriminantAnalysis',  sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis),
 ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
 ('StackingClassifier', sklearn.ensemble._stacking.StackingClassifier),
 ('XGBClassifier', xgboost.sklearn.XGBClassifier),
 ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers=classifiers,predictions=True)


In [71]:
models,predictions = clf.fit(x_train, x_test, y_train, y_test)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  7%|▋         | 1/15 [00:23<05:33, 23.83s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.7397501482799526, 'Balanced Accuracy': 0.7928207574913914, 'ROC AUC': 0.7928207574913912, 'F1 Score': 0.7407055187046573, 'Time taken': 23.827000856399536}


 13%|█▎        | 2/15 [01:01<06:56, 32.07s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.8311462040332147, 'Balanced Accuracy': 0.853926883346229, 'ROC AUC': 0.853926883346229, 'F1 Score': 0.8342258298489342, 'Time taken': 37.83800792694092}


 20%|██        | 3/15 [01:02<03:33, 17.77s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.6454626334519573, 'Balanced Accuracy': 0.6816933057193486, 'ROC AUC': 0.6816933057193486, 'F1 Score': 0.6491861034508466, 'Time taken': 0.7480061054229736}


 27%|██▋       | 4/15 [01:06<02:17, 12.50s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.8115361803084223, 'Balanced Accuracy': 0.8309580701394288, 'ROC AUC': 0.8309580701394288, 'F1 Score': 0.8149479725218911, 'Time taken': 4.416992902755737}


 33%|███▎      | 5/15 [01:07<01:21,  8.17s/it]

{'Model': 'DummyClassifier', 'Accuracy': 0.6340080071174378, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.49200022440298263, 'Time taken': 0.5070083141326904}


 40%|████      | 6/15 [01:08<00:50,  5.63s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.6629782028469751, 'Balanced Accuracy': 0.7080092539219318, 'ROC AUC': 0.7080092539219318, 'F1 Score': 0.6647836017629168, 'Time taken': 0.6869993209838867}


 47%|████▋     | 7/15 [04:23<09:01, 67.68s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.7820099347568209, 'Balanced Accuracy': 0.8110552295151761, 'ROC AUC': 0.8110552295151761, 'F1 Score': 0.7858337826925647, 'Time taken': 195.40999698638916}


 53%|█████▎    | 8/15 [04:24<05:25, 46.46s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.6244810201660735, 'Balanced Accuracy': 0.703735378262005, 'ROC AUC': 0.703735378262005, 'F1 Score': 0.6092204135118854, 'Time taken': 1.019000768661499}


 60%|██████    | 9/15 [04:27<03:16, 32.76s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.716785290628707, 'Balanced Accuracy': 0.7762513139819045, 'ROC AUC': 0.7762513139819045, 'F1 Score': 0.7156813707288585, 'Time taken': 2.6260006427764893}


 67%|██████▋   | 10/15 [04:27<01:54, 22.86s/it]

{'Model': 'Perceptron', 'Accuracy': 0.7106687425860023, 'Balanced Accuracy': 0.7695757609093832, 'ROC AUC': 0.7695757609093833, 'F1 Score': 0.7095663395331344, 'Time taken': 0.7050056457519531}


 73%|███████▎  | 11/15 [04:28<01:04, 16.11s/it]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.7056642941874258, 'Balanced Accuracy': 0.7663462706774209, 'ROC AUC': 0.7663462706774209, 'F1 Score': 0.7038753765582604, 'Time taken': 0.8059971332550049}


 80%|████████  | 12/15 [04:55<00:58, 19.41s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.8252520759193357, 'Balanced Accuracy': 0.8565467723192774, 'ROC AUC': 0.8565467723192774, 'F1 Score': 0.8283510901951779, 'Time taken': 26.96200394630432}


 93%|█████████▎| 14/15 [04:58<00:11, 11.04s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.7456998813760379, 'Balanced Accuracy': 0.797234605858195, 'ROC AUC': 0.7972346058581947, 'F1 Score': 0.7470419772189233, 'Time taken': 2.826979160308838}
[LightGBM] [Info] Number of positive: 49379, number of negative: 51890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6164
[LightGBM] [Info] Number of data points in the train set: 101269, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.487602 -> initscore=-0.049601
[LightGBM] [Info] Start training from score -0.049601


100%|██████████| 15/15 [05:01<00:00, 20.11s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.7447175266903915, 'Balanced Accuracy': 0.7973483420105396, 'ROC AUC': 0.7973483420105396, 'F1 Score': 0.7458269192283654, 'Time taken': 3.159001350402832}





In [72]:
from sklearn.metrics import roc_auc_score
for model in predictions.columns:
    print(model)
    print(classification_report(y_test, predictions[model]))
    auc_scores = roc_auc_score(y_test, predictions[model], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.99      0.59      0.74     34206
           1       0.59      0.99      0.74     19746

    accuracy                           0.74     53952
   macro avg       0.79      0.79      0.74     53952
weighted avg       0.84      0.74      0.74     53952

AUC Scores for each class: 0.7928207574913912
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       0.96      0.77      0.85     34206
           1       0.70      0.94      0.80     19746

    accuracy                           0.83     53952
   macro avg       0.83      0.85      0.83     53952
weighted avg       0.86      0.83      0.83     53952

AUC Scores for each class: 0.853926883346229
--------------------------------------------------