# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv


# Dataset Loading

In [6]:
train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252') #test_set1 

autrain = pd.read_csv(r'D:\Project Phase II\Dataset\UNSW_NB15_training-set.csv',encoding='cp1252')
autest = pd.read_csv(r'D:\Project Phase II\Dataset\UNSW_NB15_testing-set.csv',encoding='cp1252') #test_set2

  train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
  test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252') #test_set1


# Data Preprocessing

## Preprocess user made train-test sets

In [3]:
todrop = ['sloss', 'dloss','stcpb', 'dtcpb', 'trans_depth', 'Stime', 'Ltime','ct_flw_http_mthd', 
        'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat']
        
reducedTrain = train.drop(todrop, axis = 1)
reducedTest = test.drop(todrop, axis = 1)
reducedTrain = reducedTrain.drop_duplicates()
reducedTest = reducedTest.drop_duplicates()

trainAttributes = reducedTrain.drop(['srcip','sport','dstip','dsport','Label'], axis = 1)
trainLabel = reducedTrain['Label']
testAttributes = reducedTest.drop(['srcip','sport','dstip','dsport','Label'], axis = 1)
testLabel = reducedTest['Label']

train = reducedTrain
test = reducedTest

train = train.drop_duplicates(['srcip','sport', 'dstip', 'dsport'], keep = 'last')
test = test.drop_duplicates(['srcip','sport', 'dstip', 'dsport'], keep = 'last')

## preprocess author's test dataset

In [4]:
autestdrop = ['ï»¿id', 'rate', 'sloss', 'dloss','stcpb', 'dtcpb','trans_depth','ct_srv_src','ct_dst_ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
        'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']

autest = autest.drop(autestdrop, axis = 1)
#Change column names of autest
autest.rename(columns={'dinpkt':'Dintpkt','djit':'Djit','dload':'Dload','dpkts':'Dpkts','label':'Label','sinpkt':'Sintpkt','sjit':'Sjit', 'sload':'Sload', 'spkts':'Spkts','dmean':'dmeansz','response_body_len':'res_bdy_len', 'smean':'smeansz'}, inplace=True)
autest = autest[(autest['proto']=='tcp') | (autest['proto']=='udp') | (autest['proto']=='ospf')]
autest = autest[(autest['service']=='ssh') | (autest['service']=='ftp-data ')  | (autest['service']=='ftp') | (autest['service']=='-') | (autest['service']=='dns') | (autest['service']=='smtp') | (autest['service']=='http') | (autest['service']=='radius') | (autest['service']=='pop3') ]
autest = autest[(autest['state'] == 'CON') | (autest['state'] == 'RST') | (autest['state'] == 'FIN') | (autest['state'] == 'ACC') | (autest['state'] == 'REQ') | (autest['state'] == 'INT')]

autest['proto'].replace("tcp", 0, inplace = True)
autest['proto'].replace("udp", 1, inplace = True)
autest['proto'].replace("ospf", 2, inplace = True)

autest['service'].replace("ssh", 0, inplace = True)
autest['service'].replace("ftp-data", 1, inplace = True)
autest['service'].replace("ftp", 2, inplace = True)
autest['service'].replace("-", 3, inplace = True)
autest['service'].replace("dns", 4, inplace = True)
autest['service'].replace("smtp", 5, inplace = True)
autest['service'].replace("http", 6, inplace = True)
autest['service'].replace("radius", 7, inplace = True)
autest['service'].replace("pop3", 8, inplace = True)

autest['state'].replace("CON", 0, inplace = True)
autest['state'].replace("RST", 1, inplace = True)
autest['state'].replace("FIN", 2, inplace = True)
autest['state'].replace("ACC", 3, inplace = True)
autest['state'].replace("REQ", 4, inplace = True)
autest['state'].replace("INT", 5, inplace = True)

for column in autest.columns:
        if column != 'Label':
                col_mean = sum(autest[column]) / len(autest[column])
                col_std = (sum((x - col_mean) ** 2 for x in autest[column]) / len(autest[column])) ** 0.5
                autest[column] = [(x - col_mean) / col_std for x in autest[column]]


autest = autest.drop_duplicates()
autestAttributes = autest.drop(['Label'], axis = 1)
autestAttributes = autestAttributes[trainAttributes.columns]
autestLabel = autest['Label']

## Preprocess author's training and testing dataset

In [7]:
autraindrop = ['ï»¿id', 'attack_cat']

autrain = autrain.drop(autraindrop, axis = 1)
autrain.rename(columns={'dinpkt':'Dintpkt','djit':'Djit','dload':'Dload','dpkts':'Dpkts','label':'Label','sinpkt':'Sintpkt','sjit':'Sjit', 'sload':'Sload', 'spkts':'Spkts','dmean':'dmeansz','response_body_len':'res_bdy_len', 'smean':'smeansz'}, inplace=True)
autrain = autrain[(autrain['proto']=='tcp') | (autrain['proto']=='udp') | (autrain['proto']=='ospf')]
autrain = autrain[(autrain['service']=='ssh') | (autrain['service']=='ftp-data ')  | (autrain['service']=='ftp') | (autrain['service']=='-') | (autrain['service']=='dns') | (autrain['service']=='smtp') | (autrain['service']=='http') | (autrain['service']=='radius') | (autrain['service']=='pop3') ]
autrain = autrain[(autrain['state'] == 'CON') | (autrain['state'] == 'RST') | (autrain['state'] == 'FIN') | (autrain['state'] == 'ACC') | (autrain['state'] == 'REQ') | (autrain['state'] == 'INT')]

autrain['proto'].replace("tcp", 0, inplace = True)
autrain['proto'].replace("udp", 1, inplace = True)
autrain['proto'].replace("ospf", 2, inplace = True)

autrain['service'].replace("ssh", 0, inplace = True)
autrain['service'].replace("ftp-data", 1, inplace = True)
autrain['service'].replace("ftp", 2, inplace = True)
autrain['service'].replace("-", 3, inplace = True)
autrain['service'].replace("dns", 4, inplace = True)
autrain['service'].replace("smtp", 5, inplace = True)
autrain['service'].replace("http", 6, inplace = True)
autrain['service'].replace("radius", 7, inplace = True)
autrain['service'].replace("pop3", 8, inplace = True)

autrain['state'].replace("CON", 0, inplace = True)
autrain['state'].replace("RST", 1, inplace = True)
autrain['state'].replace("FIN", 2, inplace = True)
autrain['state'].replace("ACC", 3, inplace = True)
autrain['state'].replace("REQ", 4, inplace = True)
autrain['state'].replace("INT", 5, inplace = True)

for column in autrain.columns:
        if column != 'Label':
                col_mean = sum(autrain[column]) / len(autrain[column])
                col_std = (sum((x - col_mean) ** 2 for x in autrain[column]) / len(autrain[column])) ** 0.5
                autrain[column] = [(x - col_mean) / col_std for x in autrain[column]]


autrain = autrain.drop_duplicates()
autrainAttributes = autrain.drop(['Label'], axis = 1)
# autrainAttributes = autrainAttributes[trainAttributes.columns]
autrainLabel = autrain['Label']

In [8]:
autestdrop = autraindrop = ['ï»¿id', 'attack_cat']

autest = autest.drop(autestdrop, axis = 1)
#Change column names of autest
autest.rename(columns={'dinpkt':'Dintpkt','djit':'Djit','dload':'Dload','dpkts':'Dpkts','label':'Label','sinpkt':'Sintpkt','sjit':'Sjit', 'sload':'Sload', 'spkts':'Spkts','dmean':'dmeansz','response_body_len':'res_bdy_len', 'smean':'smeansz'}, inplace=True)
autest = autest[(autest['proto']=='tcp') | (autest['proto']=='udp') | (autest['proto']=='ospf')]
autest = autest[(autest['service']=='ssh') | (autest['service']=='ftp-data ')  | (autest['service']=='ftp') | (autest['service']=='-') | (autest['service']=='dns') | (autest['service']=='smtp') | (autest['service']=='http') | (autest['service']=='radius') | (autest['service']=='pop3') ]
autest = autest[(autest['state'] == 'CON') | (autest['state'] == 'RST') | (autest['state'] == 'FIN') | (autest['state'] == 'ACC') | (autest['state'] == 'REQ') | (autest['state'] == 'INT')]

autest['proto'].replace("tcp", 0, inplace = True)
autest['proto'].replace("udp", 1, inplace = True)
autest['proto'].replace("ospf", 2, inplace = True)

autest['service'].replace("ssh", 0, inplace = True)
autest['service'].replace("ftp-data", 1, inplace = True)
autest['service'].replace("ftp", 2, inplace = True)
autest['service'].replace("-", 3, inplace = True)
autest['service'].replace("dns", 4, inplace = True)
autest['service'].replace("smtp", 5, inplace = True)
autest['service'].replace("http", 6, inplace = True)
autest['service'].replace("radius", 7, inplace = True)
autest['service'].replace("pop3", 8, inplace = True)

autest['state'].replace("CON", 0, inplace = True)
autest['state'].replace("RST", 1, inplace = True)
autest['state'].replace("FIN", 2, inplace = True)
autest['state'].replace("ACC", 3, inplace = True)
autest['state'].replace("REQ", 4, inplace = True)
autest['state'].replace("INT", 5, inplace = True)

for column in autest.columns:
        if column != 'Label':
                col_mean = sum(autest[column]) / len(autest[column])
                col_std = (sum((x - col_mean) ** 2 for x in autest[column]) / len(autest[column])) ** 0.5
                autest[column] = [(x - col_mean) / col_std for x in autest[column]]


autest = autest.drop_duplicates()
autestAttributes = autest.drop(['Label'], axis = 1)
#autestAttributes = autestAttributes[trainAttributes.columns]
autestLabel = autest['Label']

# LazyPredict Predictions

## Train-test with user dataset

In [5]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier


classifiers = [
    ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
    ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
    ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
    ('ExtraTreeClassifier', sklearn.ensemble._forest.ExtraTreeClassifier),
    ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
    ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
    ('NuSVC', sklearn.svm._classes.NuSVC),
    ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
    ('SVC', sklearn.svm._classes.SVC),
    ('XGBClassifier', xgboost.sklearn.XGBClassifier),
    ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers= classifiers, predictions=True)

models,predictions = clf.fit(trainAttributes, testAttributes, trainLabel, testLabel)




'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  9%|▉         | 1/11 [00:15<02:36, 15.63s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.9924071828627519, 'Balanced Accuracy': 0.992390566389497, 'ROC AUC': 0.9923905663894969, 'F1 Score': 0.992406645432784, 'Time taken': 15.63097095489502}


 18%|█▊        | 2/11 [00:29<02:13, 14.89s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.9926670131069923, 'Balanced Accuracy': 0.9926568341046715, 'ROC AUC': 0.9926568341046716, 'F1 Score': 0.9926667899884984, 'Time taken': 14.363081455230713}


 27%|██▋       | 3/11 [00:32<01:12,  9.04s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.9883653790634563, 'Balanced Accuracy': 0.9883665552867074, 'ROC AUC': 0.9883665552867074, 'F1 Score': 0.9883653899340559, 'Time taken': 2.0720555782318115}


 36%|███▋      | 4/11 [00:32<00:39,  5.59s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.9880189387378024, 'Balanced Accuracy': 0.9880203732508865, 'ROC AUC': 0.9880203732508867, 'F1 Score': 0.9880189513700693, 'Time taken': 0.29500317573547363}


 45%|████▌     | 5/11 [00:39<00:36,  6.15s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.9930711934869219, 'Balanced Accuracy': 0.9930597163011612, 'ROC AUC': 0.993059716301161, 'F1 Score': 0.9930709353069457, 'Time taken': 7.154617547988892}


 55%|█████▍    | 6/11 [00:53<00:44,  8.82s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.9924649229170276, 'Balanced Accuracy': 0.9924516873423717, 'ROC AUC': 0.9924516873423717, 'F1 Score': 0.9924645646448986, 'Time taken': 14.003910779953003}


 64%|██████▎   | 7/11 [17:48<22:31, 337.87s/it]

{'Model': 'NuSVC', 'Accuracy': 0.9813210924418269, 'Balanced Accuracy': 0.9812899871657002, 'ROC AUC': 0.9812899871657002, 'F1 Score': 0.9813169698652552, 'Time taken': 1015.310685634613}


 73%|███████▎  | 8/11 [18:22<12:02, 240.88s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.9933598937583001, 'Balanced Accuracy': 0.9933477679200112, 'ROC AUC': 0.9933477679200112, 'F1 Score': 0.9933596219820435, 'Time taken': 33.21791219711304}


 82%|████████▏ | 9/11 [21:55<07:44, 232.19s/it]

{'Model': 'SVC', 'Accuracy': 0.9923494428084763, 'Balanced Accuracy': 0.992332956065727, 'ROC AUC': 0.9923329560657269, 'F1 Score': 0.992348908766216, 'Time taken': 213.05647206306458}


 91%|█████████ | 10/11 [21:56<02:41, 161.00s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.9932732836768866, 'Balanced Accuracy': 0.9932603122479549, 'ROC AUC': 0.9932603122479549, 'F1 Score': 0.993272974605986, 'Time taken': 1.611938714981079}
[LightGBM] [Info] Number of positive: 40502, number of negative: 40190
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4392
[LightGBM] [Info] Number of data points in the train set: 80692, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501933 -> initscore=0.007733
[LightGBM] [Info] Start training from score 0.007733


100%|██████████| 11/11 [22:00<00:00, 120.03s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.9935331139211271, 'Balanced Accuracy': 0.9935202088214208, 'ROC AUC': 0.9935202088214208, 'F1 Score': 0.9935328193491417, 'Time taken': 3.561455488204956}





In [6]:
from sklearn.metrics import roc_auc_score
for m in predictions.columns:
    print(m)
    print(classification_report(testLabel, predictions[m]))
    auc_scores = roc_auc_score(testLabel, predictions[m], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     17280
           1       0.99      1.00      0.99     17358

    accuracy                           0.99     34638
   macro avg       0.99      0.99      0.99     34638
weighted avg       0.99      0.99      0.99     34638

AUC Scores for each class: 0.9923905663894969
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     17280
           1       0.99      1.00      0.99     17358

    accuracy                           0.99     34638
   macro avg       0.99      0.99      0.99     34638
weighted avg       0.99      0.99      0.99     34638

AUC Scores for each class: 0.9926568341046716
-------------------------------------------------

## Train Test with Author's test dataset

In [7]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier


classifiers = [
    ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
    ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
    ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
    ('ExtraTreeClassifier', sklearn.ensemble._forest.ExtraTreeClassifier),
    ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
    ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
    ('NuSVC', sklearn.svm._classes.NuSVC),
    ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
    ('SVC', sklearn.svm._classes.SVC),
    ('XGBClassifier', xgboost.sklearn.XGBClassifier),
    ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers= classifiers, predictions=True)

models,predictions = clf.fit(trainAttributes, autestAttributes, trainLabel, autestLabel)


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  9%|▉         | 1/11 [00:16<02:41, 16.14s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 16.136648416519165}


 18%|█▊        | 2/11 [00:31<02:20, 15.56s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.6470232495632308, 'Balanced Accuracy': 0.6488946203661992, 'ROC AUC': 0.648894620366199, 'F1 Score': 0.6563292583496728, 'Time taken': 15.149586200714111}


 27%|██▋       | 3/11 [00:33<01:16,  9.54s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 2.388719081878662}


 36%|███▋      | 4/11 [00:33<00:41,  5.88s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.6649195896608878, 'Balanced Accuracy': 0.5186149884607107, 'ROC AUC': 0.5186149884607107, 'F1 Score': 0.5701449312898044, 'Time taken': 0.2602548599243164}


 45%|████▌     | 5/11 [00:42<00:40,  6.79s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.6656363392017202, 'Balanced Accuracy': 0.5016185066122086, 'ROC AUC': 0.5016185066122086, 'F1 Score': 0.533342819749092, 'Time taken': 8.416309595108032}


 55%|█████▍    | 6/11 [01:02<00:57, 11.44s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.6646284101599247, 'Balanced Accuracy': 0.513565347089257, 'ROC AUC': 0.5135653470892569, 'F1 Score': 0.560811830249161, 'Time taken': 20.452399015426636}


 64%|██████▎   | 7/11 [21:35<27:22, 410.69s/it]

{'Model': 'NuSVC', 'Accuracy': 0.6434171034359181, 'Balanced Accuracy': 0.49219848381013637, 'ROC AUC': 0.49219848381013637, 'F1 Score': 0.5380150398946357, 'Time taken': 1232.6560406684875}


 73%|███████▎  | 8/11 [21:55<14:18, 286.18s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.6941719302961071, 'Balanced Accuracy': 0.5743529199456876, 'ROC AUC': 0.5743529199456876, 'F1 Score': 0.6396549687367908, 'Time taken': 19.57996892929077}


 82%|████████▏ | 9/11 [24:34<08:12, 246.46s/it]

{'Model': 'SVC', 'Accuracy': 0.6612686466872733, 'Balanced Accuracy': 0.5025842695195301, 'ROC AUC': 0.5025842695195301, 'F1 Score': 0.5411394125503298, 'Time taken': 159.12514328956604}


 91%|█████████ | 10/11 [24:35<02:50, 170.81s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.6480983738744792, 'Balanced Accuracy': 0.6493560312313048, 'ROC AUC': 0.6493560312313048, 'F1 Score': 0.6573183918975563, 'Time taken': 1.430095911026001}
[LightGBM] [Info] Number of positive: 40502, number of negative: 40190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022764 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4392
[LightGBM] [Info] Number of data points in the train set: 80692, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501933 -> initscore=0.007733
[LightGBM] [Info] Start training from score 0.007733


100%|██████████| 11/11 [24:37<00:00, 134.32s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 1.8584394454956055}





In [9]:
from sklearn.metrics import roc_auc_score
for m in predictions.columns:
    print(m)
    print(classification_report(autestLabel, predictions[m]))
    auc_scores = roc_auc_score(autestLabel, predictions[m], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.79      0.64      0.71     29673
           1       0.48      0.66      0.56     14973

    accuracy                           0.65     44646
   macro avg       0.64      0.65      0.63     44646
weighted avg       0.69      0.65      0.66     44646

AUC Scores for each class: 0.6508268315380992
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       0.79      0.64      0.71     29673
           1       0.48      0.65      0.55     14973

    accuracy                           0.65     44646
   macro avg       0.63      0.65      0.63     44646
weighted avg       0.68      0.65      0.66     44646

AUC Scores for each class: 0.648894620366199
--------------------------------------------------

## Test purely with author's train-test datasets

In [11]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier


classifiers = [
    ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
    ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
    ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
    ('ExtraTreeClassifier', sklearn.ensemble._forest.ExtraTreeClassifier),
    ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
    ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
    ('NuSVC', sklearn.svm._classes.NuSVC),
    ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
    ('SVC', sklearn.svm._classes.SVC),
    ('XGBClassifier', xgboost.sklearn.XGBClassifier),
    ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers= classifiers, predictions=True)

models,predictions = clf.fit(autrainAttributes, autestAttributes, autrainLabel, autestLabel)




'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  9%|▉         | 1/11 [00:14<02:28, 14.90s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.6633957317190723, 'Balanced Accuracy': 0.6771129492044956, 'ROC AUC': 0.6771129492044955, 'F1 Score': 0.6696266095815283, 'Time taken': 14.898641109466553}


 18%|█▊        | 2/11 [00:32<02:26, 16.22s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.7033217019143645, 'Balanced Accuracy': 0.6744816399782381, 'ROC AUC': 0.6744816399782381, 'F1 Score': 0.7015389019937077, 'Time taken': 17.14996361732483}


 27%|██▋       | 3/11 [00:34<01:18,  9.83s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.6611208575119045, 'Balanced Accuracy': 0.6533534498722411, 'ROC AUC': 0.6533534498722411, 'F1 Score': 0.6658171335982364, 'Time taken': 2.2295007705688477}


 36%|███▋      | 4/11 [00:34<00:42,  6.12s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.6657862774960961, 'Balanced Accuracy': 0.6184209212393726, 'ROC AUC': 0.6184209212393726, 'F1 Score': 0.6563742883245662, 'Time taken': 0.4300193786621094}


 45%|████▌     | 5/11 [00:44<00:45,  7.56s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.7274777814192902, 'Balanced Accuracy': 0.7851804902830828, 'ROC AUC': 0.7851804902830828, 'F1 Score': 0.7273278257581929, 'Time taken': 10.100873947143555}


 55%|█████▍    | 6/11 [01:04<00:58, 11.73s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.7733030016772378, 'Balanced Accuracy': 0.8036092566667059, 'ROC AUC': 0.8036092566667059, 'F1 Score': 0.7772621130066119, 'Time taken': 19.81660747528076}


 64%|██████▎   | 7/11 [21:31<27:15, 408.88s/it]

{'Model': 'NuSVC', 'Accuracy': 0.6579398893408649, 'Balanced Accuracy': 0.7302494962065427, 'ROC AUC': 0.7302494962065427, 'F1 Score': 0.649623413179047, 'Time taken': 1226.5459833145142}


 73%|███████▎  | 8/11 [21:57<14:20, 286.97s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.697480287636637, 'Balanced Accuracy': 0.7616128674116235, 'ROC AUC': 0.7616128674116236, 'F1 Score': 0.6945338995894083, 'Time taken': 25.9184730052948}


 82%|████████▏ | 9/11 [28:59<10:58, 329.39s/it]

{'Model': 'SVC', 'Accuracy': 0.7322781515683137, 'Balanced Accuracy': 0.7874799052856725, 'ROC AUC': 0.7874799052856724, 'F1 Score': 0.7328297781453849, 'Time taken': 422.6756272315979}


 91%|█████████ | 10/11 [29:01<03:48, 228.38s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.6802837809180466, 'Balanced Accuracy': 0.6803150360061321, 'ROC AUC': 0.6803150360061321, 'F1 Score': 0.6855254210437851, 'Time taken': 2.1903645992279053}
[LightGBM] [Info] Number of positive: 47166, number of negative: 48673
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6027
[LightGBM] [Info] Number of data points in the train set: 95839, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492138 -> initscore=-0.031451
[LightGBM] [Info] Start training from score -0.031451


100%|██████████| 11/11 [29:04<00:00, 158.63s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.6723988355728634, 'Balanced Accuracy': 0.6750815644715507, 'ROC AUC': 0.6750815644715507, 'F1 Score': 0.6780733034885725, 'Time taken': 2.885566234588623}





In [12]:
from sklearn.metrics import roc_auc_score
for m in predictions.columns:
    print(m)
    print(classification_report(autestLabel, predictions[m]))
    auc_scores = roc_auc_score(autestLabel, predictions[m], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.80      0.63      0.70     32933
           1       0.53      0.73      0.61     18938

    accuracy                           0.66     51871
   macro avg       0.66      0.68      0.66     51871
weighted avg       0.70      0.66      0.67     51871

AUC Scores for each class: 0.6771129492044955
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       0.76      0.78      0.77     32933
           1       0.60      0.57      0.58     18938

    accuracy                           0.70     51871
   macro avg       0.68      0.67      0.68     51871
weighted avg       0.70      0.70      0.70     51871

AUC Scores for each class: 0.6744816399782381
-------------------------------------------------

In [14]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNeighborsClassifier,0.77,0.8,0.8,0.78,19.82
SVC,0.73,0.79,0.79,0.73,422.68
ExtraTreesClassifier,0.73,0.79,0.79,0.73,10.1
RandomForestClassifier,0.7,0.76,0.76,0.69,25.92
NuSVC,0.66,0.73,0.73,0.65,1226.55
XGBClassifier,0.68,0.68,0.68,0.69,2.19
AdaBoostClassifier,0.66,0.68,0.68,0.67,14.9
LGBMClassifier,0.67,0.68,0.68,0.68,2.89
BaggingClassifier,0.7,0.67,0.67,0.7,17.15
DecisionTreeClassifier,0.66,0.65,0.65,0.67,2.23
