# Libraries

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv


# Dataset Loading

In [70]:
train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252') #test_set1 
autest = pd.read_csv(r'D:\Project Phase II\Dataset\UNSW_NB15_testing-set.csv',encoding='cp1252') #test_set2

  train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
  test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252') #test_set1


# Data Preprocessing

## Preprocess user-made train-test datasets

In [71]:
todrop = ['sloss', 'dloss','stcpb', 'dtcpb', 'trans_depth', 'Stime', 'Ltime','tcprtt', 'ct_flw_http_mthd', 
        'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat']
        
reducedTrain = train.drop(todrop, axis = 1)
reducedTest = test.drop(todrop, axis = 1)
reducedTrain = reducedTrain.drop_duplicates()
reducedTest = reducedTest.drop_duplicates()

trainAttributes = reducedTrain.drop(['srcip','sport','dstip','dsport','Label'], axis = 1)
trainLabel = reducedTrain['Label']
testAttributes = reducedTest.drop(['srcip','sport','dstip','dsport','Label'], axis = 1)
testLabel = reducedTest['Label']

train = reducedTrain
test = reducedTest

train = train.drop_duplicates(['srcip','sport', 'dstip', 'dsport', 'sttl', 'dttl', 'swin', 'dwin'], keep = 'last')
test = test.drop_duplicates(['srcip','sport', 'dstip', 'dsport', 'sttl', 'dttl', 'swin', 'dwin'], keep = 'last')

In [4]:
print("Train Set:\n",train.Label.value_counts())
print("Test Set:\n", test.Label.value_counts())

Train Set:
 Label
0    39990
1    39821
Name: count, dtype: int64
Test Set:
 Label
1    17217
0    17201
Name: count, dtype: int64


## Preprocess Author's testing dataset

Use stcpb, dtcpb to substitute the unavailablity of ip port

In [72]:
autestdrop = ['ï»¿id', 'rate', 'sloss', 'dloss','trans_depth','tcprtt','ct_srv_src','ct_dst_ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
        'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']

autest = autest.drop(autestdrop, axis = 1)
#Change column names of autest
autest.rename(columns={'dinpkt':'Dintpkt','djit':'Djit','dload':'Dload','dpkts':'Dpkts','label':'Label','sinpkt':'Sintpkt','sjit':'Sjit', 'sload':'Sload', 'spkts':'Spkts','dmean':'dmeansz','response_body_len':'res_bdy_len', 'smean':'smeansz'}, inplace=True)
autest = autest[(autest['proto']=='tcp') | (autest['proto']=='udp') | (autest['proto']=='ospf')]
autest = autest[(autest['service']=='ssh') | (autest['service']=='ftp-data ')  | (autest['service']=='ftp') | (autest['service']=='-') | (autest['service']=='dns') | (autest['service']=='smtp') | (autest['service']=='http') | (autest['service']=='radius') | (autest['service']=='pop3') ]
autest = autest[(autest['state'] == 'CON') | (autest['state'] == 'RST') | (autest['state'] == 'FIN') | (autest['state'] == 'ACC') | (autest['state'] == 'REQ') | (autest['state'] == 'INT')]

autest['proto'].replace("tcp", 0, inplace = True)
autest['proto'].replace("udp", 1, inplace = True)
autest['proto'].replace("ospf", 2, inplace = True)

autest['service'].replace("ssh", 0, inplace = True)
autest['service'].replace("ftp-data", 1, inplace = True)
autest['service'].replace("ftp", 2, inplace = True)
autest['service'].replace("-", 3, inplace = True)
autest['service'].replace("dns", 4, inplace = True)
autest['service'].replace("smtp", 5, inplace = True)
autest['service'].replace("http", 6, inplace = True)
autest['service'].replace("radius", 7, inplace = True)
autest['service'].replace("pop3", 8, inplace = True)

autest['state'].replace("CON", 0, inplace = True)
autest['state'].replace("RST", 1, inplace = True)
autest['state'].replace("FIN", 2, inplace = True)
autest['state'].replace("ACC", 3, inplace = True)
autest['state'].replace("REQ", 4, inplace = True)
autest['state'].replace("INT", 5, inplace = True)

for column in autest.columns:
        if column != 'Label':
                col_mean = sum(autest[column]) / len(autest[column])
                col_std = (sum((x - col_mean) ** 2 for x in autest[column]) / len(autest[column])) ** 0.5
                autest[column] = [(x - col_mean) / col_std for x in autest[column]]


autest = autest.drop_duplicates(['sttl', 'dttl', 'swin', 'dwin','stcpb','dtcpb'], keep = 'last')
autestAttributes = autest.drop(['Label'], axis = 1)
autestAttributes = autestAttributes[trainAttributes.columns]
autestLabel = autest['Label']

In [62]:
print("Author's Testing Set:\n", autest.Label.value_counts())

Author's Testing Set:
 Label
0    24354
1    13671
Name: count, dtype: int64


# Input Transformation

## Training

In [73]:
nodes = set()
trainUnique = train
for i in range(len(train)):
    src = str(train['srcip'].iloc[i])+':'+ str(train['sport'].iloc[i]) + ':' + str(train['sttl'].iloc[i])+':'+str(train['swin'].iloc[i])
    dst = str(train['dstip'].iloc[i])+':'+str(train['dsport'].iloc[i]) + ':' + str(train['dttl'].iloc[i])+':'+str(train['dwin'].iloc[i])
    nodes.add(src)
    nodes.add(dst)


#Find unique (srcip,sport,dstip,dsport) tuples in train
train_tuples = set()
for i in range(len(train)):
    train_tuples.add((train['srcip'].iloc[i], train['sport'].iloc[i], train['dstip'].iloc[i], train['dsport'].iloc[i], train['sttl'].iloc[i], train['swin'].iloc[i], train['dttl'].iloc[i], train['dwin'].iloc[i]))

In [74]:
xLookUp = {}
x = []
count = 0
edge_index = []
edge_attr = []
for i in range(len(train)):
    src = str(train['srcip'].iloc[i])+':'+ str(train['sport'].iloc[i]) + ':' + str(train['sttl'].iloc[i])+':'+str(train['swin'].iloc[i])
    dst = str(train['dstip'].iloc[i])+':'+str(train['dsport'].iloc[i]) + ':' + str(train['dttl'].iloc[i])+':'+str(train['dwin'].iloc[i])
    if src not in xLookUp:
        xLookUp[src] = count
        x.append([int(train['sport'].iloc[i]), int(train['sttl'].iloc[i]), int(train['swin'].iloc[i])])
        count += 1
    if dst not in xLookUp:
        xLookUp[dst] = count
        x.append([int(train['dsport'].iloc[i]), int(train['dttl'].iloc[i]), int(train['dwin'].iloc[i])])
        count += 1
    edge_index.append([xLookUp[src], xLookUp[dst]])
    edge_attr.append(list(train.iloc[i].drop(['srcip', 'sport', 'dstip', 'dsport', 'sttl', 'dttl','swin','dwin','Label']).values))


# Convert x to tensor
x_tensor = torch.tensor(x, dtype=torch.float)

# Convert edge_index to tensor
edge_index_tensor = torch.tensor(edge_index, dtype=torch.long).t()  # Transpose for correct format

# Convert edge_attr to tensor
edge_attr_tensor = torch.tensor(edge_attr, dtype=torch.float)

print(x_tensor.shape, edge_index_tensor.shape, edge_attr_tensor.shape)

torch.Size([97138, 3]) torch.Size([2, 79856]) torch.Size([79856, 20])


## Testing

In [75]:
testnodes = set()
testUnique = test
for i in range(len(test)):
    src = str(test['srcip'].iloc[i])+':'+ str(test['sport'].iloc[i]) + ':' + str(test['sttl'].iloc[i])+':'+str(test['swin'].iloc[i])
    dst = str(test['dstip'].iloc[i])+':'+str(test['dsport'].iloc[i]) + ':' + str(test['dttl'].iloc[i])+':'+str(test['dwin'].iloc[i])
    testnodes.add(src)
    testnodes.add(dst)


#Find unique (srcip,sport,dstip,dsport) tuples in train
test_tuples = set()
for i in range(len(test)):
    test_tuples.add((test['srcip'].iloc[i], test['sport'].iloc[i], test['dstip'].iloc[i], test['dsport'].iloc[i], test['sttl'].iloc[i], test['swin'].iloc[i], test['dttl'].iloc[i], test['dwin'].iloc[i]))

In [76]:
xtestLookUp = {}
xtest = []
count = 0
edge_index_test = []
edge_attr_test = []
for i in range(len(test)):
    src = str(test['srcip'].iloc[i])+':'+ str(test['sport'].iloc[i]) + ':' + str(test['sttl'].iloc[i])+':'+str(test['swin'].iloc[i])
    dst = str(test['dstip'].iloc[i])+':'+str(test['dsport'].iloc[i]) + ':' + str(test['dttl'].iloc[i])+':'+str(test['dwin'].iloc[i])
    if src not in xtestLookUp:
        xtestLookUp[src] = count
        xtest.append([int(test['sport'].iloc[i]), int(test['sttl'].iloc[i]), int(test['swin'].iloc[i])])
        count += 1
    if dst not in xtestLookUp:
        xtestLookUp[dst] = count
        xtest.append([int(test['dsport'].iloc[i]), int(test['dttl'].iloc[i]), int(test['dwin'].iloc[i])])
        count += 1
    edge_index_test.append([xtestLookUp[src], xtestLookUp[dst]])
    edge_attr_test.append(list(test.iloc[i].drop(['srcip', 'sport', 'dstip', 'dsport', 'sttl', 'dttl','swin','dwin','Label']).values))


# Convert x to tensor
x_test_tensor = torch.tensor(xtest, dtype=torch.float)

# Convert edge_index to tensor
edge_index_test_tensor = torch.tensor(edge_index_test, dtype=torch.long).t()  # Transpose for correct format

# Convert edge_attr to tensor
edge_attr_test_tensor = torch.tensor(edge_attr_test, dtype=torch.float)

print(x_test_tensor.shape, edge_index_test_tensor.shape, edge_attr_test_tensor.shape)

torch.Size([42906, 3]) torch.Size([2, 34430]) torch.Size([34430, 20])


## Author's Testing Dataset

Since srcip, sport, dstip, dsport information is not available, we proceed with having each node feature value as 0, and assume each edge as an unique connection between two imaginary nodes, we add stcpb and dtcpb as placeholders 

In [77]:
autestnodes = set()
autestUnique = autest
for i in range(len(autest)):
    src = str(autest['sttl'].iloc[i])+':'+str(autest['swin'].iloc[i]) + str(autest['stcpb'].iloc[i])
    dst = str(autest['dttl'].iloc[i])+':'+str(autest['dwin'].iloc[i]) + str(autest['dtcpb'].iloc[i])
    autestnodes.add(src)
    autestnodes.add(dst)


#Find unique (srcip,sport,dstip,dsport) tuples in train
autest_tuples = set()
for i in range(len(autest)):
    autest_tuples.add((autest['sttl'].iloc[i], autest['swin'].iloc[i], autest['dttl'].iloc[i], autest['dwin'].iloc[i], autest['stcpb'].iloc[i], autest['dtcpb'].iloc[i]))

In [78]:
xautestLookUp = {}
xautest = []
count = 0
edge_index_autest = []
edge_attr_autest = []

for i in range(len(autest)):
    src = str(autest['sttl'].iloc[i])+':'+str(autest['swin'].iloc[i]) + str(autest['stcpb'].iloc[i])
    dst = str(autest['dttl'].iloc[i])+':'+str(autest['dwin'].iloc[i]) + str(autest['dtcpb'].iloc[i])
    if src not in xautestLookUp:
        xautestLookUp[src] = count
        xautest.append([0,int(autest['sttl'].iloc[i]), int(autest['swin'].iloc[i])])
        count += 1
    if dst not in xautestLookUp:
        xautestLookUp[dst] = count
        xautest.append([0, int(autest['dttl'].iloc[i]), int(autest['dwin'].iloc[i])])
        count += 1
    edge_index_autest.append([xautestLookUp[src], xautestLookUp[dst]])
    edge_attr_autest.append(list(autest.iloc[i].drop(['sttl', 'dttl','swin','dwin','Label','stcpb','dtcpb']).values))



# Convert x to tensor
x_autest_tensor = torch.tensor(xautest, dtype=torch.float)

# Convert edge_index to tensor
edge_index_autest_tensor = torch.tensor(edge_index_autest, dtype=torch.long).t()  # Transpose for correct format

# Convert edge_attr to tensor
edge_attr_autest_tensor = torch.tensor(edge_attr_autest, dtype=torch.float)

print(x_autest_tensor.shape, edge_index_autest_tensor.shape, edge_attr_autest_tensor.shape)

torch.Size([75920, 3]) torch.Size([2, 38025]) torch.Size([38025, 20])


# GAT Model

nn.Linear appplies linear transformation to the incoming data: y = x * transpose(A) + b 


In [79]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class GATClassifier(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_channels, num_heads):
        super(GATClassifier, self).__init__()
        self.conv1 = GATConv(num_node_features, hidden_channels, heads=num_heads, add_self_loops=False)
        self.conv2 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads, add_self_loops=False)
        # self.edge_classifier = nn.Linear(hidden_channels * num_heads + num_edge_features, 1)  # Output size changed to 1
        self.edge_classifier = nn.Linear(hidden_channels * 2 * num_heads + num_edge_features, 1)  # Output size changed to 1

    def forward(self, x, edge_index, edge_attr):
        # Apply graph attentional layers
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # Concatenate node features with edge features
        num_edges = edge_index.size(1)
        node_x = x[edge_index[0]]
        node_x_other = x[edge_index[1]]
        edge_attr_padded = torch.cat([edge_attr, torch.zeros(num_edges - edge_attr.size(0), edge_attr.size(1))], dim=0)
        edge_x = torch.cat([node_x, node_x_other, edge_attr_padded], dim=1)
        print("edge_x.shape:", edge_x.shape)
        print("self.edge_classifier.weight.shape:",self.edge_classifier.weight.shape)
        # Classify edges into binary class using BCELogit
        edge_scores = self.edge_classifier(edge_x)
        return edge_scores


Training

In [86]:
# Example usage
num_nodes = len(nodes)
num_edges = len(train_tuples)
num_node_features = 3
num_edge_features = 20
hidden_channels = 16
num_heads = 4

# Instantiate GAT classifier
model = GATClassifier(num_node_features, num_edge_features, hidden_channels, num_heads)

# Forward pass
edge_scores_train = model(x_tensor, edge_index_tensor, edge_attr_tensor)

edge_x.shape: torch.Size([79856, 148])
self.edge_classifier.weight.shape: torch.Size([1, 148])


Testing

In [87]:
# Example usage
num_nodes_test = len(testnodes)
num_edges_test = len(test_tuples)
num_node_features = 3
num_edge_features = 20
hidden_channels = 4
num_heads = 16

# Forward pass
edge_scores_test = model(x_test_tensor, edge_index_test_tensor, edge_attr_test_tensor)

edge_x.shape: torch.Size([34430, 148])
self.edge_classifier.weight.shape: torch.Size([1, 148])


Author's testing dataset

In [88]:
# Example usage
num_nodes_test = len(autestnodes)
num_edges_test = len(autest_tuples)
num_node_features = 3
num_edge_features = 20
hidden_channels = 16
num_heads = 4

# Forward pass
edge_scores_autest = model(x_autest_tensor, edge_index_autest_tensor, edge_attr_autest_tensor)

edge_x.shape: torch.Size([38025, 148])
self.edge_classifier.weight.shape: torch.Size([1, 148])


# KNN

In [89]:
import pandas as pd
# Convert edge_scores tensor to a NumPy array
edge_scores_train_array = edge_scores_train.detach().numpy()
edge_scores_test_array = edge_scores_test.detach().numpy()
# Create a DataFrame from the NumPy array
edge_scores_train_df = pd.DataFrame(edge_scores_train_array, columns=['Edge_Scores'])
edge_scores_test_df = pd.DataFrame(edge_scores_test_array, columns=['Edge_Scores'])
trainLabel = train['Label']
testLabel = test['Label']

In [90]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train = edge_scores_train_df
X_test = edge_scores_test_df
y_train = trainLabel
y_test = testLabel

# Initialize the KNN classifier
k = 10  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Calculate accuracy
report = classification_report(y_test, y_pred)
print("Classification Report: \n", report)

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.93      0.90     17201
           1       0.92      0.86      0.89     17229

    accuracy                           0.89     34430
   macro avg       0.90      0.89      0.89     34430
weighted avg       0.90      0.89      0.89     34430



Author's test dataset

In [91]:
import pandas as pd
# Convert edge_scores tensor to a NumPy array
edge_scores_autest_array = edge_scores_autest.detach().numpy()
# Create a DataFrame from the NumPy array
testLabeldf = pd.DataFrame(auEdgeLabel, columns=['Label'])
edge_scores_autest_df = pd.DataFrame(edge_scores_autest_array, columns=['Edge_Scores'])
trainLabel = train['Label']
autestLabel = autest['Label']
auX_test = edge_scores_autest_df
auy_test = autestLabel 

auy_pred = knn_classifier.predict(edge_scores_autest_df)

# Calculate accuracy
report = classification_report(autestLabel, auy_pred)
print("Classification Report: \n", report)

Classification Report: 
               precision    recall  f1-score   support

           0       0.73      0.64      0.68     24354
           1       0.48      0.58      0.52     13671

    accuracy                           0.62     38025
   macro avg       0.60      0.61      0.60     38025
weighted avg       0.64      0.62      0.63     38025



# Lazy Predict Scores

In [92]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier


classifiers = [
    ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
    ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
    ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
    # ('ExtraTreeClassifier', sklearn.ensemble._forest.ExtraTreeClassifier),
    # ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
    # ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
    # ('NuSVC', sklearn.svm._classes.NuSVC),
    ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
    # ('SVC', sklearn.svm._classes.SVC),
    ('XGBClassifier', xgboost.sklearn.XGBClassifier),
    ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers= classifiers, predictions=True)

models,predictions = clf.fit(X_train, X_test, y_train, y_test)




'tuple' object has no attribute '__name__'
Invalid Classifier(s)


 17%|█▋        | 1/6 [00:01<00:09,  1.94s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.886494336334592, 'Balanced Accuracy': 0.8865193459587084, 'ROC AUC': 0.8865193459587084, 'F1 Score': 0.8863890964369112, 'Time taken': 1.9350829124450684}


 33%|███▎      | 2/6 [00:04<00:08,  2.06s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.8571304095265757, 'Balanced Accuracy': 0.8571403063981533, 'ROC AUC': 0.8571403063981532, 'F1 Score': 0.857110269484069, 'Time taken': 2.138676166534424}


 50%|█████     | 3/6 [00:04<00:03,  1.24s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.8525994772001162, 'Balanced Accuracy': 0.8526028518893403, 'ROC AUC': 0.8526028518893403, 'F1 Score': 0.852597302023816, 'Time taken': 0.268186092376709}


 67%|██████▋   | 4/6 [00:24<00:17,  8.69s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.8527446993900668, 'Balanced Accuracy': 0.8527477671122392, 'ROC AUC': 0.8527477671122391, 'F1 Score': 0.8527429349306606, 'Time taken': 20.119638681411743}


 83%|████████▎ | 5/6 [00:24<00:05,  5.68s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.8928550682544293, 'Balanced Accuracy': 0.8928748620126774, 'ROC AUC': 0.8928748620126774, 'F1 Score': 0.8927932343956324, 'Time taken': 0.343428373336792}
[LightGBM] [Info] Number of positive: 39866, number of negative: 39990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 79856, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499224 -> initscore=-0.003106
[LightGBM] [Info] Start training from score -0.003106


100%|██████████| 6/6 [00:25<00:00,  4.24s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.8938135347081034, 'Balanced Accuracy': 0.8938354785419262, 'ROC AUC': 0.8938354785419262, 'F1 Score': 0.893738011755283, 'Time taken': 0.5980427265167236}





In [93]:
from sklearn.metrics import roc_auc_score
for m in predictions.columns:
    print(m)
    print(classification_report(y_test, predictions[m]))
    auc_scores = roc_auc_score(y_test, predictions[m], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.86      0.92      0.89     17201
           1       0.91      0.86      0.88     17229

    accuracy                           0.89     34430
   macro avg       0.89      0.89      0.89     34430
weighted avg       0.89      0.89      0.89     34430

AUC Scores for each class: 0.8865193459587084
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     17201
           1       0.87      0.84      0.86     17229

    accuracy                           0.86     34430
   macro avg       0.86      0.86      0.86     34430
weighted avg       0.86      0.86      0.86     34430

AUC Scores for each class: 0.8571403063981532
-------------------------------------------------

Lazy Predict for author's testing dataset

In [94]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier

classifiers = [
    ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
    ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
    ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
    # ('ExtraTreeClassifier', sklearn.ensemble._forest.ExtraTreeClassifier),
    # ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
    # ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
    # ('NuSVC', sklearn.svm._classes.NuSVC),
    ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
    # ('SVC', sklearn.svm._classes.SVC),
    ('XGBClassifier', xgboost.sklearn.XGBClassifier),
    ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers= classifiers, predictions=True)

aumodels,aupredictions = clf.fit(X_train, auX_test, y_train, auy_test)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


 17%|█▋        | 1/6 [00:02<00:11,  2.38s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.6285075608152532, 'Balanced Accuracy': 0.6168067796144676, 'ROC AUC': 0.6168067796144676, 'F1 Score': 0.6340335316967551, 'Time taken': 2.3837015628814697}


 33%|███▎      | 2/6 [00:04<00:08,  2.22s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.591163708086785, 'Balanced Accuracy': 0.5980655341394917, 'ROC AUC': 0.5980655341394916, 'F1 Score': 0.5993969804128663, 'Time taken': 2.0966784954071045}


 50%|█████     | 3/6 [00:04<00:03,  1.33s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.5834319526627219, 'Balanced Accuracy': 0.5940028857866331, 'ROC AUC': 0.5940028857866331, 'F1 Score': 0.5917383654112943, 'Time taken': 0.2701249122619629}


 67%|██████▋   | 4/6 [00:24<00:17,  8.77s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.5835634451019066, 'Balanced Accuracy': 0.5941697113721391, 'ROC AUC': 0.5941697113721393, 'F1 Score': 0.5918663115286102, 'Time taken': 20.169023990631104}


 83%|████████▎ | 5/6 [00:25<00:05,  5.71s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.6235108481262327, 'Balanced Accuracy': 0.6095529414906733, 'ROC AUC': 0.6095529414906733, 'F1 Score': 0.6287588517430759, 'Time taken': 0.28249549865722656}
[LightGBM] [Info] Number of positive: 39866, number of negative: 39990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 79856, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499224 -> initscore=-0.003106
[LightGBM] [Info] Start training from score -0.003106


100%|██████████| 6/6 [00:25<00:00,  4.27s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.6260092044707429, 'Balanced Accuracy': 0.6112466476072186, 'ROC AUC': 0.6112466476072185, 'F1 Score': 0.6310145015398337, 'Time taken': 0.3919703960418701}





In [95]:
from sklearn.metrics import roc_auc_score
for model in aupredictions.columns:
    print(model)
    print(classification_report(auy_test, aupredictions[model]))
    auc_scores = roc_auc_score(auy_test, aupredictions[model], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.73      0.66      0.69     24354
           1       0.49      0.58      0.53     13671

    accuracy                           0.63     38025
   macro avg       0.61      0.62      0.61     38025
weighted avg       0.64      0.63      0.63     38025

AUC Scores for each class: 0.6168067796144676
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       0.73      0.57      0.64     24354
           1       0.45      0.62      0.52     13671

    accuracy                           0.59     38025
   macro avg       0.59      0.60      0.58     38025
weighted avg       0.63      0.59      0.60     38025

AUC Scores for each class: 0.5980655341394916
-------------------------------------------------