## Lib

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering


## Dataset loading

In [2]:
todrop = ['srcip', 'sport', 'dstip', 'dsport','sloss', 'dloss','stcpb', 'dtcpb', 'trans_depth', 'Stime', 'Ltime','ct_flw_http_mthd', 
        'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat']

In [3]:
keep = ['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'smeansz',
       'dmeansz',  'res_bdy_len', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'ct_state_ttl','Label']


In [3]:
train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252')

In [93]:
print("Training set:")
print('Attack: ', len(train[train['Label'] == 0]), ' Non attack: ', len(train[train['Label'] == 0]), 'Total: ', len(train))
print("Testing set:")
print('Attack: ', len(test[test['Label'] == 0]), ' Non attack: ', len(test[test['Label'] == 0]), 'Total: ', len(test))

Training set:
Attack:  40502  Non attack:  40502 Total:  81004
Testing set:
Attack:  17358  Non attack:  17358 Total:  34716


## Data preprocessing

Extract edge features alone from train and test

In [35]:
todrop = ['srcip', 'sport', 'dstip', 'dsport','sloss', 'dloss','stcpb', 'dtcpb', 'trans_depth', 'Stime', 'Ltime','ct_flw_http_mthd', 
        'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']

trainAttributes = train.drop(todrop, axis = 1)
testAttributes = test.drop(todrop, axis = 1)

trainLabel = train['Label']
testLabel = test['Label']

Use LDA to reduce dimension to 1

In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components = 1)
lda_x_train = lda.fit_transform(trainAttributes, trainLabel)
lda_x_test = lda.transform(testAttributes)

Compute nodes and edges

Consider only 7000 train records since memory becomes high

In [7]:
train = pd.concat([train.head(3500), train.tail(3500)])
lda_x_train = np.concatenate((lda_x_train[:3500], lda_x_train[len(lda_x_train)-3500:]))

In [8]:
nodes = set()
edges = set()
for i in range(7000):
    src = str(train['srcip'].iloc[i])+':'+str(train['sport'].iloc[i])
    dst = str(train['dstip'].iloc[i])+':'+str(train['dsport'].iloc[i])
    nodes.add(src)
    nodes.add(dst)
    edges.add((src,dst))

In [9]:
len(nodes)

8963

Train Data

In [13]:
import numpy as np
# Prepare an adjacency matrix of the nodes
adjacency_matrix = np.zeros((len(nodes), len(nodes)), dtype=float)
label_matrix = np.zeros((len(nodes), len(nodes)))


# Add the columns srcip:sport and dstip:dsport to the adjacency matrix
for i in range(7000):
    src = str(train['srcip'].iloc[i])+':'+str(train['sport'].iloc[i])
    dst = str(train['dstip'].iloc[i])+':'+str(train['dsport'].iloc[i])
    src_index = list(nodes).index(src)
    dst_index = list(nodes).index(dst)
    adjacency_matrix[src_index, dst_index] = lda_x_train[i]
    label_matrix[src_index, dst_index] = train['Label'].iloc[i]
# Flatten tuple values in the adjacency matrix
adjacency_matrix_flat = adjacency_matrix.reshape(-1, 1)
label_flat = label_matrix.reshape(-1, 1)

  adjacency_matrix[src_index, dst_index] = lda_x_train[i]


Test Dataset

Consider only 5000 records for test

In [9]:
test = pd.concat([test.head(2500), test.tail(2500)])
lda_x_test = np.concatenate((lda_x_test[:2500], lda_x_test[len(lda_x_test)-2500:]))

In [10]:
tenodes = set()
teedges = set()
for i in range(len(test)):
    src = str(test['srcip'].iloc[i])+':'+str(test['sport'].iloc[i])
    dst = str(test['dstip'].iloc[i])+':'+str(test['dsport'].iloc[i])
    tenodes.add(src)
    tenodes.add(dst)
    teedges.add((src,dst))

In [11]:
import numpy as np
# Prepare an adjacency matrix of the nodes
adjacency_matrix_test = np.zeros((len(tenodes), len(tenodes)), dtype=float)
label_matrix_test = np.zeros((len(tenodes), len(tenodes)))


# Add the columns srcip:sport and dstip:dsport to the adjacency matrix
for i in range(len(test)):
    src = str(test['srcip'].iloc[i])+':'+str(test['sport'].iloc[i])
    dst = str(test['dstip'].iloc[i])+':'+str(test['dsport'].iloc[i])
    src_index = list(tenodes).index(src)
    dst_index = list(tenodes).index(dst)
    adjacency_matrix_test[src_index, dst_index] = lda_x_test[i]
    label_matrix_test[src_index, dst_index] = test['Label'].iloc[i]
# Flatten tuple values in the adjacency matrix
adjacency_matrix_test_flat = adjacency_matrix_test.reshape(-1, 1)
label_test_flat = label_matrix_test.reshape(-1, 1)

  adjacency_matrix_test[src_index, dst_index] = lda_x_test[i]


## LSTM Model

In [16]:
# Create an instance of the model
# Hyperparameters
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Modify the call method of the LSTMClassifier
class LSTMClassifier(tf.keras.Model):
    def __init__(self, input_shape, hidden_units, output_units):
        super(LSTMClassifier, self).__init__()
        self.lstm = LSTM(hidden_units, return_sequences=False)  # Set return_sequences=False since we only need the output at the last timestep
        self.dense = Dense(output_units, activation='sigmoid')

    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)  # Add a new dimension to represent the input features
        x = self.lstm(x)
        output = self.dense(x)
        return output
    

# Hyperparameters
input_shape = adjacency_matrix_flat.shape[1:]  # Assuming each tuple has 2 attributes
hidden_units = 16
output_units = 1  # Binary classification

# Convert adjacency_matrix_flat to a TensorFlow tensor
adjacency_tensor = tf.convert_to_tensor(adjacency_matrix_flat, dtype=tf.float32)

# Initialize model, optimizer, and loss function
lstmmodel = LSTMClassifier(input_shape, hidden_units, output_units)
lstmmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstmhistory = lstmmodel.fit(adjacency_tensor, label_flat, epochs=2, batch_size=32, validation_split=0.2)





Epoch 1/2


Epoch 2/2


In [17]:
lstmmodel.save_weights("lstmfinalldaweights.h5")

Load model

In [14]:
# Create an instance of the model
# Hyperparameters
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Modify the call method of the LSTMClassifier
class LSTMClassifier(tf.keras.Model):
    def __init__(self, input_shape, hidden_units, output_units):
        super(LSTMClassifier, self).__init__()
        self.lstm = LSTM(hidden_units, return_sequences=False)  # Set return_sequences=False since we only need the output at the last timestep
        self.dense = Dense(output_units, activation='sigmoid')

    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)  # Add a new dimension to represent the input features
        x = self.lstm(x)
        output = self.dense(x)
        return output
    
input_shape = adjacency_matrix_flat.shape[1:]  # Assuming each tuple has 2 attributes
hidden_units = 16
output_units = 1  # Binary classification
# Initialize model, optimizer, and loss function
lstmmodel = LSTMClassifier(input_shape, hidden_units, output_units)

dummy_input = np.zeros((4, 4), dtype=float)
dummy_label = np.zeros((4, 4))
#Use dummy data to initialize the model
dummy_flat = dummy_input.reshape(-1, 1)
dummy_label_flat = dummy_label.reshape(-1, 1) 

lstmmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstmhistory = lstmmodel.fit(dummy_flat, dummy_label_flat, epochs=2, batch_size=32, validation_split=0.2)




Epoch 1/2


Epoch 2/2


In [15]:
# Load the saved weights
lstmmodel.load_weights("lstmfinalldaweights.h5")

In [16]:
lstmpredictions = lstmmodel.predict(adjacency_matrix_test_flat)
lstmpredictions = (lstmpredictions > 0.5).astype(int)



In [17]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(label_test_flat, lstmpredictions)

print("Classification Report for LSTM:")
print(report)

Classification Report for LSTM:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00  43214979
         1.0       0.98      0.99      0.99      2497

    accuracy                           1.00  43217476
   macro avg       0.99      1.00      0.99  43217476
weighted avg       1.00      1.00      1.00  43217476



## Test lstm with author's test set

In [37]:
autest = pd.read_csv(r'D:\Project Phase II\Dataset\UNSW_NB15_testing-set.csv',encoding='cp1252')

In [38]:
autestdrop = ['ï»¿id', 'rate', 'sloss', 'dloss','stcpb', 'dtcpb','trans_depth','ct_srv_src','ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']


autest = autest.drop(autestdrop, axis = 1)

In [7]:
#Change column names of autest
autest.rename(columns={'dinpkt':'Dintpkt','djit':'Djit','dload':'Dload','dpkts':'Dpkts','label':'Label','sinpkt':'Sintpkt','sjit':'Sjit', 'sload':'Sload', 'spkts':'Spkts','dmean':'dmeansz','response_body_len':'res_bdy_len', 'smean':'smeansz'}, inplace=True)

In [8]:
autest = autest[(autest['proto']=='tcp') | (autest['proto']=='udp') | (autest['proto']=='ospf')]
autest = autest[(autest['service']=='ssh') | (autest['service']=='ftp-data ')  | (autest['service']=='ftp') | (autest['service']=='-') | (autest['service']=='dns') | (autest['service']=='smtp') | (autest['service']=='http') | (autest['service']=='radius') | (autest['service']=='pop3') ]
autest = autest[(autest['state'] == 'CON') | (autest['state'] == 'RST') | (autest['state'] == 'FIN') | (autest['state'] == 'ACC') | (autest['state'] == 'REQ') | (autest['state'] == 'INT')]

In [9]:
autest['proto'].replace("tcp", 0, inplace = True)
autest['proto'].replace("udp", 1, inplace = True)
autest['proto'].replace("ospf", 2, inplace = True)

autest['service'].replace("ssh", 0, inplace = True)
autest['service'].replace("ftp-data", 1, inplace = True)
autest['service'].replace("ftp", 2, inplace = True)
autest['service'].replace("-", 3, inplace = True)
autest['service'].replace("dns", 4, inplace = True)
autest['service'].replace("smtp", 5, inplace = True)
autest['service'].replace("http", 6, inplace = True)
autest['service'].replace("radius", 7, inplace = True)
autest['service'].replace("pop3", 8, inplace = True)

autest['state'].replace("CON", 0, inplace = True)
autest['state'].replace("RST", 1, inplace = True)
autest['state'].replace("FIN", 2, inplace = True)
autest['state'].replace("ACC", 3, inplace = True)
autest['state'].replace("REQ", 4, inplace = True)
autest['state'].replace("INT", 5, inplace = True)



In [10]:
for column in autest.columns:
    if column != 'Label':
        col_mean = sum(autest[column]) / len(autest[column])
        col_std = (sum((x - col_mean) ** 2 for x in autest[column]) / len(autest[column])) ** 0.5
        autest[column] = [(x - col_mean) / col_std for x in autest[column]]


In [11]:
autest = autest.drop_duplicates()


In [12]:
autestAttributes = autest.drop(['Label'], axis = 1)
autestAttributes = autestAttributes[trainAttributes.columns]
autestLabel = autest['Label']
lda_x_autest = lda.transform(autestAttributes)

NameError: name 'lda' is not defined

In [55]:
import numpy as np
# Prepare an adjacency matrix of the nodes
autestlen = int(len(autest) ** 0.5)
adjacency_matrix_autest = np.zeros((autestlen, autestlen), dtype=float)
label_matrix_autest = np.zeros((autestlen, autestlen))
counter = 0


# Add the columns srcip:sport and dstip:dsport to the adjacency matrix
for i in range(autestlen):
    for j in range(autestlen):
        adjacency_matrix_autest[i, j] = lda_x_autest[counter]
        label_matrix_autest[i, j] = autest['Label'].iloc[counter]
        counter += 1
# Flatten tuple values in the adjacency matrix
adjacency_matrix_autest_flat = adjacency_matrix_autest.reshape(-1, 1)
label_autest_flat = label_matrix_autest.reshape(-1, 1)

  adjacency_matrix_autest[i, j] = lda_x_autest[counter]


In [63]:
lstmpredictions = lstmmodel.predict(adjacency_matrix_autest_flat)
lstmpredictions = (lstmpredictions >= 0.5).astype(int)



In [66]:
autest.Label.value_counts()

Label
0    29673
1    14973
Name: count, dtype: int64

In [67]:
unique_values, counts = np.unique(lstmpredictions, return_counts=True)

# Display the unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

Value: 0, Count: 44178
Value: 1, Count: 343


In [68]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(label_autest_flat, lstmpredictions)

print("Classification Report for LSTM using Author's test set:")
print(report)

Classification Report for LSTM using Author's test set:
              precision    recall  f1-score   support

         0.0       0.66      0.99      0.79     29548
         1.0       0.05      0.00      0.00     14973

    accuracy                           0.66     44521
   macro avg       0.36      0.50      0.40     44521
weighted avg       0.46      0.66      0.53     44521



Performance is down bad

## GRU

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense

# Modify the call method of the GRUClassifier
class GRUClassifier(tf.keras.Model):
    def __init__(self, input_shape, hidden_units, output_units):
        super(GRUClassifier, self).__init__()
        self.gru = GRU(hidden_units, return_sequences=False)  # Set return_sequences=False since we only need the output at the last timestep
        self.dense = Dense(output_units, activation='sigmoid')

    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)  # Add a new dimension to represent the input features
        x = self.gru(x)
        output = self.dense(x)
        return output

# Hyperparameters
input_shape = adjacency_matrix_flat.shape[1:]  # Assuming each tuple has 2 attributes
hidden_units = 16
output_units = 1  # Binary classification

grumodel = GRUClassifier(input_shape, hidden_units, output_units)


grumodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gruhistory = grumodel.fit(adjacency_tensor, label_flat, epochs=2, batch_size=32, validation_split=0.2)

# Convert adjacency_matrix_flat to a TensorFlow tensor


In [None]:
grumodel.save_weights("grufinalldaweights.h5")

In [None]:
grupredictions = grumodel.predict(adjacency_matrix_test_flat)
grupredictions = (grupredictions > 0.5).astype(int)

In [None]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(label_test_flat, grupredictions)

print("Classification Report for GRU:")
print(report)

## Try out baseline models with the same datasets

In [46]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier

#Skip SVM
classifiers = [
    ('AdaBoostClassifier', sklearn.ensemble.AdaBoostClassifier),
    ('BaggingClassifier', sklearn.ensemble.BaggingClassifier),
    ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
    ('DecisionTreeClassifier', sklearn.tree.DecisionTreeClassifier),
    ('DummyClassifier', sklearn.dummy.DummyClassifier),
    ('GaussianNB', sklearn.naive_bayes.GaussianNB),
    ('KNeighborsClassifier',  sklearn.neighbors.KNeighborsClassifier),
    ('LinearDiscriminantAnalysis',  sklearn.discriminant_analysis.LinearDiscriminantAnalysis),
    ('LogisticRegression', sklearn.linear_model.LogisticRegression),
    ('Perceptron', sklearn.linear_model.Perceptron),
    ('QuadraticDiscriminantAnalysis',  sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis),
    ('RandomForestClassifier', sklearn.ensemble.RandomForestClassifier),
    ('StackingClassifier', sklearn.ensemble.StackingClassifier),
    ('XGBClassifier', xgboost.XGBClassifier),
    ('LGBMClassifier', lightgbm.LGBMClassifier)
]

clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers=classifiers,predictions=True)

In [42]:
len(testAttributes), len(testLabel)

(34716, 34716)

In [47]:
# Import necessary modules
# Execute the code block
models, predictions = clf.fit(trainAttributes,  testAttributes, trainLabel, testLabel)


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  0%|          | 0/15 [00:00<?, ?it/s]

  7%|▋         | 1/15 [00:10<02:20, 10.04s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.9923090217767024, 'Balanced Accuracy': 0.9923090217767023, 'ROC AUC': 0.9923090217767024, 'F1 Score': 0.9923086002624436, 'Time taken': 10.04209017753601}


 20%|██        | 3/15 [00:20<01:06,  5.55s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.9927699043668625, 'Balanced Accuracy': 0.9927699043668625, 'ROC AUC': 0.9927699043668625, 'F1 Score': 0.9927697527009295, 'Time taken': 10.1019868850708}
{'Model': 'BernoulliNB', 'Accuracy': 0.9578292430003457, 'Balanced Accuracy': 0.9578292430003457, 'ROC AUC': 0.9578292430003457, 'F1 Score': 0.9577980384976114, 'Time taken': 0.1698317527770996}


 33%|███▎      | 5/15 [00:21<00:25,  2.54s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.988391519760341, 'Balanced Accuracy': 0.988391519760341, 'ROC AUC': 0.988391519760341, 'F1 Score': 0.9883915162831916, 'Time taken': 1.3259849548339844}
{'Model': 'DummyClassifier', 'Accuracy': 0.5, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.3333333333333333, 'Time taken': 0.14907026290893555}


 40%|████      | 6/15 [00:22<00:15,  1.75s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.9806717363751585, 'Balanced Accuracy': 0.9806717363751585, 'ROC AUC': 0.9806717363751584, 'F1 Score': 0.9806682680699299, 'Time taken': 0.20907211303710938}


 47%|████▋     | 7/15 [00:30<00:32,  4.08s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.9925106579098975, 'Balanced Accuracy': 0.9925106579098975, 'ROC AUC': 0.9925106579098975, 'F1 Score': 0.9925103992913666, 'Time taken': 8.876134872436523}


 53%|█████▎    | 8/15 [00:31<00:20,  2.89s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.9890828436455813, 'Balanced Accuracy': 0.9890828436455813, 'ROC AUC': 0.9890828436455813, 'F1 Score': 0.9890825062198392, 'Time taken': 0.3450310230255127}


 60%|██████    | 9/15 [00:31<00:13,  2.20s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.9915888927295772, 'Balanced Accuracy': 0.9915888927295772, 'ROC AUC': 0.9915888927295771, 'F1 Score': 0.9915884424476723, 'Time taken': 0.6943261623382568}


 67%|██████▋   | 10/15 [00:32<00:08,  1.61s/it]

{'Model': 'Perceptron', 'Accuracy': 0.9891980642931213, 'Balanced Accuracy': 0.9891980642931213, 'ROC AUC': 0.9891980642931214, 'F1 Score': 0.9891977373124287, 'Time taken': 0.2919578552246094}


 73%|███████▎  | 11/15 [00:32<00:04,  1.19s/it]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.9901198294734417, 'Balanced Accuracy': 0.9901198294734417, 'ROC AUC': 0.9901198294734417, 'F1 Score': 0.9901191447252966, 'Time taken': 0.24010252952575684}


 80%|████████  | 12/15 [00:52<00:20,  6.78s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.9934324230902177, 'Balanced Accuracy': 0.9934324230902178, 'ROC AUC': 0.9934324230902178, 'F1 Score': 0.9934322304822528, 'Time taken': 19.548397541046143}


 93%|█████████▎| 14/15 [00:52<00:03,  3.83s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.9934612282521028, 'Balanced Accuracy': 0.9934612282521028, 'ROC AUC': 0.9934612282521027, 'F1 Score': 0.9934610219423573, 'Time taken': 0.872687578201294}
[LightGBM] [Info] Number of positive: 40502, number of negative: 40502
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4392
[LightGBM] [Info] Number of data points in the train set: 81004, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


100%|██████████| 15/15 [00:54<00:00,  3.66s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.9934612282521028, 'Balanced Accuracy': 0.9934612282521027, 'ROC AUC': 0.9934612282521028, 'F1 Score': 0.993461017688523, 'Time taken': 2.0775115489959717}





In [49]:
from sklearn.metrics import roc_auc_score
for model in predictions.columns:
    print(model)
    print(classification_report(testLabel, predictions[model]))
    auc_scores = roc_auc_score(testLabel, predictions[model], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     17358
           1       0.99      1.00      0.99     17358

    accuracy                           0.99     34716
   macro avg       0.99      0.99      0.99     34716
weighted avg       0.99      0.99      0.99     34716

AUC Scores for each class: 0.9923090217767024
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     17358
           1       0.99      1.00      0.99     17358

    accuracy                           0.99     34716
   macro avg       0.99      0.99      0.99     34716
weighted avg       0.99      0.99      0.99     34716

AUC Scores for each class: 0.9927699043668625
-------------------------------------------------

In [50]:
# List of classifiers to include
import sklearn
import xgboost
import lightgbm
import sklearn.discriminant_analysis
from lazypredict.Supervised import LazyClassifier

#Skip SVM
classifiers = [
 ('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('KNeighborsClassifier',  sklearn.neighbors._classification.KNeighborsClassifier),
 ('LinearDiscriminantAnalysis',  sklearn.discriminant_analysis.LinearDiscriminantAnalysis),
 ('LogisticRegression', sklearn.linear_model._logistic.LogisticRegression),
 ('Perceptron', sklearn.linear_model._perceptron.Perceptron),
 ('QuadraticDiscriminantAnalysis',  sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis),
 ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
 ('StackingClassifier', sklearn.ensemble._stacking.StackingClassifier),
 ('XGBClassifier', xgboost.sklearn.XGBClassifier),
 ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None,classifiers=classifiers,predictions=True)

In [51]:
models, predictions = clf.fit(trainAttributes,  autestAttributes, trainLabel, autestLabel)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  7%|▋         | 1/15 [00:10<02:32, 10.92s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 10.917290210723877}


 13%|█▎        | 2/15 [00:24<02:43, 12.57s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 13.720137596130371}


 20%|██        | 3/15 [00:25<01:23,  7.00s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.6435514939748241, 'Balanced Accuracy': 0.48480555871056225, 'ROC AUC': 0.48480555871056225, 'F1 Score': 0.5220059637540395, 'Time taken': 0.3721601963043213}


 27%|██▋       | 4/15 [00:26<00:54,  4.94s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 1.7805242538452148}


 33%|███▎      | 5/15 [00:27<00:32,  3.24s/it]

{'Model': 'DummyClassifier', 'Accuracy': 0.6646284101599247, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.5307261619417766, 'Time taken': 0.2160952091217041}


 40%|████      | 6/15 [00:27<00:20,  2.26s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.6612910451104242, 'Balanced Accuracy': 0.505711223821657, 'ROC AUC': 0.505711223821657, 'F1 Score': 0.5479663117789038, 'Time taken': 0.3502204418182373}


 47%|████▋     | 7/15 [00:44<00:56,  7.10s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.6628589347309949, 'Balanced Accuracy': 0.5122837999174641, 'ROC AUC': 0.5122837999174641, 'F1 Score': 0.5598802062969052, 'Time taken': 17.066572666168213}


 53%|█████▎    | 8/15 [00:44<00:34,  4.98s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.6646956054293778, 'Balanced Accuracy': 0.5001498096432034, 'ROC AUC': 0.5001498096432034, 'F1 Score': 0.5310053960773934, 'Time taken': 0.4419279098510742}


 60%|██████    | 9/15 [00:45<00:22,  3.75s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.667383416207499, 'Balanced Accuracy': 0.5041570226265736, 'ROC AUC': 0.5041570226265735, 'F1 Score': 0.5371914151920912, 'Time taken': 1.0497419834136963}


 67%|██████▋   | 10/15 [00:46<00:13,  2.76s/it]

{'Model': 'Perceptron', 'Accuracy': 0.6665546745509117, 'Balanced Accuracy': 0.502904922183828, 'ROC AUC': 0.502904922183828, 'F1 Score': 0.5352524020153651, 'Time taken': 0.5434849262237549}


 73%|███████▎  | 11/15 [00:46<00:08,  2.03s/it]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.6646284101599247, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.5307261619417766, 'Time taken': 0.38330578804016113}


 80%|████████  | 12/15 [01:12<00:27,  9.21s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.6782018545894369, 'Balanced Accuracy': 0.5605188891786296, 'ROC AUC': 0.5605188891786296, 'F1 Score': 0.6256727936623417, 'Time taken': 25.623571395874023}


 93%|█████████▎| 14/15 [01:14<00:05,  5.37s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.6462169063297943, 'Balanced Accuracy': 0.6391065843500608, 'ROC AUC': 0.6391065843500608, 'F1 Score': 0.6546524492172303, 'Time taken': 1.8930308818817139}
[LightGBM] [Info] Number of positive: 40502, number of negative: 40502
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4392
[LightGBM] [Info] Number of data points in the train set: 81004, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


100%|██████████| 15/15 [01:16<00:00,  5.09s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.6479863817587241, 'Balanced Accuracy': 0.6508268315380992, 'ROC AUC': 0.6508268315380992, 'F1 Score': 0.6573292986953801, 'Time taken': 1.9653925895690918}





In [52]:
from sklearn.metrics import roc_auc_score
for model in predictions.columns:
    print(model)
    print(classification_report(autestLabel, predictions[model]))
    auc_scores = roc_auc_score(autestLabel, predictions[model], multi_class='ovr')  # Or multi_class='ovo' for one-vs-one
    print("AUC Scores for each class:", auc_scores)
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')

AdaBoostClassifier
              precision    recall  f1-score   support

           0       0.79      0.64      0.71     29673
           1       0.48      0.66      0.56     14973

    accuracy                           0.65     44646
   macro avg       0.64      0.65      0.63     44646
weighted avg       0.69      0.65      0.66     44646

AUC Scores for each class: 0.6508268315380992
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
BaggingClassifier
              precision    recall  f1-score   support

           0       0.79      0.64      0.71     29673
           1       0.48      0.66      0.56     14973

    accuracy                           0.65     44646
   macro avg       0.64      0.65      0.63     44646
weighted avg       0.69      0.65      0.66     44646

AUC Scores for each class: 0.6508268315380992
-------------------------------------------------