In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df= pd.read_csv('C://Users//User//Desktop//MSc Westminster//Dissertation//DataSets//Heart_Attack_Prediction.csv')

In [2]:
columns=df.columns

for column in columns:
    if df[column].dtype=="int32":
        df[column]=df[column].astype("int16")
    elif df[column].dtype=="float64":
        df[column]=df[column].astype("float16")
    elif df[column].dtype=="object":
        df[column]=df[column].astype("category")

In [3]:
df['Sex']= df['Sex'].map({'Female': 0, 'Male': 1})
df['Sex']= pd.to_numeric(df['Sex'])

df['Diet']= df['Diet'].map({'Healthy': 0, 'Average': 1, 'Unhealthy':2})
df['Diet']= pd.to_numeric(df['Diet'])

df[['HBP', 'LBP']]= df['Blood Pressure'].str.split('/', expand= True)
df['HBP']= pd.to_numeric(df['HBP'])
df['LBP']= pd.to_numeric(df['LBP'])

df['Diabetes'] = df['Diabetes'].map({0: 1, 1: 0})

df['Exercise Hours Per Week']= round(df['Exercise Hours Per Week'], 0)

df['Sedentary Hours Per Day']= round(df['Sedentary Hours Per Day'], 0)

df['Income']= round(df['Income'], 0)

df['BMI']= round(df['BMI'], 0)

df = df.drop(columns=['Patient ID', 'Blood Pressure', 'Country', 'Continent', 'Hemisphere'])

In [4]:
X= df.drop(['Heart Attack Risk'], axis= 1)
y= df['Heart Attack Risk']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 15, stratify= y)

from sklearn.preprocessing import RobustScaler
scaler= RobustScaler()
scaler.fit(X_train)

X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)

In [6]:
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans

sm= ClusterCentroids(random_state= 15, estimator= KMeans(n_init= 10))
tl= SMOTE(random_state= 15)

X_sm, y_sm= sm.fit_resample(X_train, y_train)
X_sm_tl, y_sm_tl= tl.fit_resample(X_sm, y_sm)

In [7]:
from sklearn.feature_selection import SelectKBest, f_classif

selector= SelectKBest(score_func= f_classif, k= 10)
X_sm_tl_selected= selector.fit_transform(X_sm_tl, y_sm_tl)
X_sm_tl_test_selected= selector.transform(X_test)

In [8]:
feature_names= X.columns
selected_indices= selector.get_support(indices=True)
selected_features= feature_names[selected_indices]
print("Selected Features: ", selected_features)

Selected Features:  Index(['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Smoking',
       'Stress Level', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'HBP'],
      dtype='object')


--- DecisionTreeClassifier ---

In [11]:
from sklearn.metrics import confusion_matrix, classification_report, precision_score, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier(random_state=15)
dt.fit(X_sm_tl_selected, y_sm_tl)
y_pred_dt= dt.predict(X_sm_tl_test_selected)
y_pred_prob_dt= dt.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_dt, labels= dt.classes_)
print(cm)
print(classification_report(y_test, y_pred_dt, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_dt, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_dt))
print('AUC:', roc_auc_score(y_test, y_pred_prob_dt))

[[485 640]
 [280 348]]
              precision    recall  f1-score   support

           0       0.63      0.43      0.51      1125
           1       0.35      0.55      0.43       628

    accuracy                           0.48      1753
   macro avg       0.49      0.49      0.47      1753
weighted avg       0.53      0.48      0.48      1753

Precision: 0.3522267206477733
Accuracy: 0.47518539646320596
AUC: 0.4926256192498231


--- Tuned - DecisionTreeClassifier ---

In [12]:
from sklearn.model_selection import GridSearchCV
param_grid= {
             'max_depth': [None, 10, 20, 30, 40, 50],
             'min_samples_split': [2, 10, 20],
             'min_samples_leaf': [1, 5, 10],
             'max_features': [None, 'sqrt', 'log2'],
             'criterion': ['gini', 'entropy']
            }

gs_dt= GridSearchCV(estimator= dt, param_grid= param_grid, cv= 5, scoring= 'precision')
gs_dt.fit(X_sm_tl_selected, y_sm_tl)

tuned_dt= gs_dt.best_estimator_
y_pred_tuned_dt= tuned_dt.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_dt= tuned_dt.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_dt, labels= tuned_dt.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_dt, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_dt, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_dt))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_dt))

[[419 706]
 [221 407]]
              precision    recall  f1-score   support

           0       0.65      0.37      0.47      1125
           1       0.37      0.65      0.47       628

    accuracy                           0.47      1753
   macro avg       0.51      0.51      0.47      1753
weighted avg       0.55      0.47      0.47      1753

Precision: 0.3656783468104223
Accuracy: 0.47119224187107817
AUC: 0.5109193205944799


--- Tuned - DecisionTreeClassifier 1 ---

In [13]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2', 0.5, 0.75],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

gs_dt= GridSearchCV(estimator= dt, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1)
gs_dt.fit(X_sm_tl_selected, y_sm_tl)

tuned_dt1= gs_dt.best_estimator_
y_pred_tuned_dt1= tuned_dt1.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_dt1= tuned_dt1.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_dt1, labels= tuned_dt1.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_dt1, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_dt1, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_dt1))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_dt1))

[[483 642]
 [249 379]]
              precision    recall  f1-score   support

           0       0.66      0.43      0.52      1125
           1       0.37      0.60      0.46       628

    accuracy                           0.49      1753
   macro avg       0.52      0.52      0.49      1753
weighted avg       0.56      0.49      0.50      1753

Precision: 0.3712047012732615
Accuracy: 0.4917284654877353
AUC: 0.5164182590233546


--- Tuned - RandomizedSearchCV 2 ---

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
param_dist = {
    'max_depth': [None] + list(range(5, 51, 5)),
    'min_samples_split': randint(2, 21),
    'min_samples_leaf': randint(1, 21),
    'max_features': [None, 'sqrt', 'log2', 0.5, 0.75],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

rs_dt = RandomizedSearchCV(estimator= dt, param_distributions= param_dist, n_iter= 100, cv= 5, scoring= 'precision', random_state= 15, n_jobs= -1)
gs_dt.fit(X_sm_tl_selected, y_sm_tl)

tuned_dt2= gs_dt.best_estimator_
y_pred_tuned_dt2= tuned_dt2.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_dt2= tuned_dt2.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_dt2, labels= tuned_dt2.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_dt2, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_dt2, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_dt2))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_dt2))

[[483 642]
 [249 379]]
              precision    recall  f1-score   support

           0       0.66      0.43      0.52      1125
           1       0.37      0.60      0.46       628

    accuracy                           0.49      1753
   macro avg       0.52      0.52      0.49      1753
weighted avg       0.56      0.49      0.50      1753

Precision: 0.3712047012732615
Accuracy: 0.4917284654877353
AUC: 0.5164182590233546


--- Tuned - DecisionTreeClassifier 3 ---

In [15]:
from sklearn.model_selection import GridSearchCV

best_params = tuned_dt2.get_params()
param_grid = {
    'max_depth': [best_params['max_depth'] - 5, best_params['max_depth'], best_params['max_depth'] + 5] if best_params['max_depth'] is not None else [None, 5, 10],
    'min_samples_split': [max(2, best_params['min_samples_split'] - 5), best_params['min_samples_split'], best_params['min_samples_split'] + 5],
    'min_samples_leaf': [max(1, best_params['min_samples_leaf'] - 2), best_params['min_samples_leaf'], best_params['min_samples_leaf'] + 2],
    'max_features': [best_params['max_features']],
    'criterion': [best_params['criterion']],
    'class_weight': [best_params['class_weight']]
}

gs_dt= GridSearchCV(estimator= dt, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1)
gs_dt.fit(X_sm_tl_selected, y_sm_tl)

tuned_dt3= gs_dt.best_estimator_
y_pred_tuned_dt3= tuned_dt3.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_dt3= tuned_dt3.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_dt3, labels= tuned_dt3.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_dt3, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_dt3, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_dt3))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_dt3))

[[483 642]
 [249 379]]
              precision    recall  f1-score   support

           0       0.66      0.43      0.52      1125
           1       0.37      0.60      0.46       628

    accuracy                           0.49      1753
   macro avg       0.52      0.52      0.49      1753
weighted avg       0.56      0.49      0.50      1753

Precision: 0.3712047012732615
Accuracy: 0.4917284654877353
AUC: 0.5164182590233546


--- GaussianNB ---

In [16]:
from sklearn.naive_bayes import GaussianNB
nb= GaussianNB()
nb.fit(X_sm_tl_selected, y_sm_tl)
y_pred_nb= nb.predict(X_sm_tl_test_selected)
y_pred_prob_nb= nb.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_nb, labels= nb.classes_)
print(cm)
print(classification_report(y_test, y_pred_nb, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_nb, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_nb))
print('AUC:', roc_auc_score(y_test, y_pred_prob_nb))

[[346 779]
 [179 449]]
              precision    recall  f1-score   support

           0       0.66      0.31      0.42      1125
           1       0.37      0.71      0.48       628

    accuracy                           0.45      1753
   macro avg       0.51      0.51      0.45      1753
weighted avg       0.55      0.45      0.44      1753

Precision: 0.36563517915309446
Accuracy: 0.4535082715345123
AUC: 0.5039207360226469


--- Tuned GaussianNB ---

In [40]:
from sklearn.model_selection import GridSearchCV

param_grid= {'var_smoothing': np.logspace(-9, 0, 100)}

gs_nb= GridSearchCV(estimator= nb, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1)
gs_nb.fit(X_sm_tl_selected, y_sm_tl)

tuned_nb= gs_nb.best_estimator_
y_pred_tuned_nb= tuned_nb.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_nb= tuned_nb.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_nb, labels= tuned_nb1.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_nb, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_nb, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_nb))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_nb))

[[517 608]
 [288 340]]
              precision    recall  f1-score   support

           0       0.64      0.46      0.54      1125
           1       0.36      0.54      0.43       628

    accuracy                           0.49      1753
   macro avg       0.50      0.50      0.48      1753
weighted avg       0.54      0.49      0.50      1753

Precision: 0.35864978902953587
Accuracy: 0.48887621220764405
AUC: 0.4991309271054494


--- Tuned GaussianNB 1 ---

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'var_smoothing': np.logspace(-9, 0, 100),
    'priors': [[0.3, 0.7], [0.2, 0.8], [0.1, 0.9]]
}

gs_nb= GridSearchCV(estimator= nb, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1)
gs_nb.fit(X_sm_tl_selected, y_sm_tl)

tuned_nb1= gs_nb.best_estimator_
y_pred_tuned_nb1= tuned_nb1.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_nb2= tuned_nb1.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_nb1, labels= tuned_nb2.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_nb1, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_nb1, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_nb1))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_nb1))

[[ 123 1002]
 [  67  561]]
              precision    recall  f1-score   support

           0       0.65      0.11      0.19      1125
           1       0.36      0.89      0.51       628

    accuracy                           0.39      1753
   macro avg       0.50      0.50      0.35      1753
weighted avg       0.54      0.39      0.30      1753

Precision: 0.35892514395393477
Accuracy: 0.390188248716486
AUC: 0.5038145789101203


--- Tuned GaussianNB RandomizedSearchCV 2 ---

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_dist = {
    'var_smoothing': loguniform(1e-9, 1e0),
    'priors': [[0.3, 0.7], [0.2, 0.8], [0.1, 0.9]]
}

rs_nb = RandomizedSearchCV(estimator=nb, param_distributions=param_dist, n_iter=100, cv=5, scoring='precision', random_state=15, n_jobs=-1)
rs_nb.fit(X_sm_tl_selected, y_sm_tl)

tuned_nb2= rs_nb.best_estimator_
y_pred_tuned_nb2= tuned_nb2.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_nb2= tuned_nb2.predict_proba(X_sm_tl_test_selected)[:, 1]

print("Best parameters:", rs_nb.best_params_)
cm = confusion_matrix(y_test, y_pred_tuned_nb2, labels=tuned_nb2.classes_)
print("Confusion Matrix:\n", cm)
print(classification_report(y_test, y_pred_tuned_nb2, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_nb2, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_nb2))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_nb2))


Best parameters: {'priors': [0.3, 0.7], 'var_smoothing': 0.003956967935062682}
Confusion Matrix:
 [[ 123 1002]
 [  67  561]]
              precision    recall  f1-score   support

           0       0.65      0.11      0.19      1125
           1       0.36      0.89      0.51       628

    accuracy                           0.39      1753
   macro avg       0.50      0.50      0.35      1753
weighted avg       0.54      0.39      0.30      1753

Precision: 0.35892514395393477
Accuracy: 0.390188248716486
AUC: 0.5038273177636234


--- SVM ---

In [25]:
from sklearn.svm import SVC
svc= SVC(kernel= 'rbf',probability= True, gamma= 1, random_state=15)
svc.fit(X_sm_tl_selected, y_sm_tl)
y_pred_svc= svc.predict(X_sm_tl_test_selected)
y_pred_prob_svc= svc.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_svc, labels= svc.classes_)
print(cm)
print(classification_report(y_test, y_pred_svc, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_svc, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_svc))
print('AUC:', roc_auc_score(y_test, y_pred_prob_svc))

[[510 615]
 [268 360]]
              precision    recall  f1-score   support

           0       0.66      0.45      0.54      1125
           1       0.37      0.57      0.45       628

    accuracy                           0.50      1753
   macro avg       0.51      0.51      0.49      1753
weighted avg       0.55      0.50      0.50      1753

Precision: 0.36923076923076925
Accuracy: 0.49629207073588133
AUC: 0.5207954706298655


--- Tuned SVM ---

In [26]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

gs_svc= GridSearchCV(estimator= svc, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1)
gs_svc.fit(X_sm_tl_selected, y_sm_tl)

tuned_svc= gs_svc.best_estimator_
y_pred_tuned_svc= tuned_svc.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_svc= tuned_svc.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_svc, labels= tuned_svc.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_svc, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_svc, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_svc))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_svc))

[[510 615]
 [268 360]]
              precision    recall  f1-score   support

           0       0.64      0.35      0.45      1125
           1       0.36      0.65      0.46       628

    accuracy                           0.45      1753
   macro avg       0.50      0.50      0.45      1753
weighted avg       0.54      0.45      0.45      1753

Precision: 0.356140350877193
Accuracy: 0.45464917284654877
AUC: 0.5


--- Tuned SVM 1 ---

In [28]:
param_grid = {
    'C': np.logspace(-2, 2, 10),
    'gamma': np.logspace(-3, 1, 10),
    'kernel': ['rbf', 'sigmoid']
}

gs_svc= GridSearchCV(estimator= svc, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1)
gs_svc.fit(X_sm_tl_selected, y_sm_tl)

tuned_svc1= gs_svc.best_estimator_
y_pred_tuned_svc1= tuned_svc1.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_svc1= tuned_svc1.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_svc1, labels= tuned_svc1.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_svc1, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_svc1, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_svc1))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_svc1))

[[426 699]
 [244 384]]
              precision    recall  f1-score   support

           0       0.64      0.38      0.47      1125
           1       0.35      0.61      0.45       628

    accuracy                           0.46      1753
   macro avg       0.50      0.50      0.46      1753
weighted avg       0.54      0.46      0.47      1753

Precision: 0.3545706371191136
Accuracy: 0.4620650313747861
AUC: 0.5


--- Tuned SVM - RandomizedSearchCV 2 ---

In [30]:
from scipy.stats import expon
param_dist= {
    'C': expon(scale=100),
    'gamma': expon(scale=.1),
    'kernel': ['rbf', 'sigmoid'],
    'class_weight': [None, 'balanced']
}

rs_svc= RandomizedSearchCV(estimator= svc, param_distributions= param_dist, n_iter= 100, cv= 5, scoring= 'precision', random_state= 15,  n_jobs= -1)
rs_svc.fit(X_sm_tl_selected, y_sm_tl)

tuned_svc2= rs_svc.best_estimator_
y_pred_tuned_svc2= tuned_svc2.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_svc2= tuned_svc2.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_svc2, labels= tuned_svc1.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_svc2, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_svc2, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_svc2))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_svc2))

[[457 668]
 [241 387]]
              precision    recall  f1-score   support

           0       0.65      0.41      0.50      1125
           1       0.37      0.62      0.46       628

    accuracy                           0.48      1753
   macro avg       0.51      0.51      0.48      1753
weighted avg       0.55      0.48      0.49      1753

Precision: 0.36682464454976305
Accuracy: 0.4814603536794067
AUC: 0.5102788393489031


--- LGBM ---

In [31]:
from lightgbm import LGBMClassifier
lgm= LGBMClassifier(random_state=15)
lgm.fit(X_sm_tl_selected, y_sm_tl)
y_pred_lgm= lgm.predict(X_sm_tl_test_selected)
y_pred_prob_lgm= lgm.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_lgm, labels= lgm.classes_)
print(cm)
print(classification_report(y_test, y_pred_lgm, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_lgm, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_lgm))
print('AUC:', roc_auc_score(y_test, y_pred_prob_lgm))

[LightGBM] [Info] Number of positive: 2511, number of negative: 2511
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 5022, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[258 867]
 [133 495]]
              precision    recall  f1-score   support

           0       0.66      0.23      0.34      1125
           1       0.36      0.79      0.50       628

    accuracy                           0.43      1753
   macro avg       0.51      0.51      0.42      1753
weighted avg       0.55      0.43      0.40      1753

Precision: 0.3634361233480176
Accuracy: 0.4295493439817456
AUC: 0.5105024769992923


In [32]:
params_LGB= {'learning_rate': [0.001, 0.01, 0.1, 1.0],
             'num_leaves': [31, 127],
             'reg_alpha': [0.1, 0.5],
             'min_data_in_leaf': [30, 50, 100, 300, 400]}

gs_lgm= GridSearchCV(estimator= lgm, param_grid= params_LGB, cv=5, scoring='precision', n_jobs=-1, verbose=2)
gs_lgm.fit(X_sm_tl_selected, y_sm_tl)

tuned_lgm= gs_lgm.best_estimator_
y_pred_tuned_lgm= tuned_lgm.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_lgm= tuned_lgm.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_lgm, labels= tuned_lgm.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_lgm, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_lgm, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_lgm))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_lgm))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[LightGBM] [Info] Number of positive: 2511, number of negative: 2511
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 5022, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[317 808]
 [167 461]]


              precision    recall  f1-score   support

           0       0.65      0.28      0.39      1125
           1       0.36      0.73      0.49       628

    accuracy                           0.44      1753
   macro avg       0.51      0.51      0.44      1753
weighted avg       0.55      0.44      0.43      1753

Precision: 0.3632781717888101
Accuracy: 0.44381061038220193
AUC: 0.5027133757961784


--- Tuned LGBM 1 ---

In [33]:
params_LGB= {
    'learning_rate': np.logspace(-3, 0, 10),
    'num_leaves': [20, 31, 50, 70, 100],
    'reg_alpha': [0.01, 0.1, 0.5, 1.0],
    'min_data_in_leaf': [20, 50, 100, 200],
    'max_depth': [-1, 10, 20, 30],
    'boosting_type': ['gbdt', 'dart']
}

gs_lgm= GridSearchCV(estimator= lgm, param_grid= params_LGB, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_lgm.fit(X_sm_tl_selected, y_sm_tl)


tuned_lgm1= gs_lgm.best_estimator_
y_pred_tuned_lgm1= tuned_lgm1.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_lgm1= tuned_lgm1.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_lgm1, labels= tuned_lgm1.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_lgm1, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_lgm1, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_lgm1))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_lgm1))

Fitting 5 folds for each of 6400 candidates, totalling 32000 fits
[LightGBM] [Info] Number of positive: 2511, number of negative: 2511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 5022, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[292 833]
 [140 488]]
              precision    recall  f1-score   support

           0       0.68      0.26      0.38      1125
           1       0.37      0.78      0.50       628

    accuracy                           0.44      1753
   macro avg       0.52      0.52      0.44      1753
weighted avg       0.57      0.44      0.42      1753

Precision: 0.36941710825132473
Accuracy: 0.44495151169423847
AUC: 0.5131012031139419


--- Tuned LGBM RandomizedSearchCV 2 ---

In [35]:
from scipy.stats import randint, uniform
param_dist = {
    'learning_rate': uniform(0.001, 0.1),
    'num_leaves': randint(20, 100),
    'reg_alpha': uniform(0.01, 1.0),
    'min_data_in_leaf': randint(20, 200),
    'max_depth': randint(5, 30),
    'boosting_type': ['gbdt', 'dart']
}

rs_lgm= RandomizedSearchCV(estimator=lgm, param_distributions= param_dist, n_iter= 100, cv= 5, scoring= 'precision', random_state= 15, n_jobs= -1, verbose= 2)
rs_lgm.fit(X_sm_tl_selected, y_sm_tl)


tuned_lgm2= rs_lgm.best_estimator_
y_pred_tuned_lgm2= tuned_lgm2.predict(X_sm_tl_test_selected)
y_pred_prob_tuned_lgm2= tuned_lgm2.predict_proba(X_sm_tl_test_selected)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_lgm2, labels= tuned_lgm2.classes_)
print(cm)
print(classification_report(y_test, y_pred_tuned_lgm2, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_lgm2, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_lgm2))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_lgm2))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Number of positive: 2511, number of negative: 2511
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 5022, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[261 864]
 [121 507]]
              precision    recall  f1-score   support

           0       0.68      0.23      0.35      1125
           1       0.37      0.81      0.51       628

    accuracy                           0.44      1753
   macro avg       0.53      0.52      0.43      1753
weighted avg       0.57      0.44      0.40      1753

Precision: 0.36980306345733044
Accuracy: 0.4381061038220194
AUC: 0.5177070063694268


--- Visualisation ---

In [41]:
precision_scores= {
                    'Decision Tree Precision:': precision_score(y_test, y_pred_dt, zero_division= 0),
                    'Tuned Decision Tree Precision:': precision_score(y_test, y_pred_tuned_dt, zero_division= 0),
                    'Tuned Decision Tree Precision1:': precision_score(y_test, y_pred_tuned_dt1, zero_division= 0),
                    'Tuned Decision Tree - RS - Precision2:': precision_score(y_test, y_pred_tuned_dt2, zero_division= 0),
                    'Tuned Decision Tree Precision3:': precision_score(y_test, y_pred_tuned_dt3, zero_division= 0),
                    'GaussianNB Precision:': precision_score(y_test, y_pred_nb, zero_division= 0),
                    'Tuned GaussianNB Precision:': precision_score(y_test, y_pred_tuned_nb, zero_division= 0),
                    'Tuned GaussianNB Precision1:': precision_score(y_test, y_pred_tuned_nb1, zero_division= 0),
                    'Tuned GaussianNB - RS - Precision2:': precision_score(y_test, y_pred_tuned_nb2, zero_division= 0),
                    'SVM Precision:': precision_score(y_test, y_pred_svc, zero_division= 0),
                    'Tuned SVM Precision:': precision_score(y_test, y_pred_tuned_svc, zero_division= 0),
                    'Tuned SVM Precision1:': precision_score(y_test, y_pred_tuned_svc1, zero_division= 0),
                    'Tuned SVM - RS - Precision2:': precision_score(y_test, y_pred_tuned_svc2, zero_division= 0),
                    'LGBM Precision:': precision_score(y_test, y_pred_lgm, zero_division= 0),
                    'Tuned LGBM Precision:': precision_score(y_test, y_pred_tuned_lgm, zero_division= 0),
                    'Tuned LGBM Precision1:': precision_score(y_test, y_pred_tuned_lgm1, zero_division= 0),
                    'Tuned LGBM Precision2:': precision_score(y_test, y_pred_tuned_lgm2, zero_division= 0)
                  }

annova_precision= pd.DataFrame(list(precision_scores.items()), columns= ['Model', 'Precision Score'])
annova_precision= annova_precision.sort_values(by= 'Precision Score', ascending=False)
print(annova_precision)

                                     Model  Precision Score
2          Tuned Decision Tree Precision1:         0.371205
3   Tuned Decision Tree - RS - Precision2:         0.371205
4          Tuned Decision Tree Precision3:         0.371205
16                  Tuned LGBM Precision2:         0.369803
15                  Tuned LGBM Precision1:         0.369417
9                           SVM Precision:         0.369231
12            Tuned SVM - RS - Precision2:         0.366825
1           Tuned Decision Tree Precision:         0.365678
5                    GaussianNB Precision:         0.365635
13                         LGBM Precision:         0.363436
14                   Tuned LGBM Precision:         0.363278
8      Tuned GaussianNB - RS - Precision2:         0.358925
7             Tuned GaussianNB Precision1:         0.358650
6              Tuned GaussianNB Precision:         0.358650
10                    Tuned SVM Precision:         0.356140
11                   Tuned SVM Precision

In [42]:
accuracy_scores= {
                    'Decision Tree Accuracy:': accuracy_score(y_test, y_pred_dt),
                    'Tuned Decision Tree Accuracy:': accuracy_score(y_test, y_pred_tuned_dt),
                    'Tuned Decision Tree Accuracy1:': accuracy_score(y_test, y_pred_tuned_dt1),
                    'Tuned Decision Tree - RS - Accuracy2:': accuracy_score(y_test, y_pred_tuned_dt2),
                    'Tuned Decision Tree Accuracy3:': accuracy_score(y_test, y_pred_tuned_dt3),
                    'GaussianNB Accuracy:': accuracy_score(y_test, y_pred_nb),
                    'Tuned GaussianNB Accuracy:': accuracy_score(y_test, y_pred_tuned_nb),
                    'Tuned GaussianNB Accuracy1:': accuracy_score(y_test, y_pred_tuned_nb1),
                    'Tuned GaussianNB - RS - Accuracy2:': accuracy_score(y_test, y_pred_tuned_nb2),
                    'SVM Accuracy:': accuracy_score(y_test, y_pred_svc),
                    'Tuned SVM Accuracy:': accuracy_score(y_test, y_pred_tuned_svc),
                    'Tuned SVM Accuracy1:': accuracy_score(y_test, y_pred_tuned_svc1),
                    'Tuned SVM - RS - Accuracy2:': accuracy_score(y_test, y_pred_tuned_svc2),
                    'LGBM Accuracy:': accuracy_score(y_test, y_pred_lgm),
                    'Tuned LGBM Accuracy:': accuracy_score(y_test, y_pred_tuned_lgm),
                    'Tuned LGBM Accuracy1:': accuracy_score(y_test, y_pred_tuned_lgm1),
                    'Tuned LGBM Accuracy2:': accuracy_score(y_test, y_pred_tuned_lgm2)
                  }

annova_accuracy= pd.DataFrame(list(accuracy_scores.items()), columns= ['Model', 'Accuracy Score'])
annova_accuracy= annova_accuracy.sort_values(by= 'Accuracy Score', ascending=False)
print(annova_accuracy)

                                    Model  Accuracy Score
9                           SVM Accuracy:        0.496292
2          Tuned Decision Tree Accuracy1:        0.491728
3   Tuned Decision Tree - RS - Accuracy2:        0.491728
4          Tuned Decision Tree Accuracy3:        0.491728
6              Tuned GaussianNB Accuracy:        0.488876
7             Tuned GaussianNB Accuracy1:        0.488876
12            Tuned SVM - RS - Accuracy2:        0.481460
0                 Decision Tree Accuracy:        0.475185
1           Tuned Decision Tree Accuracy:        0.471192
11                   Tuned SVM Accuracy1:        0.462065
10                    Tuned SVM Accuracy:        0.454649
5                    GaussianNB Accuracy:        0.453508
15                  Tuned LGBM Accuracy1:        0.444952
14                   Tuned LGBM Accuracy:        0.443811
16                  Tuned LGBM Accuracy2:        0.438106
13                         LGBM Accuracy:        0.429549
8      Tuned G

In [43]:
auc_scores= {
                    'Decision Tree AUC:': roc_auc_score(y_test, y_pred_prob_dt),
                    'Tuned Decision Tree AUC:': roc_auc_score(y_test, y_pred_prob_tuned_dt),
                    'Tuned Decision Tree AUC1:': roc_auc_score(y_test, y_pred_prob_tuned_dt1),
                    'Tuned Decision Tree - RS - AUC2:': roc_auc_score(y_test, y_pred_prob_tuned_dt2),
                    'Tuned Decision Tree AUC3:': roc_auc_score(y_test, y_pred_prob_tuned_dt3),
                    'GaussianNB AUC:': roc_auc_score(y_test, y_pred_prob_nb),
                    'Tuned GaussianNB AUC:': roc_auc_score(y_test, y_pred_prob_tuned_nb),
                    'Tuned GaussianNB AUC1:': roc_auc_score(y_test, y_pred_prob_tuned_nb1),
                    'Tuned GaussianNB - RS - AUC2:': roc_auc_score(y_test, y_pred_prob_tuned_nb2),
                    'SVM AUC:': roc_auc_score(y_test, y_pred_prob_svc),
                    'Tuned SVM AUC:': roc_auc_score(y_test, y_pred_prob_tuned_svc),
                    'Tuned SVM AUC1:': roc_auc_score(y_test, y_pred_prob_tuned_svc1),
                    'Tuned SVM - RS - AUC2:': roc_auc_score(y_test, y_pred_prob_tuned_svc2),
                    'LGBM AUC:': roc_auc_score(y_test, y_pred_prob_lgm),
                    'Tuned LGBM AUC:': roc_auc_score(y_test, y_pred_prob_tuned_lgm),
                    'Tuned LGBM AUC1:': roc_auc_score(y_test, y_pred_prob_tuned_lgm1),
                    'Tuned LGBM AUC2:': roc_auc_score(y_test, y_pred_prob_tuned_lgm2)                    
                  }

annova_auc= pd.DataFrame(list(auc_scores.items()), columns= ['Model', 'AUC Score'])
annova_auc= annova_auc.sort_values(by= 'AUC Score', ascending=False)
print(annova_auc)

                               Model  AUC Score
9                           SVM AUC:   0.520795
16                  Tuned LGBM AUC2:   0.517707
2          Tuned Decision Tree AUC1:   0.516418
3   Tuned Decision Tree - RS - AUC2:   0.516418
4          Tuned Decision Tree AUC3:   0.516418
15                  Tuned LGBM AUC1:   0.513101
1           Tuned Decision Tree AUC:   0.510919
13                         LGBM AUC:   0.510502
12            Tuned SVM - RS - AUC2:   0.510279
5                    GaussianNB AUC:   0.503921
8      Tuned GaussianNB - RS - AUC2:   0.503815
14                   Tuned LGBM AUC:   0.502713
11                   Tuned SVM AUC1:   0.500000
10                    Tuned SVM AUC:   0.500000
7             Tuned GaussianNB AUC1:   0.499131
6              Tuned GaussianNB AUC:   0.499131
0                 Decision Tree AUC:   0.492626
