In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df= pd.read_csv('C://Users//User//Desktop//MSc Westminster//Dissertation//DataSets//Heart_Attack_Prediction.csv')

In [2]:
columns=df.columns

for column in columns:
    if df[column].dtype=="int32":
        df[column]=df[column].astype("int16")
    elif df[column].dtype=="float64":
        df[column]=df[column].astype("float16")
    elif df[column].dtype=="object":
        df[column]=df[column].astype("category")

In [3]:
df['Sex']= df['Sex'].map({'Female': 0, 'Male': 1})
df['Sex']= pd.to_numeric(df['Sex'])

df['Diet']= df['Diet'].map({'Healthy': 0, 'Average': 1, 'Unhealthy':2})
df['Diet']= pd.to_numeric(df['Diet'])

df[['HBP', 'LBP']]= df['Blood Pressure'].str.split('/', expand= True)
df['HBP']= pd.to_numeric(df['HBP'])
df['LBP']= pd.to_numeric(df['LBP'])

df['Diabetes'] = df['Diabetes'].map({0: 1, 1: 0})

df['Exercise Hours Per Week']= round(df['Exercise Hours Per Week'], 0)

df['Sedentary Hours Per Day']= round(df['Sedentary Hours Per Day'], 0)

df['Income']= round(df['Income'], 0)

df['BMI']= round(df['BMI'], 0)

df = df.drop(columns=['Patient ID', 'Blood Pressure', 'Country', 'Continent', 'Hemisphere'])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Sex                              8763 non-null   int64  
 2   Cholesterol                      8763 non-null   int64  
 3   Heart Rate                       8763 non-null   int64  
 4   Diabetes                         8763 non-null   int64  
 5   Family History                   8763 non-null   int64  
 6   Smoking                          8763 non-null   int64  
 7   Obesity                          8763 non-null   int64  
 8   Alcohol Consumption              8763 non-null   int64  
 9   Exercise Hours Per Week          8763 non-null   float16
 10  Diet                             8763 non-null   int64  
 11  Previous Heart Problems          8763 non-null   int64  
 12  Medication Use      

In [5]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,8763.0,53.70798,21.249509,18.0,35.0,54.0,72.0,90.0
Sex,8763.0,0.6973639,0.459425,0.0,0.0,1.0,1.0,1.0
Cholesterol,8763.0,259.8772,80.863276,120.0,192.0,259.0,330.0,400.0
Heart Rate,8763.0,75.02168,20.550948,40.0,57.0,75.0,93.0,110.0
Diabetes,8763.0,0.347712,0.476271,0.0,0.0,0.0,1.0,1.0
Family History,8763.0,0.4929819,0.499979,0.0,0.0,0.0,1.0,1.0
Smoking,8763.0,0.896839,0.304186,0.0,1.0,1.0,1.0,1.0
Obesity,8763.0,0.5014265,0.500026,0.0,0.0,1.0,1.0,1.0
Alcohol Consumption,8763.0,0.5980828,0.490313,0.0,0.0,1.0,1.0,1.0
Exercise Hours Per Week,8763.0,inf,5.796875,0.0,5.0,10.0,15.0,20.0


In [6]:
X= df.drop(['Heart Attack Risk'], axis= 1)
y= df['Heart Attack Risk']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 15, stratify= y)


from sklearn.preprocessing import RobustScaler
scaler= RobustScaler()
scaler.fit(X_train)

X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)

--- LogisticRegression ---

In [8]:
from sklearn.metrics import confusion_matrix, classification_report, precision_score, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
lr_cs= LogisticRegression(random_state=15, class_weight= 'balanced', max_iter= 1000)
lr_cs.fit(X_train, y_train)
y_pred_lr_cs= lr_cs.predict(X_test)
y_pred_prob_lr_cs= lr_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_lr_cs, labels= lr_cs.classes_)
print(classification_report(y_test, y_pred_lr_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_lr_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_lr_cs))
print('AUC:', roc_auc_score(y_test, y_pred_lr_cs))

              precision    recall  f1-score   support

           0       0.63      0.49      0.55      1125
           1       0.35      0.49      0.41       628

    accuracy                           0.49      1753
   macro avg       0.49      0.49      0.48      1753
weighted avg       0.53      0.49      0.50      1753

Precision: 0.34650112866817157
Accuracy: 0.48659440958357103
AUC: 0.48709341825902336


--- DecisionTreeClassifier ---

In [9]:
from sklearn.tree import DecisionTreeClassifier
dt_cs= DecisionTreeClassifier(class_weight= 'balanced', random_state= 15)
dt_cs.fit(X_train, y_train)
y_pred_dt_cs= dt_cs.predict(X_test)
y_pred_prob_dt_cs= dt_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_dt_cs, labels= dt_cs.classes_)
print(classification_report(y_test, y_pred_dt_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_dt_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_dt_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_dt_cs))

              precision    recall  f1-score   support

           0       0.64      0.63      0.64      1125
           1       0.35      0.35      0.35       628

    accuracy                           0.53      1753
   macro avg       0.49      0.49      0.49      1753
weighted avg       0.53      0.53      0.53      1753

Precision: 0.3501577287066246
Accuracy: 0.5333713633770679
AUC: 0.4936404812455768


--- Tuned - DecisionTreeClassifier ---

In [10]:
from sklearn.model_selection import GridSearchCV
param_grid= {
             'max_depth': [None, 10, 20, 30, 40, 50],
             'min_samples_split': [2, 10, 20],
             'min_samples_leaf': [1, 5, 10],
             'max_features': [None, 'sqrt', 'log2'],
             'criterion': ['gini', 'entropy']
            }

gs_dt_cs= GridSearchCV(estimator= dt_cs, param_grid= param_grid, cv= 5, scoring= 'precision')
gs_dt_cs.fit(X_train, y_train)

print("Best Parameters:", gs_dt_cs.best_params_)
print("Best Precision Score:", gs_dt_cs.best_score_)

Best Parameters: {'criterion': 'gini', 'max_depth': 30, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Precision Score: 0.37954070946562746


In [11]:
tuned_dt_cs= gs_dt_cs.best_estimator_
y_pred_tuned_dt_cs= tuned_dt_cs.predict(X_test)
y_pred_prob_tuned_dt_cs= tuned_dt_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_dt_cs, labels= tuned_dt_cs.classes_)
print(classification_report(y_test, y_pred_tuned_dt_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_tuned_dt_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_dt_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_dt_cs))

              precision    recall  f1-score   support

           0       0.64      0.64      0.64      1125
           1       0.36      0.36      0.36       628

    accuracy                           0.54      1753
   macro avg       0.50      0.50      0.50      1753
weighted avg       0.54      0.54      0.54      1753

Precision: 0.3563579277864992
Accuracy: 0.5373645179691957
AUC: 0.4985102618542108


--- SVM ---

In [12]:
from sklearn.svm import SVC
svc_cs= SVC(kernel= 'rbf',probability= True, class_weight= 'balanced', gamma= 1, random_state= 15)
svc_cs.fit(X_train, y_train)
y_pred_svc_cs= svc_cs.predict(X_test)
y_pred_prob_svc_cs= svc_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_svc_cs, labels= svc_cs.classes_)
print(classification_report(y_test, y_pred_svc_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_svc_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_svc_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_svc_cs))

              precision    recall  f1-score   support

           0       0.64      0.99      0.78      1125
           1       0.30      0.01      0.02       628

    accuracy                           0.64      1753
   macro avg       0.47      0.50      0.40      1753
weighted avg       0.52      0.64      0.51      1753

Precision: 0.30434782608695654
Accuracy: 0.6366229321163719
AUC: 0.489381457891012


--- Random Forest ---

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_cs= RandomForestClassifier(class_weight= 'balanced', random_state= 15)
rf_cs.fit(X_train, y_train)
y_pred_rf_cs= rf_cs.predict(X_test)
y_pred_prob_rf_cs= rf_cs.predict_proba(X_test)[:,1]

In [14]:
cm= confusion_matrix(y_test, y_pred_rf_cs, labels= rf_cs.classes_)
print(classification_report(y_test, y_pred_rf_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_rf_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_rf_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_rf_cs))

              precision    recall  f1-score   support

           0       0.64      0.98      0.78      1125
           1       0.36      0.02      0.03       628

    accuracy                           0.64      1753
   macro avg       0.50      0.50      0.40      1753
weighted avg       0.54      0.64      0.51      1753

Precision: 0.35714285714285715
Accuracy: 0.6371933827723902
AUC: 0.5208662420382166


In [15]:
param_grid= {
             'n_estimators': [100, 200, 300],
             'max_depth': [None, 10, 20, 30],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4],
            }

gs_tuned_rf_cs= GridSearchCV(estimator= rf_cs, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_tuned_rf_cs.fit(X_train, y_train)
print("Best Parameters:", gs_tuned_rf_cs.best_params_)
print("Best Precision Score:", gs_tuned_rf_cs.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Precision Score: 0.569047619047619


In [16]:
tuned_rf_cs= gs_tuned_rf_cs.best_estimator_
y_pred_tuned_rf_cs= tuned_rf_cs.predict(X_test)
y_pred_prob_tuned_rf_cs= tuned_rf_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_rf_cs, labels= tuned_rf_cs.classes_)
print(classification_report(y_test, y_pred_tuned_rf_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_tuned_rf_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_rf_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_rf_cs))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1125
           1       0.00      0.00      0.00       628

    accuracy                           0.64      1753
   macro avg       0.32      0.50      0.39      1753
weighted avg       0.41      0.64      0.50      1753

Precision: 0.0
Accuracy: 0.6406160867084997
AUC: 0.5340311394196744


--- AdaBoost ---

In [17]:
from sklearn.ensemble import AdaBoostClassifier
ada_cs= AdaBoostClassifier(random_state=15)
ada_cs.fit(X_train, y_train, sample_weight= (y_train== 0).astype(int) + (y_train== 1).astype(int) * 2)
y_pred_ada_cs= ada_cs.predict(X_test)
y_pred_prob_ada_cs= ada_cs.predict_proba(X_test)[:,1]

In [18]:
cm= confusion_matrix(y_test, y_pred_ada_cs, labels= ada_cs.classes_)
print(classification_report(y_test, y_pred_ada_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_ada_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_ada_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_ada_cs))

              precision    recall  f1-score   support

           0       0.65      0.35      0.45      1125
           1       0.36      0.66      0.47       628

    accuracy                           0.46      1753
   macro avg       0.51      0.51      0.46      1753
weighted avg       0.55      0.46      0.46      1753

Precision: 0.3622936576889661
Accuracy: 0.46092413006274957
AUC: 0.49761358811040346


In [19]:
param_grid= {'n_estimators': [50, 100, 200]}

gs_ada_cs= GridSearchCV(estimator= ada_cs, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_ada_cs.fit(X_train, y_train, sample_weight=(y_train == 0).astype(int) + (y_train == 1).astype(int) * 2)
print("Best Parameters:", gs_ada_cs.best_params_)
print("Best Precision Score:", gs_ada_cs.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'n_estimators': 200}
Best Precision Score: 0.3682910882671858


In [20]:
tuned_ada_cs= gs_ada_cs.best_estimator_
y_pred_tuned_ada_cs= tuned_ada_cs.predict(X_test)
y_pred_prob_tuned_ada_cs= tuned_ada_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_ada_cs, labels= tuned_ada_cs.classes_)
print(classification_report(y_test, y_pred_tuned_ada_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_tuned_ada_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_ada_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_ada_cs))

              precision    recall  f1-score   support

           0       0.63      0.38      0.47      1125
           1       0.35      0.59      0.44       628

    accuracy                           0.46      1753
   macro avg       0.49      0.49      0.46      1753
weighted avg       0.53      0.46      0.46      1753

Precision: 0.3477038425492034
Accuracy: 0.45636052481460354
AUC: 0.4934819532908705


--- GradientBoosting ---

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
grb_cs= GradientBoostingClassifier(random_state= 15)
grb_cs.fit(X_train, y_train, sample_weight= (y_train== 0).astype(int) + (y_train== 1).astype(int) * 2)
y_pred_grb_cs= grb_cs.predict(X_test)
y_pred_prob_grb_cs= grb_cs.predict_proba(X_test)[:,1]


cm= confusion_matrix(y_test, y_pred_grb_cs, labels= grb_cs.classes_)
print(classification_report(y_test, y_pred_grb_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_grb_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_grb_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_grb_cs))

              precision    recall  f1-score   support

           0       0.66      0.33      0.44      1125
           1       0.36      0.69      0.48       628

    accuracy                           0.46      1753
   macro avg       0.51      0.51      0.46      1753
weighted avg       0.55      0.46      0.45      1753

Precision: 0.36486486486486486
Accuracy: 0.4592127780946948
AUC: 0.5038726114649681


In [22]:
param_grid= {'learning_rate': [0.01, 0.1, 0.2]}

gs_grb_cs= GridSearchCV(estimator= grb_cs, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_grb_cs.fit(X_train, y_train, sample_weight= (y_train== 0).astype(int) + (y_train== 1).astype(int) * 2)

print("Best Parameters:", gs_grb_cs.best_params_)
print("Best Precision Score:", gs_grb_cs.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'learning_rate': 0.1}
Best Precision Score: 0.3656010184908228


In [23]:
tuned_grb_cs= gs_grb_cs.best_estimator_
y_pred_tuned_grb_cs= tuned_grb_cs.predict(X_test)
y_pred_prob_tuned_grb_cs= tuned_grb_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_grb_cs, labels= tuned_grb_cs.classes_)
print(classification_report(y_test, y_pred_tuned_grb_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_tuned_grb_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_grb_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_grb_cs))

              precision    recall  f1-score   support

           0       0.66      0.33      0.44      1125
           1       0.36      0.69      0.48       628

    accuracy                           0.46      1753
   macro avg       0.51      0.51      0.46      1753
weighted avg       0.55      0.46      0.45      1753

Precision: 0.36486486486486486
Accuracy: 0.4592127780946948
AUC: 0.5038726114649681


--- XGB ---

In [24]:
from xgboost import XGBClassifier
xgb_cs= XGBClassifier(scale_pos_weight= (y_train== 0).sum() / (y_train== 1).sum(), random_state= 15)
xgb_cs.fit(X_train, y_train)
y_pred_xgb_cs= xgb_cs.predict(X_test)
y_pred_prob_xgb_cs= xgb_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_xgb_cs, labels= xgb_cs.classes_)
print(classification_report(y_test, y_pred_xgb_cs, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_xgb_cs, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_xgb_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_xgb_cs))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      1125
           1       0.37      0.34      0.35       628

    accuracy                           0.55      1753
   macro avg       0.51      0.51      0.51      1753
weighted avg       0.55      0.55      0.55      1753

Precision: 0.36660929432013767
Accuracy: 0.5533371363377068
AUC: 0.504761500353857


In [25]:
params_XGBoost= {'learning_rate': [0.01, 0.1, 1.0]}

gs_xgb_cs= GridSearchCV(estimator= xgb_cs, param_grid= params_XGBoost, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_xgb_cs.fit(X_train, y_train)

print("Best Parameters:", gs_xgb_cs.best_params_)
print("Best Precision Score:", gs_xgb_cs.best_score_)

tuned_xgb_cs= gs_xgb_cs.best_estimator_
y_pred_tuned_xgb_cs= tuned_xgb_cs.predict(X_test)
y_pred_prob_tuned_xgb_cs= tuned_xgb_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_xgb_cs, labels= tuned_xgb_cs.classes_)
print(classification_report(y_test, y_pred_tuned_xgb_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_tuned_xgb_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_xgb_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_xgb_cs))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'learning_rate': 0.1}
Best Precision Score: 0.369148752042389
              precision    recall  f1-score   support

           0       0.64      0.62      0.63      1125
           1       0.35      0.37      0.36       628

    accuracy                           0.53      1753
   macro avg       0.49      0.49      0.49      1753
weighted avg       0.54      0.53      0.53      1753

Precision: 0.35135135135135137
Accuracy: 0.5288077581289219
AUC: 0.5082547770700636


--- LGBM ---

In [26]:
from lightgbm import LGBMClassifier
lgm_cs= LGBMClassifier(class_weight='balanced', random_state= 15)
lgm_cs.fit(X_train, y_train)
y_pred_lgm_cs= lgm_cs.predict(X_test)
y_pred_prob_lgm_cs= lgm_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_lgm_cs, labels= lgm_cs.classes_)
print(classification_report(y_test, y_pred_lgm_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_lgm_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_lgm_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_lgm_cs))

[LightGBM] [Info] Number of positive: 2511, number of negative: 4499
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1145
[LightGBM] [Info] Number of data points in the train set: 7010, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1125
           1       0.38      0.39      0.39       628

    accuracy                           0.55      1753
   macro avg       0.52      0.52      0.52      1753
weighted avg       0.55      0.55      0.55      1753

Precision: 0.37709923664122136
Accuracy: 0.5499144324015972
AUC: 0.519932059447983


In [27]:
params_LGB= {'learning_rate': [0.001, 0.01, 0.1, 1.0],
             'num_leaves': [31, 127],
             'reg_alpha': [0.1, 0.5],
             'min_data_in_leaf': [30, 50, 100, 300, 400]}

gs_lgm_cs= GridSearchCV(estimator= lgm_cs, param_grid= params_LGB, cv=5, scoring='precision', n_jobs=-1, verbose=2)
gs_lgm_cs.fit(X_train, y_train)

print("Best Parameters:", gs_lgm_cs.best_params_)
print("Best Precision Score:", gs_lgm_cs.best_score_)

tuned_lgm_cs= gs_lgm_cs.best_estimator_
y_pred_tuned_lgm_cs= tuned_lgm_cs.predict(X_test)
y_pred_prob_tuned_lgm_cs= tuned_lgm_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_lgm_cs, labels= tuned_lgm_cs.classes_)
print(classification_report(y_test, y_pred_tuned_lgm_cs, zero_division= 0))
print('Precision:', precision_score(y_test, y_pred_tuned_lgm_cs, zero_division= 0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_lgm_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_lgm_cs))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[LightGBM] [Info] Number of positive: 2511, number of negative: 4499
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1145
[LightGBM] [Info] Number of data points in the train set: 7010, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Best Parameters: {'learning_rate': 0.01, 'min_data_in_leaf': 30, 'num_leaves': 127, 'reg_alpha': 0.1}
Best Precision Score: 0.3723139794169571
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      1125
           1       0.38      0.42      0.40       628

    accuracy                           0.54      1753
   macro avg       0.5

--- CatBoost ---

In [28]:
from catboost import CatBoostClassifier
cat_cs= CatBoostClassifier(class_weights= [1, (y_train== 0).sum() / (y_train== 1).sum()], random_state=15, silent=True)
cat_cs.fit(X_train, y_train)
y_pred_cat_cs= cat_cs.predict(X_test)
y_pred_prob_cat_cs= cat_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_cat_cs, labels= cat_cs.classes_)
print(classification_report(y_test, y_pred_cat_cs, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_cat_cs, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_cat_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_cat_cs))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      1125
           1       0.38      0.36      0.37       628

    accuracy                           0.56      1753
   macro avg       0.51      0.51      0.51      1753
weighted avg       0.55      0.56      0.56      1753

Precision: 0.3770764119601329
Accuracy: 0.5573302909298345
AUC: 0.5218782731776362


In [29]:
params_CatBoost= {
                  'depth': [3,5,10],
                  'learning_rate' : [0.01,0.1,1],
                  'iterations' : [5,10,50,100]
                 }

gs_cat_cs= GridSearchCV(estimator= cat_cs, param_grid= params_CatBoost, cv= 5, scoring='precision', n_jobs= -1, verbose= 2)
gs_cat_cs.fit(X_train, y_train)

print("Best Parameters:", gs_cat_cs.best_params_)
print("Best Precision Score:", gs_cat_cs.best_score_)

tuned_cat_cs= gs_cat_cs.best_estimator_
y_pred_tuned_cat_cs= tuned_cat_cs.predict(X_test)
y_pred_prob_tuned_cat_cs= tuned_cat_cs.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_cat_cs, labels= tuned_cat_cs.classes_)
print(classification_report(y_test, y_pred_tuned_cat_cs, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_cat_cs, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_cat_cs))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_cat_cs))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'depth': 5, 'iterations': 50, 'learning_rate': 0.1}
Best Precision Score: 0.37420723677922696
              precision    recall  f1-score   support

           0       0.64      0.53      0.58      1125
           1       0.36      0.46      0.40       628

    accuracy                           0.51      1753
   macro avg       0.50      0.50      0.49      1753
weighted avg       0.54      0.51      0.52      1753

Precision: 0.3561811505507956
Accuracy: 0.5077010838562465
AUC: 0.5046043878273178


--- Visualisation ---

In [30]:
print('Logistic Regression Precision:', precision_score(y_test, y_pred_lr_cs, zero_division= 0))
print('Decision Tree Precision:', precision_score(y_test, y_pred_dt_cs, zero_division= 0))
print('Tuned Decision Tree Precision:', precision_score(y_test, y_pred_tuned_dt_cs, zero_division= 0))
print('SVM Precision:', precision_score(y_test, y_pred_svc_cs, zero_division= 0))
print('Random Forest Precision:', precision_score(y_test, y_pred_rf_cs, zero_division= 0))
print('Tuned Random Forest Precision:', precision_score(y_test, y_pred_tuned_rf_cs, zero_division= 0))
print('AdaBoost Precision:', precision_score(y_test, y_pred_ada_cs, zero_division= 0))
print('Tuned AdaBoost Precision:', precision_score(y_test, y_pred_tuned_ada_cs, zero_division= 0))
print('GradientBoosting Precision:', precision_score(y_test, y_pred_grb_cs, zero_division= 0))
print('Tuned GradientBoosting Precision:', precision_score(y_test, y_pred_tuned_grb_cs, zero_division= 0))
print('XGB Precision:', precision_score(y_test, y_pred_xgb_cs, zero_division= 0))
print('Tuned XGB Precision:', precision_score(y_test, y_pred_tuned_xgb_cs, zero_division= 0))
print('LGBM Precision:', precision_score(y_test, y_pred_lgm_cs, zero_division= 0))
print('Tuned LGBM Precision:', precision_score(y_test, y_pred_tuned_lgm_cs, zero_division= 0))
print('CatBoost Precision:', precision_score(y_test, y_pred_cat_cs, zero_division= 0))
print('Tuned CatBoost Precision:', precision_score(y_test, y_pred_tuned_cat_cs, zero_division= 0))

Logistic Regression Precision: 0.34650112866817157
Decision Tree Precision: 0.3501577287066246
Tuned Decision Tree Precision: 0.3563579277864992
SVM Precision: 0.30434782608695654
Random Forest Precision: 0.35714285714285715
Tuned Random Forest Precision: 0.0
AdaBoost Precision: 0.3622936576889661
Tuned AdaBoost Precision: 0.3477038425492034
GradientBoosting Precision: 0.36486486486486486
Tuned GradientBoosting Precision: 0.36486486486486486
XGB Precision: 0.36660929432013767
Tuned XGB Precision: 0.35135135135135137
LGBM Precision: 0.37709923664122136
Tuned LGBM Precision: 0.3764367816091954
CatBoost Precision: 0.3770764119601329
Tuned CatBoost Precision: 0.3561811505507956


In [31]:
precision_scores= {
                    'Logistic Regression Precision:': precision_score(y_test, y_pred_lr_cs, zero_division= 0),
                    'Decision Tree Precision:': precision_score(y_test, y_pred_dt_cs, zero_division= 0),
                    'Tuned Decision Tree Precision:': precision_score(y_test, y_pred_tuned_dt_cs, zero_division= 0),
                    'SVM Precision:': precision_score(y_test, y_pred_svc_cs, zero_division= 0),
                    'Random Forest Precision:': precision_score(y_test, y_pred_rf_cs, zero_division= 0),
                    'Tuned Random Forest Precision:': precision_score(y_test, y_pred_tuned_rf_cs, zero_division= 0),
                    'AdaBoost Precision:': precision_score(y_test, y_pred_ada_cs, zero_division= 0),
                    'Tuned AdaBoost Precision:': precision_score(y_test, y_pred_tuned_ada_cs, zero_division= 0),
                    'GradientBoosting Precision:': precision_score(y_test, y_pred_grb_cs, zero_division= 0),
                    'Tuned GradientBoosting Precision:': precision_score(y_test, y_pred_tuned_grb_cs, zero_division= 0),
                    'XGB Precision:': precision_score(y_test, y_pred_xgb_cs, zero_division= 0),
                    'Tuned XGB Precision:': precision_score(y_test, y_pred_tuned_xgb_cs, zero_division= 0),
                    'LGBM Precision:': precision_score(y_test, y_pred_lgm_cs, zero_division= 0),
                    'Tuned LGBM Precision:': precision_score(y_test, y_pred_tuned_lgm_cs, zero_division= 0),
                    'CatBoost Precision:': precision_score(y_test, y_pred_cat_cs, zero_division= 0),
                    'Tuned CatBoost Precision:': precision_score(y_test, y_pred_tuned_cat_cs, zero_division= 0)
                  }

cs_precision= pd.DataFrame(list(precision_scores.items()), columns= ['Model', 'Precision Score'])
cs_precision= cs_precision.sort_values(by= 'Precision Score', ascending=False)
print(cs_precision)

                                Model  Precision Score
12                    LGBM Precision:         0.377099
14                CatBoost Precision:         0.377076
13              Tuned LGBM Precision:         0.376437
10                     XGB Precision:         0.366609
8         GradientBoosting Precision:         0.364865
9   Tuned GradientBoosting Precision:         0.364865
6                 AdaBoost Precision:         0.362294
4            Random Forest Precision:         0.357143
2      Tuned Decision Tree Precision:         0.356358
15          Tuned CatBoost Precision:         0.356181
11               Tuned XGB Precision:         0.351351
1            Decision Tree Precision:         0.350158
7           Tuned AdaBoost Precision:         0.347704
0      Logistic Regression Precision:         0.346501
3                      SVM Precision:         0.304348
5      Tuned Random Forest Precision:         0.000000


In [32]:
accuracy_scores= {
                    'Logistic Regression Accuracy:': accuracy_score(y_test, y_pred_lr_cs),
                    'Decision Tree Accuracy:': accuracy_score(y_test, y_pred_dt_cs),
                    'Tuned Decision Tree Accuracy:': accuracy_score(y_test, y_pred_tuned_dt_cs),
                    'SVM Accuracy:': accuracy_score(y_test, y_pred_svc_cs),
                    'Random Forest Accuracy:': accuracy_score(y_test, y_pred_rf_cs),
                    'Tuned Random Forest Accuracy:': accuracy_score(y_test, y_pred_tuned_rf_cs),
                    'AdaBoost Accuracy:': accuracy_score(y_test, y_pred_ada_cs),
                    'Tuned AdaBoost Accuracy:': accuracy_score(y_test, y_pred_tuned_ada_cs),
                    'GradientBoosting Accuracy:': accuracy_score(y_test, y_pred_grb_cs),
                    'Tuned GradientBoosting Accuracy:': accuracy_score(y_test, y_pred_tuned_grb_cs),
                    'XGB Accuracy:': accuracy_score(y_test, y_pred_xgb_cs),
                    'Tuned XGB Accuracy:': accuracy_score(y_test, y_pred_tuned_xgb_cs),
                    'LGBM Accuracy:': accuracy_score(y_test, y_pred_lgm_cs),
                    'Tuned LGBM Accuracy:': accuracy_score(y_test, y_pred_tuned_lgm_cs),
                    'CatBoost Accuracy:': accuracy_score(y_test, y_pred_cat_cs),
                    'Tuned CatBoost Accuracy:': accuracy_score(y_test, y_pred_tuned_cat_cs)
                  }

cs_accuracy= pd.DataFrame(list(accuracy_scores.items()), columns= ['Model', 'Accuracy Score'])
cs_accuracy= cs_accuracy.sort_values(by= 'Accuracy Score', ascending=False)
print(cs_accuracy)

                               Model  Accuracy Score
5      Tuned Random Forest Accuracy:        0.640616
4            Random Forest Accuracy:        0.637193
3                      SVM Accuracy:        0.636623
14                CatBoost Accuracy:        0.557330
10                     XGB Accuracy:        0.553337
12                    LGBM Accuracy:        0.549914
13              Tuned LGBM Accuracy:        0.543639
2      Tuned Decision Tree Accuracy:        0.537365
1            Decision Tree Accuracy:        0.533371
11               Tuned XGB Accuracy:        0.528808
15          Tuned CatBoost Accuracy:        0.507701
0      Logistic Regression Accuracy:        0.486594
6                 AdaBoost Accuracy:        0.460924
8         GradientBoosting Accuracy:        0.459213
9   Tuned GradientBoosting Accuracy:        0.459213
7           Tuned AdaBoost Accuracy:        0.456361


In [33]:
auc_scores= {
                    'Logistic Regression AUC:': roc_auc_score(y_test, y_pred_prob_lr_cs),
                    'Decision Tree AUC:': roc_auc_score(y_test, y_pred_prob_dt_cs),
                    'Tuned Decision Tree AUC:': roc_auc_score(y_test, y_pred_prob_tuned_dt_cs),
                    'SVM AUC:': roc_auc_score(y_test, y_pred_prob_svc_cs),
                    'Random Forest AUC:': roc_auc_score(y_test, y_pred_prob_rf_cs),
                    'Tuned Random Forest AUC:': roc_auc_score(y_test, y_pred_prob_tuned_rf_cs),
                    'AdaBoost AUC:': roc_auc_score(y_test, y_pred_prob_ada_cs),
                    'Tuned AdaBoost AUC:': roc_auc_score(y_test, y_pred_prob_tuned_ada_cs),
                    'GradientBoosting AUC:': roc_auc_score(y_test, y_pred_prob_grb_cs),
                    'Tuned GradientBoosting AUC:': roc_auc_score(y_test, y_pred_prob_tuned_grb_cs),
                    'XGB AUC:': roc_auc_score(y_test, y_pred_prob_xgb_cs),
                    'Tuned XGB AUC:': roc_auc_score(y_test, y_pred_prob_tuned_xgb_cs),
                    'LGBM AUC:': roc_auc_score(y_test, y_pred_prob_lgm_cs),
                    'Tuned LGBM AUC:': roc_auc_score(y_test, y_pred_prob_tuned_lgm_cs),
                    'CatBoost AUC:': roc_auc_score(y_test, y_pred_prob_cat_cs),
                    'Tuned CatBoost AUC:': roc_auc_score(y_test, y_pred_prob_tuned_cat_cs)
                  }

cs_auc= pd.DataFrame(list(auc_scores.items()), columns= ['Model', 'AUC Score'])
cs_auc= cs_auc.sort_values(by= 'AUC Score', ascending=False)
print(cs_auc)

                          Model  AUC Score
5      Tuned Random Forest AUC:   0.534031
13              Tuned LGBM AUC:   0.527263
14                CatBoost AUC:   0.521878
4            Random Forest AUC:   0.520866
12                    LGBM AUC:   0.519932
11               Tuned XGB AUC:   0.508255
10                     XGB AUC:   0.504762
15          Tuned CatBoost AUC:   0.504604
8         GradientBoosting AUC:   0.503873
9   Tuned GradientBoosting AUC:   0.503873
2      Tuned Decision Tree AUC:   0.498510
6                 AdaBoost AUC:   0.497614
1            Decision Tree AUC:   0.493640
7           Tuned AdaBoost AUC:   0.493482
3                      SVM AUC:   0.489381
0      Logistic Regression AUC:   0.482713
