In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df= pd.read_csv('C://Users//User//Desktop//MSc Westminster//Dissertation//DataSets//Heart_Disease_Indicators.csv')

In [2]:
from sklearn.model_selection import train_test_split

target_size = 50000

num_class_0 = int(target_size * df['HeartDiseaseorAttack'].value_counts(normalize=True)[0])
num_class_1 = target_size - num_class_0

df_class_0 = df[df['HeartDiseaseorAttack'] == 0]
df_class_1 = df[df['HeartDiseaseorAttack'] == 1]

df_class_0_sampled = df_class_0.sample(n=num_class_0, random_state=15)
df_class_1_sampled = df_class_1.sample(n=num_class_1, random_state=15)

df_sampled = pd.concat([df_class_0_sampled, df_class_1_sampled])

df_sampled = df_sampled.sample(frac=1, random_state=15).reset_index(drop=True)
df= df_sampled
print(df['HeartDiseaseorAttack'].value_counts(normalize=True))

0    0.90582
1    0.09418
Name: HeartDiseaseorAttack, dtype: float64


In [3]:
df.drop_duplicates(inplace= True)
df.shape

(48050, 22)

In [4]:
df= df[df['Diabetes'] != 1].copy()
df.loc[df['Diabetes'] == 2, 'Diabetes'] = 1
print(df['Diabetes'].value_counts())

0    40175
1     6968
Name: Diabetes, dtype: int64


In [5]:
categorical_columns= ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'Diabetes', 'PhysActivity',
                      'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk',
                      'Sex', 'Age', 'Education', 'Income'
                     ]
df[categorical_columns]= df[categorical_columns].astype(str)
df= pd.get_dummies(df, columns= categorical_columns, drop_first= True)

In [6]:
X= df.drop(['HeartDiseaseorAttack'], axis= 1)
y= df['HeartDiseaseorAttack']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 15, stratify= y)

continuous_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth']

from sklearn.preprocessing import StandardScaler
ss= StandardScaler()
X_train= ss.fit_transform(X_train)
X_test= ss.transform(X_test)

In [8]:
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from sklearn.cluster import KMeans

cc= ClusterCentroids(random_state= 15, estimator= KMeans(n_init= 10))
rus= RandomUnderSampler(random_state= 15)

X_cc, y_cc= cc.fit_resample(X_train, y_train)
X_cc_rus, y_cc_rus= rus.fit_resample(X_cc, y_cc)

--- LogisticRegression ---

In [9]:
from sklearn.metrics import confusion_matrix, classification_report, precision_score, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
lr_cc_rus= LogisticRegression(random_state=15)
lr_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_lr_cc_rus= lr_cc_rus.predict(X_test)
y_pred_prob_lr_cc_rus= lr_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_lr_cc_rus, labels= lr_cc_rus.classes_)
print(classification_report(y_test, y_pred_lr_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_lr_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_lr_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_lr_cc_rus))

              precision    recall  f1-score   support

           0       0.97      0.71      0.82      8517
           1       0.23      0.80      0.36       912

    accuracy                           0.72      9429
   macro avg       0.60      0.76      0.59      9429
weighted avg       0.90      0.72      0.78      9429

Precision: 0.23045397225725095
Accuracy: 0.7219217308304168
AUC: 0.7574659118295917


--- DecisionTreeClassifier ---

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt_cc_rus= DecisionTreeClassifier(random_state=15)
dt_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_dt_cc_rus= dt_cc_rus.predict(X_test)
y_pred_prob_dt_cc_rus= dt_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_dt_cc_rus, labels= dt_cc_rus.classes_)
print(classification_report(y_test, y_pred_dt_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_dt_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_dt_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_dt_cc_rus))

              precision    recall  f1-score   support

           0       0.95      0.14      0.25      8517
           1       0.10      0.93      0.19       912

    accuracy                           0.22      9429
   macro avg       0.53      0.54      0.22      9429
weighted avg       0.87      0.22      0.24      9429

Precision: 0.10436060754532092
Accuracy: 0.2181567504507371
AUC: 0.5378461343566736


--- Tuned - DecisionTreeClassifier ---

In [11]:
from sklearn.model_selection import GridSearchCV
param_grid= {
             'max_depth': [None, 10, 20, 30, 40, 50],
             'min_samples_split': [2, 10, 20],
             'min_samples_leaf': [1, 5, 10],
             'max_features': [None, 'sqrt', 'log2'],
             'criterion': ['gini', 'entropy']
            }

gs_dt_cc_rus= GridSearchCV(estimator= dt_cc_rus, param_grid= param_grid, cv= 5, scoring= 'precision')
gs_dt_cc_rus.fit(X_cc_rus, y_cc_rus)

print("Best Parameters:", gs_dt_cc_rus.best_params_)
print("Best Precision Score:", gs_dt_cc_rus.best_score_)

Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best Precision Score: 0.9355440424229104


In [12]:
tuned_dt_cc_rus= gs_dt_cc_rus.best_estimator_
y_pred_tuned_dt_cc_rus= tuned_dt_cc_rus.predict(X_test)
y_pred_prob_tuned_dt_cc_rus= tuned_dt_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_dt_cc_rus, labels= tuned_dt_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_dt_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_dt_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_dt_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_dt_cc_rus))

              precision    recall  f1-score   support

           0       0.96      0.14      0.25      8517
           1       0.11      0.94      0.19       912

    accuracy                           0.22      9429
   macro avg       0.53      0.54      0.22      9429
weighted avg       0.88      0.22      0.24      9429

Precision: 0.1053019145802651
Accuracy: 0.22112631244034361
AUC: 0.5493717158047167


--- KNeighborsClassifier ---

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn_cc_rus= KNeighborsClassifier()
knn_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_knn_cc_rus= knn_cc_rus.predict(X_test)
y_pred_prob_knn_cc_rus= knn_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_knn_cc_rus, labels= knn_cc_rus.classes_)
print(classification_report(y_test, y_pred_knn_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_knn_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_knn_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_knn_cc_rus))

              precision    recall  f1-score   support

           0       0.93      0.83      0.88      8517
           1       0.22      0.45      0.30       912

    accuracy                           0.80      9429
   macro avg       0.58      0.64      0.59      9429
weighted avg       0.87      0.80      0.82      9429

Precision: 0.22490400438837083
Accuracy: 0.7969031710679818
AUC: 0.7363212172146933


--- Tuned - KNeighborsClassifier ---

In [14]:
param_grid= {
             'n_neighbors': np.arange(1,40),
             'weights': ['uniform', 'distance'],
             'metric': ['euclidean', 'manhattan', 'minkowski']
            }

gs_knn_cc_rus= GridSearchCV(estimator= knn_cc_rus, param_grid= param_grid, cv=5, scoring= 'precision')
gs_knn_cc_rus.fit(X_cc_rus, y_cc_rus)
print("Best Parameters:", gs_knn_cc_rus.best_params_)
print("Best Precision Score:", gs_knn_cc_rus.best_score_)

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 8, 'weights': 'uniform'}
Best Precision Score: 0.8385511032144926


In [15]:
tuned_knn_cc_rus= gs_knn_cc_rus.best_estimator_
y_pred_tuned_knn_cc_rus= tuned_knn_cc_rus.predict(X_test)
y_pred_prob_tuned_knn_cc_rus= tuned_knn_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_knn_cc_rus, labels= tuned_knn_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_knn_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_knn_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_knn_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_knn_cc_rus))

              precision    recall  f1-score   support

           0       0.97      0.60      0.74      8517
           1       0.19      0.85      0.31       912

    accuracy                           0.63      9429
   macro avg       0.58      0.73      0.52      9429
weighted avg       0.90      0.63      0.70      9429

Precision: 0.18631906242525711
Accuracy: 0.6250927988121752
AUC: 0.7908582345113694


--- GaussianNB ---

In [16]:
from sklearn.naive_bayes import GaussianNB
nb_cc_rus= GaussianNB()
nb_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_nb_cc_rus= nb_cc_rus.predict(X_test)
y_pred_prob_nb_cc_rus= nb_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_nb_cc_rus, labels= nb_cc_rus.classes_)
print(classification_report(y_test, y_pred_nb_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_nb_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_nb_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_nb_cc_rus))

              precision    recall  f1-score   support

           0       0.98      0.39      0.56      8517
           1       0.14      0.91      0.24       912

    accuracy                           0.44      9429
   macro avg       0.56      0.65      0.40      9429
weighted avg       0.90      0.44      0.53      9429

Precision: 0.139
Accuracy: 0.4438434616608336
AUC: 0.7935689508495907


--- SVM ---

In [17]:
from sklearn.svm import SVC
svc_cc_rus= SVC(kernel= 'rbf',probability= True, gamma= 1, random_state=15)
svc_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_svc_cc_rus= svc_cc_rus.predict(X_test)
y_pred_prob_svc_cc_rus= svc_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_svc_cc_rus, labels= svc_cc_rus.classes_)
print(classification_report(y_test, y_pred_svc_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_svc_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_svc_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_svc_cc_rus))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93      8517
           1       0.13      0.07      0.09       912

    accuracy                           0.86      9429
   macro avg       0.52      0.51      0.51      9429
weighted avg       0.83      0.86      0.85      9429

Precision: 0.1285140562248996
Accuracy: 0.864036483190158
AUC: 0.5991223177999007


--- Random Forest ---

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf_cc_rus= RandomForestClassifier(random_state=15)
rf_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_rf_cc_rus= rf_cc_rus.predict(X_test)
y_pred_prob_rf_cc_rus= rf_cc_rus.predict_proba(X_test)[:,1]

In [19]:
cm= confusion_matrix(y_test, y_pred_rf_cc_rus, labels= rf_cc_rus.classes_)
print(classification_report(y_test, y_pred_rf_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_rf_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_rf_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_rf_cc_rus))

              precision    recall  f1-score   support

           0       0.99      0.05      0.10      8517
           1       0.10      1.00      0.18       912

    accuracy                           0.14      9429
   macro avg       0.55      0.52      0.14      9429
weighted avg       0.90      0.14      0.11      9429

Precision: 0.10110232713506291
Accuracy: 0.1433874217838583
AUC: 0.6503394140511547


In [20]:
param_grid= {
             'n_estimators': [100, 200, 300],
             'max_depth': [None, 10, 20, 30],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4],
            }

gs_tuned_rf_cc_rus= GridSearchCV(estimator= rf_cc_rus, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_tuned_rf_cc_rus.fit(X_cc_rus, y_cc_rus)
print("Best Parameters:", gs_tuned_rf_cc_rus.best_params_)
print("Best Precision Score:", gs_tuned_rf_cc_rus.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Precision Score: 0.9390482751745527


In [21]:
tuned_rf_cc_rus= gs_tuned_rf_cc_rus.best_estimator_
y_pred_tuned_rf_cc_rus= tuned_rf_cc_rus.predict(X_test)
y_pred_prob_tuned_rf_cc_rus= tuned_rf_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_rf_cc_rus, labels= tuned_rf_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_rf_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_rf_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_rf_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_rf_cc_rus))

              precision    recall  f1-score   support

           0       0.99      0.05      0.10      8517
           1       0.10      1.00      0.18       912

    accuracy                           0.14      9429
   macro avg       0.55      0.52      0.14      9429
weighted avg       0.91      0.14      0.11      9429

Precision: 0.10121367331032179
Accuracy: 0.14359953335454448
AUC: 0.6631607785461069


--- AdaBoost ---

In [22]:
from sklearn.ensemble import AdaBoostClassifier
ada_cc_rus= AdaBoostClassifier(random_state=15)
ada_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_ada_cc_rus= ada_cc_rus.predict(X_test)
y_pred_prob_ada_cc_rus= ada_cc_rus.predict_proba(X_test)[:,1]

In [23]:
cm= confusion_matrix(y_test, y_pred_ada_cc_rus, labels= ada_cc_rus.classes_)
print(classification_report(y_test, y_pred_ada_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_ada_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_ada_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_ada_cc_rus))

              precision    recall  f1-score   support

           0       0.96      0.04      0.09      8517
           1       0.10      0.98      0.18       912

    accuracy                           0.14      9429
   macro avg       0.53      0.51      0.13      9429
weighted avg       0.88      0.14      0.09      9429

Precision: 0.09921381906765585
Accuracy: 0.13553929366846962
AUC: 0.6624162021673887


In [24]:
param_grid= {'n_estimators': [50, 100, 200]}

gs_ada_cc_rus= GridSearchCV(estimator= ada_cc_rus, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_ada_cc_rus.fit(X_cc_rus, y_cc_rus)
print("Best Parameters:", gs_ada_cc_rus.best_params_)
print("Best Precision Score:", gs_ada_cc_rus.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'n_estimators': 200}
Best Precision Score: 0.9562969539518557


In [25]:
tuned_ada_cc_rus= gs_ada_cc_rus.best_estimator_
y_pred_tuned_ada_cc_rus= tuned_ada_cc_rus.predict(X_test)
y_pred_prob_tuned_ada_cc_rus= tuned_ada_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_ada_cc_rus, labels= tuned_ada_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_ada_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_ada_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_ada_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_ada_cc_rus))

              precision    recall  f1-score   support

           0       0.96      0.03      0.06      8517
           1       0.10      0.99      0.18       912

    accuracy                           0.12      9429
   macro avg       0.53      0.51      0.12      9429
weighted avg       0.87      0.12      0.07      9429

Precision: 0.09838216003498032
Accuracy: 0.12397921306607275
AUC: 0.6492912652507162


--- GradientBoosting ---

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
grb_cc_rus= GradientBoostingClassifier(random_state=15)
grb_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_grb_cc_rus= grb_cc_rus.predict(X_test)
y_pred_prob_grb_cc_rus= grb_cc_rus.predict_proba(X_test)[:,1]


cm= confusion_matrix(y_test, y_pred_grb_cc_rus, labels= grb_cc_rus.classes_)
print(classification_report(y_test, y_pred_grb_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_grb_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_grb_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_grb_cc_rus))

              precision    recall  f1-score   support

           0       0.98      0.04      0.07      8517
           1       0.10      0.99      0.18       912

    accuracy                           0.13      9429
   macro avg       0.54      0.51      0.13      9429
weighted avg       0.89      0.13      0.08      9429

Precision: 0.09929701230228472
Accuracy: 0.12949411390391347
AUC: 0.6604017841509963


In [27]:
param_grid= {'learning_rate': [0.01, 0.1, 0.2]}

gs_grb_cc_rus= GridSearchCV(estimator= grb_cc_rus, param_grid= param_grid, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_grb_cc_rus.fit(X_cc_rus, y_cc_rus)

print("Best Parameters:", gs_grb_cc_rus.best_params_)
print("Best Precision Score:", gs_grb_cc_rus.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'learning_rate': 0.2}
Best Precision Score: 0.9530922344103455


In [28]:
tuned_grb_cc_rus= gs_grb_cc_rus.best_estimator_
y_pred_tuned_grb_cc_rus= tuned_grb_cc_rus.predict(X_test)
y_pred_prob_tuned_grb_cc_rus= tuned_grb_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_grb_cc_rus, labels= tuned_grb_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_grb_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_grb_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_grb_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_grb_cc_rus))

              precision    recall  f1-score   support

           0       0.95      0.03      0.06      8517
           1       0.10      0.98      0.18       912

    accuracy                           0.12      9429
   macro avg       0.52      0.51      0.12      9429
weighted avg       0.87      0.12      0.07      9429

Precision: 0.09814207650273224
Accuracy: 0.12334287835401421
AUC: 0.6441075537263965


--- XGB ---

In [29]:
from xgboost import XGBClassifier
xgb_cc_rus= XGBClassifier(random_state=15)
xgb_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_xgb_cc_rus= xgb_cc_rus.predict(X_test)
y_pred_prob_xgb_cc_rus= xgb_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_xgb_cc_rus, labels= xgb_cc_rus.classes_)
print(classification_report(y_test, y_pred_xgb_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_xgb_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_xgb_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_xgb_cc_rus))

              precision    recall  f1-score   support

           0       0.96      0.04      0.08      8517
           1       0.10      0.98      0.18       912

    accuracy                           0.13      9429
   macro avg       0.53      0.51      0.13      9429
weighted avg       0.88      0.13      0.09      9429

Precision: 0.09910507126284389
Accuracy: 0.133630289532294
AUC: 0.6502718891422522


In [30]:
params_XGBoost= {'learning_rate': [0.01, 0.1, 1.0]}

gs_xgb_cc_rus= GridSearchCV(estimator= xgb_cc_rus, param_grid= params_XGBoost, cv= 5, scoring= 'precision', n_jobs= -1, verbose= 2)
gs_xgb_cc_rus.fit(X_cc_rus, y_cc_rus)

print("Best Parameters:", gs_xgb_cc_rus.best_params_)
print("Best Precision Score:", gs_xgb_cc_rus.best_score_)

tuned_xgb_cc_rus= gs_xgb_cc_rus.best_estimator_
y_pred_tuned_xgb_cc_rus= tuned_xgb_cc_rus.predict(X_test)
y_pred_prob_tuned_xgb_cc_rus= tuned_xgb_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_xgb_cc_rus, labels= tuned_xgb_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_xgb_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_xgb_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_xgb_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_xgb_cc_rus))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'learning_rate': 1.0}
Best Precision Score: 0.9515698833493491
              precision    recall  f1-score   support

           0       0.96      0.05      0.09      8517
           1       0.10      0.98      0.18       912

    accuracy                           0.14      9429
   macro avg       0.53      0.52      0.14      9429
weighted avg       0.88      0.14      0.10      9429

Precision: 0.09965559382290856
Accuracy: 0.13893307879944852
AUC: 0.6333618238239722


--- LGBM ---

In [31]:
from lightgbm import LGBMClassifier
lgm_cc_rus= LGBMClassifier(random_state=15)
lgm_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_lgm_cc_rus= lgm_cc_rus.predict(X_test)
y_pred_prob_lgm_cc_rus= lgm_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_lgm_cc_rus, labels= lgm_cc_rus.classes_)
print(classification_report(y_test, y_pred_lgm_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_lgm_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_lgm_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_lgm_cc_rus))

[LightGBM] [Info] Number of positive: 3649, number of negative: 3649
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2604
[LightGBM] [Info] Number of data points in the train set: 7298, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.97      0.04      0.09      8517
           1       0.10      0.99      0.18       912

    accuracy                           0.14      9429
   macro avg       0.54      0.52      0.13      9429
weighted avg       0.89      0.14      0.09      9429

Precision: 0.09971226206285967
Accuracy: 0.13606957259518507
AUC: 0.6146718431042971


In [32]:
params_LGB= {'learning_rate': [0.001, 0.01, 0.1, 1.0],
             'num_leaves': [31, 127],
             'reg_alpha': [0.1, 0.5],
             'min_data_in_leaf': [30, 50, 100, 300, 400]}

gs_lgm_cc_rus= GridSearchCV(estimator= lgm_cc_rus, param_grid= params_LGB, cv=5, scoring='precision', n_jobs=-1, verbose=2)
gs_lgm_cc_rus.fit(X_cc_rus, y_cc_rus)

print("Best Parameters:", gs_lgm_cc_rus.best_params_)
print("Best Precision Score:", gs_lgm_cc_rus.best_score_)

tuned_lgm_cc_rus= gs_lgm_cc_rus.best_estimator_
y_pred_tuned_lgm_cc_rus= tuned_lgm_cc_rus.predict(X_test)
y_pred_prob_tuned_lgm_cc_rus= tuned_lgm_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_lgm_cc_rus, labels= tuned_lgm_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_lgm_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_lgm_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_lgm_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_lgm_cc_rus))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[LightGBM] [Info] Number of positive: 3649, number of negative: 3649
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2604
[LightGBM] [Info] Number of data points in the train set: 7298, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Best Parameters: {'learning_rate': 1.0, 'min_data_in_leaf': 100, 'num_leaves': 31, 'reg_alpha': 0.5}
Best Precision Score: 0.9595570890939322
              precision    recall  f1-score   support

           0       0.95      0.05      0.09      8517
           1       0.10      0.98      0.18       912

    accuracy                           0.14      9429
   macro avg       0.52      0.51      0.13      9429
weighted avg       0.87      0.14      0.10      9429

Precision: 0.0988681757656458
Accuracy: 0.13649379573655743
AUC: 0.6094283955309197


--- CatBoost ---

In [33]:
from catboost import CatBoostClassifier
cat_cc_rus= CatBoostClassifier(random_state=15)
cat_cc_rus.fit(X_cc_rus, y_cc_rus)
y_pred_cat_cc_rus= cat_cc_rus.predict(X_test)
y_pred_prob_cat_cc_rus= cat_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_cat_cc_rus, labels= cat_cc_rus.classes_)
print(classification_report(y_test, y_pred_cat_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_cat_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_cat_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_cat_cc_rus))

Learning rate set to 0.024073
0:	learn: 0.6586896	total: 223ms	remaining: 3m 42s
1:	learn: 0.6187813	total: 238ms	remaining: 1m 58s
2:	learn: 0.5842101	total: 256ms	remaining: 1m 25s
3:	learn: 0.5542255	total: 273ms	remaining: 1m 7s
4:	learn: 0.5222409	total: 294ms	remaining: 58.4s
5:	learn: 0.4994688	total: 310ms	remaining: 51.3s
6:	learn: 0.4742134	total: 329ms	remaining: 46.7s
7:	learn: 0.4538184	total: 360ms	remaining: 44.6s
8:	learn: 0.4341781	total: 381ms	remaining: 41.9s
9:	learn: 0.4145505	total: 398ms	remaining: 39.4s
10:	learn: 0.3965152	total: 417ms	remaining: 37.5s
11:	learn: 0.3800643	total: 435ms	remaining: 35.8s
12:	learn: 0.3636403	total: 478ms	remaining: 36.3s
13:	learn: 0.3494305	total: 493ms	remaining: 34.7s
14:	learn: 0.3360321	total: 510ms	remaining: 33.5s
15:	learn: 0.3251818	total: 530ms	remaining: 32.6s
16:	learn: 0.3156375	total: 547ms	remaining: 31.7s
17:	learn: 0.3044299	total: 568ms	remaining: 31s
18:	learn: 0.2958970	total: 582ms	remaining: 30s
19:	learn: 0

165:	learn: 0.0886773	total: 3.27s	remaining: 16.4s
166:	learn: 0.0885092	total: 3.29s	remaining: 16.4s
167:	learn: 0.0883367	total: 3.32s	remaining: 16.4s
168:	learn: 0.0881402	total: 3.33s	remaining: 16.4s
169:	learn: 0.0879544	total: 3.35s	remaining: 16.4s
170:	learn: 0.0877303	total: 3.36s	remaining: 16.3s
171:	learn: 0.0875428	total: 3.38s	remaining: 16.3s
172:	learn: 0.0873210	total: 3.39s	remaining: 16.2s
173:	learn: 0.0868076	total: 3.41s	remaining: 16.2s
174:	learn: 0.0864915	total: 3.43s	remaining: 16.2s
175:	learn: 0.0861304	total: 3.45s	remaining: 16.1s
176:	learn: 0.0858992	total: 3.46s	remaining: 16.1s
177:	learn: 0.0857720	total: 3.48s	remaining: 16.1s
178:	learn: 0.0855870	total: 3.5s	remaining: 16.1s
179:	learn: 0.0854340	total: 3.53s	remaining: 16.1s
180:	learn: 0.0852171	total: 3.55s	remaining: 16s
181:	learn: 0.0850756	total: 3.56s	remaining: 16s
182:	learn: 0.0849322	total: 3.58s	remaining: 16s
183:	learn: 0.0845990	total: 3.59s	remaining: 15.9s
184:	learn: 0.08434

335:	learn: 0.0618125	total: 6.24s	remaining: 12.3s
336:	learn: 0.0616661	total: 6.25s	remaining: 12.3s
337:	learn: 0.0615597	total: 6.26s	remaining: 12.3s
338:	learn: 0.0614777	total: 6.28s	remaining: 12.2s
339:	learn: 0.0612869	total: 6.29s	remaining: 12.2s
340:	learn: 0.0611922	total: 6.3s	remaining: 12.2s
341:	learn: 0.0608822	total: 6.32s	remaining: 12.2s
342:	learn: 0.0607812	total: 6.34s	remaining: 12.1s
343:	learn: 0.0605383	total: 6.35s	remaining: 12.1s
344:	learn: 0.0604877	total: 6.37s	remaining: 12.1s
345:	learn: 0.0603269	total: 6.38s	remaining: 12.1s
346:	learn: 0.0602110	total: 6.39s	remaining: 12s
347:	learn: 0.0601295	total: 6.41s	remaining: 12s
348:	learn: 0.0600355	total: 6.42s	remaining: 12s
349:	learn: 0.0599083	total: 6.44s	remaining: 12s
350:	learn: 0.0597800	total: 6.46s	remaining: 11.9s
351:	learn: 0.0596862	total: 6.48s	remaining: 11.9s
352:	learn: 0.0595237	total: 6.5s	remaining: 11.9s
353:	learn: 0.0594390	total: 6.52s	remaining: 11.9s
354:	learn: 0.0592334	

505:	learn: 0.0451672	total: 8.97s	remaining: 8.76s
506:	learn: 0.0451084	total: 8.98s	remaining: 8.73s
507:	learn: 0.0450523	total: 8.99s	remaining: 8.71s
508:	learn: 0.0450096	total: 9.01s	remaining: 8.69s
509:	learn: 0.0449312	total: 9.02s	remaining: 8.66s
510:	learn: 0.0448315	total: 9.04s	remaining: 8.65s
511:	learn: 0.0447762	total: 9.05s	remaining: 8.62s
512:	learn: 0.0446372	total: 9.06s	remaining: 8.6s
513:	learn: 0.0445081	total: 9.07s	remaining: 8.58s
514:	learn: 0.0444451	total: 9.08s	remaining: 8.55s
515:	learn: 0.0443218	total: 9.1s	remaining: 8.54s
516:	learn: 0.0442401	total: 9.11s	remaining: 8.51s
517:	learn: 0.0442015	total: 9.13s	remaining: 8.5s
518:	learn: 0.0440765	total: 9.17s	remaining: 8.5s
519:	learn: 0.0440054	total: 9.2s	remaining: 8.49s
520:	learn: 0.0439524	total: 9.21s	remaining: 8.47s
521:	learn: 0.0438758	total: 9.23s	remaining: 8.45s
522:	learn: 0.0438257	total: 9.26s	remaining: 8.45s
523:	learn: 0.0437107	total: 9.28s	remaining: 8.43s
524:	learn: 0.043

674:	learn: 0.0349790	total: 11.7s	remaining: 5.62s
675:	learn: 0.0349268	total: 11.7s	remaining: 5.6s
676:	learn: 0.0348869	total: 11.7s	remaining: 5.58s
677:	learn: 0.0348673	total: 11.7s	remaining: 5.56s
678:	learn: 0.0348160	total: 11.7s	remaining: 5.54s
679:	learn: 0.0347029	total: 11.7s	remaining: 5.52s
680:	learn: 0.0346572	total: 11.7s	remaining: 5.5s
681:	learn: 0.0345997	total: 11.8s	remaining: 5.48s
682:	learn: 0.0344690	total: 11.8s	remaining: 5.46s
683:	learn: 0.0343674	total: 11.8s	remaining: 5.45s
684:	learn: 0.0343303	total: 11.8s	remaining: 5.43s
685:	learn: 0.0342611	total: 11.8s	remaining: 5.41s
686:	learn: 0.0342154	total: 11.9s	remaining: 5.4s
687:	learn: 0.0341663	total: 11.9s	remaining: 5.38s
688:	learn: 0.0341306	total: 11.9s	remaining: 5.37s
689:	learn: 0.0340914	total: 11.9s	remaining: 5.35s
690:	learn: 0.0340767	total: 11.9s	remaining: 5.33s
691:	learn: 0.0340277	total: 11.9s	remaining: 5.31s
692:	learn: 0.0339740	total: 12s	remaining: 5.29s
693:	learn: 0.033

840:	learn: 0.0273064	total: 14.7s	remaining: 2.78s
841:	learn: 0.0272726	total: 14.7s	remaining: 2.76s
842:	learn: 0.0272346	total: 14.7s	remaining: 2.74s
843:	learn: 0.0271933	total: 14.7s	remaining: 2.72s
844:	learn: 0.0271608	total: 14.7s	remaining: 2.71s
845:	learn: 0.0271409	total: 14.8s	remaining: 2.69s
846:	learn: 0.0270692	total: 14.8s	remaining: 2.67s
847:	learn: 0.0270406	total: 14.8s	remaining: 2.65s
848:	learn: 0.0270021	total: 14.8s	remaining: 2.63s
849:	learn: 0.0269420	total: 14.8s	remaining: 2.61s
850:	learn: 0.0269165	total: 14.8s	remaining: 2.59s
851:	learn: 0.0268875	total: 14.8s	remaining: 2.58s
852:	learn: 0.0268492	total: 14.9s	remaining: 2.56s
853:	learn: 0.0268121	total: 14.9s	remaining: 2.54s
854:	learn: 0.0267803	total: 14.9s	remaining: 2.52s
855:	learn: 0.0267330	total: 14.9s	remaining: 2.51s
856:	learn: 0.0266990	total: 14.9s	remaining: 2.49s
857:	learn: 0.0266234	total: 14.9s	remaining: 2.47s
858:	learn: 0.0266027	total: 15s	remaining: 2.46s
859:	learn: 0.

              precision    recall  f1-score   support

           0       0.96      0.04      0.08      8517
           1       0.10      0.99      0.18       912

    accuracy                           0.13      9429
   macro avg       0.53      0.51      0.13      9429
weighted avg       0.88      0.13      0.09      9429

Precision: 0.0990633608815427
Accuracy: 0.13150917382543217
AUC: 0.6663835641410677


In [34]:
params_CatBoost= {
                  'depth': [3,5,10],
                  'learning_rate' : [0.01,0.1,1],
                  'iterations' : [5,10,50,100]
                 }

gs_cat_cc_rus= GridSearchCV(estimator= cat_cc_rus, param_grid= params_CatBoost, cv=5, scoring='precision', n_jobs=-1, verbose=2)
gs_cat_cc_rus.fit(X_cc_rus, y_cc_rus)

print("Best Parameters:", gs_cat_cc_rus.best_params_)
print("Best Precision Score:", gs_cat_cc_rus.best_score_)

tuned_cat_cc_rus= gs_cat_cc_rus.best_estimator_
y_pred_tuned_cat_cc_rus= tuned_cat_cc_rus.predict(X_test)
y_pred_prob_tuned_cat_cc_rus= tuned_cat_cc_rus.predict_proba(X_test)[:,1]

cm= confusion_matrix(y_test, y_pred_tuned_cat_cc_rus, labels= tuned_cat_cc_rus.classes_)
print(classification_report(y_test, y_pred_tuned_cat_cc_rus, zero_division=0))
print('Precision:', precision_score(y_test, y_pred_tuned_cat_cc_rus, zero_division=0))
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_cat_cc_rus))
print('AUC:', roc_auc_score(y_test, y_pred_prob_tuned_cat_cc_rus))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
0:	learn: 0.3267610	total: 5.49ms	remaining: 269ms
1:	learn: 0.2291740	total: 12.1ms	remaining: 290ms
2:	learn: 0.1965238	total: 18.1ms	remaining: 283ms
3:	learn: 0.1739524	total: 24.8ms	remaining: 285ms
4:	learn: 0.1328024	total: 31.4ms	remaining: 283ms
5:	learn: 0.1277052	total: 37.1ms	remaining: 272ms
6:	learn: 0.1165292	total: 45.9ms	remaining: 282ms
7:	learn: 0.1014867	total: 51.6ms	remaining: 271ms
8:	learn: 0.0949461	total: 58.2ms	remaining: 265ms
9:	learn: 0.0901788	total: 64.9ms	remaining: 260ms
10:	learn: 0.0879831	total: 72.9ms	remaining: 259ms
11:	learn: 0.0832441	total: 81.2ms	remaining: 257ms
12:	learn: 0.0803785	total: 88.8ms	remaining: 253ms
13:	learn: 0.0780594	total: 94.3ms	remaining: 243ms
14:	learn: 0.0748637	total: 100ms	remaining: 234ms
15:	learn: 0.0736512	total: 107ms	remaining: 227ms
16:	learn: 0.0722386	total: 113ms	remaining: 219ms
17:	learn: 0.0708783	total: 119ms	remaining: 212ms
18:	learn: 0.069

--- Visualisation ---

In [35]:
print('Logistic Regression Precision:', precision_score(y_test, y_pred_lr_cc_rus, zero_division= 0))
print('Decision Tree Precision:', precision_score(y_test, y_pred_dt_cc_rus, zero_division= 0))
print('Tuned Decision Tree Precision:', precision_score(y_test, y_pred_tuned_dt_cc_rus, zero_division= 0))
print('KNeighborsClassifier Precision:', precision_score(y_test, y_pred_knn_cc_rus, zero_division= 0))
print('Tuned KNeighborsClassifier Precision:', precision_score(y_test, y_pred_tuned_knn_cc_rus, zero_division= 0))
print('GaussianNB Precision:', precision_score(y_test, y_pred_nb_cc_rus, zero_division= 0))
print('SVM Precision:', precision_score(y_test, y_pred_svc_cc_rus, zero_division= 0))
print('Random Forest Precision:', precision_score(y_test, y_pred_rf_cc_rus, zero_division= 0))
print('Tuned Random Forest Precision:', precision_score(y_test, y_pred_tuned_rf_cc_rus, zero_division= 0))
print('AdaBoost Precision:', precision_score(y_test, y_pred_ada_cc_rus, zero_division= 0))
print('Tuned AdaBoost Precision:', precision_score(y_test, y_pred_tuned_ada_cc_rus, zero_division= 0))
print('GradientBoosting Precision:', precision_score(y_test, y_pred_grb_cc_rus, zero_division= 0))
print('Tuned GradientBoosting Precision:', precision_score(y_test, y_pred_tuned_grb_cc_rus, zero_division= 0))
print('XGB Precision:', precision_score(y_test, y_pred_xgb_cc_rus, zero_division= 0))
print('Tuned XGB Precision:', precision_score(y_test, y_pred_tuned_xgb_cc_rus, zero_division= 0))
print('LGBM Precision:', precision_score(y_test, y_pred_lgm_cc_rus, zero_division= 0))
print('Tuned LGBM Precision:', precision_score(y_test, y_pred_tuned_lgm_cc_rus, zero_division= 0))
print('CatBoost Precision:', precision_score(y_test, y_pred_cat_cc_rus, zero_division= 0))
print('Tuned CatBoost Precision:', precision_score(y_test, y_pred_tuned_cat_cc_rus, zero_division= 0))

Logistic Regression Precision: 0.23045397225725095
Decision Tree Precision: 0.10436060754532092
Tuned Decision Tree Precision: 0.1053019145802651
KNeighborsClassifier Precision: 0.22490400438837083
Tuned KNeighborsClassifier Precision: 0.18631906242525711
GaussianNB Precision: 0.139
SVM Precision: 0.1285140562248996
Random Forest Precision: 0.10110232713506291
Tuned Random Forest Precision: 0.10121367331032179
AdaBoost Precision: 0.09921381906765585
Tuned AdaBoost Precision: 0.09838216003498032
GradientBoosting Precision: 0.09929701230228472
Tuned GradientBoosting Precision: 0.09814207650273224
XGB Precision: 0.09910507126284389
Tuned XGB Precision: 0.09965559382290856
LGBM Precision: 0.09971226206285967
Tuned LGBM Precision: 0.0988681757656458
CatBoost Precision: 0.0990633608815427
Tuned CatBoost Precision: 0.09993262968785088


In [36]:
precision_scores= {
                    'Logistic Regression Precision:': precision_score(y_test, y_pred_lr_cc_rus, zero_division= 0),
                    'Decision Tree Precision:': precision_score(y_test, y_pred_dt_cc_rus, zero_division= 0),
                    'Tuned Decision Tree Precision:': precision_score(y_test, y_pred_tuned_dt_cc_rus, zero_division= 0),
                    'KNeighborsClassifier Precision:': precision_score(y_test, y_pred_knn_cc_rus, zero_division= 0),
                    'Tuned KNeighborsClassifier Precision:': precision_score(y_test, y_pred_tuned_knn_cc_rus, zero_division= 0),
                    'GaussianNB Precision:': precision_score(y_test, y_pred_nb_cc_rus, zero_division= 0),
                    'SVM Precision:': precision_score(y_test, y_pred_svc_cc_rus, zero_division= 0),
                    'Random Forest Precision:': precision_score(y_test, y_pred_rf_cc_rus, zero_division= 0),
                    'Tuned Random Forest Precision:': precision_score(y_test, y_pred_tuned_rf_cc_rus, zero_division= 0),
                    'AdaBoost Precision:': precision_score(y_test, y_pred_ada_cc_rus, zero_division= 0),
                    'Tuned AdaBoost Precision:': precision_score(y_test, y_pred_tuned_ada_cc_rus, zero_division= 0),
                    'GradientBoosting Precision:': precision_score(y_test, y_pred_grb_cc_rus, zero_division= 0),
                    'Tuned GradientBoosting Precision:': precision_score(y_test, y_pred_tuned_grb_cc_rus, zero_division= 0),
                    'XGB Precision:': precision_score(y_test, y_pred_xgb_cc_rus, zero_division= 0),
                    'Tuned XGB Precision:': precision_score(y_test, y_pred_tuned_xgb_cc_rus, zero_division= 0),
                    'LGBM Precision:': precision_score(y_test, y_pred_lgm_cc_rus, zero_division= 0),
                    'Tuned LGBM Precision:': precision_score(y_test, y_pred_tuned_lgm_cc_rus, zero_division= 0),
                    'CatBoost Precision:': precision_score(y_test, y_pred_cat_cc_rus, zero_division= 0),
                    'Tuned CatBoost Precision:': precision_score(y_test, y_pred_tuned_cat_cc_rus, zero_division= 0)
                  }

cc_rus_precision= pd.DataFrame(list(precision_scores.items()), columns= ['Model', 'Precision Score'])
cc_rus_precision= cc_rus_precision.sort_values(by= 'Precision Score', ascending=False)
print(cc_rus_precision)

                                    Model  Precision Score
0          Logistic Regression Precision:         0.230454
3         KNeighborsClassifier Precision:         0.224904
4   Tuned KNeighborsClassifier Precision:         0.186319
5                   GaussianNB Precision:         0.139000
6                          SVM Precision:         0.128514
2          Tuned Decision Tree Precision:         0.105302
1                Decision Tree Precision:         0.104361
8          Tuned Random Forest Precision:         0.101214
7                Random Forest Precision:         0.101102
18              Tuned CatBoost Precision:         0.099933
15                        LGBM Precision:         0.099712
14                   Tuned XGB Precision:         0.099656
11            GradientBoosting Precision:         0.099297
9                     AdaBoost Precision:         0.099214
13                         XGB Precision:         0.099105
17                    CatBoost Precision:         0.0990

In [37]:
accuracy_scores= {
                    'Logistic Regression Accuracy:': accuracy_score(y_test, y_pred_lr_cc_rus),
                    'Decision Tree Accuracy:': accuracy_score(y_test, y_pred_dt_cc_rus),
                    'Tuned Decision Tree Accuracy:': accuracy_score(y_test, y_pred_tuned_dt_cc_rus),
                    'KNeighborsClassifier Accuracy:': accuracy_score(y_test, y_pred_knn_cc_rus),
                    'Tuned KNeighborsClassifier Accuracy:': accuracy_score(y_test, y_pred_tuned_knn_cc_rus),
                    'GaussianNB Accuracy:': accuracy_score(y_test, y_pred_nb_cc_rus),
                    'SVM Accuracy:': accuracy_score(y_test, y_pred_svc_cc_rus),
                    'Random Forest Accuracy:': accuracy_score(y_test, y_pred_rf_cc_rus),
                    'Tuned Random Forest Accuracy:': accuracy_score(y_test, y_pred_tuned_rf_cc_rus),
                    'AdaBoost Accuracy:': accuracy_score(y_test, y_pred_ada_cc_rus),
                    'Tuned AdaBoost Accuracy:': accuracy_score(y_test, y_pred_tuned_ada_cc_rus),
                    'GradientBoosting Accuracy:': accuracy_score(y_test, y_pred_grb_cc_rus),
                    'Tuned GradientBoosting Accuracy:': accuracy_score(y_test, y_pred_tuned_grb_cc_rus),
                    'XGB Accuracy:': accuracy_score(y_test, y_pred_xgb_cc_rus),
                    'Tuned XGB Accuracy:': accuracy_score(y_test, y_pred_tuned_xgb_cc_rus),
                    'LGBM Accuracy:': accuracy_score(y_test, y_pred_lgm_cc_rus),
                    'Tuned LGBM Accuracy:': accuracy_score(y_test, y_pred_tuned_lgm_cc_rus),
                    'CatBoost Accuracy:': accuracy_score(y_test, y_pred_cat_cc_rus),
                    'Tuned CatBoost Accuracy:': accuracy_score(y_test, y_pred_tuned_cat_cc_rus)
                  }

cc_rus_accuracy= pd.DataFrame(list(accuracy_scores.items()), columns= ['Model', 'Accuracy Score'])
cc_rus_accuracy= cc_rus_accuracy.sort_values(by= 'Accuracy Score', ascending=False)
print(cc_rus_accuracy)

                                   Model  Accuracy Score
6                          SVM Accuracy:        0.864036
3         KNeighborsClassifier Accuracy:        0.796903
0          Logistic Regression Accuracy:        0.721922
4   Tuned KNeighborsClassifier Accuracy:        0.625093
5                   GaussianNB Accuracy:        0.443843
2          Tuned Decision Tree Accuracy:        0.221126
1                Decision Tree Accuracy:        0.218157
18              Tuned CatBoost Accuracy:        0.147524
8          Tuned Random Forest Accuracy:        0.143600
7                Random Forest Accuracy:        0.143387
14                   Tuned XGB Accuracy:        0.138933
16                  Tuned LGBM Accuracy:        0.136494
15                        LGBM Accuracy:        0.136070
9                     AdaBoost Accuracy:        0.135539
13                         XGB Accuracy:        0.133630
17                    CatBoost Accuracy:        0.131509
11            GradientBoosting 

In [38]:
auc_scores= {
                    'Logistic Regression AUC:': roc_auc_score(y_test, y_pred_prob_lr_cc_rus),
                    'Decision Tree AUC:': roc_auc_score(y_test, y_pred_prob_dt_cc_rus),
                    'Tuned Decision Tree AUC:': roc_auc_score(y_test, y_pred_prob_tuned_dt_cc_rus),
                    'KNeighborsClassifier AUC:': roc_auc_score(y_test, y_pred_prob_knn_cc_rus),
                    'Tuned KNeighborsClassifier AUC:': roc_auc_score(y_test, y_pred_prob_tuned_knn_cc_rus),
                    'GaussianNB AUC:': roc_auc_score(y_test, y_pred_prob_nb_cc_rus),
                    'SVM AUC:': roc_auc_score(y_test, y_pred_prob_svc_cc_rus),
                    'Random Forest AUC:': roc_auc_score(y_test, y_pred_prob_rf_cc_rus),
                    'Tuned Random Forest AUC:': roc_auc_score(y_test, y_pred_prob_tuned_rf_cc_rus),
                    'AdaBoost AUC:': roc_auc_score(y_test, y_pred_prob_ada_cc_rus),
                    'Tuned AdaBoost AUC:': roc_auc_score(y_test, y_pred_prob_tuned_ada_cc_rus),
                    'GradientBoosting AUC:': roc_auc_score(y_test, y_pred_prob_grb_cc_rus),
                    'Tuned GradientBoosting AUC:': roc_auc_score(y_test, y_pred_prob_tuned_grb_cc_rus),
                    'XGB AUC:': roc_auc_score(y_test, y_pred_prob_xgb_cc_rus),
                    'Tuned XGB AUC:': roc_auc_score(y_test, y_pred_prob_tuned_xgb_cc_rus),
                    'LGBM AUC:': roc_auc_score(y_test, y_pred_prob_lgm_cc_rus),
                    'Tuned LGBM AUC:': roc_auc_score(y_test, y_pred_prob_tuned_lgm_cc_rus),
                    'CatBoost AUC:': roc_auc_score(y_test, y_pred_prob_cat_cc_rus),
                    'Tuned CatBoost AUC:': roc_auc_score(y_test, y_pred_prob_tuned_cat_cc_rus)
                  }

cc_rus_auc= pd.DataFrame(list(auc_scores.items()), columns= ['Model', 'AUC Score'])
cc_rus_auc= cc_rus_auc.sort_values(by= 'AUC Score', ascending=False)
print(cc_rus_auc)

                              Model  AUC Score
0          Logistic Regression AUC:   0.830642
5                   GaussianNB AUC:   0.793569
4   Tuned KNeighborsClassifier AUC:   0.790858
3         KNeighborsClassifier AUC:   0.736321
17                    CatBoost AUC:   0.666384
8          Tuned Random Forest AUC:   0.663161
9                     AdaBoost AUC:   0.662416
11            GradientBoosting AUC:   0.660402
7                Random Forest AUC:   0.650339
13                         XGB AUC:   0.650272
10              Tuned AdaBoost AUC:   0.649291
12      Tuned GradientBoosting AUC:   0.644108
18              Tuned CatBoost AUC:   0.641926
14                   Tuned XGB AUC:   0.633362
15                        LGBM AUC:   0.614672
16                  Tuned LGBM AUC:   0.609428
6                          SVM AUC:   0.599122
2          Tuned Decision Tree AUC:   0.549372
1                Decision Tree AUC:   0.537846
