# Import liberies

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from functions import *
from pipeline import *

from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split as split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler 


from xgboost import XGBClassifier


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

# The Data

In [36]:
heartdf=pd.read_csv('heart_2020_cleaned.csv')

Manipulation that we did in the EDA part

In [4]:
heartdf3 = heartdf[(heartdf.SleepTime>=3)&(heartdf.SleepTime<=15)].drop_duplicates()

In [23]:
heartdf3.shape

(299908, 18)

# Model

First of all like we said in the EDA, the target is not balanced so we will use some methods to get the target balance.

We will use the under/over sampling and penalizing method.

And we will see which one is better of course we will try differnts models as well.

So First lets see the imbalanced ratio we have in the data.

In [24]:
counter = Counter(heartdf3.HeartDisease)
print(counter)

Counter({'No': 272918, 'Yes': 26990})


In [25]:
ratio=(26990/272918)*100
print(f'Imbalanced Ratio of Target: {ratio:.3f}')

Imbalanced Ratio of Target: 9.889


* Spliting the data into Train and Test

In [35]:
X=heartdf3.drop('HeartDisease',axis=1)
y=heartdf3['HeartDisease'].replace({'Yes':1,'No':0})

X_train, X_test, y_train, y_test = split(X, y, test_size=0.3, 
                                            random_state=951357)

* Columns to use in the transform

In [6]:
col_to_index = ['Smoking','Stroke','DiffWalking','Sex',
                                'Race','Diabetic','GenHealth',
                                'PhysicalActivity','KidneyDisease','SkinCancer'
                                # ,'AlcoholDrinking','Asthma'
                                ]

list_col = ['AgeCategory']

age_dict = {'18-24':18,'25-29':25,'30-34':30,'35-39':35,'40-44':40,'45-49':45,'50-54':50,'55-59':55,'60-64':60,
            '65-69':65,'70-74':70,'75-79':75,'80 or older':80}
list_dict = [age_dict]

drop_cols = ['AlcoholDrinking','Asthma','MentalHealth']

## 1. Without any imbalance method

* Pipelines:

In [7]:
pipeRF = Pipeline([('DropColumns', DropColumns(drop_cols)),
                    ('columnAdd', AddColumnIndex(col_to_index)),
                    ('columnDropper', AddColumnGroup(list_col,list_dict)),
                    ('scaler', StandardScaler()),
                    ('RF', RandomForestClassifier())
])
pipeXGB = Pipeline([('DropColumns',DropColumns(drop_cols)),
                    ('columnAdd',AddColumnIndex(col_to_index)),
                    ('columnDropper',AddColumnGroup(list_col,list_dict)),
                    ('scaler',StandardScaler()),
                    ('XGB',XGBClassifier())
])
                    

* Parameters to Grid Search

In [8]:
param_range = [3, 4, 9]
param_range2 = [2, 3, 5]
n_estimators = [30,40]
param_weight = [{0: 0.8, 1:9}, {0: 0.5, 1:5}, {0: 1, 1:10}, {0: 1, 1:9}, "balanced"]
scale_weights = [5,9,10]

* Cross Validation

In [21]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

* Random Forest Model

In [31]:
parameters_RF08 = [{'RF__min_samples_leaf': param_range2,
                        'RF__max_depth': param_range,
                        'RF__n_estimators': n_estimators}]
modelRF08 = GridSearchCV(estimator=pipeRF,
                        param_grid=parameters_RF08,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelRF08.fit(X_train,y_train)

In [32]:
printreport(modelRF08.best_estimator_, X_train, y_train)

Confusion Matrix:
        0    1
0  190896  213
1   18138  688

Precision: 0.764
Recall: 0.037
ROC AUC: 0.842
PR Curve: 0.377


* XGBoost Model

In [33]:
parameters_XGB = [{'XGB__max_depth': param_range,
                        'XGB__min_child_weight': param_range2,
                        'XGB__n_estimators': n_estimators}]

modelXGB08 = GridSearchCV(estimator=pipeXGB,
                        param_grid=parameters_XGB,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelXGB08.fit(X_train,y_train)



In [34]:

printreport(modelXGB08.best_estimator_, X_train, y_train)

Confusion Matrix:
        0     1
0  190023  1086
1   17203  1623

Precision: 0.599
Recall: 0.086
ROC AUC: 0.844
PR Curve: 0.364


## 1. Penalizing

We will try differents weights, the balanced weight and we will give some weights by ourself

In [35]:
pipeLR = Pipeline([('DropColumns',DropColumns(drop_cols)),
                   ('columnAdd',AddColumnIndex(col_to_index)),
                   ('columnDropper',AddColumnGroup(list_col,list_dict)),
                   ('scaler',StandardScaler()),
                   ('LR',LogisticRegression())
])


* Logistic Regression Model

In [36]:
parameters_LR = [{'LR__class_weight': param_weight}]

modelLR = GridSearchCV(estimator=pipeLR,
                        param_grid=parameters_LR,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelLR.fit(X_train,y_train)

In [37]:
printreport(modelLR.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  134044  57065
1    3862  14964

Test Confusion Matrix:
       0      1
0  57589  24220
1   1767   6397

Train Precision: 0.208
Test Precision: 0.209

Train Recall: 0.795
Test Recall: 0.784

Train ROC AUC: 0.823
Test ROC AUC: 0.820

Train PR Curve: 0.324
Test PR Curve: 0.323


* Random Forest Model

In [38]:
parameters_RF = [{'RF__min_samples_leaf': param_range2,
                        'RF__max_depth': param_range,
                        'RF__n_estimators': n_estimators,
                        'RF__class_weight': param_weight}]
modelRF = GridSearchCV(estimator=pipeRF,
                        param_grid=parameters_RF,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelRF.fit(X_train,y_train)

In [39]:
printreport(modelRF.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  137175  53934
1    3457  15369

Test Confusion Matrix:
       0      1
0  58742  23067
1   1697   6467

Train Precision: 0.222
Test Precision: 0.219

Train Recall: 0.816
Test Recall: 0.792

Train ROC AUC: 0.843
Test ROC AUC: 0.830

Train PR Curve: 0.371
Test PR Curve: 0.341


* XGBoost Model

The model XGBClassifier doesn't have classes weights. 

So in this case we will try two ways, one is sample_weight in the fit and second one is parameter of XGBClassifier scale_pos_weight.

In [40]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)
modelXGB1 = GridSearchCV(estimator=pipeXGB,
                        param_grid=parameters_XGB,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelXGB1.fit(X_train,y_train, XGB__sample_weight=classes_weights)

In [41]:
printreport(modelXGB1.best_estimator_, X_train, y_train)

Confusion Matrix:
        0      1
0  137139  53970
1    3536  15290

Precision: 0.221
Recall: 0.812
ROC AUC: 0.841
PR Curve: 0.353


In [42]:
parameters_XGB2 = [{'XGB__max_depth': param_range,
                        'XGB__min_child_weight': param_range2,
                        'XGB__n_estimators': n_estimators,
                        'XGB__scale_pos_weight': scale_weights}]
modelXGB2 = GridSearchCV(estimator=pipeXGB,
                        param_grid=parameters_XGB2,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelXGB2.fit(X_train,y_train)

In [43]:
printreport(modelXGB2.best_estimator_, X_train, y_train)

Confusion Matrix:
        0      1
0  142517  48592
1    4066  14760

Precision: 0.233
Recall: 0.784
ROC AUC: 0.841
PR Curve: 0.352


In this first part that we used Penalizing method to balanced the target.

The best model that we got is Random Forest. 

With a Recall of 0.815, AUC 0.768 and PR 0.199.


Now we are going to do the second part that is under/over sampling. 

## 2. Under/over Sampling

* Pipelines

In [15]:
pipeDT = Pipeline([('DropColumns',DropColumns(drop_cols)),
                   ('columnAdd',AddColumnIndex(col_to_index)),
                   ('columnDropper',AddColumnGroup(list_col,list_dict)),
                   ('scaler',StandardScaler()),
                   ('over',SMOTE(sampling_strategy=0.1, k_neighbors=7)),
                   ('under',RandomUnderSampler(sampling_strategy=0.5)),
                   ('DT',DecisionTreeClassifier())
])

pipeLR2 = Pipeline([('DropColumns',DropColumns(drop_cols)),
                    ('columnAdd',AddColumnIndex(col_to_index)),
                    ('columnDropper',AddColumnGroup(list_col,list_dict)),
                    ('scaler',StandardScaler()),
                    ('over',SMOTE(sampling_strategy=0.1, k_neighbors=7)),
                    ('under',RandomUnderSampler(sampling_strategy=0.5)),
                    ('LR',LogisticRegression())
])

pipeXGB3 = Pipeline([('DropColumns',DropColumns(drop_cols)),
                     ('columnAdd',AddColumnIndex(col_to_index)),
                     ('columnDropper',AddColumnGroup(list_col,list_dict)),
                     ('over',SMOTE(sampling_strategy=0.1, k_neighbors=7)),
                     ('under',RandomUnderSampler(sampling_strategy=0.5)),
                     ('XGB',XGBClassifier())
])

* Parameters to GridSearch

In [12]:
criterion = ['gini','entropy']
k = [1, 2, 3, 4, 5, 6, 7]

In [46]:
parameters_DT = {'DT__criterion': criterion,
                'DT__max_depth': param_range,
                'DT__min_samples_split': param_range2,
                'DT__min_samples_leaf': param_range2,
                'over__k_neighbors': k
                }       

modelDT = GridSearchCV(estimator=pipeDT, 
                       param_grid=parameters_DT,
                       n_jobs=-1, 
                       cv=cv, 
                       scoring='roc_auc')

modelDT.fit(X_train, y_train)

In [47]:
printreport(modelDT.best_estimator_, X_train, y_train)

Confusion Matrix:
        0      1
0  163038  28071
1    7706  11120

Precision: 0.284
Recall: 0.591
ROC AUC: 0.835
PR Curve: 0.321


In [48]:
parameters_LR2 = [{'LR__class_weight': param_weight,
                    'over__k_neighbors': k}]

modelLR2 = GridSearchCV(estimator=pipeLR2,
                        param_grid=parameters_LR2,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelLR2.fit(X_train,y_train)

In [49]:
printreport(modelLR2.best_estimator_, X_train, y_train)

Confusion Matrix:
        0      1
0  138919  52190
1    4387  14439

Precision: 0.217
Recall: 0.767
ROC AUC: 0.823
PR Curve: 0.324


In [16]:
parameters_XGB3 = [{'XGB__max_depth': param_range,
                    'XGB__min_child_weight': param_range2,
                    'XGB__n_estimators': n_estimators,
                    'over__k_neighbors': k}]

modelXGB3 = GridSearchCV(estimator=pipeXGB3,
                        param_grid=parameters_XGB3,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')

modelXGB3.fit(X_train,y_train)

In [17]:
printreport(modelXGB3.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  163022  28087
1    7295  11531

Test Confusion Matrix:
       0      1
0  69835  11974
1   3263   4901

Train Precision: 0.291
Test Precision: 0.290

Train Recall: 0.613
Test Recall: 0.600

Train ROC AUC: 0.841
Test ROC AUC: 0.834

Train PR Curve: 0.352
Test PR Curve: 0.349


## Smoting with RandomOverSampler, with everything in Pipeline

In [37]:
from imblearn.over_sampling import RandomOverSampler
heartdf4 = heartdf.drop_duplicates()
x = heartdf4.drop('HeartDisease',axis=1)
y = heartdf4['HeartDisease'].replace({'Yes':1,'No':0})
X_train3, X_test3, y_train3, y_test3 = split(x, y, train_size=.75,random_state=456876)

In [56]:
pipeRFAllPipe = Pipeline([
                    # ('DropColumns',DropColumns(drop_cols)),
                     ('columnAdd',AddColumnIndex(col_to_index)),
                     ('columnDropper',AddColumnGroup(list_col,list_dict)),
                     ('over2',RandomOverSampler(random_state=100)),
                     ('RF',RandomForestClassifier())
])


pipeRFAllPipe.fit(X_train3,y_train3)

In [57]:
printreport(pipeRFAllPipe, X_train3, y_train3, X_test3, y_test3)

Train Confusion Matrix:
        0      1
0  205230    622
1       0  20435

Test Confusion Matrix:
       0     1
0  65412  3192
1   5471  1355

Train Precision: 0.970
Test Precision: 0.298

Train Recall: 1.000
Test Recall: 0.199

Train ROC AUC: 1.000
Test ROC AUC: 0.781

Train PR Curve: 1.000
Test PR Curve: 0.228


In [38]:
le = LabelEncoder()

heartdf5 = heartdf4.copy(deep = True)
 

col = list(heartdf4.columns)
categorical_features = []
numerical_features = []
for i in heartdf5.columns:
    if len(heartdf4[i].unique()) > 6:
        numerical_features.append(i)
    else:
        heartdf5[i]=le.fit_transform(heartdf5[i])
        categorical_features.append(i)

In [39]:
def convert_age_range_to_mean(age):
    if isinstance(age, int):
        return float(age)
    
    if '-' in age:
        age_min, age_max = age.split('-')
        return (float(age_min) + float(age_max)) / 2
    
    if ' or older' in age:
        age_min = age.replace(' or older', '')
        return float(age_min) + 10  # treats '80 or older' as 90

    return float(age)  # or any other default value you prefer

heartdf5['AgeCategory'] = heartdf5['AgeCategory'].apply(convert_age_range_to_mean)

## With add and drop outside of Pipeline

In [60]:
x2 = heartdf5.drop('HeartDisease',axis=1)
y2 = heartdf5['HeartDisease']
X_train23, X_test23, y_train23, y_test23 = split(x2, y2, train_size=.75,random_state=456876)

In [61]:
pipeRFHalfPipe = Pipeline([
                    # ('DropColumns',DropColumns(drop_cols)),
                    #  ('columnAdd',AddColumnIndex(col_to_index)),
                    #  ('columnDropper',AddColumnGroup(list_col,list_dict)),
                     ('over2',RandomOverSampler(random_state=100)),
                     ('RF',RandomForestClassifier())
])


pipeRFHalfPipe.fit(X_train23,y_train23)

In [62]:
printreport(pipeRFHalfPipe, X_train23, y_train23, X_test23, y_test23)

Train Confusion Matrix:
        0      1
0  205231    621
1       0  20435

Test Confusion Matrix:
       0     1
0  65432  3172
1   5439  1387

Train Precision: 0.971
Test Precision: 0.304

Train Recall: 1.000
Test Recall: 0.203

Train ROC AUC: 1.000
Test ROC AUC: 0.780

Train PR Curve: 1.000
Test PR Curve: 0.229


## Over also otside of Pipe

In [63]:
over = RandomOverSampler(random_state=100)
x_new , y_new = over.fit_resample(x2,y2)
X_trainNew3, X_testNew3, y_trainNew3, y_testNew3 = split(x_new, y_new, train_size=.75,random_state=42)

pipeRFjustmodel = Pipeline([
                    # ('DropColumns',DropColumns(drop_cols)),
                    #  ('columnAdd',AddColumnIndex(col_to_index)),
                    #  ('columnDropper',AddColumnGroup(list_col,list_dict)),
                    #  ('over2',RandomOverSampler(random_state=100)),
                     ('RF',RandomForestClassifier())
])


pipeRFjustmodel.fit(X_trainNew3,y_trainNew3)

In [64]:
printreport(pipeRFHalfPipe, X_trainNew3, y_trainNew3, X_testNew3, y_testNew3)

Train Confusion Matrix:
        0       1
0  203169    2851
1   41048  164616

Test Confusion Matrix:
       0      1
0  67494    942
1  13739  55053

Train Precision: 0.983
Test Precision: 0.983

Train Recall: 0.800
Test Recall: 0.800

Train ROC AUC: 0.963
Test ROC AUC: 0.963

Train PR Curve: 0.969
Test PR Curve: 0.969


## No Pipeline

In [65]:
modelRfnopipe = RandomForestClassifier()

modelRfnopipe.fit(X_trainNew3,y_trainNew3)

In [66]:
printreport(modelRfnopipe, X_trainNew3, y_trainNew3, X_testNew3, y_testNew3)

Train Confusion Matrix:
        0       1
0  205172     848
1       3  205661

Test Confusion Matrix:
       0      1
0  63898   4538
1     38  68754

Train Precision: 0.996
Test Precision: 0.938

Train Recall: 1.000
Test Recall: 0.999

Train ROC AUC: 1.000
Test ROC AUC: 0.997

Train PR Curve: 1.000
Test PR Curve: 0.994


In [68]:
over22 = SMOTE(random_state=100)
x_new , y_new = over22.fit_resample(x2,y2)
X_trainNew3, X_testNew3, y_trainNew3, y_testNew3 = split(x_new, y_new, train_size=.75,random_state=42)


In [69]:
modelRfnopipesmote = RandomForestClassifier()

modelRfnopipesmote.fit(X_trainNew3,y_trainNew3)

In [70]:
printreport(modelRfnopipesmote, X_trainNew3, y_trainNew3, X_testNew3, y_testNew3)

Train Confusion Matrix:
        0       1
0  205518     502
1     185  205479

Test Confusion Matrix:
       0      1
0  60940   7496
1   5827  62965

Train Precision: 0.998
Test Precision: 0.894

Train Recall: 0.999
Test Recall: 0.915

Train ROC AUC: 1.000
Test ROC AUC: 0.963

Train PR Curve: 1.000
Test PR Curve: 0.956


## With droping

Drop of Ashtma and MentalHealth leave the model a bit worst.
The AlcoholDrinking doesn't influence at all.
And the time SleepTime also influence just a bit to worst.

In [40]:
heartdf6 = heartdf.drop_duplicates().drop(['AlcoholDrinking'],axis=1)

In [41]:
le = LabelEncoder()

heartdf7 = heartdf6.copy(deep = True)
 

col = list(heartdf6.columns)
categorical_features = []
numerical_features = []
for i in heartdf7.columns:
    if len(heartdf6[i].unique()) > 6:
        numerical_features.append(i)
    else:
        heartdf7[i]=le.fit_transform(heartdf6[i])
        categorical_features.append(i)


def convert_age_range_to_mean(age):
    if isinstance(age, int):
        return float(age)
    
    if '-' in age:
        age_min, age_max = age.split('-')
        return (float(age_min) + float(age_max)) / 2
    
    if ' or older' in age:
        age_min = age.replace(' or older', '')
        return float(age_min) + 10  # treats '80 or older' as 90

    return float(age)  # or any other default value you prefer

heartdf7['AgeCategory'] = heartdf7['AgeCategory'].apply(convert_age_range_to_mean)

In [44]:
x3 = heartdf7.drop('HeartDisease',axis=1)
y3 = heartdf7['HeartDisease']
over22 = SMOTE(random_state=100)
X_trainNew3, X_testNew3, y_trainNew3, y_testNew3 = split(x3, y3, train_size=.75,random_state=42)
x_new , y_new = over22.fit_resample(X_trainNew3,y_trainNew3)


In [45]:
modelRfnopipesmotedrop = RandomForestClassifier()

modelRfnopipesmotedrop.fit(x_new,y_new)

RandomForestClassifier()

In [46]:
printreport(modelRfnopipesmotedrop, x_new,y_new, X_testNew3, y_testNew3)

Train Confusion Matrix:
        0       1
0  205340     489
1     215  205614

Test Confusion Matrix:
       0     1
0  62054  6573
1   4934  1869

Train Precision: 0.998
Test Precision: 0.221

Train Recall: 0.999
Test Recall: 0.275

Train ROC AUC: 1.000
Test ROC AUC: 0.756

Train PR Curve: 1.000
Test PR Curve: 0.188


In the second part that we used Under/Over Sampling method to balanced the target.

The best model that we got is XGBoost. 

With a Recall of 0.614, AUC 0.734 and PR 0.214


In [108]:
y_pred_proba = modelRF.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba).round(3)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.legend(loc=4)
plt.show()

NameError: name 'modelRF' is not defined