# Import liberies

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from functions import *
from pipeline import *

from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split as split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

# The Data

In [2]:
heartdf=pd.read_csv('heart_2020_cleaned.csv')

Manipulation that we did in the EDA part

In [3]:
heartdf3 = heartdf[(heartdf.SleepTime>=3)&(heartdf.SleepTime<=15)].drop_duplicates()

In [5]:
heartdf3.shape

(299908, 18)

# Model

First of all like we said in the EDA, the target is not balanced so we will use some methods to get the target balance.

We will use the under/over sampling and penalizing method.

And we will see which one is better of course we will try differnts models as well.

So First lets see the imbalanced ratio we have in the data.

In [6]:
counter = Counter(heartdf3.HeartDisease)
print(counter)

Counter({'No': 272918, 'Yes': 26990})


In [8]:
ratio=(26990/272918)*100
print(f'Imbalanced Ratio of Target: {ratio:.3f}')

Imbalanced Ratio of Target: 9.889


* Spliting the data into Train and Test

In [4]:
X=heartdf3.drop('HeartDisease',axis=1)
y=heartdf3['HeartDisease'].replace({'Yes':1,'No':0})

X_train, X_test, y_train, y_test = split(X, y, test_size=0.3, 
                                            random_state=951357)

* Columns to use in the transform

In [5]:
col_to_index = ['Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex',
                                'Race','Diabetic','GenHealth',
                                'PhysicalActivity','Asthma','KidneyDisease','SkinCancer']

list_col = ['AgeCategory']

age_dict = {'18-24':18,'25-29':25,'30-34':30,'35-39':35,'40-44':40,'45-49':45,'50-54':50,'55-59':55,'60-64':60,
            '65-69':65,'70-74':70,'75-79':75,'80 or older':80}
list_dict = [age_dict]

## 1. Penalizing

* Pipelines:

We will try differents weights, the balanced weight and we will give some weights by ourself

In [28]:
pipeLR = Pipeline([
    ('columnAdd',AddColumnIndex(col_to_index)),
    ('columnDropper',AddColumnGroup(list_col,list_dict)),
    ('LR',LogisticRegression())
])
pipeRF = Pipeline([
    ('columnAdd',AddColumnIndex(col_to_index)),
    ('columnDropper',AddColumnGroup(list_col,list_dict)),
    ('RF',RandomForestClassifier())
])
pipeXGB = Pipeline([
    ('columnAdd',AddColumnIndex(col_to_index)),
    ('columnDropper',AddColumnGroup(list_col,list_dict)),
    ('XGB',XGBClassifier())
])

* Parameters to Grid Search

In [6]:
param_range = [3, 4, 9]
param_range2 = [2, 3, 5]
n_estimators = [30,40]
param_weight = [{0: f, 1:t} for (f,t) in [(0.8,9),(0.5,5),(1,10),(1,9)]]
scale_weights = [5,9,10]

* Cross Validation

In [7]:
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)

* Logistic Regression Model

In [12]:
parameters_LR = [{'LR__class_weight': param_weight}]

modelLR = GridSearchCV(estimator=pipeLR,
        param_grid=parameters_LR,
        cv=cv,
        scoring='roc_auc')
modelLR.fit(X_train,y_train)

* Random Forest Model

In [18]:
parameters_RF = [{'RF__min_samples_leaf': param_range2,
                        'RF__max_depth': param_range,
                        'RF__n_estimators': n_estimators,
                        'RF__class_weight': param_weight}]
modelRF = GridSearchCV(estimator=pipeRF,
        param_grid=parameters_RF,
        cv=cv,
        scoring='roc_auc')
modelRF.fit(X_train,y_train)

* XGBoost Model

The model XGBClassifier doesn't have classes weights. 

So in this case we will try two ways, one is sample_weight in the fit and second one is parameter of XGBClassifier scale_pos_weight.

In [29]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)
parameters_XGB1 = [{'XGB__max_depth': param_range,
                        'XGB__min_child_weight': param_range2,
                        'XGB__n_estimators': n_estimators}]
modelXGB1 = GridSearchCV(estimator=pipeXGB,
        param_grid=parameters_XGB1,
        cv=cv,
        scoring='roc_auc')
modelXGB1.fit(X_train,y_train, XGB__sample_weight=classes_weights)

In [17]:
parameters_XGB2 = [{'XGB__max_depth': param_range,
                        'XGB__min_child_weight': param_range2,
                        'XGB__n_estimators': n_estimators,
                        'XGB__scale_pos_weight': scale_weights}]
modelXGB2 = GridSearchCV(estimator=pipeXGB,
        param_grid=parameters_XGB2,
        cv=cv,
        scoring='roc_auc')
modelXGB2.fit(X_train,y_train)

* Results:

In [13]:
printreport(modelLR.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  141708  49401
1    4832  13994

Test Confusion Matrix:
       0      1
0  60751  21058
1   2154   6010

Train Precision: 0.221
Test Precision: 0.222

Train Recall: 0.743
Test Recall: 0.736

Train ROC AUC: 0.742
Test ROC AUC: 0.739

Train PR Curve: 0.187
Test PR Curve: 0.187


In [20]:
printreport(modelRF.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  141825  49284
1    3874  14952

Test Confusion Matrix:
       0      1
0  60725  21084
1   1921   6243

Train Precision: 0.603
Test Precision: 0.599
Train Recall: 0.768
Test Recall: 0.753
Train ROC AUC: 0.768
Test ROC AUC: 0.753


In [30]:
printreport(modelXGB1.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  137112  53997
1    3474  15352

Test Confusion Matrix:
       0      1
0  58760  23049
1   1651   6513

Train Precision: 0.221
Test Precision: 0.220

Train Recall: 0.815
Test Recall: 0.798

Train ROC AUC: 0.766
Test ROC AUC: 0.758

Train PR Curve: 0.197
Test PR Curve: 0.194


In [22]:
printreport(modelXGB2.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  138662  52447
1    3665  15161

Test Confusion Matrix:
       0      1
0  59441  22368
1   1725   6439

Train Precision: 0.599
Test Precision: 0.598
Train Recall: 0.765
Test Recall: 0.758
Train ROC AUC: 0.765
Test ROC AUC: 0.758


In this first part that we used Penalizing method to balanced the target.

The best model that we got is _______ model. 

With a Recall of ______ and AUC ______.


Now we are going to do the second part that is under/over sampling. 

## 2. Under/over Sampling

* Pipelines

In [21]:
pipeDT = Pipeline([
    ('columnAdd',AddColumnIndex(col_to_index)),
    ('columnDropper',AddColumnGroup(list_col,list_dict)),
    ('over',SMOTE(sampling_strategy=0.1, k_neighbors=7)),
    ('under',RandomUnderSampler(sampling_strategy=0.5)),
    ('DT',DecisionTreeClassifier())
])

pipeLR2 = Pipeline([
    ('columnAdd',AddColumnIndex(col_to_index)),
    ('columnDropper',AddColumnGroup(list_col,list_dict)),
    ('over',SMOTE(sampling_strategy=0.1, k_neighbors=7)),
    ('under',RandomUnderSampler(sampling_strategy=0.5)),
    ('LR',LogisticRegression())
])

pipeXGB3 = Pipeline([
    ('columnAdd',AddColumnIndex(col_to_index)),
    ('columnDropper',AddColumnGroup(list_col,list_dict)),
    ('over',SMOTE(sampling_strategy=0.1, k_neighbors=7)),
    ('under',RandomUnderSampler(sampling_strategy=0.5)),
    ('XGB',XGBClassifier())
])

* Parameters to GridSearch

In [17]:
criterion = ['gini','entropy']
k = [1, 2, 3, 4, 5, 6, 7]

In [18]:
parameters_DT = {'DT__criterion': criterion,
                'DT__max_depth': param_range,
                'DT__min_samples_split': param_range2,
                'DT__min_samples_leaf': param_range2,
                'over__k_neighbors': k
                }       

modelDT = GridSearchCV(estimator=pipeDT, 
                       param_grid=parameters_DT,
                       n_jobs=-1, 
                       cv=cv, 
                       scoring='roc_auc')

modelDT.fit(X_train, y_train)

In [22]:
parameters_LR2 = [{'LR__class_weight': param_weight,
                    'over__k_neighbors': k}]

modelLR2 = GridSearchCV(estimator=pipeLR2,
                        param_grid=parameters_LR2,
                        n_jobs=-1, 
                        cv=cv,
                        scoring='roc_auc')
modelLR2.fit(X_train,y_train)

In [23]:
parameters_XGB3 = [{'XGB__max_depth': param_range,
                    'XGB__min_child_weight': param_range2,
                    'XGB__n_estimators': n_estimators,
                    'over__k_neighbors': k}]

modelXGB3 = GridSearchCV(estimator=pipeXGB3,
                        param_grid=parameters_XGB3,
                        cv=cv,
                        scoring='roc_auc')

modelXGB3.fit(X_train,y_train)

In [24]:
printreport(modelDT.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  161277  29832
1    7330  11496

Test Confusion Matrix:
       0      1
0  69061  12748
1   3402   4762

Train Precision: 0.278
Test Precision: 0.272

Train Recall: 0.611
Test Recall: 0.583

Train ROC AUC: 0.727
Test ROC AUC: 0.714

Train PR Curve: 0.205
Test PR Curve: 0.196


In [25]:
printreport(modelLR2.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
       0       1
0  65170  125939
1    540   18286

Test Confusion Matrix:
       0      1
0  27959  53850
1    270   7894

Train Precision: 0.127
Test Precision: 0.128

Train Recall: 0.971
Test Recall: 0.967

Train ROC AUC: 0.656
Test ROC AUC: 0.654

Train PR Curve: 0.126
Test PR Curve: 0.127


In [26]:
printreport(modelXGB3.best_estimator_, X_train, y_train, X_test, y_test)

Train Confusion Matrix:
        0      1
0  162891  28218
1    7231  11595

Test Confusion Matrix:
       0      1
0  69757  12052
1   3221   4943

Train Precision: 0.291
Test Precision: 0.291

Train Recall: 0.616
Test Recall: 0.605

Train ROC AUC: 0.734
Test ROC AUC: 0.729

Train PR Curve: 0.214
Test PR Curve: 0.212
