# 04. Modeling
___


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_curve, roc_auc_score

from sklearn.model_selection import GridSearchCV




In [2]:
heart22 = pd.read_csv('~/Desktop/capstone-project-Tasnimacj/data/cleaned_data/heart22_preprocessed.csv')

In [3]:
heart22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246013 entries, 0 to 246012
Data columns (total 43 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Unnamed: 0                     246013 non-null  int64  
 1   Female                         246013 non-null  int64  
 2   GeneralHealth                  246013 non-null  int64  
 3   PhysicalHealthDays             246013 non-null  float64
 4   MentalHealthDays               246013 non-null  float64
 5   LastCheckupTime                246013 non-null  int64  
 6   PhysicalActivities             246013 non-null  int64  
 7   SleepHours                     246013 non-null  float64
 8   RemovedTeeth                   246013 non-null  int64  
 9   HadHeartAttack                 246013 non-null  int64  
 10  HadAngina                      246013 non-null  int64  
 11  HadStroke                      246013 non-null  int64  
 12  HadAsthma                     

In [5]:
y = heart22['HadAngina'] # Target Variable
X = heart22.drop('HadAngina', axis=1) 



In [6]:
print('Shape of y:', y.shape)
print('Shape of X:', X.shape)

Shape of y: (246013,)
Shape of X: (246013, 42)


In [7]:
#1st split

X_rem, X_test, y_rem, y_test = train_test_split(X, y, test_size=0.2, random_state=25, stratify=y)

print(f'The remainder set has {len(X_rem)} data points.')
print(f'The test set has {len(X_test)} data points.')

The remainder set has 196810 data points.
The test set has 49203 data points.


Baseline Logistic Regression


In [8]:
ss = StandardScaler().fit(X_rem)
X_rem_ss = ss.transform(X_rem)
X_test_ss = ss.transform(X_test)

In [9]:
baseline_log_reg = LogisticRegression(random_state=25)
baseline_log_reg.fit(X_rem_ss, y_rem)

LogisticRegression(random_state=25)

In [10]:
print(f'Accuracy on remainder set: {baseline_log_reg.score(X_rem_ss, y_rem)}')
print(f'Accuracy on test set: {baseline_log_reg.score(X_test_ss, y_test)}')

Accuracy on remainder set: 0.9450586860423759
Accuracy on test set: 0.9446375221023109


In [11]:
# class distribution
display(y_test.value_counts())

HadAngina
0    46212
1     2991
Name: count, dtype: int64

In [12]:
# predict classification
y_test_pred = baseline_log_reg.predict(X_test_ss)

# confusion matrix
conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Unnamed: 0,predicted 0,predicted 1
true 0,45642,570
true 1,2154,837


Recall score: 27.98%
Precision score: 59.49%
F1 score: 38.06%


Tuning Logistic Regression

Pipeline

In [13]:
from tempfile import mkdtemp
cachedir = mkdtemp()

In [14]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("my_pca", PCA(n_components=20)),
                 ("model", LogisticRegression())], memory=cachedir)

Hyperparameter Optimisation

In [18]:

c_values = [.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]

# Parameters
log_reg_param = [

    {'scaler': [ StandardScaler()],
     'my_pca__n_components': [20],
     'model': [LogisticRegression(solver='saga',random_state=1, n_jobs=-1, max_iter=10000)], 
     'model__C': c_values,
     'model__penalty': ['l1', 'l2'],
    }
]

Grid

In [19]:
grid = GridSearchCV(estimator=pipe,param_grid=log_reg_param, cv=5,verbose=1,refit=True)

In [20]:
fittedgrid_lr = grid.fit(X_rem,y_rem)



Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [21]:
fittedgrid_lr.best_params_

{'model': LogisticRegression(C=0.001, max_iter=10000, n_jobs=-1, random_state=1,
                    solver='saga'),
 'model__C': 0.001,
 'model__penalty': 'l2',
 'my_pca__n_components': 20,
 'scaler': StandardScaler()}

In [22]:
fittedgrid_lr.best_estimator_

Pipeline(memory='/var/folders/r3/bz5mjtds4dvdw0hskxwvs9vc0000gp/T/tmphe1wkm0v',
         steps=[('scaler', StandardScaler()), ('my_pca', PCA(n_components=20)),
                ('model',
                 LogisticRegression(C=0.001, max_iter=10000, n_jobs=-1,
                                    random_state=1, solver='saga'))])

In [23]:
# Print the accuracies
print(f"Best accuracy on the remainder set: {fittedgrid_lr.score(X_rem, y_rem)}")
print(f"Best accuracy on the test set: {fittedgrid_lr.score(X_test, y_test)}")

Best accuracy on the remainder set: 0.9424876784716224
Best accuracy on the test set: 0.9429709570554641


In [24]:
# predict classification
y_test_pred = fittedgrid_lr.predict(X_test)

# confusion matrix
conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Unnamed: 0,predicted 0,predicted 1
true 0,45845,367
true 1,2439,552


Recall score: 18.46%
Precision score: 60.07%
F1 score: 28.24%


Tuning Decision Tree

Pipeline

In [32]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("my_pca", PCA(n_components=20)),
                 ("dt_model", DecisionTreeClassifier())], memory=cachedir)

Hyperparameter Optimisation

In [36]:

dt_param  = {"scaler":[StandardScaler(), None],
            "my_pca__n_components":[20],
            "dt_model__max_depth": [None, 2, 4, 6, 8,10],
            "dt_model__min_samples_leaf": [2, 5, 10] }

Grid

In [37]:
grid = GridSearchCV(estimator=pipe,param_grid=dt_param, cv=5,verbose=1,refit=True)

In [38]:
fittedgrid_dt = grid.fit(X_rem,y_rem)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [31]:
# pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'my_pca', 'dt_model', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'my_pca__copy', 'my_pca__iterated_power', 'my_pca__n_components', 'my_pca__random_state', 'my_pca__svd_solver', 'my_pca__tol', 'my_pca__whiten', 'dt_model__ccp_alpha', 'dt_model__class_weight', 'dt_model__criterion', 'dt_model__max_depth', 'dt_model__max_features', 'dt_model__max_leaf_nodes', 'dt_model__min_impurity_decrease', 'dt_model__min_impurity_split', 'dt_model__min_samples_leaf', 'dt_model__min_samples_split', 'dt_model__min_weight_fraction_leaf', 'dt_model__random_state', 'dt_model__splitter'])

In [39]:
fittedgrid_dt.best_params_

{'dt_model__max_depth': 6,
 'dt_model__min_samples_leaf': 10,
 'my_pca__n_components': 20,
 'scaler': StandardScaler()}

In [40]:
fittedgrid_dt.best_estimator_

Pipeline(memory='/var/folders/r3/bz5mjtds4dvdw0hskxwvs9vc0000gp/T/tmphe1wkm0v',
         steps=[('scaler', StandardScaler()), ('my_pca', PCA(n_components=20)),
                ('dt_model',
                 DecisionTreeClassifier(max_depth=6, min_samples_leaf=10))])

In [41]:
# Print the accuracies
print(f"Best accuracy on the remainder set: {fittedgrid_dt.score(X_rem, y_rem)}")
print(f"Best accuracy on the test set: {fittedgrid_dt.score(X_test, y_test)}")

Best accuracy on the remainder set: 0.9402977490981149
Best accuracy on the test set: 0.9395768550698128


In [42]:
# predict classification
y_test_pred = fittedgrid_dt.predict(X_test)

# confusion matrix
conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Unnamed: 0,predicted 0,predicted 1
true 0,45967,245
true 1,2728,263


Recall score: 8.79%
Precision score: 51.77%
F1 score: 15.03%


SMOTE

In [45]:
from imblearn.over_sampling import SMOTE
#non scaled
# train 
# scale after smote

In [46]:
X_rem_sm, y_rem_sm = SMOTE(random_state=1).fit_resample(X_rem, y_rem)

In [47]:

print('Original class distribution')
display(pd.Series(y_rem).value_counts().sort_index())

print('\nResampled class distribution')
display(pd.Series(y_rem_sm).value_counts().sort_index())

Original class distribution


HadAngina
0    184848
1     11962
Name: count, dtype: int64


Resampled class distribution


HadAngina
0    184848
1    184848
Name: count, dtype: int64

In [48]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("my_pca", PCA(n_components=20)),
                 ("model", LogisticRegression())], memory=cachedir)
log_reg_param = [

    {'scaler': [ StandardScaler()],
     'my_pca__n_components': [20],
     'model': [LogisticRegression(solver='saga',random_state=1, n_jobs=-1, max_iter=10000)], 
     'model__C': c_values,
     'model__penalty': ['l1', 'l2'],
    }
]

grid_sm = GridSearchCV(estimator=pipe,param_grid=log_reg_param, cv=5,verbose=1,refit=True)

In [49]:
fittedgrid_lr_sm = grid_sm.fit(X_rem_sm,y_rem_sm)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [59]:

print(f"Best accuracy on the remainder set: {fittedgrid_lr_sm.score(X_rem_sm, y_rem_sm)}")
print(f"Best accuracy on the test set: {fittedgrid_lr_sm.score(X_test, y_test)}")


# predict classification
y_test_pred = fittedgrid_lr_sm.predict(X_test)

# confusion matrix
conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Best accuracy on the remainder set: 0.8598605340604172
Best accuracy on the test set: 0.8297868016177875


Unnamed: 0,predicted 0,predicted 1
true 0,39380,6832
true 1,1543,1448


Recall score: 48.41%
Precision score: 17.49%
F1 score: 25.69%


In [56]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("my_pca", PCA(n_components=20)),
                 ("dt_model", DecisionTreeClassifier())], memory=cachedir)

dt_param  = {"scaler":[StandardScaler(), None],
            "my_pca__n_components":[20],
            "dt_model__max_depth": [None, 2, 4, 6, 8,10],
            "dt_model__min_samples_leaf": [2, 5, 10] }


grid_sm = GridSearchCV(estimator=pipe,param_grid=dt_param, cv=5,verbose=1,refit=True)

In [57]:
fittedgrid_dt_sm = grid_sm.fit(X_rem_sm,y_rem_sm)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [60]:
print(f"Best accuracy on the remainder set: {fittedgrid_dt_sm.score(X_rem_sm, y_rem_sm)}")
print(f"Best accuracy on the test set: {fittedgrid_dt_sm.score(X_test, y_test)}")

# predict classification
y_test_pred = fittedgrid_dt_sm.predict(X_test)

# confusion matrix
conmat = pd.DataFrame(
    data = confusion_matrix(y_test, y_test_pred),
    index = ['true 0', 'true 1'],
    columns = ['predicted 0', 'predicted 1']
)
display(conmat)

print(f'Recall score: {recall_score(y_test, y_test_pred)*100:0.2f}%')
print(f'Precision score: {precision_score(y_test, y_test_pred)*100:0.2f}%')
print(f'F1 score: {f1_score(y_test, y_test_pred)*100:0.2f}%')

Best accuracy on the remainder set: 0.863314723448455
Best accuracy on the test set: 0.8128975875454749


Unnamed: 0,predicted 0,predicted 1
true 0,38756,7456
true 1,1750,1241


Recall score: 41.49%
Precision score: 14.27%
F1 score: 21.24%


Create table that compares baseline,Logreg test, DT test, SMOTE logreg test, SMOTE DT test

In [62]:
data = {'F1 score' :[38.06,28.24,15.03,25.69,21.24],
     'Recall score':[27.98,18.46,8.79,48.41,41.49], 
     'Precision score':[59.49,60.07,51.77,17.49,14.27],
      'Accuracy':[94.46,94.30,93.96,82.98,81.29]}


scores = pd.DataFrame(
    data = data,
    index = ['Basline LogReg', 'Best LogReg', 'Best DT','Best SMOTE LogReg', 'Best SMOTE DT' ],
    columns = ['F1 score','Recall score', 'Precision score', 'Accuracy']
)

In [63]:
scores

Unnamed: 0,F1 score,Recall score,Precision score,Accuracy
Basline LogReg,38.06,27.98,59.49,94.46
Best LogReg,28.24,18.46,60.07,94.3
Best DT,15.03,8.79,51.77,93.96
Best SMOTE LogReg,25.69,48.41,17.49,82.98
Best SMOTE DT,21.24,41.49,14.27,81.29
