I am gonna be using this dataset from kaggle: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
*

I will be implementing it with 3 algorithms: 
- Desicion Trees
- Random Forest 
- XGboost




In [1]:
# General imports. I'll do the other specific imports on top of each part.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

%matplotlib inline
RANDOM_STATE = 42

In [2]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [4]:
# Define features and target
X = df.drop("HeartDisease", axis=1)
y = df['HeartDisease']

In [5]:
# Now Let's get the categorical columns

cat_cols = X.select_dtypes(include="object").columns.to_list()
cat_cols

# I won't separate the numerical values because int this notebook I will be only using
# tree based algorithms and they don't need feature scaling

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [6]:
# Let's do the one-hot encoding:
X = pd.get_dummies(data=X, prefix=cat_cols, columns=cat_cols)

In [7]:
X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,False,True,False,True,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,True,False,False,False,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,False,True,False,True,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,True,False,True,False,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,False,True,False,False,True,False,False,True,False,True,False,False,False,True


In [8]:
# Let's get our train and test samples

X_train , X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [9]:
print(f'train samples: {len(X_train)}')
print(f'validation samples: {len(X_test)}')

train samples: 734
validation samples: 184


# Desicion Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
param_grid = {
    'min_samples_split': [2, 10, 30, 50, 100, 200, 300, 700],
    'max_depth': [1, 2, 3, 4, 8, 16, 32, 64, None]
}

In [12]:
dt = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
print("Best parameters: ", grid_search.best_params_)
print("Best training accuracy: ", grid_search.best_score_)

Best parameters:  {'max_depth': 3, 'min_samples_split': 2}
Best training accuracy:  0.8269406392694064


In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Test Accuracy score: ", accuracy_score(y_test, y_pred))

Test Accuracy score:  0.8478260869565217


# Random Forest

Again, I'll use grid search for the parameters like depth and etc. . You can do it manually by creating a list of possible values and test each one of them but I'll use gridsearch from sklearn because it is more efficient and is quicker and has a lot of added functionality.

In [66]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [2, 4, 8, 16, 32, 64, None],
    'min_samples_split': [2, 10, 30, 50, 100, 200, 300, 500, 700]
}

In [69]:
random_forest_model = RandomForestClassifier(random_state=42)

In [70]:
grid_search = GridSearchCV(estimator=random_forest_model, cv=5, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


In [71]:
print(grid_search.best_params_)
# print(grid_search.best_score_) Cross_val score

{'max_depth': 16, 'min_samples_split': 10, 'n_estimators': 100}


In [72]:
best_model = grid_search.best_estimator_

print("Best training accuracy: ", best_model.score(X_train, y_train))
print("Test accuracy (Final accuracy): ", best_model.score(X_test, y_test))

Best training accuracy:  0.9291553133514986
Test accuracy (Final accuracy):  0.875


# XGboost

In [29]:
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(random_state=42, verbosity = 1)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 4, 5, 6, None],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

In [59]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [60]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [61]:
print("Best Params:", grid_search.best_params_)

Best Params: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}


In [65]:
best_xgb_model = grid_search.best_estimator_
print("Best training accuracy: ", best_xgb_model.score(X_train, y_train))
print("Test accuracy (Final accuracy): ", best_xgb_model.score(X_test, y_test))

Best training accuracy:  0.9373297002724795
Test accuracy (Final accuracy):  0.8804347826086957
