<a href="https://colab.research.google.com/github/Prajwal-Luitel/Heart-Failure-Prediction/blob/main/Heart_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [91]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [92]:
# Remove warning after completion
# import warnings
# warnings.filterwarnings('ignore')

In [93]:
df = pd.read_csv("hf://datasets/aai530-group6/heart-failure-prediction-dataset/heart.csv")

In [94]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# Feature Engineering

## Train Test Split

In [95]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
print(f"X_train shape: {X_train.shape}\nX_test shape: {X_test.shape}")

X_train shape: (734, 11)
X_test shape: (184, 11)


## One Hot Encoding

In [97]:
from sklearn.preprocessing import OneHotEncoder
oe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first',dtype=np.int32)

In [98]:
categorical_columns =  X.select_dtypes( include=['object']).columns.to_list()
encoded_train  = oe.fit_transform(X_train[categorical_columns])
encoded_test  = oe.transform(X_test[categorical_columns])

In [99]:
encoded_df_train = pd.DataFrame(encoded_train,
                                columns=oe.get_feature_names_out(categorical_columns),
                                index = X_train.index )
encoded_df_test = pd.DataFrame(encoded_test,
                               columns=oe.get_feature_names_out(categorical_columns),
                               index = X_test.index  )

In [100]:
print(f"encoded_df_train shape: {encoded_df_train.shape}\nencoded_df_train shape: {encoded_df_train.shape}")

encoded_df_train shape: (734, 9)
encoded_df_train shape: (734, 9)


In [101]:
X_train = pd.concat([X_train.drop(columns=categorical_columns), encoded_df_train], axis=1)

X_test = pd.concat([X_test.drop(columns=categorical_columns), encoded_df_test], axis=1)

In [102]:
X_train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
795,42,120,240,1,194,0.8,1,0,1,0,1,0,0,0,0
25,36,130,209,0,178,0.0,1,0,1,0,1,0,0,0,1
84,56,150,213,1,125,1.0,1,0,0,0,1,0,1,1,0
10,37,130,211,0,142,0.0,0,0,1,0,1,0,0,0,1
344,51,120,0,1,104,0.0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,48,120,254,0,110,0.0,0,0,0,0,0,1,0,0,1
270,45,120,225,0,140,0.0,1,0,0,0,1,0,0,0,1
860,60,130,253,0,144,1.4,1,0,0,0,1,0,1,0,1
435,60,152,0,0,118,0.0,1,0,0,0,0,1,1,0,1


## Standard Scaler

In [103]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [104]:
numerical_columns = X.select_dtypes(exclude=['object']).columns.to_list()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [105]:
X_train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
795,-1.245067,-0.708985,0.372803,1.842609,2.284353,-0.097061,1,0,1,0,1,0,0,0,0
25,-1.886236,-0.166285,0.086146,-0.542709,1.652241,-0.836286,1,0,1,0,1,0,0,0,1
84,0.250993,0.919115,0.123134,1.842609,-0.441628,0.087745,1,0,0,0,1,0,1,1,0
10,-1.779375,-0.166285,0.104640,-0.542709,0.229991,-0.836286,0,0,1,0,1,0,0,0,1
344,-0.283314,-0.708985,-1.846478,1.842609,-1.271274,-0.836286,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.603898,-0.708985,0.502261,-0.542709,-1.034232,-0.836286,0,0,0,0,0,1,0,0,1
270,-0.924483,-0.708985,0.234098,-0.542709,0.150977,-0.836286,1,0,0,0,1,0,0,0,1
860,0.678439,-0.166285,0.493014,-0.542709,0.309005,0.457358,1,0,0,0,1,0,1,0,1
435,0.678439,1.027656,-1.846478,-0.542709,-0.718176,-0.836286,1,0,0,0,0,1,1,0,1


# Default Model

## Logistic Regression

### Model Training

In [106]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(X_train, y_train)

### Model Evaluation

In [107]:
from sklearn.metrics import classification_report
y_pred = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184



## Decision Tree

### Model Training

In [108]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

### Model Evaluation

In [109]:
from sklearn.metrics import classification_report
y_pred = decision_tree.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.81      0.79        77
           1       0.86      0.84      0.85       107

    accuracy                           0.83       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.83      0.83      0.83       184



## XGBoost

### Model Training

In [110]:
from xgboost import XGBClassifier

xgboost = XGBClassifier(random_state=42)
xgboost.fit(X_train, y_train)

### Model Evaluation

In [111]:
from sklearn.metrics import classification_report
y_pred = xgboost.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.86      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184

