# Heart Disease Dataset Notebook

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('final.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
print(df.info())
print(df.isnull().sum())
print(df.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [4]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(319795, 18)
(301717, 18)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

XT, xt, YT, yt = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

num_cols = XT.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = XT.select_dtypes(exclude=[np.number]).columns.tolist()

scaler = StandardScaler()
XT1 = XT.copy()
xt1 = xt.copy()
XT1[num_cols] = scaler.fit_transform(XT[num_cols])
xt1[num_cols] = scaler.transform(xt[num_cols])

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
XT2 = pd.DataFrame(encoder.fit_transform(XT[cat_cols]), columns=encoder.get_feature_names_out(cat_cols), index=XT.index)
xt2 = pd.DataFrame(encoder.transform(xt[cat_cols]), columns=encoder.get_feature_names_out(cat_cols), index=xt.index)

XTF = pd.concat([XT1[num_cols], XT2], axis=1)
xtf = pd.concat([xt1[num_cols], xt2], axis=1)

XTF.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,Stroke_Yes,...,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes
176113,0.474667,-0.439058,-0.137842,-0.056883,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
6984,-1.0671,-0.439058,-0.506526,0.625866,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
207892,0.539616,1.277402,-0.506526,0.625866,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
260064,-0.600086,-0.439058,-0.506526,-0.056883,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
45725,1.359212,0.051359,-0.506526,-0.056883,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
    'KNeighbors': KNeighborsClassifier()
}

results = []
for name, model in models.items():
    print('---', name)
    model.fit(XTF, YT)
    yp = model.predict(xtf)
    acc = accuracy_score(yt, yp)
    prec = precision_score(yt, yp, zero_division=0, pos_label='Yes')
    results.append({'model': name, 'accuracy': acc*100, 'precision': prec*100})
    print('model : ', name, ' | accuracy : ', acc*100, ' | precision : ', prec*100)

--- LogisticRegression
model :  LogisticRegression  | accuracy :  91.12256396659154  | precision :  54.3940795559667
--- RandomForest
model :  RandomForest  | accuracy :  89.55157099297362  | precision :  31.399912778020063
--- GradientBoosting
model :  GradientBoosting  | accuracy :  91.13582129126343  | precision :  55.21783181357649
--- KNeighbors
model :  KNeighbors  | accuracy :  89.99237703831366  | precision :  34.52820242488139
