In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Load the Dataset
df = pd.read_excel("heart_disease.xlsx", sheet_name="Heart_disease")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
0,63,Male,typical angina,145,233,True,lv hypertrophy,150,False,2.3,downsloping,fixed defect,0
1,41,Male,atypical angina,135,203,False,normal,132,False,0.0,flat,fixed defect,0
2,57,Male,asymptomatic,140,192,False,normal,148,False,0.4,flat,fixed defect,0
3,52,Male,typical angina,118,186,False,lv hypertrophy,190,False,0.0,flat,fixed defect,0
4,57,Male,asymptomatic,110,201,False,normal,126,True,1.5,flat,fixed defect,0


In [4]:
#Check Null Values
df.isnull().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalch       0
exang        0
oldpeak     62
slope        0
thal         0
num          0
dtype: int64

In [5]:
df.shape

(908, 13)

In [6]:
#Fill null values with mean
df["oldpeak"] = df["oldpeak"].fillna(df["oldpeak"].mean())

In [7]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
thal        0
num         0
dtype: int64

In [8]:

df.dtypes

age           int64
sex          object
cp           object
trestbps      int64
chol          int64
fbs            bool
restecg      object
thalch        int64
exang        object
oldpeak     float64
slope        object
thal         object
num           int64
dtype: object

In [9]:
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

In [10]:
print(df['sex'].unique())
print(df['exang'].unique())

['Male' 'Female']
[False True 'FALSE' 'TURE']


In [11]:
df["exang"] = df["exang"].replace({
    'False' : 'FALSE',
    'TURE' : 'TRUE',
    'True' : 'TRUE'
})

In [12]:
df['exang'] = df['exang'].astype(str).str.lower().str.strip()

In [13]:
#Label Encoding
df['sex'] = df['sex'].map({'Male':1, 'Female':0})
df['exang'] = df['exang'].map({'false':1, 'true':0})

In [14]:
print(df['sex'].unique())
print(df['exang'].unique())

[1 0]
[1 0]


In [15]:
#One hot coding
df = pd.get_dummies(df, columns=["cp", "fbs", "restecg","slope","thal"], drop_first=True)

In [16]:
df.head(20)

Unnamed: 0,age,sex,trestbps,chol,thalch,exang,oldpeak,num,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,63,1,145,233,150,1,2.3,0,False,False,True,True,False,False,False,False,False,False
1,41,1,135,203,132,1,0.0,0,True,False,False,False,True,False,True,False,False,False
2,57,1,140,192,148,1,0.4,0,False,False,False,False,True,False,True,False,False,False
3,52,1,118,186,190,1,0.0,0,False,False,True,False,False,False,True,False,False,False
4,57,1,110,201,126,0,1.5,0,False,False,False,False,True,False,True,False,False,False
5,66,1,160,228,138,1,2.3,0,False,False,False,False,False,False,False,True,False,False
6,56,1,130,276,128,0,1.0,0,False,True,False,False,True,False,False,True,False,False
7,48,1,110,211,138,1,0.0,0,False,True,False,False,True,False,False,False,False,False
8,57,1,140,260,140,1,0.0,0,True,False,False,True,True,False,False,False,False,False
9,53,1,155,175,160,1,0.891253,0,False,True,False,True,False,True,False,True,False,False


In [17]:
df.dtypes

age                           int64
sex                           int64
trestbps                      int64
chol                          int64
thalch                        int64
exang                         int64
oldpeak                     float64
num                           int64
cp_atypical angina             bool
cp_non-anginal                 bool
cp_typical angina              bool
fbs_True                       bool
restecg_normal                 bool
restecg_st-t abnormality       bool
slope_flat                     bool
slope_upsloping                bool
thal_normal                    bool
thal_reversable defect         bool
dtype: object

In [18]:
x = df.drop('num', axis =1)
y = df['num']

In [19]:
print(type(df))
print(type(x))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [20]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908 entries, 0 to 907
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       908 non-null    int64  
 1   sex                       908 non-null    int64  
 2   trestbps                  908 non-null    int64  
 3   chol                      908 non-null    int64  
 4   thalch                    908 non-null    int64  
 5   exang                     908 non-null    int64  
 6   oldpeak                   908 non-null    float64
 7   cp_atypical angina        908 non-null    bool   
 8   cp_non-anginal            908 non-null    bool   
 9   cp_typical angina         908 non-null    bool   
 10  fbs_True                  908 non-null    bool   
 11  restecg_normal            908 non-null    bool   
 12  restecg_st-t abnormality  908 non-null    bool   
 13  slope_flat                908 non-null    bool   
 14  slope_upsl

In [21]:
#train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [22]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(726, 17)
(182, 17)
(726,)
(182,)


In [23]:
print(y_train.unique)
print(y_train.dtype)

<bound method Series.unique of 411    1
765    1
721    1
279    0
62     0
      ..
779    1
57     0
432    1
131    0
204    0
Name: num, Length: 726, dtype: int64>
int64


In [24]:
#For better accuracy use RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(x_train, y_train)

In [25]:
df['num'].value_counts()

num
1    509
0    399
Name: count, dtype: int64

In [26]:
#Import model DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion="entropy", max_depth=7, random_state=42, min_samples_split=15, min_samples_leaf=10)
model.fit(x_train, y_train)

In [27]:
#Generate prediction on Test data
y_pred = model.predict(x_test)

In [28]:
x.columns.tolist()

['age',
 'sex',
 'trestbps',
 'chol',
 'thalch',
 'exang',
 'oldpeak',
 'cp_atypical angina',
 'cp_non-anginal',
 'cp_typical angina',
 'fbs_True',
 'restecg_normal',
 'restecg_st-t abnormality',
 'slope_flat',
 'slope_upsloping',
 'thal_normal',
 'thal_reversable defect']

In [29]:
#Check Model Performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("classification_report", classification_report(y_test, y_pred))
print("confusion_matrix", confusion_matrix(y_test, y_pred))


Accuracy: 0.7417582417582418
classification_report               precision    recall  f1-score   support

           0       0.72      0.68      0.70        80
           1       0.76      0.79      0.78       102

    accuracy                           0.74       182
   macro avg       0.74      0.73      0.74       182
weighted avg       0.74      0.74      0.74       182

confusion_matrix [[54 26]
 [21 81]]


In [30]:
#Save the model
import pickle
with open("model.pkl", "wb") as p:
    pickle.dump(model , p)
