# **heart disease classification** #

# **import libraries** #

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from keras.layers import Dense
from keras.models import Sequential

# **load dataset** #

In [2]:
data=pd.read_csv("heart_disease_uci.csv")
data.head().style.background_gradient()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [4]:
data.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [6]:
data.nunique()

id          920
age          50
sex           2
dataset       4
cp            4
trestbps     61
chol        217
fbs           2
restecg       3
thalch      119
exang         2
oldpeak      53
slope         3
ca            4
thal          3
num           5
dtype: int64

In [5]:
data.duplicated().sum()

0

In [7]:
data.isna().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

# **data preprocessing** #

In [8]:
Columns_drop=["ca","thal","slope"]
# remove that columns
data=data.drop(Columns_drop,axis=1)


In [9]:
data=data.dropna()
data.isna().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
num         0
dtype: int64

In [10]:
print("the data after clearing=")
data.tail().style.background_gradient()

the data after clearing=


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,num
913,914,62,Male,VA Long Beach,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,1
914,915,46,Male,VA Long Beach,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,2
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,1
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,2
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,1


In [11]:
data=data.drop(["id"],axis=1)

In [12]:
# apply label encoder on gender ,exng
lb=LabelEncoder()
data["sex"]=lb.fit_transform(data["sex"])
data["exang"]=lb.fit_transform(data["exang"])
data["fbs"]=lb.fit_transform(data["fbs"])

In [13]:
data.columns

Index(['age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalch', 'exang', 'oldpeak', 'num'],
      dtype='object')

In [15]:
#apply one hot encoder on following columns

oh = OneHotEncoder()
Columns_onehot=["dataset","cp","restecg"]
for column in Columns_onehot:
# Reshape the "dataset" column before applying OneHotEncoder
    data[column] = oh.fit_transform(data[[column]]).toarray()



In [16]:
print("the data after encoding=")
data.head().style.background_gradient(cmap="Greens")

the data after encoding=


Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,num
0,63,1,1.0,0.0,145.0,233.0,1,1.0,150.0,0,2.3,0
1,67,1,1.0,1.0,160.0,286.0,0,1.0,108.0,1,1.5,2
2,67,1,1.0,1.0,120.0,229.0,0,1.0,129.0,1,2.6,1
3,37,1,1.0,0.0,130.0,250.0,0,0.0,187.0,0,3.5,0
4,41,0,1.0,0.0,130.0,204.0,0,1.0,172.0,0,1.4,0


In [17]:
data.shape

(740, 12)

# **apply keras algorithm** #

In [18]:
from sklearn.metrics import accuracy_score

x = data.drop(["num"], axis=1)
y = data["num"]

# split data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=40)

model = Sequential([
    Dense(12, activation="LeakyReLU", input_shape=(11,)),  # Fix input_shape to (12,)
    Dense(12, activation="LeakyReLU"),
    Dense(12, activation="LeakyReLU"),
    Dense(5, activation="softmax"),
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=20, validation_split=0.2,verbose=1) 
 # Use validation_split instead of validation_batch_size

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.4932432472705841


# **apply random forest algorithm** #

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators=5)
model2 = rfc.fit(X_train, y_train)
y_predict = model2.predict(X_test)

# Assuming y_test and y_predict are multioutput, use accuracy_score with multioutput parameter
acc = accuracy_score(y_test, y_predict)

print(f"The accuracy from RandomForestClassifier: {acc}")


The accuracy from RandomForestClassifier: 0.5405405405405406
