In [41]:
# Import all libraries
import numpy as np
import pandas as pd

In [42]:
# Import the dataset
heart_disease = pd.read_csv("/kaggle/input/heart-disease-data/heart_disease_uci.csv")

# View the dataset
heart_disease.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [43]:
# Drop unused/unaffecting features
heart_disease.drop(['id','dataset'], axis=1, inplace=True)
heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


Let's check the datatypes of the columns

In [44]:
heart_disease.dtypes

age           int64
sex          object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object

Check how many unique values are in num and their number

In [45]:
heart_disease["num"].value_counts()

0    411
1    265
2    109
3    107
4     28
Name: num, dtype: int64

In [46]:
heart_disease.isna().sum()

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

There's no missing targets, so split features and targets

In [47]:
x = heart_disease.drop("num", axis=1)
y = heart_disease["num"]
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal


In [48]:
# Fill up categorical & numerical data of the features and make a new filled feature df

x["trestbps"].fillna(x["trestbps"].mean(), inplace=True)
x["chol"].fillna(x["chol"].mean(), inplace=True)
x["fbs"].fillna(x["fbs"].mode()[0], inplace=True)
x["restecg"].fillna(x["restecg"].mode()[0], inplace=True)
x["thalch"].fillna(x["thalch"].mean(), inplace=True)
x["exang"].fillna(x["exang"].mode()[0], inplace=True)
x["oldpeak"].fillna(x["oldpeak"].mean(), inplace=True)
x["slope"].fillna(x["slope"].mode()[0], inplace=True)
x["ca"].fillna(x["ca"].mean(), inplace=True)
x["thal"].fillna(x["thal"].mode()[0], inplace=True)

In [49]:
x.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64

No missing feature data or target data, proceed to turning text into numbers

In [50]:
x.dtypes

age           int64
sex          object
cp           object
trestbps    float64
chol        float64
fbs            bool
restecg      object
thalch      float64
exang          bool
oldpeak     float64
slope        object
ca          float64
thal         object
dtype: object

In [51]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer(
    [("one_hot", one_hot, cat_features)],
    remainder="passthrough"
)

transformed_x = transformer.fit_transform(x)

Now that all our data as been converted into numbers, train models

In [52]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

<hr>

### Linear Model



In [53]:
from sklearn.linear_model import RidgeClassifier

lin_model = RidgeClassifier()
lin_model.fit(x_train, y_train)

RidgeClassifier()

In [54]:
y_preds = lin_model.predict(x_test)

In [55]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.77      0.90      0.83        91
           1       0.53      0.56      0.55        57
           2       0.12      0.06      0.08        16
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         7

    accuracy                           0.62       184
   macro avg       0.28      0.31      0.29       184
weighted avg       0.56      0.62      0.59       184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
confusion_matrix(y_test, y_preds)

array([[82,  8,  0,  1,  0],
       [16, 32,  5,  4,  0],
       [ 5,  8,  1,  2,  0],
       [ 4,  8,  1,  0,  0],
       [ 0,  4,  1,  2,  0]])

In [57]:
accuracy_score(y_test, y_preds)

0.625

<hr>

### Support Vector Machines (Classification)

In [58]:
from sklearn.svm import SVC

svc_model = SVC(decision_function_shape='ovo')
svc_model.fit(x_train, y_train)

SVC(decision_function_shape='ovo')

In [59]:
y_preds = svc_model.predict(x_test)

In [60]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.56      0.88      0.68        91
           1       0.44      0.32      0.37        57
           2       0.00      0.00      0.00        16
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         7

    accuracy                           0.53       184
   macro avg       0.20      0.24      0.21       184
weighted avg       0.41      0.53      0.45       184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
confusion_matrix(y_test, y_preds)

array([[80, 11,  0,  0,  0],
       [39, 18,  0,  0,  0],
       [11,  5,  0,  0,  0],
       [ 8,  5,  0,  0,  0],
       [ 5,  2,  0,  0,  0]])

In [62]:
accuracy_score(y_test, y_preds)

0.532608695652174

<hr>

### Nearest Neighbors (Nearest Neighbors Classification)

In [63]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)
y_preds = knn_model.predict(x_test)

In [64]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.62      0.74      0.67        91
           1       0.36      0.35      0.36        57
           2       0.23      0.19      0.21        16
           3       0.25      0.15      0.19        13
           4       0.00      0.00      0.00         7

    accuracy                           0.50       184
   macro avg       0.29      0.29      0.29       184
weighted avg       0.46      0.50      0.48       184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
confusion_matrix(y_test, y_preds)

array([[67, 20,  2,  2,  0],
       [30, 20,  5,  2,  0],
       [ 6,  6,  3,  1,  0],
       [ 4,  5,  2,  2,  0],
       [ 1,  4,  1,  1,  0]])

In [66]:
accuracy_score(y_test, y_preds)

0.5

<hr>

### Naive Bayes (Gaussian Naive Bayes)

In [67]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
y_preds = nb_model.predict(x_test)

In [68]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.86      0.77      0.81        91
           1       0.57      0.37      0.45        57
           2       0.07      0.06      0.07        16
           3       0.06      0.08      0.07        13
           4       0.08      0.43      0.14         7

    accuracy                           0.52       184
   macro avg       0.33      0.34      0.31       184
weighted avg       0.62      0.52      0.56       184



In [69]:
confusion_matrix(y_test, y_preds)

array([[70,  9,  1,  3,  8],
       [ 6, 21,  7,  8, 15],
       [ 3,  4,  1,  1,  7],
       [ 2,  3,  4,  1,  3],
       [ 0,  0,  1,  3,  3]])

In [70]:
accuracy_score(y_test, y_preds)

0.5217391304347826

<hr>

### Decision Trees

In [71]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(x_train,y_train)
y_preds = dt_model.predict(x_test)

In [72]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.74      0.66      0.70        91
           1       0.38      0.32      0.34        57
           2       0.08      0.12      0.10        16
           3       0.13      0.23      0.17        13
           4       0.17      0.14      0.15         7

    accuracy                           0.46       184
   macro avg       0.30      0.29      0.29       184
weighted avg       0.50      0.46      0.48       184



In [73]:
confusion_matrix(y_test, y_preds)

array([[60, 19,  7,  4,  1],
       [15, 18, 10, 11,  3],
       [ 3,  8,  2,  2,  1],
       [ 3,  3,  4,  3,  0],
       [ 0,  0,  3,  3,  1]])

In [74]:
accuracy_score(y_test, y_preds)

0.45652173913043476

<hr>

### Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(x_train,y_train)
y_preds = rfc_model.predict(x_test)

In [76]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        91
           1       0.62      0.56      0.59        57
           2       0.24      0.25      0.24        16
           3       0.31      0.38      0.34        13
           4       1.00      0.14      0.25         7

    accuracy                           0.66       184
   macro avg       0.60      0.44      0.45       184
weighted avg       0.67      0.66      0.66       184



In [77]:
confusion_matrix(y_test, y_preds)

array([[80,  7,  1,  3,  0],
       [13, 32,  8,  4,  0],
       [ 3,  6,  4,  3,  0],
       [ 2,  5,  1,  5,  0],
       [ 0,  2,  3,  1,  1]])

In [78]:
accuracy_score(y_test, y_preds)

0.6630434782608695

In [79]:
rfc_model.score(x_test,y_test)

0.6630434782608695

<br>
<br>
<br>
<hr>

# Performance Evaluation

In [80]:
heart_disease["num"].value_counts()

0    411
1    265
2    109
3    107
4     28
Name: num, dtype: int64

Our target contains a selection of 5 values. So this is a **multi-class classification problem**. Hence, our use of classifiers.

Here, we can see that the **Random Forest (RandomForestClassifier)** is the best model for this case having an accuracy score of **66%**.<br>