In [210]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

In [211]:
df = pd.read_csv("heart_disease_uci.csv")
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [212]:
df["ca"].value_counts()

ca
0.0    181
1.0     67
2.0     41
3.0     20
Name: count, dtype: int64

In [213]:
df["dataset"].value_counts()

dataset
Cleveland        304
Hungary          293
VA Long Beach    200
Switzerland      123
Name: count, dtype: int64

In [214]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299 entries, 0 to 748
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        299 non-null    int64  
 1   age       299 non-null    int64  
 2   sex       299 non-null    object 
 3   dataset   299 non-null    object 
 4   cp        299 non-null    object 
 5   trestbps  299 non-null    float64
 6   chol      299 non-null    float64
 7   fbs       299 non-null    object 
 8   restecg   299 non-null    object 
 9   thalch    299 non-null    float64
 10  exang     299 non-null    object 
 11  oldpeak   299 non-null    float64
 12  slope     299 non-null    object 
 13  ca        299 non-null    float64
 14  thal      299 non-null    object 
 15  num       299 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 39.7+ KB


In [215]:
df["exang"].value_counts()

exang
False    200
True      99
Name: count, dtype: int64

In [216]:
df.drop(columns = ["id"], inplace = True) 

In [217]:
df_cols = df.columns 
df_index = df.index

#X = df.drop(columns = ["num", "fbs", "exang"])
X = df.drop(columns = ["num"])
df["num"] = df["num"].map({0 : 0, 1 : 1, 2 : 1, 3 : 1, 4 : 1})
X["fbs"] = X["fbs"].astype(str)
X["exang"] = X["exang"].astype(str)
y = df["num"]
categorical_cols = X.select_dtypes(exclude = "number").columns
numerical_cols = X.select_dtypes(include = "number").columns
print(categorical_cols)
print(numerical_cols)
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), numerical_cols), 
        ("cat", OneHotEncoder(drop = "first"), categorical_cols)
    ]
)

X_processed = preprocessor.fit_transform(X)

Index(['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype='object')
Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca'], dtype='object')


In [218]:
X["dataset"].value_counts()

dataset
Cleveland        297
Hungary            1
VA Long Beach      1
Name: count, dtype: int64

In [219]:
for column_name in X.columns:
    column_value = X.iloc[0][column_name]
    print(type(column_value))

<class 'numpy.int64'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'str'>
<class 'numpy.float64'>
<class 'str'>
<class 'numpy.float64'>
<class 'str'>


In [220]:
X["exang"].value_counts()

exang
False    200
True      99
Name: count, dtype: int64

In [221]:
'''
<class 'numpy.int64'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'bool'>
<class 'str'>
<class 'numpy.float64'>
<class 'bool'>
<class 'numpy.float64'>
<class 'str'>
<class 'numpy.float64'>
<class 'str'>
'''

data_to_pd = {"age" : 67, "sex" : "Male", "dataset" : "Hungary", "cp" : "asymptomatic", "trestbps" : 160,
              "chol" : 286, "fbs" : "False", "restecg" : "lv hypertrophy", "thalch" : 108, "exang" : "True", "oldpeak" : 1.5,
               "slope" : "flat", "ca" : 3, "thal" : "normal"}
'''
data_to_pd = {"age" : 67, "sex" : "Male", "dataset" : "Hungary", "cp" : "asymptomatic", "trestbps" : 160,
              "chol" : 286, "fbs" : False, "restecg" : "lv hypertrophy", "thalch" : 108, "exang" : True, "oldpeak" : 1.5,
               "slope" : "flat", "ca" : 3, "thal" : "normal"}
'''
# 67,Male,Cleveland,asymptomatic,160,286,FALSE,lv hypertrophy,108,TRUE,1.5,flat,3,normal
data_df = pd.DataFrame(data_to_pd, index = [0])
data_df_proc = preprocessor.transform(data_df)
data_df_proc

array([[ 1.38414338,  1.59635425,  0.74772238, -1.79044733,  0.380309  ,
         2.48726932,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  1.        ,  0.        ]])

In [222]:
print(categorical_cols)
print(numerical_cols)
['age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

Index(['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype='object')
Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca'], dtype='object')


['age',
 'sex',
 'dataset',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalch',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [223]:
X["slope"].value_counts()

slope
flat           139
upsloping      139
downsloping     21
Name: count, dtype: int64

In [224]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size = 0.2, random_state = 42)

In [225]:
lgr = LogisticRegression(penalty = "l1", solver = "liblinear", C = 0.5)
lgr.fit(X_train, y_train)
y_preds = lgr.predict(X_test)
acc1 = accuracy_score(y_preds, y_test)
acc1

0.9

In [226]:
dtc = DecisionTreeClassifier(max_depth = 5)
dtc.fit(X_train, y_train)
y_preds = dtc.predict(X_test)
acc2 = accuracy_score(y_test, y_preds)
acc2

0.8166666666666667

In [227]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)
acc3 = accuracy_score(y_preds, y_test)
acc3

0.8666666666666667

In [228]:
svc = SVC(kernel = "linear")
svc.fit(X_train, y_train)
y_preds = svc.predict(X_test)
acc4 = accuracy_score(y_preds, y_test)
acc4

0.9166666666666666

In [229]:
models = ["Logistic Regression", "Decision Tree", "KNN", "SVM"]
accs = [acc1, acc2, acc3, acc4]
data = {'Model': models,
        'Accuracy': accs}
accuracy_df = pd.DataFrame(data)
accuracy_df

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.9
1,Decision Tree,0.816667
2,KNN,0.866667
3,SVM,0.916667


In [230]:
fig = px.bar(accuracy_df, x = "Model", y = "Accuracy")
fig.update_yaxes(range=[0.75, 0.92])
fig.show()