In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [3]:
#original data
df=pd.read_csv("heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [4]:
#replace yes/no with 1/0
column_yesno = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Diabetic", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]

df =  df[df.columns].replace({'Yes':1, 'No':0, 'Male':1,'Female':0,'No, borderline diabetes':'0','Yes (during pregnancy)':'1' })
df['Diabetic'] = df['Diabetic'].astype(int)

In [5]:
#replace range in AgeCategory with number
encode_AgeCategory = {'55-59':57.0, '80 or older':80.0, '65-69':67.0,
                      '75-79':77.0,'40-44':42.0,'70-74':72,'60-64':62.0,
                      '50-54':52.0,'45-49':47.0,'18-24':21,'35-39':37.0,
                      '30-34':32.0,'25-29':27.0}
df['AgeCategory'] = df['AgeCategory'].apply(lambda x: encode_AgeCategory[x])

In [6]:
#replace range in AgeCategory with number
encode_GenHealth = {'Poor':0.0, 'Fair':1.0, 'Good':2.0,'Very good':3.0,'Excellent':4.0}
df['GenHealth'] = df['GenHealth'].apply(lambda x: encode_GenHealth[x])

In [7]:
#normalize continuous values to 0-1 scale
column_int = ['PhysicalHealth', 'MentalHealth', 'SleepTime']
column_cont = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'AgeCategory', 'GenHealth']
df[column_int] = df[column_int].astype(float)
df['BMI'] = df['BMI'].astype(float)
df[column_cont] = df[column_cont].apply(lambda x: x/x.max())
df.pop('Race')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,0.175013,1,0,0,0.1,1.0,0,0,0.7125,1,1,0.75,0.208333,1,0,1
1,0,0.214444,0,0,1,0.0,0.0,0,0,1.0,0,1,0.75,0.291667,0,0,0
2,0,0.280232,1,0,0,0.666667,1.0,0,1,0.8375,1,1,0.25,0.333333,1,0,0
3,0,0.255245,0,0,0,0.0,0.0,0,0,0.9625,0,0,0.5,0.25,0,0,1
4,0,0.249974,0,0,0,0.933333,0.0,1,0,0.525,0,1,0.75,0.333333,0,0,0


df['Race_Black'] = df['Race']
df['Race_Asian'] = df['Race']
df['Race_Hispanic'] = df['Race']
df['Race_White'] = df['Race']
df['Race_Other'] = df['Race']
df.pop('Race')
df

df =  df['Race_Asian'].replace({'Asian':1,'Black':0,'Hispanic':0,'White':0,'Other':0})
df['Race_Black'] =  df['Race_Black'].replace({'Asian':0,'Black':1,'Hispanic':0,'White':0,'Other':0})
df =  df['Race_Hispanic'].replace({'Asian':0,'Black':0,'Hispanic':1,'White':0,'Other':0})
df =  df['Race_White'].replace({'Asian':0,'Black':0,'Hispanic':0,'White':1,'Other':0})
df =  df['Race_Other'].replace({'Asian':0,'Black':0,'Hispanic':0,'White':0,'Other':1})
df

In [8]:
y = df.pop('HeartDisease')
X = df
X.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0.175013,1,0,0,0.1,1.0,0,0,0.7125,1,1,0.75,0.208333,1,0,1
1,0.214444,0,0,1,0.0,0.0,0,0,1.0,0,1,0.75,0.291667,0,0,0
2,0.280232,1,0,0,0.666667,1.0,0,1,0.8375,1,1,0.25,0.333333,1,0,0
3,0.255245,0,0,0,0.0,0.0,0,0,0.9625,0,0,0.5,0.25,0,0,1
4,0.249974,0,0,0,0.933333,0.0,1,0,0.525,0,1,0.75,0.333333,0,0,0


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
X_train.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
148821,0.264628,0,0,0,0.0,0.0,0,1,0.65,0,1,0.5,0.291667,0,0,0
113787,0.242804,1,0,0,0.0,0.0,0,0,0.65,0,1,1.0,0.291667,0,0,0
114251,0.286452,1,0,0,0.0,0.0,0,1,0.7125,0,1,1.0,0.208333,1,0,0
311048,0.245967,0,0,0,0.0,0.0,0,0,0.8375,0,1,0.5,0.291667,0,0,1
50914,0.304586,0,0,0,0.0,0.0,0,0,1.0,1,0,0.5,0.333333,0,0,0


In [17]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9120170108089515


In [16]:
knn.predict(X_test.iloc[3])

AttributeError: 'Series' object has no attribute 'reshape'