## Import libraries

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import joblib

## Prepocessing data

In [23]:
df = pd.read_csv("data/heart_cleaned.csv")

In [24]:
#Removing The 'or older' in the dataset
df['AgeCategory'] = df['AgeCategory'].replace('80 or older', '80-80')

In [25]:
df["Diabetic"] = df["Diabetic"].replace('No, borderline diabetes', 'No')
df["Diabetic"] = df["Diabetic"].replace('Yes (during pregnancy)', 'Yes')

In [26]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [27]:
LabelEnc = LabelEncoder()
OHEnc = OneHotEncoder(sparse=False)

In [28]:
y_label = LabelEnc.fit_transform(y)

In [29]:
y_label

array([0, 0, 0, ..., 0, 0, 0])

data, on which we train our models
1. Smoking : Have you smoked at least 100 cigarettes in your entire life? (Yes or No ).
2. AlcoholDrinking : Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week (Yes or No)
3. Stroke : (Ever told) (you had) a stroke? (Yes or No)
4. PhysicalHealth : Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? (0-30 days, float).
5. MentalHealth : Thinking about your mental health, for how many days during the past 30 days was your mental health not good? (0-30 days, float).
6. DiffWalking : Do you have serious difficulty walking or climbing stairs? (Yes or No)
7. Sex (Male or Female)
8. Diabetic : (Ever told) (you had) diabetes? (Yes or No)
9. PhysicalActivity : Adults who reported doing physical activity or exercise during the past 30 days other than your regular job. (Yes or No)
10. SleepTime : On average, how many hours of sleep do you get in a 24-hour period?
11. Asthma : (Ever told) (you had) asthma? (Yes or No)
12. KidneyDisease : Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease? (Yes or No)
13. SkinCancer : (Ever told) (you had) skin cancer? (Yes or No)

In [30]:
train_cols = ['Smoking', 'AlcoholDrinking', 'Stroke',
              'PhysicalHealth', 'MentalHealth',
                    'DiffWalking', 'Sex', 'Diabetic', 'PhysicalActivity', 'SleepTime',
                    'Asthma', 'KidneyDisease', 'SkinCancer']

In [31]:
X = X[train_cols]

In [38]:
categorical_cols = ['Smoking', 'AlcoholDrinking', 'Stroke',
                    'DiffWalking', 'Sex', 'Diabetic', 'PhysicalActivity',
                    'Asthma', 'KidneyDisease', 'SkinCancer']

label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

In [39]:
joblib.dump(label_encoders, 'encoders/label_encoders.joblib')

['encoders/label_encoders.joblib']

In [40]:
X

Unnamed: 0,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,1,0,0,3.0,30.0,0,0,1,1,5.0,1,0,1
1,0,0,1,0.0,0.0,0,0,0,1,7.0,0,0,0
2,1,0,0,20.0,30.0,0,1,1,1,8.0,1,0,0
3,0,0,0,0.0,0.0,0,0,0,0,6.0,0,0,1
4,0,0,0,28.0,0.0,1,0,0,1,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,0,0,7.0,0.0,1,1,1,0,6.0,1,0,0
319791,1,0,0,0.0,0.0,0,1,0,1,5.0,1,0,0
319792,0,0,0,0.0,0.0,0,0,0,1,6.0,0,0,0
319793,0,0,0,0.0,0.0,0,0,0,0,12.0,0,0,0


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.2, random_state=42)

In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [43]:
#тренировка модели и получение результатов проверки
def get_preds(model, X_train, X_test, y_train):
  model.fit(X_train, y_train)
  return model.predict(X_test)

#оценивание точности модели с помощью нормализованной
#и ненормализованной метрики accuracy
def get_accuracy(preds, y_valid):
  print("ratio: ", accuracy_score(preds, y_valid))
  print("f1", f1_score(preds, y_valid))
  print("number of correct predictions", accuracy_score(preds, y_valid, normalize = False))

In [49]:
#определение моделей
KNclass = KNeighborsClassifier(n_neighbors = 5)
RFclass = RandomForestClassifier(n_estimators = 500, random_state = 0)
classifiers = [KNclass, RFclass]

In [51]:
#оцениваем точность каждой модели
for classifier in classifiers:
  print(classifier)
  prediction = get_preds(classifier, X_train, X_test, y_train)
  get_accuracy(prediction, y_test)
  joblib.dump(classifier, "models/"+str(classifier)+".joblib")
  print("-"*30, '\n')

KNeighborsClassifier()
ratio:  0.906549508278741
f1 0.11595917763644431
number of correct predictions 57982
------------------------------ 

RandomForestClassifier(n_estimators=500, random_state=0)
ratio:  0.9048452915148767
f1 0.15706371191135732
number of correct predictions 57873
------------------------------ 

