In [1]:
from preprocessing import preprocess_raw
import pandas as pd
import warnings
warnings.filterwarnings("ignore") # ignore pandas warnings, they are really annoying

In [2]:
raw_data = pd.read_csv("./data/heart_2022_no_nans.csv")
X_train, X_test, y_train, y_test = preprocess_raw(raw_data)

In [3]:
X_train.shape, X_test.shape

((49204, 94), (196818, 94))

In [4]:
y_train.shape, y_test.shape 

((49204, 1), (196818, 1))

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, make_scorer

In [6]:

# metrics = []
# knn = KNeighborsClassifier()
# gs = GridSearchCV(
#     estimator=knn,
#     param_grid= {
#         'n_neighbors': [3,5,7,9,11],
#         'weights': ['uniform', 'distance'],
#         'p': [1,2] # manhattan distance vs euclidean distance
#     },
#     scoring = {
#         "accuracy" : make_scorer(accuracy_score),
#         "precision": make_scorer(precision_score),
#         "recall" : make_scorer(recall_score),
#         "f1_score" : make_scorer(f1_score),
#         "AUC" : make_scorer(roc_auc_score)
#     },
#     n_jobs=1,
#     verbose=4,
#     refit="accuracy"
# )
# gs.fit(X_train, y_train)

In [7]:
# gs.best_estimator_

In [8]:
# gs.best_params_

In [9]:
# gs.best_score_

In [10]:
# cv_result = pd.DataFrame(gs.cv_results_)

In [11]:
# gs.best_index_

In [12]:
# cv_result.iloc[gs.best_index_]

In [13]:
# using oversampling and train on the best param
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [14]:
y_train.value_counts()

HadHeartAttack_Yes
0                     46524
1                      2680
Name: count, dtype: int64

In [15]:
y_resampled.value_counts()

HadHeartAttack_Yes
0                     46524
1                     46524
Name: count, dtype: int64

In [16]:
X_resampled.shape, y_resampled.shape

((93048, 94), (93048, 1))

In [17]:
oversampled_knn = KNeighborsClassifier(n_neighbors=11, p=1, weights='uniform', n_jobs=4)
oversampled_knn.fit(X_resampled, y_resampled)

In [18]:
X_test[:100]

Unnamed: 0,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,SleepHours,RemovedTeeth,SmokerStatus,ECigaretteUsage,AgeCategory,WeightInKilograms,...,PneumoVaxEver_Yes,"TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap",HighRiskLastYear_No,HighRiskLastYear_Yes,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes
172414,3.0,20.0,10.0,3.0,8.0,2.0,0.0,3.0,4.0,86.18,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
106325,3.0,0.0,0.0,3.0,6.0,3.0,3.0,3.0,0.0,68.04,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
215652,0.0,2.0,1.0,3.0,8.0,2.0,3.0,3.0,3.0,91.17,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
75392,0.0,0.0,0.0,3.0,6.0,3.0,3.0,3.0,10.0,81.65,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
55931,3.0,0.0,0.0,3.0,8.0,2.0,2.0,3.0,0.0,72.57,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108807,2.0,15.0,20.0,2.0,6.0,2.0,2.0,3.0,7.0,76.20,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
92917,0.0,0.0,0.0,3.0,8.0,3.0,3.0,3.0,4.0,85.28,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
78301,3.0,7.0,30.0,3.0,8.0,3.0,1.0,1.0,9.0,127.01,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
219300,4.0,0.0,0.0,3.0,7.0,3.0,3.0,3.0,7.0,65.77,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [25]:
n_predictions = 10000 # somehow KNN predict takes very long time, so limit it 
y_pred = oversampled_knn.predict(X_test[:n_predictions])
print(f"accuracy: {accuracy_score(y_test[:n_predictions], y_pred)}")
print(f"precision: {precision_score(y_test[:n_predictions], y_pred)}")
print(f"recall: {recall_score(y_test[:n_predictions], y_pred)}")
print(f"f1: {f1_score(y_test[:n_predictions], y_pred)}")
print(f"AUC: {roc_auc_score(y_test[:n_predictions], oversampled_knn.predict_proba(X_test[:n_predictions])[:,1])}")
# y_pred

accuracy: 0.8653
precision: 0.19239720713731576
recall: 0.44765342960288806
f1: 0.2691264243081932
AUC: 0.7542067927822295
