In [2]:
import numpy as np
import pandas as pd
import category_encoders as ce

from sklearn.model_selection import train_test_split, KFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

In [3]:
def handle_data(csv):
    df = pd.read_csv(csv)
    
    # Encode gender
    df = df[df.gender != "Other"]
    one_hot = ce.OneHotEncoder()
    gender_encoded = one_hot.fit_transform(df.gender)
    df = df.join(gender_encoded)
    del df["gender"]
    
    # Encode work_type, Residence_type, smoking_status features
    target_encoder = ce.TargetEncoder(cols=["work_type", "Residence_type", "smoking_status"])
    df = target_encoder.fit_transform(df, df.stroke)
    
    # fill None values with mean
    df.bmi = df.bmi.fillna(df.bmi.mean())
    
    # Encode ever_marries feature
    df.ever_married = df.ever_married.replace(["Yes", "No"], [1, 0])
    
    # Scaling
    cols_to_scale = ["age", "avg_glucose_level", "bmi"]
    scaler = StandardScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    return df

In [4]:
def split_data(df):
    features = df.loc[:, df.columns != "stroke"]
    target = df.stroke
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [5]:
def balance_splitting(df, rows_to_use):
    zero_stroke = df[df.stroke == 0].sample(rows_to_use)
    new_df = df[df.stroke == 1].append(zero_stroke, ignore_index=True)
    features = new_df.loc[:, new_df.columns != "stroke"]
    target = new_df.stroke
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [10]:
path = "healthcare.csv"
data = handle_data(path)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [11]:
x_train, x_test, y_train, y_test = split_data(data)

In [12]:
x_train_balanced, x_test_balanced, y_train_balanced, y_test_balanced = balance_splitting(data, 400)

In [14]:
knn = KNeighborsClassifier()
knn

KNeighborsClassifier()

In [15]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [16]:
predict = knn.predict(x_test)

In [17]:
accuracy_score(y_test, predict)

0.9374021909233177

In [18]:
confusion_matrix(y_test, predict)

array([[1198,    0],
       [  80,    0]], dtype=int64)

In [21]:
tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
(tn, fp, fn, tp)

(1198, 0, 80, 0)

In [22]:
y_test.value_counts()

0    1198
1      80
Name: stroke, dtype: int64

In [23]:
knn.fit(x_train_balanced, y_train_balanced)

KNeighborsClassifier()

In [24]:
predict = knn.predict(x_test_balanced)

In [25]:
accuracy_score(y_test_balanced, predict)

0.5214723926380368

In [26]:
confusion_matrix(y_test_balanced, predict)

array([[71, 26],
       [52, 14]], dtype=int64)

In [27]:
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
(tn, fp, fn, tp)

(71, 26, 52, 14)

In [28]:
y_test_balanced.value_counts()

0    97
1    66
Name: stroke, dtype: int64

In [29]:
predict_2 = knn.predict(x_test)

In [30]:
accuracy_score(y_test, predict_2)

0.7323943661971831

In [33]:
tn, fp, fn, tp = confusion_matrix(y_test, predict_2).ravel()
(tn, fp, fn, tp) # We guessed right 32 'ones' out of 80 and 904 'zeros' out of 1198

(904, 294, 48, 32)

In [34]:
y_test.value_counts()

0    1198
1      80
Name: stroke, dtype: int64

In [75]:
knn = KNeighborsClassifier(3)

In [76]:
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [77]:
predict = knn.predict(x_test)

In [78]:
accuracy_score(y_test, predict)

0.931924882629108

In [79]:
tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
(tn, fp, fn, tp)

(1191, 7, 80, 0)

In [94]:
knn = KNeighborsClassifier(10)
knn.fit(x_train_balanced, y_train_balanced)
predict = knn.predict(x_test_balanced)
print(accuracy_score(y_test_balanced, predict))
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
print(tn, fp, fn, tp)

0.6012269938650306
90 7 58 8


In [101]:
knn = KNeighborsClassifier()
knn.fit(x_train_balanced, y_train_balanced)
predict = knn.predict(x_test_balanced)
print(accuracy_score(y_test_balanced, predict))
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
print(tn, fp, fn, tp)

0.5214723926380368
71 26 52 14


In [115]:
#function for ease of use
def knn_classifier(neighbors, distance_metric):
    knn = KNeighborsClassifier(neighbors, metric=distance_metric, weights="distance")
    knn.fit(x_train_balanced, y_train_balanced)
    predict = knn.predict(x_test_balanced)
    tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
    return accuracy_score(y_test_balanced, predict).ravel(), (tn, fp, fn, tp)

In [116]:
accuracy, confusion = knn_classifier(10, "manhattan")
print(accuracy)
print(confusion)

[0.52147239]
(69, 28, 50, 16)


In [117]:
accuracy, confusion = knn_classifier(8, "manhattan")
print(accuracy)
print(confusion)

[0.52147239]
(69, 28, 50, 16)


In [120]:
accuracy, confusion = knn_classifier(20, "manhattan")
print(accuracy)
print(confusion)

[0.53374233]
(71, 26, 50, 16)


In [118]:
accuracy, confusion = knn_classifier(3, "manhattan")
print(accuracy)
print(confusion)

[0.50920245]
(63, 34, 46, 20)


In [121]:
accuracy, confusion = knn_classifier(3, "chebyshev")
print(accuracy)
print(confusion)

[0.50920245]
(63, 34, 46, 20)


In [135]:
# Cross validation
kf = KFold()
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [141]:
features = np.array(data.loc[:, data.columns != "stroke"])
target = np.array(data.stroke)

for train_index, test_index in kf.split(features):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]
    knn = KNeighborsClassifier(3, metric="manhattan", weights="distance")
    knn.fit(x_train, y_train)
    predict = knn.predict(x_test)
    print(accuracy_score(y_test, predict))
    tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
    print(tn, fp, fn, tp)

0.7563600782778865
773 0 249 0
0.974559686888454
996 26 0 0
0.9735812133072407
995 27 0 0
0.9794520547945206
1001 21 0 0
0.9666993143976493
987 34 0 0


In [148]:
rkf = RepeatedKFold(n_repeats=2)
rkf

RepeatedKFold(n_repeats=2, n_splits=5, random_state=None)

In [149]:
features = np.array(data.loc[:, data.columns != "stroke"])
target = np.array(data.stroke)

for train_index, test_index in rkf.split(features):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]
    knn = KNeighborsClassifier(3, metric="manhattan", weights="distance")
    knn.fit(x_train, y_train)
    predict = knn.predict(x_test)
    print(accuracy_score(y_test, predict))
    tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
    print(tn, fp, fn, tp)

0.9324853228962818
953 19 50 0
0.9295499021526419
950 19 53 0
0.9275929549902152
947 26 48 1
0.9285714285714286
948 17 56 1
0.9451518119490695
963 18 38 2
0.9344422700587084
955 20 47 0
0.9246575342465754
944 22 55 1
0.9383561643835616
959 16 47 0
0.9256360078277887
945 30 46 1
0.9353574926542605
953 16 50 2
