In [3]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# The code below is to import the csv dataset and deal with the missing values by removing it

In [4]:
dataset = pd.read_csv('stroke.csv')
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
dataset.fillna(999, inplace=True)
dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,999.0,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,999.0,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# The code below specifies the atributes needed to classfy


In [5]:
x = dataset.iloc[:,1:10]


# This code below changes the string values to 0 and 1

In [6]:
x= pd.get_dummies(x)

In [7]:
x

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,67.0,0,1,228.69,36.6,0,1,0,0,1,0,0,1,0,0,0,1
1,61.0,0,0,202.21,999.0,1,0,0,0,1,0,0,0,1,0,1,0
2,80.0,0,1,105.92,32.5,0,1,0,0,1,0,0,1,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,0,1,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,1,0,83.75,999.0,1,0,0,0,1,0,0,1,0,0,0,1
5106,81.0,0,0,125.20,40.0,1,0,0,0,1,0,0,0,1,0,0,1
5107,35.0,0,0,82.99,30.6,1,0,0,0,1,0,0,0,1,0,1,0
5108,51.0,0,0,166.29,25.6,0,1,0,0,1,0,0,1,0,0,1,0


# This code below is for the class atribute 

In [8]:
y = dataset.iloc[:,-1].values



# The code below is to specify the training and test dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# the below code is to build the KNN algorithm

In [11]:
from math import sqrt
class KNN():
  def __init__(self,k):
    self.k=k
    print(self.k)
  def fit(self,X_train,y_train):
    self.x_train=X_train
    self.y_train=y_train
  def calculate_euclidean(self,sample1,sample2):
    distance=0.0
    for i in range(len(sample1)):
      distance+=(sample1[i]-sample2[i])**2 

    return sqrt(distance)
  def nearest_neighbors(self,test_sample):
    distances=[]
    for i in range(len(self.x_train)):
      distances.append((self.y_train[i],self.calculate_euclidean(self.x_train[i],test_sample)))
    distances.sort(key=lambda x:x[1])
    neighbors=[]
    for i in range(self.k): 
      neighbors.append(distances[i][0])
    return neighbors
  def predict(self,test_set):
    predictions=[]
    for test_sample in test_set:
      neighbors=self.nearest_neighbors(test_sample)
      labels=[sample for sample in neighbors]
      prediction=max(labels,key=labels.count)
      predictions.append(prediction)
    return predictions

In [12]:
model=KNN(5)
model.fit(X_train,y_train)

5


# the code below is to predict

In [13]:
y_pred = model.predict(X_test)


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred) 
print(cm)
accuracy_score(y_test, y_pred)

[[1203   10]
 [  64    1]]


0.9420970266040689

In [15]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1213
           1       0.09      0.02      0.03        65

    accuracy                           0.94      1278
   macro avg       0.52      0.50      0.50      1278
weighted avg       0.91      0.94      0.92      1278



# the above info tells us that the accuracy of 94% and f1-score of 97% and recall of 99% precision of 95%