# ML stroke prediction - Logistic Regression
dataset link: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download

## Classification of stroke or no-stroke

For the first implementation of Machine Learning on this dataset we are using Logistic Regression

In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np
np.set_printoptions(suppress=True)

import logistic_regression as lr


import csv
import sys
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# SKlearn for F1 score calculation
from sklearn.metrics import f1_score

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# we use pandas to import a comma-seperated values dataset
import pandas as pd

# tells matplotlib to embed plots within the notebook
%matplotlib inline

### First importing the dataset and small conversions

For the first implementation we're just using a limited set of features for the model.

In [2]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
#data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
#data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
#data.drop("smoking_status", axis = 1, inplace = True)

print(data)

       age  hypertension  heart_disease ever_married Residence_type  \
0     67.0             0              1          Yes          Urban   
1     61.0             0              0          Yes          Rural   
2     80.0             0              1          Yes          Rural   
3     49.0             0              0          Yes          Urban   
4     79.0             1              0          Yes          Rural   
...    ...           ...            ...          ...            ...   
5105  80.0             1              0          Yes          Urban   
5106  81.0             0              0          Yes          Urban   
5107  35.0             0              0          Yes          Rural   
5108  51.0             0              0          Yes          Rural   
5109  44.0             0              0          Yes          Urban   

      avg_glucose_level   smoking_status  stroke  
0                228.69  formerly smoked       1  
1                202.21     never smoked     

In [3]:
data = data.dropna(axis = 0)

## Making dummy classes for the enumertor classes

In [4]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [5]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 8)


Unnamed: 0,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,0,1,0,1,0,0
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,0,1,0,0,0,1
4,0,1,1,0,0,0,1,0


In [6]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 13)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,1,0,1,0,1,0,1,0,0
1,61.0,0,0,202.21,1,0,1,1,0,0,0,1,0
2,80.0,0,1,105.92,1,0,1,1,0,0,0,1,0
3,49.0,0,0,171.23,1,0,1,0,1,0,0,0,1
4,79.0,1,0,174.12,1,0,1,1,0,0,0,1,0


In [7]:
data_new_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             5110 non-null   float64
 1   hypertension                    5110 non-null   int64  
 2   heart_disease                   5110 non-null   int64  
 3   avg_glucose_level               5110 non-null   float64
 4   stroke                          5110 non-null   int64  
 5   ever_married_No                 5110 non-null   uint8  
 6   ever_married_Yes                5110 non-null   uint8  
 7   Residence_type_Rural            5110 non-null   uint8  
 8   Residence_type_Urban            5110 non-null   uint8  
 9   smoking_status_Unknown          5110 non-null   uint8  
 10  smoking_status_formerly smoked  5110 non-null   uint8  
 11  smoking_status_never smoked     5110 non-null   uint8  
 12  smoking_status_smokes           51

# Downsampling of the dataset before split

In [8]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [9]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

# Splitting in train en test set

### preparation of the features
Adding a theta 0 equal to 1

In [10]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

print(X.shape)
#print(X)

(498, 12)
(498, 13)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 13)


# Hierboven maar 1x inladen !!!!

In [12]:
_lambda = 1

### Creating the empty array for theta

In [13]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

### Optimizing the parameters (learning the model)

In [14]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.456
theta:
	[-5.058, 0.076, 1.080, -0.017, 0.006]


In [15]:
p_train = lr.predict(theta, X_train, 0.5)
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 79.40 %


In [16]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)

print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])

max f1 score:  0.8196721311475411
for decision boundary:  0.46


# F1 score op test_set

In [17]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_test = lr.predict(theta, X_test, (i/100))
    f1_scores[j] = f1_score(y_test,p_test,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)

print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])

max f1 score:  0.8598130841121495
for decision boundary:  0.37


In [18]:
#for i in range(30, 50):
#    p_test = lr.predict(theta, X_test, (i/100))
#    print("zeker: ", i/100)
#    print(f1_score(y_test,p_test,average='binary'))

## Beste score tot nu toe op de test set:
## 85.98 voor een decision boundary van 0.37
### met de features: age, heart_disease, hypertension, avg_glucose, smoking_status, ever_maried, residence_type