# ML stroke prediction - Logistic Regression
dataset link: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download

## Classification of stroke or no-stroke

For the first implementation of Machine Learning on this dataset we are using Logistic Regression

In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np
np.set_printoptions(suppress=True)

import logistic_regression as lr

import csv
import sys
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# SKlearn for F1 score calculation
from sklearn.metrics import f1_score

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# we use pandas to import a comma-seperated values dataset
import pandas as pd

# tells matplotlib to embed plots within the notebook
%matplotlib inline

### First importing the dataset and small conversions

For the first implementation we're just using a limited set of features for the model.

In [2]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

print(data)

       age  hypertension  heart_disease  avg_glucose_level  stroke
0     67.0             0              1             228.69       1
1     61.0             0              0             202.21       1
2     80.0             0              1             105.92       1
3     49.0             0              0             171.23       1
4     79.0             1              0             174.12       1
...    ...           ...            ...                ...     ...
5105  80.0             1              0              83.75       0
5106  81.0             0              0             125.20       0
5107  35.0             0              0              82.99       0
5108  51.0             0              0             166.29       0
5109  44.0             0              0              85.28       0

[5110 rows x 5 columns]


# Downsampling of the dataset before split

In [3]:
negative = data[data.stroke==0]
positive = data[data.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [4]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

# Splitting in train en test set

### preparation of the features
Adding a theta 0 equal to 1

In [5]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

print(X.shape)
#print(X)

(498, 4)
(498, 5)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 5)


# Hierboven maar 1x inladen !!!!

In [7]:
_lambda = 1

### Creating the empty array for theta

In [8]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrcostFunctionReg(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

### Optimizing the parameters (learning the model)

In [9]:
theta, cost, grad = lr.lrOptimization(lr.lrcostFunctionReg, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.464
theta:
	[-4.920, 0.072, 0.973, 0.116, 0.006]


In [10]:
p_train = lr.predict(theta, X_train, 0.39)
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 78.89 %


In [11]:
print("op de training set")
print(f1_score(y_train,p_train,average='binary'))

op de training set
0.8099547511312218


In [12]:
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    print("zeker: ", i/100)
    print(f1_score(y,p,average='binary'))

zeker:  0.3
0.8034482758620689
zeker:  0.31
0.8034782608695652
zeker:  0.32
0.8105263157894735
zeker:  0.33
0.8134991119005329
zeker:  0.34
0.8128342245989305
zeker:  0.35
0.8144144144144144
zeker:  0.36
0.8158844765342961
zeker:  0.37
0.8152173913043479
zeker:  0.38
0.8130671506352087
zeker:  0.39
0.8138686131386861
zeker:  0.4
0.8044280442804428
zeker:  0.41
0.8059149722735675
zeker:  0.42
0.7992565055762081
zeker:  0.43
0.7947761194029851
zeker:  0.44
0.7954971857410882
zeker:  0.45
0.7947269303201507
zeker:  0.46
0.7916666666666667
zeker:  0.47
0.7900763358778626
zeker:  0.48
0.7892720306513409
zeker:  0.49
0.7846153846153846


# F1 score op test_set

In [13]:
p_test = lr.predict(theta, X_test, 0.35)
print("op de test set")
print(f1_score(y_test,p_test,average='binary'))

op de test set
0.8333333333333334


In [14]:
#for i in range(30, 50):
#    p_test = lr.predict(theta, X_test, (i/100))
#    print("zeker: ", i/100)
#    print(f1_score(y_test,p_test,average='binary'))

## Beste score tot nu toe op de test set:
## 83.3 voor een decision boundary van 0.35