# ML stroke prediction - Logistic Regression
dataset link: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download

## Classification of stroke or no-stroke

For the first implementation of Machine Learning on this dataset we are using Logistic Regression

In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np
np.set_printoptions(suppress=True)

import logistic_regression as lr

import csv
import sys
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# SKlearn for F1 score calculation
from sklearn.metrics import f1_score

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# we use pandas to import a comma-seperated values dataset
import pandas as pd

# tells matplotlib to embed plots within the notebook
%matplotlib inline

### First importing the dataset and small conversions

For the first implementation we're just using a limited set of features for the model.

In [2]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

print(data)

       age  hypertension  heart_disease  avg_glucose_level  stroke
0     67.0             0              1             228.69       1
1     61.0             0              0             202.21       1
2     80.0             0              1             105.92       1
3     49.0             0              0             171.23       1
4     79.0             1              0             174.12       1
...    ...           ...            ...                ...     ...
5105  80.0             1              0              83.75       0
5106  81.0             0              0             125.20       0
5107  35.0             0              0              82.99       0
5108  51.0             0              0             166.29       0
5109  44.0             0              0              85.28       0

[5110 rows x 5 columns]


The sigmoid function copied from excersise 2. Could be copied later on to a 'library'.

In [3]:
counter_stroke = 0
counter_non_stroke = 0
data_balanced = np.zeros((398,5))
#print(data_balanced.shape)
data_balanced[0:199] = data[0:199]
data_balanced[199:398] = data[249:448]
print(data_balanced[195:205])
for i in range(398):
    if data_balanced[i, 4] == 1:
        counter_stroke += 1
    else:
        counter_non_stroke += 1
        
print(counter_stroke)
print(counter_non_stroke)

[[ 59.     0.     0.   200.62   1.  ]
 [ 70.     1.     0.   242.52   1.  ]
 [ 69.     0.     0.    93.81   1.  ]
 [ 79.     0.     0.   114.77   1.  ]
 [  3.     0.     0.    95.12   0.  ]
 [ 58.     1.     0.    87.96   0.  ]
 [  8.     0.     0.   110.89   0.  ]
 [ 70.     0.     0.    69.04   0.  ]
 [ 14.     0.     0.   161.28   0.  ]
 [ 47.     0.     0.   210.95   0.  ]]
199
199


In [4]:
X = data.drop(['stroke'], axis=1).values
y = data['stroke'].values

print("Xshape: ")
print(X.shape)
print("Yshape: ")
print(y.shape)

print("X: ")
print(X)

print("y: ")
print(y)

Xshape: 
(5110, 4)
Yshape: 
(5110,)
X: 
[[ 67.     0.     1.   228.69]
 [ 61.     0.     0.   202.21]
 [ 80.     0.     1.   105.92]
 ...
 [ 35.     0.     0.    82.99]
 [ 51.     0.     0.   166.29]
 [ 44.     0.     0.    85.28]]
y: 
[1 1 1 ... 0 0 0]


In [5]:
negative = data[data.stroke==0]
positive = data[data.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# upsample minority
pos_upsampled = resample(positive,
 replace=True, # sample with replacement
 n_samples=len(negative), # match number in majority class
 random_state=27) # reproducible results

upsampled = pd.concat([negative, pos_upsampled])

print(upsampled.stroke.value_counts())

0    4861
1    4861
Name: stroke, dtype: int64


In [6]:
X = upsampled.drop(['stroke'], axis=1).values
y = upsampled['stroke'].values

In [7]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

print(X.shape)
#print(X)

(9722, 4)
(9722, 5)


In [8]:
_lambda = 2

In [9]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X, y, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

print('Cost at test theta: {:.3f}'.format(cost))

print('Gradient at test theta:')
print('\t[{:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}]'.format(*grad))

Cost at test theta: 0.693
Gradient at test theta:
	[0.000, -6.372, -0.042, -0.036, -6.820]


In [10]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X, y, _lambda)

Cost at theta found by optimize.minimize: 0.486
theta:
	[-4.651, 0.071, 0.431, 0.422, 0.004]


In [11]:
p = lr.predict(theta, X, 0.35)
print('Train Accuracy: {:.2f} %'.format(np.mean(p == y) * 100))

Train Accuracy: 76.26 %


In [12]:
print(f1_score(y,p,average='binary'))

0.7952085181898846


In [13]:
for i in range(30, 40):
    p = lr.predict(theta, X, (i/100))
    print("zeker: ", i/100)
    print(f1_score(y,p,average='binary'))

zeker:  0.3
0.7842456840461525
zeker:  0.31
0.7871136264693077
zeker:  0.32
0.7893128437963853
zeker:  0.33
0.7905832747716093
zeker:  0.34
0.7919059821507467
zeker:  0.35
0.7952085181898846
zeker:  0.36
0.7937444146559428
zeker:  0.37
0.793016558675306
zeker:  0.38
0.793947082276187
zeker:  0.39
0.7917770671539515


# Oversampling werkt!!!!

### Dit is zonder train test split

### Hieronder met undersampling

In [14]:
negative = data[data.stroke==0]
positive = data[data.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [15]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [16]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

print(X.shape)
#print(X)

(498, 4)
(498, 5)


In [17]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X, y, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

print('Cost at test theta: {:.3f}'.format(cost))

print('Gradient at test theta:')
print('\t[{:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}]'.format(*grad))

Cost at test theta: 0.693
Gradient at test theta:
	[0.000, -6.855, -0.052, -0.036, -7.120]


In [18]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X, y, _lambda)

Cost at theta found by optimize.minimize: 0.460
theta:
	[-4.907, 0.074, 0.768, 0.165, 0.005]


In [19]:
p = lr.predict(theta, X, 0.39)
print('Train Accuracy: {:.2f} %'.format(np.mean(p == y) * 100))

Train Accuracy: 78.92 %


In [20]:
print(f1_score(y,p,average='binary'))

0.8134991119005329


In [22]:
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    print("zeker: ", i/100)
    print(f1_score(y,p,average='binary'))

zeker:  0.3
0.7986230636833047
zeker:  0.31
0.8
zeker:  0.32
0.8027681660899654
zeker:  0.33
0.8076923076923077
zeker:  0.34
0.8141592920353982
zeker:  0.35
0.8134991119005329
zeker:  0.36
0.8142857142857142
zeker:  0.37
0.8144144144144144
zeker:  0.38
0.8144144144144144
zeker:  0.39
0.8152173913043479
zeker:  0.4
0.8087431693989071
zeker:  0.41
0.8073394495412842
zeker:  0.42
0.8081180811808116
zeker:  0.43
0.8037037037037037
zeker:  0.44
0.8
zeker:  0.45
0.8
zeker:  0.46
0.799249530956848
zeker:  0.47
0.7954545454545455
zeker:  0.48
0.793168880455408
zeker:  0.49
0.7892720306513409


# Undersampling werkt!!!
### Dit heeft een beter resultaat als oversampling
### Op de trainingset, the real test is met de testset