# ML stroke prediction - Logistic Regression
dataset link: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download

## Classification of stroke or no-stroke

For the first implementation of Machine Learning on this dataset we are using Logistic Regression

In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np
np.set_printoptions(suppress=True)

import logistic_regression as lr

import csv
import sys
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# SKlearn for F1 score calculation
from sklearn.metrics import f1_score

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# we use pandas to import a comma-seperated values dataset
import pandas as pd

# tells matplotlib to embed plots within the notebook
%matplotlib inline

In [2]:
f1_scores_train_types = []
f1_scores_train_data = np.zeros(7)
f1_scores_test_types = []
f1_scores_test_data = np.zeros(7)

# Baseline features

In [3]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [4]:
negative = data[data.stroke==0]
positive = data[data.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [5]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [6]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 5)


In [8]:
_lambda = 1

In [9]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [10]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.464
theta:
	[-4.920, 0.072, 0.973, 0.116, 0.006]


In [11]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [12]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('baseline')
f1_scores_train_data[0] = f1_scores[index]

max f1 score:  0.8116591928251121
for decision boundary:  0.36


In [13]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 78.89 %


In [14]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('baseline')
f1_scores_test_data[0] = f1_scores[index]

max f1 score:  0.8158844765342961
for decision boundary:  0.36


# Feature: gender

In [15]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
#data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [16]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [17]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 3)


Unnamed: 0,gender_Female,gender_Male,gender_Other
0,0,1,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0


In [18]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 8)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,gender_Female,gender_Male,gender_Other
0,67.0,0,1,228.69,1,0,1,0
1,61.0,0,0,202.21,1,1,0,0
2,80.0,0,1,105.92,1,0,1,0
3,49.0,0,0,171.23,1,1,0,0
4,79.0,1,0,174.12,1,1,0,0


In [19]:
data_new_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                5110 non-null   float64
 1   hypertension       5110 non-null   int64  
 2   heart_disease      5110 non-null   int64  
 3   avg_glucose_level  5110 non-null   float64
 4   stroke             5110 non-null   int64  
 5   gender_Female      5110 non-null   uint8  
 6   gender_Male        5110 non-null   uint8  
 7   gender_Other       5110 non-null   uint8  
dtypes: float64(2), int64(3), uint8(3)
memory usage: 214.7 KB


In [20]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [21]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [22]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 8)


In [24]:
_lambda = 1

In [25]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [26]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.460
theta:
	[-5.099, 0.073, 0.940, 0.233, 0.006]


In [27]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [28]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('gender')
f1_scores_train_data[1] = f1_scores[index]

max f1 score:  0.8096280087527352
for decision boundary:  0.33


In [29]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 78.14 %


In [30]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('gender')
f1_scores_test_data[1] = f1_scores[index]

max f1 score:  0.8106194690265487
for decision boundary:  0.33


# Feature: ever married

In [31]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
#data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [32]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [33]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 2)


Unnamed: 0,ever_married_No,ever_married_Yes
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [34]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 7)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,ever_married_No,ever_married_Yes
0,67.0,0,1,228.69,1,0,1
1,61.0,0,0,202.21,1,0,1
2,80.0,0,1,105.92,1,0,1
3,49.0,0,0,171.23,1,0,1
4,79.0,1,0,174.12,1,0,1


In [35]:
data_new_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                5110 non-null   float64
 1   hypertension       5110 non-null   int64  
 2   heart_disease      5110 non-null   int64  
 3   avg_glucose_level  5110 non-null   float64
 4   stroke             5110 non-null   int64  
 5   ever_married_No    5110 non-null   uint8  
 6   ever_married_Yes   5110 non-null   uint8  
dtypes: float64(2), int64(3), uint8(2)
memory usage: 209.7 KB


In [36]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [37]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [38]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 7)


In [40]:
_lambda = 1

In [41]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [42]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.464
theta:
	[-4.928, 0.072, 0.974, 0.114, 0.006]


In [43]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [44]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('married')
f1_scores_train_data[2] = f1_scores[index]

max f1 score:  0.8116591928251121
for decision boundary:  0.36


In [45]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 78.89 %


In [46]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('married')
f1_scores_test_data[2] = f1_scores[index]

max f1 score:  0.8158844765342961
for decision boundary:  0.36


# Feature: work type

In [47]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
#data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [48]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [49]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 5)


Unnamed: 0,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,1,0


In [50]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 10)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,67.0,0,1,228.69,1,0,0,1,0,0
1,61.0,0,0,202.21,1,0,0,0,1,0
2,80.0,0,1,105.92,1,0,0,1,0,0
3,49.0,0,0,171.23,1,0,0,1,0,0
4,79.0,1,0,174.12,1,0,0,0,1,0


In [51]:
data_new_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      5110 non-null   float64
 1   hypertension             5110 non-null   int64  
 2   heart_disease            5110 non-null   int64  
 3   avg_glucose_level        5110 non-null   float64
 4   stroke                   5110 non-null   int64  
 5   work_type_Govt_job       5110 non-null   uint8  
 6   work_type_Never_worked   5110 non-null   uint8  
 7   work_type_Private        5110 non-null   uint8  
 8   work_type_Self-employed  5110 non-null   uint8  
 9   work_type_children       5110 non-null   uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 224.7 KB


In [52]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [53]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [54]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 10)


In [56]:
_lambda = 1

In [57]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [58]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.463
theta:
	[-4.919, 0.074, 0.990, 0.104, 0.006]


In [59]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [60]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('work')
f1_scores_train_data[3] = f1_scores[index]

max f1 score:  0.811529933481153
for decision boundary:  0.36


In [61]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 78.64 %


In [62]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('work')
f1_scores_test_data[3] = f1_scores[index]

max f1 score:  0.8148148148148148
for decision boundary:  0.33


# Feature: residence type

In [63]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
#data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [64]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [65]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 2)


Unnamed: 0,Residence_type_Rural,Residence_type_Urban
0,0,1
1,1,0
2,1,0
3,0,1
4,1,0


In [66]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 7)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,Residence_type_Rural,Residence_type_Urban
0,67.0,0,1,228.69,1,0,1
1,61.0,0,0,202.21,1,1,0
2,80.0,0,1,105.92,1,1,0
3,49.0,0,0,171.23,1,0,1
4,79.0,1,0,174.12,1,1,0


In [67]:
data_new_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   5110 non-null   float64
 1   hypertension          5110 non-null   int64  
 2   heart_disease         5110 non-null   int64  
 3   avg_glucose_level     5110 non-null   float64
 4   stroke                5110 non-null   int64  
 5   Residence_type_Rural  5110 non-null   uint8  
 6   Residence_type_Urban  5110 non-null   uint8  
dtypes: float64(2), int64(3), uint8(2)
memory usage: 209.7 KB


In [68]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [69]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [70]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 7)


In [72]:
_lambda = 1

In [73]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [74]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.464
theta:
	[-4.919, 0.072, 0.974, 0.125, 0.006]


In [75]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [76]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('residence')
f1_scores_train_data[4] = f1_scores[index]

max f1 score:  0.8151447661469934
for decision boundary:  0.35


In [77]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 79.15 %


In [78]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('residence')
f1_scores_test_data[4] = f1_scores[index]

max f1 score:  0.8186714542190305
for decision boundary:  0.35


# Feature: smoking

In [79]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
#data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [80]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [81]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 4)


Unnamed: 0,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,0,1
4,0,0,1,0


In [82]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 9)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,1,0,1,0,0
1,61.0,0,0,202.21,1,0,0,1,0
2,80.0,0,1,105.92,1,0,0,1,0
3,49.0,0,0,171.23,1,0,0,0,1
4,79.0,1,0,174.12,1,0,0,1,0


In [83]:
data_new_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             5110 non-null   float64
 1   hypertension                    5110 non-null   int64  
 2   heart_disease                   5110 non-null   int64  
 3   avg_glucose_level               5110 non-null   float64
 4   stroke                          5110 non-null   int64  
 5   smoking_status_Unknown          5110 non-null   uint8  
 6   smoking_status_formerly smoked  5110 non-null   uint8  
 7   smoking_status_never smoked     5110 non-null   uint8  
 8   smoking_status_smokes           5110 non-null   uint8  
dtypes: float64(2), int64(3), uint8(4)
memory usage: 219.7 KB


In [84]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [85]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [86]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(398, 9)


In [88]:
_lambda = 1

In [89]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [90]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.457
theta:
	[-5.058, 0.076, 1.078, -0.011, 0.006]


In [91]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [92]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('smoking')
f1_scores_train_data[5] = f1_scores[index]

max f1 score:  0.814814814814815
for decision boundary:  0.45


In [93]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 79.90 %


In [94]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('smoking')
f1_scores_test_data[5] = f1_scores[index]

max f1 score:  0.8205128205128205
for decision boundary:  0.38


# Feature: bmi

In [95]:
#  training data stored in arrays X, y
#df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))
#X = pd.DataFrame(df, columns=['age', 'hypertension', 'heart_disease', 'avg_glucose_level']).to_numpy()
#X = np.around(X, 5)
#print("X: ")
#print(X)

#y = pd.DataFrame(df, columns=['stroke']).to_numpy()
#y = np.around(y, 5)
#y.reshape(5110)
#print("y: ")
#print(y)
 
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
data.drop("Residence_type", axis = 1, inplace = True)
#data.drop("bmi", axis = 1, inplace = True)
data.drop("smoking_status", axis = 1, inplace = True)

#print(data)

In [96]:
data.isna().sum()   #sum of missing values per variable.

age                    0
hypertension           0
heart_disease          0
avg_glucose_level      0
bmi                  201
stroke                 0
dtype: int64

In [97]:
data = data.dropna(axis = 0)

In [98]:
data.isna().sum() 

age                  0
hypertension         0
heart_disease        0
avg_glucose_level    0
bmi                  0
stroke               0
dtype: int64

In [99]:
negative = data[data.stroke==0]
positive = data[data.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    209
0    209
Name: stroke, dtype: int64

In [100]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [101]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

#print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

#print(X.shape)
#print(X)

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)
print(X_train.shape)

(334, 6)


In [103]:
_lambda = 1

In [104]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)
cost, grad = lr.lrCostFunction(initial_theta, X_train, y_train, _lambda)
#print(cost)
#print(grad)

#print(cost.shape)
#print(grad.shape)

In [105]:
theta, cost, grad = lr.lrOptimization(lr.lrCostFunction, initial_theta, X_train, y_train, _lambda)

Cost at theta found by optimize.minimize: 0.492
theta:
	[-4.418, 0.068, 0.393, -0.065, 0.006]


In [106]:
#print("op de training set")
#print(f1_score(y_train,p_train,average='binary'))

In [107]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p_train = lr.predict(theta, X_train, (i/100))
    f1_scores[j] = f1_score(y_train,p_train,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_train_types.append('bmi')
f1_scores_train_data[6] = f1_scores[index]

max f1 score:  0.7846153846153846
for decision boundary:  0.36


In [108]:
p_train = lr.predict(theta, X_train, decision_boundary[index])
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))

Train Accuracy: 74.85 %


In [109]:
j = 0
f1_scores = np.zeros(20)
decision_boundary = np.zeros(20)
for i in range(30, 50):
    p = lr.predict(theta, X, (i/100))
    f1_scores[j] = f1_score(y,p,average='binary')
    decision_boundary[j] = (i/100)
    j += 1
    #print("zeker: ", i/100)
    #print(f1_score(y,p,average='binary'))
    
index = np.argmax(f1_scores)
#print(f1_scores)
print("max f1 score: ", f1_scores[index])
print("for decision boundary: ", decision_boundary[index])
#print(f1_scores([np.argmax(f1_scores)]))

f1_scores_test_types.append('bmi')
f1_scores_test_data[6] = f1_scores[index]

max f1 score:  0.7933194154488517
for decision boundary:  0.36


In [110]:
print("train type: ", f1_scores_train_types)
print("train data: ", f1_scores_train_data)
print("test type: ", f1_scores_test_types)
print("test data: ", f1_scores_test_data)

train type:  ['baseline', 'gender', 'married', 'work', 'residence', 'smoking', 'bmi']
train data:  [0.81165919 0.80962801 0.81165919 0.81152993 0.81514477 0.81481481
 0.78461538]
test type:  ['baseline', 'gender', 'married', 'work', 'residence', 'smoking', 'bmi']
test data:  [0.81588448 0.81061947 0.81588448 0.81481481 0.81867145 0.82051282
 0.79331942]
