# SKLearn implementations of logistic regression and SVM
## This is done to compare the implementation of our models to the standard libraries
We fully expect that these results will be better than our results, especially for SVM. Our SVM implementation uses a simplified version of the learning algorithm.
The SKlearn package also does some intelligent processing and setting optimalisation underneath.

## Module imports

In [17]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# SKlearn for F1 score calculation
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# we use pandas to import a comma-seperated values dataset
import pandas as pd

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import svm

import logistic_regression as lr

# tells matplotlib to embed plots within the notebook
%matplotlib inline

## Reading the dataset and selecting features

In [18]:
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
#data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
#data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
#data.drop("smoking_status", axis = 1, inplace = True)

#indexAge = data[((data['age'] < 20) & (data['stroke'] == 1)) ].index  #find the indexes of outliers
#data.drop(indexAge , inplace=True) # drop the outliers
#indexGlucose = data[((data['avg_glucose_level'] > 220) & (data['stroke'] == 0)) ].index
#data.drop(indexGlucose , inplace=True)

Removing all the empty datarows, only necessary when using the BMI feature.

In [19]:
data = data.dropna(axis = 0)

## Making dummy classes for the enumertor classes

In [20]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [21]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

(5110, 8)


Unnamed: 0,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,0,1,0,1,0,0
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,0,1,0,0,0,1
4,0,1,1,0,0,0,1,0


In [22]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

(5110, 13)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,1,0,1,0,1,0,1,0,0
1,61.0,0,0,202.21,1,0,1,1,0,0,0,1,0
2,80.0,0,1,105.92,1,0,1,1,0,0,0,1,0
3,49.0,0,0,171.23,1,0,1,0,1,0,0,0,1
4,79.0,1,0,174.12,1,0,1,1,0,0,0,1,0


## Downsample and train test split

In [23]:
negative = data_new_final[data_new_final.stroke==0]
positive = data_new_final[data_new_final.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [24]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [25]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)

(498, 12)


## Implementation of the logistic regression

In [26]:
LRclassifier = LogisticRegression(max_iter=1000, random_state=1, solver='liblinear', penalty='l1')
LRclassifier.fit(X_train, y_train)

y_pred_LR = LRclassifier.predict(X_test)
p = LRclassifier.predict_proba(X_test)

m,n = p.shape

p_stroke = np.zeros(m)

p_stroke[:] = [i[1] for i in p]

#print(p_stroke)
#print(p)

In [27]:
def prediction_to_classifier(pred, percentile):
    pred [pred >= percentile] = 1
    pred [pred < percentile]  = 0
    return pred

In [28]:
p_test_data = []

for i in range(30, 50):
    #print("decision: ", i/100)
    x = prediction_to_classifier(p_stroke, i/100)
    f1 = f1_score(y_test,x,average='binary')
    p_test_data.append(f1)
    #print(f1)
    
#print(p_test_data)
index = np.argmax(p_test_data)
print(p_test_data[index])

0.8288288288288288


In [29]:
p_stroke = prediction_to_classifier(p_stroke, 0.35)
print(p_stroke)
f1_score(y_test,p_stroke,average='binary')

[1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1.
 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1.
 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1.
 0. 1. 0. 1.]


0.8288288288288288

In [30]:
#LRAcc = accuracy_score(y_pred_LR, y_test)
#print('.:. Logistic Regression Accuracy:'+'\033[1m {:.2f}%'.format(LRAcc*100)+' .:.')

## Implementation of the SVM

In [32]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(X_train, y_train)
p_test = model.predict(X_test)
print('Test F1-score: {:.2f} %' .format(f1_score(y_test,p_test,average='binary') * 100))

Test F1-score: 82.69 %


.:. Support Vector Machine Accuracy:[1m 62.31% .:.


.:. Support Vector Machine Accuracy:[1m 59.00% .:.
