In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# SKlearn for F1 score calculation
from sklearn.metrics import f1_score

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# we use pandas to import a comma-seperated values dataset
import pandas as pd

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import svm

import logistic_regression as lr

# tells matplotlib to embed plots within the notebook
%matplotlib inline

In [2]:
data = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

data.head(10)

data.drop("id", axis = 1, inplace = True)
data.drop("gender", axis = 1, inplace = True)
#data.drop("ever_married", axis = 1, inplace = True)
data.drop("work_type", axis = 1, inplace = True)
#data.drop("Residence_type", axis = 1, inplace = True)
data.drop("bmi", axis = 1, inplace = True)
#data.drop("smoking_status", axis = 1, inplace = True)

#indexAge = data[((data['age'] < 20) & (data['stroke'] == 1)) ].index  #find the indexes of outliers
#data.drop(indexAge , inplace=True) # drop the outliers
#indexGlucose = data[((data['avg_glucose_level'] > 220) & (data['stroke'] == 0)) ].index
#data.drop(indexGlucose , inplace=True)

In [None]:
data = data.dropna(axis = 0)

## Making dummy classes for the enumertor classes

In [None]:
num_cols = data.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data.select_dtypes(exclude = [np.number]).columns.tolist()

In [None]:
# Numerical columns data
data_new_num = data[num_cols]

# Categorical columns data
data_new_cat = data[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

In [None]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

In [10]:
negative = data[data.stroke==0]
positive = data[data.stroke==1]

#print(data.stroke.value_counts())

#print(negative.stroke.value_counts())
#print(positive.stroke.value_counts())

# downsample majority
neg_downsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in minority class
 random_state=27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])
# check new class counts
downsampled.stroke.value_counts()

1    247
0    247
Name: stroke, dtype: int64

In [11]:
X = downsampled.drop(['stroke'], axis=1).values
y = downsampled['stroke'].values

In [12]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape

print(X.shape)

# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0,shuffle=True, stratify=y)

(494, 4)


In [13]:
def gaussianKernel(x1, x2, sigma):
    """
    Computes the radial basis function
    Returns a radial basis function kernel between x1 and x2.
    
    Parameters
    ----------
    x1 :  numpy ndarray
        A vector of size (n, ), representing the first datapoint.
    
    x2 : numpy ndarray
        A vector of size (n, ), representing the second datapoint.
    
    sigma : float
        The bandwidth parameter for the Gaussian kernel.

    Returns
    -------
    sim : float
        The computed RBF between the two provided data points.
    
    Instructions
    ------------
    Fill in this function to return the similarity between `x1` and `x2`
    computed using a Gaussian kernel with bandwidth `sigma`.
    """
    sim = 0
    # ====================== YOUR CODE HERE ======================
    sim = np.exp(-np.power(np.linalg.norm(x1 - x2),2)/(2 * np.power(sigma,2)))


    # =============================================================
    return sim

In [103]:
# SVM Parameters
C = 0.8
sigma = 2.4
model = svm.svmTrain(X_train, y_train, C, gaussianKernel, args=(sigma,))
#results = np.zeros(shape=(22, 22))
#for x in range(21):
#    results[0,x+1] = C
#     C += 0.01
# for x in range(21):
#     results[x+1,0] = sigma
#     sigma += 0.01
# x = 1
# y = 1
# for C in np.arange(0.7,0.91,0.01):
#     for sigma in np.arange(2.3,2.51,0.01):
#         #print(f'for C: {C} and sigma: {sigma}')
#         model = svm.svmTrain(X_train, y_train, C, gaussianKernel, args=(sigma,))
#         p_test = svm.svmPredict(model, X_test)
#         results[x,y] = f1_score(y_test,p_test,average='binary') * 100
#         #print('Test Accuracy: {:.2f} %'.format(np.mean(p_test == y_test) * 100))
#         #print('Test F1-score: {:.2f} %' .format(f1_score(y_test,p_test,average='binary') * 100))
#         x += 1
#     x = 1
#     y += 1
# print(results)

In [104]:
p_train = svm.svmPredict(model, X_train)
print('Train Accuracy: {:.2f} %'.format(np.mean(p_train == y_train) * 100))
print('Train F1-score: {:.2f} %' .format(f1_score(y_train,p_train,average='binary') * 100))

Train Accuracy: 93.42 %
Train F1-score: 93.56 %


In [105]:
p_test = svm.svmPredict(model, X_test)
print('Test Accuracy: {:.2f} %'.format(np.mean(p_test == y_test) * 100))
print('Test F1-score: {:.2f} %' .format(f1_score(y_test,p_test,average='binary') * 100))

Test Accuracy: 81.82 %
Test F1-score: 80.85 %
