# Import package

In [17]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [18]:
df = pd.read_csv("./Dataset/processed_train.csv")

In [19]:
df

Unnamed: 0,hypertension,heart_disease,ever_married,stroke,gender_Female,gender_Male,gender_Other,blood_A,blood_AB,blood_B,...,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,avg_glucose_level,bmi,age
0,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,-0.454391,-1.632301,0.000364
1,0,0,1,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,-0.377299,0.528038,0.535819
2,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,1,0,0,-0.162540,-1.649074,-0.047722
3,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,-0.325215,0.450906,-0.227273
4,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,1,0,0,-0.185025,-0.357567,-0.855701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,-0.909377,0.528038,0.715369
3905,0,0,1,0,1,0,0,0,0,1,...,1,0,0,0,1,0,0,-0.833202,0.528038,0.401155
3906,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,1,0,0,-0.140513,-1.062025,-1.214803
3907,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,-0.695536,-0.558841,0.176717


In [20]:
x = df.copy()
x = x.drop(columns='stroke')
y = df['stroke']

In [21]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x, y)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size= 0.2, random_state= 42)

In [23]:
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

6052
6052
1514
1514


## Training Model

In [24]:
df.corr()['stroke'].sort_values(ascending=False)

stroke                            1.000000
blood_O                           0.206690
age                               0.192354
Residence_type_Urban              0.116430
avg_glucose_level                 0.102858
heart_disease                     0.095742
ever_married                      0.087278
Residence_type_Rural              0.070189
hypertension                      0.064028
smoking_status_formerly smoked    0.051671
bmi                               0.037239
work_type_Self-employed           0.024138
work_type_Private                 0.022925
work_type_Govt_job                0.017460
gender_Male                       0.012944
smoking_status_smokes             0.006403
gender_Other                      0.001244
smoking_status_never smoked      -0.003192
work_type_Never_worked           -0.012062
gender_Female                    -0.013280
blood_A                          -0.037011
smoking_status_Unknown           -0.043779
blood_B                          -0.047782
work_type_c

## Train Svm without Hyper parameter tuning

In [46]:
Svm = SVC()
Svm.fit(x_train,y_train)

y_Svm_pred = Svm.predict(x_test)
y_Svm_pred

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [47]:
cm = confusion_matrix(y_test, y_Svm_pred)
cm

array([[755,  16],
       [ 10, 733]], dtype=int64)

In [48]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)
print(accuracy)

0.982573726541555
0.9828269484808454


## Hyper parameter tuning with GridSearchCV

In [49]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
param_distributions = {"C": uniform(0.001, 1)}
rnd_search_cv = RandomizedSearchCV(Svm, param_distributions, n_iter=50, verbose=2, cv=3)


In [50]:
rnd_search_cv.fit(x_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END ..............................C=0.14456586068757293; total time=   0.6s
[CV] END ..............................C=0.14456586068757293; total time=   0.6s
[CV] END ..............................C=0.14456586068757293; total time=   0.6s
[CV] END ...............................C=0.5576157385554026; total time=   0.4s
[CV] END ...............................C=0.5576157385554026; total time=   0.3s
[CV] END ...............................C=0.5576157385554026; total time=   0.3s
[CV] END ...............................C=0.7013218059111903; total time=   0.3s
[CV] END ...............................C=0.7013218059111903; total time=   0.3s
[CV] END ...............................C=0.7013218059111903; total time=   0.3s
[CV] END ..............................C=0.22552370781927955; total time=   0.5s
[CV] END ..............................C=0.22552370781927955; total time=   0.5s
[CV] END ..............................C=0.2255

RandomizedSearchCV(cv=3, estimator=SVC(), n_iter=50,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002251A7E4BB0>},
                   verbose=2)

In [51]:
print(rnd_search_cv.best_params_)
print(rnd_search_cv.best_estimator_)

{'C': 0.9143286769475959}
SVC(C=0.9143286769475959)


In [52]:
rnd_search_cv

RandomizedSearchCV(cv=3, estimator=SVC(), n_iter=50,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002251A7E4BB0>},
                   verbose=2)

In [53]:
y_Svm_pred = rnd_search_cv.best_estimator_.predict(x_test)
y_Svm_pred

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [54]:
cm = confusion_matrix(y_test, y_Svm_pred)
cm

array([[754,  17],
       [ 10, 733]], dtype=int64)

In [55]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)
print(accuracy)

0.9819156061620898
0.9821664464993395


## Lower value
# Let's use defaul C and gamma value