In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import seaborn as sns


In [30]:
df = pd.read_csv("C:/Users/DELL/Downloads/healthcare-dataset-stroke-data.csv")
df= df.dropna()
df.isnull().sum()
df.bmi

0       36.6
2       32.5
3       34.4
4       24.0
5       29.0
        ... 
5104    18.6
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 4909, dtype: float64

In [31]:
from sklearn.preprocessing import LabelEncoder
# transforming smoking status
df.smoking_status.unique()
df.head()
le = LabelEncoder()
df['smoking_status'] = le.fit_transform(df['smoking_status'])
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['work_type'] = le.fit_transform(df['work_type'])
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4909 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4909 non-null   int64  
 1   gender             4909 non-null   int64  
 2   age                4909 non-null   float64
 3   hypertension       4909 non-null   int64  
 4   heart_disease      4909 non-null   int64  
 5   ever_married       4909 non-null   int64  
 6   work_type          4909 non-null   int64  
 7   Residence_type     4909 non-null   int64  
 8   avg_glucose_level  4909 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     4909 non-null   int64  
 11  stroke             4909 non-null   int64  
dtypes: float64(3), int64(9)
memory usage: 498.6 KB


In [32]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,4909.0,37064.313506,20995.098457,77.0,18605.0,37608.0,55220.0,72940.0
gender,4909.0,0.410063,0.492309,0.0,0.0,0.0,1.0,2.0
age,4909.0,42.865374,22.555115,0.08,25.0,44.0,60.0,82.0
hypertension,4909.0,0.091872,0.288875,0.0,0.0,0.0,0.0,1.0
heart_disease,4909.0,0.049501,0.216934,0.0,0.0,0.0,0.0,1.0
ever_married,4909.0,0.652679,0.476167,0.0,0.0,1.0,1.0,1.0
work_type,4909.0,2.170096,1.092593,0.0,2.0,2.0,3.0,4.0
Residence_type,4909.0,0.507232,0.499999,0.0,0.0,1.0,1.0,1.0
avg_glucose_level,4909.0,105.30515,44.424341,55.12,77.07,91.68,113.57,271.74
bmi,4909.0,28.893237,7.854067,10.3,23.5,28.1,33.1,97.6


In [33]:
# Since standard deviations for bmi, glucose level and age are very high, this will cause bias in our model, so we standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(df[['age', 'avg_glucose_level', 'bmi']] )
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,4909.0,37064.31,20995.098457,77.0,18605.0,37608.0,55220.0,72940.0
gender,4909.0,0.4100631,0.492309,0.0,0.0,0.0,1.0,2.0
age,4909.0,1.679017e-16,1.000102,-1.897119,-0.792157,0.05031,0.759755,1.735243
hypertension,4909.0,0.09187207,0.288875,0.0,0.0,0.0,0.0,1.0
heart_disease,4909.0,0.04950092,0.216934,0.0,0.0,0.0,0.0,1.0
ever_married,4909.0,0.6526788,0.476167,0.0,0.0,1.0,1.0,1.0
work_type,4909.0,2.170096,1.092593,0.0,2.0,2.0,3.0,4.0
Residence_type,4909.0,0.5072316,0.499999,0.0,0.0,1.0,1.0,1.0
avg_glucose_level,4909.0,4.6317720000000004e-17,1.000102,-1.129792,-0.635643,-0.306736,0.186062,3.74686
bmi,4909.0,2.894857e-18,1.000102,-2.36758,-0.686751,-0.101007,0.53567,8.748813


In [34]:
# model
x = df.drop(['stroke'], axis='columns')
y = df['stroke']


x_train,x_test, y_train,y_test = train_test_split(x,y, test_size=0.2)

model = SVC(class_weight='balanced')
model.fit(x_train,y_train)
model.score(x_test, y_test)


0.7077393075356415

In [35]:
# We fixed the imbalance so our model score dropped, no we improve the model
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)
for kernel in ['linear', 'poly','rbf','sigmoid']:
    model = SVC(kernel=kernel, class_weight='balanced')
    model.fit(x_train_resampled,y_train_resampled)
    model.score(x_test,y_test)

In [36]:
# our model is good , we now check for overffiting

train_accuracy = model.score(x_train, y_train)
test_accuracy = model.score(x_test,y_test)
print(f'Train score:{train_accuracy}')
print(f'Test score:{test_accuracy}')

Train score:0.5057295645530939
Test score:0.5142566191446029


In [37]:
# confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm
'''Diagonal values (TP & TN) → Correct predictions.
Off-diagonal values (FP & FN) → Misclassifications.
c'''

'Diagonal values (TP & TN) → Correct predictions.\nOff-diagonal values (FP & FN) → Misclassifications.\nc'

In [38]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.51      0.67       935
           1       0.05      0.53      0.09        47

    accuracy                           0.51       982
   macro avg       0.50      0.52      0.38       982
weighted avg       0.91      0.51      0.64       982



In [39]:
# for imbalance data
from sklearn.metrics import roc_auc_score
roc = roc_auc_score(y_test,y_pred)
print(roc)


0.5226419387871203


In [40]:
# we check for imbalance
import numpy as np

unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique,counts)))


{np.int64(0): np.int64(3765), np.int64(1): np.int64(162)}


In [41]:
print('hkei')

hkei
