In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('Datasets/healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Data Preparation

In [9]:
df.columns = df.columns.str.lower().str.replace(' ', '-')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [15]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [19]:
for column in categorical_columns:
    df[column] = df[column].str.lower().str.replace(' ', '-')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,male,67.0,0,1,yes,private,urban,228.69,36.6,formerly-smoked,1
1,51676,female,61.0,0,0,yes,self-employed,rural,202.21,,never-smoked,1
2,31112,male,80.0,0,1,yes,private,rural,105.92,32.5,never-smoked,1
3,60182,female,49.0,0,0,yes,private,urban,171.23,34.4,smokes,1
4,1665,female,79.0,1,0,yes,self-employed,rural,174.12,24.0,never-smoked,1


In [20]:
target_columns = ['stroke']
numerical_columns = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [26]:
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,male,67.0,0,1,yes,private,urban,228.69,36.6,formerly-smoked,1
1,female,61.0,0,0,yes,self-employed,rural,202.21,,never-smoked,1
2,male,80.0,0,1,yes,private,rural,105.92,32.5,never-smoked,1
3,female,49.0,0,0,yes,private,urban,171.23,34.4,smokes,1
4,female,79.0,1,0,yes,self-employed,rural,174.12,24.0,never-smoked,1


# Validation Framework

In [21]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [22]:
len(df_train), len(df_val), len(df_test)

(3066, 1022, 1022)

# Filling Missing values

In [28]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [30]:
np.mean(df[df.gender =='male'].bmi), np.mean(df[df.gender =='female'].bmi), np.mean(df.bmi)

(28.647936350074616, 29.065757680359038, 28.893236911794673)

In [58]:
np.median(df[(df.gender =='male') & (~df.bmi.isnull())].bmi), np.median(df[(df.gender =='female') & (~df.bmi.isnull())].bmi), np.median(df[(~df.bmi.isnull())].bmi)

(28.4, 27.8, 28.1)

In [60]:
df.bmi = df.bmi.fillna(np.median(df[(~df.bmi.isnull())].bmi))

In [61]:
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [88]:
df.stroke.value_counts(normalize=True)

0    0.951272
1    0.048728
Name: stroke, dtype: float64

# One Hot Encoding

In [83]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

In [84]:
X_dicts = df[categorical_columns+numerical_columns].to_dict(orient='records')
X = dv.fit_transform(X_dicts)
y = df[target_columns]

# Handling imbalance

In [63]:
df.stroke.value_counts(normalize=True)

0    0.951272
1    0.048728
Name: stroke, dtype: float64

Using over sampling so as not to lose data 

In [65]:
!pip install imblearn

Collecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
     -------------------------------------- 199.3/199.3 kB 6.1 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.9.1 imblearn-0.0


In [79]:
from imblearn.combine import SMOTETomek

In [80]:
smk = SMOTETomek(random_state=42)

In [81]:
X = df[categorical_columns+numerical_columns]
y = df[target_columns]

In [86]:
X_res,y_res=smk.fit_resample(X,y)

In [105]:
df_oversampled = pd.DataFrame(data=np.column_stack((X_res, y_res)), columns = dv.get_feature_names() + target_columns) 



In [106]:
df_oversampled.head()

Unnamed: 0,age,avg_glucose_level,bmi,ever_married=no,ever_married=yes,gender=female,gender=male,gender=other,heart_disease,hypertension,...,smoking_status=formerly-smoked,smoking_status=never-smoked,smoking_status=smokes,smoking_status=unknown,work_type=children,work_type=govt_job,work_type=never_worked,work_type=private,work_type=self-employed,stroke
0,67.0,228.69,36.6,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,61.0,202.21,28.1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,80.0,105.92,32.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,49.0,171.23,34.4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,79.0,174.12,24.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [91]:
type(X_res)

numpy.ndarray

# Validation Framework

In [111]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df_oversampled, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [112]:
len(df_train), len(df_val), len(df_test)

(5784, 1929, 1929)

# EDA

## Feature Importance

In [115]:
columns = dv.get_feature_names()



In [121]:
df_full_train[columns].corrwith(df_full_train.stroke).sort_values(ascending=False)

age                               0.600369
ever_married=yes                  0.301572
hypertension                      0.251677
avg_glucose_level                 0.250547
heart_disease                     0.249366
smoking_status=formerly-smoked    0.154712
work_type=self-employed           0.134947
bmi                               0.089689
residence_type=urban              0.049142
gender=male                       0.048978
work_type=private                 0.043963
smoking_status=smokes             0.012054
work_type=govt_job                0.001800
gender=other                     -0.011466
smoking_status=never-smoked      -0.017661
work_type=never_worked           -0.047323
gender=female                    -0.048691
residence_type=rural             -0.049142
smoking_status=unknown           -0.133631
work_type=children               -0.264982
ever_married=no                  -0.301572
dtype: float64

##### If I have to remove some columns then we can remove gender it doesn't seem to be contributing much. 
##### Even residence_type can be removed and also smoking_status 
##### Columns to not consider
    - residence_type
    - gender

In [124]:
list(df_full_train[columns].corrwith(df_full_train.stroke).index)

['age',
 'avg_glucose_level',
 'bmi',
 'ever_married=no',
 'ever_married=yes',
 'gender=female',
 'gender=male',
 'gender=other',
 'heart_disease',
 'hypertension',
 'residence_type=rural',
 'residence_type=urban',
 'smoking_status=formerly-smoked',
 'smoking_status=never-smoked',
 'smoking_status=smokes',
 'smoking_status=unknown',
 'work_type=children',
 'work_type=govt_job',
 'work_type=never_worked',
 'work_type=private',
 'work_type=self-employed']

In [125]:
columns_to_consider = ['age',
 'avg_glucose_level',
 'bmi',
 'ever_married=no',
 'ever_married=yes',
 'gender=other',
 'heart_disease',
 'hypertension',
 'smoking_status=formerly-smoked',
 'smoking_status=never-smoked',
 'smoking_status=smokes',
 'smoking_status=unknown',
 'work_type=children',
 'work_type=govt_job',
 'work_type=never_worked',
 'work_type=private',
 'work_type=self-employed']

# Model Training

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score

In [128]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

In [148]:
def train(X, y, C=1.0):    
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X, y)
    
    return model

In [149]:
def predict(X, model):
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred

In [153]:
c_scores = []

for C in tqdm([0.01, 0.1, 1, 10]):
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    auc_scores = []
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
        
        X_train = df_train[columns_to_consider].values
        y_train = df_train[target_columns].values
        
        X_val = df_val[columns_to_consider]
        y_val = df_val[target_columns].values

        model = train(X_train, y_train, C)
        y_pred = predict(X_val,  model)


        auc = roc_auc_score(y_val, y_pred)
        auc_scores.append(auc)
    c_scores.append((C, round(np.mean(auc_scores), 3), round(np.std(auc_scores), 3)))
    
c_scores_columns = ['C', 'AUC_mean', 'AUC_std']
df_c_scores = pd.DataFrame(c_scores, columns = c_scores_columns)
df_c_scores

  0%|          | 0/4 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


Unnamed: 0,C,AUC_mean,AUC_std
0,0.01,0.844,0.004
1,0.1,0.857,0.004
2,1.0,0.857,0.004
3,10.0,0.857,0.004
