In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

****Objective****

Predict whether a Patient will have stroke or not based on some given attributes. Evaluation metric was AUC-ROC Score

****Understanding Data****
Here is the Definitions of the columns of the data

id-Patient ID

gender-Gender of Patient

age-Age of Patient

hypertension-0 - no hypertension, 1 - suffering from hypertension

heart_disease-0 - no heart disease, 1 - suffering from heart disease

ever_married-Yes/No

work_type-Type of occupation

Residence_type-Area type of residence (Urban/ Rural)

avg_glucose_level-Average Glucose level (measured after meal)

bmi-Body mass index

smoking_status-patient’s smoking status

stroke-0 - no stroke, 1 - suffered stroke

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df =pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

***DATA PRE-PROCESSING***

In [None]:
df['gender'].value_counts()

In [None]:
df = df.drop(df[df['gender']== 'Other'].index)

In [None]:
sns.countplot(x='gender' , data = df, palette="Set3")
xlabel='gender' 
ylabel='count'

In [None]:
sns.boxplot(x='smoking_status', y='age' , data = df, palette="Set3")
xlabel='smoking_status'
ylabel='age'

In [None]:
sns.boxplot(x='bmi', y='work_type' , data = df,palette="Set3")
xlabel='bmi'
ylabel='work_type'

In [None]:
g = sns.catplot(x="gender", hue="work_type", col="stroke",
...                 data=df, kind="count",
...                 height=4, aspect=.7, palette="Set2");

In [None]:
g = sns.catplot(x="gender", hue="ever_married", col="stroke",
...                 data=df, kind="count",
...                 height=4, aspect=.7, palette="Set2");


In [None]:
g = sns.catplot(x="gender", hue="heart_disease", col="stroke",
...                 data=df, kind="count",
...                 height=4, aspect=.7, palette="husl")


In [None]:
g = sns.catplot(x="gender", hue="Residence_type", col="stroke",
...                 data=df, kind="count",
...                 height=4, aspect=.7, palette="Set3")

In [None]:
sns.countplot(x='smoking_status',data=df, hue= 'gender', palette= "Set3")
xlabel='Smoking status' 
ylabel='count'

In [None]:
g = sns.catplot(x="gender", hue="smoking_status", col="stroke",
...                 data=df, kind="count",
...                 height=4, aspect=.7, palette="Set3")

In [None]:
plt.figure(figsize=(12,10))
sns.distplot(df[df['stroke'] == 0]['avg_glucose_level'],color='green')
sns.distplot(df[df['stroke'] == 1]['avg_glucose_level'],color='red')
plt.title('No Stroke vs Stroke by Avg Glucose Level',fontsize=15)
plt.xlim([30,330])

In [None]:
df.select_dtypes(['object']).columns

In [None]:
dummies = pd.get_dummies(df[['gender','ever_married','work_type','Residence_type','smoking_status']],drop_first=True)
df = pd.concat([df,dummies],axis=1)
df = df.drop(['gender','ever_married','work_type','Residence_type','smoking_status'], axis=1)
df= df.drop(['id'], axis=1)

****CHECKING FOR NULL VALUES AND USING KNN****

In [None]:
df.isnull().sum()

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
dataset = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)

In [None]:
dataset.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(dataset.corr(), annot = True )
xlabel='' 
ylabel=''

In [None]:
# compute the corr matrix

corr = df.corr()

# generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr,dtype=bool))

# set up the matplotlib figure
f, ax = plt.subplots(figsize=(11,9))

# generate a custom diverging colormap
cmap = sns.diverging_palette(230,20,as_cmap=True)

#draw the heatpmap with the mask and correct aspect ratio
sns.heatmap(corr,mask=mask,cmap=cmap,vmax=.3,center=0,square=True,linewidths=.5,cbar_kws={'shrink':.5})

****OVERSAMPLING MY DATASET DUE THE DATA IS UNEVEN****

In [None]:
X = dataset.drop('stroke',axis=1).values
y = dataset['stroke'].values

In [None]:
from imblearn import under_sampling, over_sampling
print("Before OverSampling, counts of label '1': {}".format(sum(y == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y == 0))) 
  
# import SMOTE module from imblearn library  

from imblearn.combine import SMOTEENN 
sm = SMOTEENN(random_state=10) 
X_over, y_over = sm.fit_resample(X, y) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_over.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_over.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_over == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_over == 0)))

****BEFORE START THE TRAINING I WILL NORMALIZE MY DATA****

Applying Model
Dividing the data into Train and validate (80:20) 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.20, random_state=7)

In [None]:
from sklearn import preprocessing
X_train_norm= preprocessing.normalize(X_train)
X_test_norm =preprocessing.normalize(X_test)

****ALGORITHMS****

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


# prepare configuration for cross validation test harness
seed = 7
#MODELS
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))

In [None]:
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, X_train_norm, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

****Decision Tree****

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train_norm,y_train)
predictions = dtree.predict(X_test_norm)

In [None]:
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,plot_roc_curve,auc,precision_recall_curve,plot_precision_recall_curve,average_precision_score
from sklearn.model_selection import cross_val_score



# Roc Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

****Prediction and Result****

Post that I used the models to predict the value for the Test set given in for the competition and submitted the result. It gave 94% AUC-ROC value for the submitted result.


****Conclusion****

Overall we used decision tree to forecast weather a patient can have stroke or not. We has to deal with imbalanced data which is common in such healthcare problems. For improving the model we could try out other ways of dealing with imbalanced data like SMOTE.

Also we could have dealt with missing data of smoke status in other ways as well for e.g. Age less than 10 or 15 years patients could have been tagged as never_smoked etc.

Finally just one thought on why the 2 models were so different, one of the reasons could be the age distribution of the 2 data set. Median age of Smoke dataset was 48 while that of Non smoke dataset was 21. These are some ways Logistic model could have been improved.

Please share you thoughts on how else this model could have been improved or some other ML technique we can use for such datasets.