This is my very first project in machine learning using Python. 

The purpose was to find the best model to predict stroke.

I would like to thank the mentor who helped me with the oversampling method as well as the other authors who inspired me some lines of code 🙏

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score

In [None]:
# Load data

df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

## Exploratory analysis and data preparation

In [None]:
# First look at the dataset

df.head()

In [None]:
# Total number of rows and columns

df.shape

In [None]:
df.info()

The dataset consists of 10 metrics for a total of 5110 patients. We have demographic data (gender, age, marital status, type of work and residence) as well as health data including hypertension, heart disease, average glucose level, body mass index (BMI), smoking status and whether the patient has experienced a stroke.

In [None]:
# Data spread between people having experienced of a stroke or not

df['stroke'].value_counts()

In [None]:
#Plotting the count of the target

ncount = len(df['stroke'])
ax = sns.countplot(x=df['stroke'])
for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
            ha='center', va='bottom') # set the alignment of the text
plt.savefig('stroke_count.png')

In [None]:
# Percentage of people having had a stroke in this dataset

len(df[df['stroke'] == 1])/len(df)*100

The dataset is very imbalanced => important to keep it in mind when cleaning and training, as well as when choosing the metrics.

### Handling missing values

In [None]:
df.isna().sum()

There are 201 null values in the 'bmi' column.

In [None]:
# Dropping the missing values

df.dropna(inplace = True)

In [None]:
# Checking 

df.isna().sum()

In [None]:
df.info()

After removing the null values, we have left with 4909 entries. 

### Drop the id column
The ID column was useful to identify the patients but it will not have any impact on the models, so we can drop it.

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
# Checking

df.head()

### Exploring each variable

#### Gender

In [None]:
#Data spread between male and female

df['gender'].value_counts()

There is 1 row with "Other", we can drop it. 

In [None]:
#Dropping 'Other' by selecting rows where gender 1= Other

df = df.loc[df["gender"] != 'Other']

In [None]:
#Visualise stroke counts gender wise

sns.countplot(x=df["stroke"], hue=df["gender"])
plt.savefig('stroke_gender.png')

#### Age

In [None]:
#Visualise the spread of the mean for the age variable

fig = sns.FacetGrid(data=df, hue="stroke", aspect=4)
fig.map(sns.kdeplot, "age", shade=True)
fig.add_legend()
plt.savefig('stroke_age.png')

It's not suprising to see that there is a higher risk of stroke when the patient get older.

#### Hypertension

In [None]:
#Count hypertension

df['hypertension'].value_counts()

In [None]:
#Visualise proportion of people having hypertension between the 2 groups

df_hypertension = df.groupby(['hypertension','stroke'])['hypertension'].count()
df_hypertension_total = df.groupby(['hypertension'])['hypertension'].count()
df_hypertension_fig = df_hypertension / df_hypertension_total * 100
df_hypertension_fig = df_hypertension_fig.unstack()
df_hypertension_fig.plot.bar(stacked=True, figsize=(6,6), width=0.5)
plt.savefig('stroke_hypertension.png')

In proportion, there are more people experiencing stroke in the group with hypertension.

#### Heart disease

In [None]:
#Count heart disease

df['heart_disease'].value_counts()

In [None]:
#Visualise proportion of people having heart disease between the 2 groups

df_heart = df.groupby(['heart_disease','stroke'])['heart_disease'].count()
df_heart_total = df.groupby(['heart_disease'])['heart_disease'].count()
df_heart_fig = df_heart / df_heart_total * 100
df_heart_fig = df_heart_fig.unstack()
df_heart_fig.plot.bar(stacked=True, figsize=(6,6), width=0.5)
plt.savefig('stroke_heart.png')

Same constatation as with the group having hypertension, there is a larger proportion of people experiencing stroke.

#### Marital status

In [None]:
#Count ever_married

df['ever_married'].value_counts()

In [None]:
#Plotting stacked bar to see the proportion of people having stroke in this group

df_married = df.groupby(['ever_married','stroke'])['ever_married'].count()
df_married_total = df.groupby(['ever_married'])['ever_married'].count()
df_married_fig = df_married / df_married_total * 100
df_married_fig = df_married_fig.unstack()
df_married_fig.plot.bar(stacked=True, figsize=(6,6), width=0.5)
plt.savefig('stroke_married.png')

The larger proportion of people experiencing stroke for this population can be correlated with what we have seen for age.

#### Work type

In [None]:
#Count work_type

df['work_type'].value_counts()

In [None]:
#Plot

df_work = df.groupby(['work_type','stroke'])['work_type'].count()
df_work_total = df.groupby(['work_type'])['work_type'].count()
df_work_fig = df_work / df_work_total * 100
df_work_fig = df_work_fig.unstack()
df_work_fig.plot.bar(stacked=True, figsize=(7,7), width=0.75)
plt.savefig('stroke_work.png')

#### Residence type

In [None]:
#Count residence_type

df['Residence_type'].value_counts()

In [None]:
#Plot

df_residence = df.groupby(['Residence_type','stroke'])['Residence_type'].count()
df_residence_total = df.groupby(['Residence_type'])['Residence_type'].count()
df_residence_fig = df_residence / df_residence_total * 100
df_residence_fig = df_residence_fig.unstack()
df_residence_fig.plot.bar(stacked=True, figsize=(6,6), width=0.5)
plt.savefig('stroke_residence.png')

Environmental factors can be a risk factor for stroke but there is no difference in this dataset. 

#### Glucose level

In [None]:
#Spread avg_glucose_level

fig = sns.FacetGrid(data=df, hue="stroke", aspect=4)
fig.map(sns.kdeplot, "avg_glucose_level", shade=True)
fig.add_legend()

In [None]:
sns.violinplot(x="stroke", y="avg_glucose_level", data=df)
plt.savefig('stroke_glucose.png')

The distribution of average glucose level between the two classes is almost similar. There are only a slightly difference for the average glucose level above 150 where more people is experiencing stroke.

#### BMI

In [None]:
#Spread bmi

fig = sns.FacetGrid(data=df, hue="stroke", aspect=4)
fig.map(sns.kdeplot, "bmi", shade=True)
fig.add_legend()
plt.savefig('stroke_bmi.png')

There is no real difference between the two groups in terms of BMI.

#### Smoking status

In [None]:
#Count smoking_status

df['smoking_status'].value_counts()

In [None]:
#Plot

df_smoking = df.groupby(['smoking_status','stroke'])['smoking_status'].count()
df_smoking_total = df.groupby(['smoking_status'])['smoking_status'].count()
df_smoking_fig = df_smoking / df_smoking_total * 100
df_smoking_fig = df_smoking_fig.unstack()
df_smoking_fig.plot.bar(stacked=True, figsize=(7,7), width=0.5)
plt.savefig('stroke_smoking.png')

The graph confirms that smoking is a risk factor for stroke. 

### Encoding categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
enc=LabelEncoder()

In [None]:
#Encoding gender variable

df['gender']=enc.fit_transform(df['gender'])

In [None]:
df.head()

In [None]:
#Encoding marital status

df['ever_married']=enc.fit_transform(df['ever_married'])

In [None]:
df.head()

In [None]:
# Encode variables with more than 2 Classes

df = pd.get_dummies(df, columns= [i for i in df.columns if df[i].dtypes=='object'],drop_first=True)

In [None]:
#Check

df.head()

In [None]:
df.info()

We have now 4908 entries for 16 variables and all our data are either in numerical format so that we can perform the training later.


### Further exploratory analysis and visualisation

In [None]:
df.describe()

In [None]:
sns.pairplot(df)
plt.savefig('stroke_pairplot.png')

In [None]:
df.corr()

The variables that have the highest correlation score with stroke are: age, heart disease, glucose level and hypertension, which is what we suspected.

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True)
plt.savefig('stroke_corr_heat.png')

Nevertheless, the coefficients are very low (between .14 and .2)

## Training

### 1. Set the independent (X) and the dependent variable (y)

In [None]:
y=df['stroke'].ravel()

In [None]:
y

In [None]:
X=df.drop('stroke', axis=1)

In [None]:
X

In [None]:
#Scaling X 

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
X_scale=scaler.fit_transform(X)

In [None]:
X_scale[:5]

### 2. Split the data into training and testing sets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, stratify=y, shuffle=True, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
plt.hist(y_train)

In [None]:
y_test

In [None]:
plt.hist(y_test)

### 3. Creating models

#### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr=LogisticRegression(random_state=42)

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred_lr=lr.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_lr)

In [None]:
print(classification_report(y_test,y_pred_lr))

1. With imbalanced data, the accuracy is not a metric that we can take into account because it is based on the the larger part of the target. In other words, this model is very accurate predincting when a people is not having a stroke, which is obviously what we don't need...
2. The poor result in class 1 of the target is expected because of the imbalanced dataset as well as the limited correlation among the variables. 

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred_rf=rf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_rf))

In [None]:
confusion_matrix(y_test, y_pred_rf)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt=DecisionTreeClassifier(random_state=42)

In [None]:
dt.fit(X_train, y_train)

In [None]:
y_pred_dt=dt.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_dt))

In [None]:
confusion_matrix(y_test, y_pred_dt)

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred_knn=knn.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_knn))

In [None]:
confusion_matrix(y_test, y_pred_knn)

### Handling imbalanced data with sampling

In [None]:
#Using over-sampling method

from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE()
X_oversampled, y_oversampled = sm.fit_resample(X, y)

In [None]:
#Data after oversampling

sns.countplot(x = y_oversampled, data = df)
plt.savefig('stroke_oversampled.png')

In [None]:
# Train again with the new data

X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size = 0.2, random_state = 42)

In [None]:
#Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print(confusion_matrix(lr_pred, y_test))
print(classification_report(lr_pred, y_test))

In [None]:
#Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(confusion_matrix(dt_pred, y_test))
print(classification_report(dt_pred, y_test))

In [None]:
#KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print(confusion_matrix(knn_pred, y_test))
print(classification_report(knn_pred, y_test))

In [None]:
#Random forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(confusion_matrix(rf_pred, y_test))
print(classification_report(rf_pred, y_test))

In [None]:
conf_mat = confusion_matrix(rf_pred, y_test)
sns.heatmap(conf_mat.T, annot=True, fmt='d', cbar=False,
          xticklabels=['No','Yes'],
          yticklabels=['No','Yes'] )
plt.xlabel('Actuals')
plt.ylabel('Predicted')
plt.savefig('stroke_over_rf_cm.png')

In [None]:
# Creating the feature importances dataframe

feature_importance = np.array(rf.feature_importances_)
feature_names = np.array(X.columns)

feat_imp = pd.DataFrame({'feature_names':feature_names,'feature_importance':feature_importance})

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=feat_imp['feature_importance'], y=feat_imp['feature_names'])
plt.savefig('stroke_feature_imp.png')

In [None]:
from sklearn import tree

fn = df.columns
cn = ["Yes","No"]

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (40,15))

tree.plot_tree(rf.estimators_[0],
               feature_names = fn, 
               class_names=cn,
               filled = True);
plt.savefig('stroke_over_tree.png')

In [None]:
pred_prob1 = lr.predict_proba(X_test)
pred_prob2 = dt.predict_proba(X_test)
pred_prob3 = knn.predict_proba(X_test)
pred_prob4 = rf.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)
fpr3, tpr3, thresh3 = roc_curve(y_test, pred_prob3[:,1], pos_label=1)
fpr4, tpr4, thresh4 = roc_curve(y_test, pred_prob4[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
from sklearn.metrics import roc_auc_score

# auc scores
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])
auc_score3 = roc_auc_score(y_test, pred_prob3[:,1])
auc_score4 = roc_auc_score(y_test, pred_prob4[:,1])

print(auc_score1)
print(auc_score2)
print(auc_score3)
print(auc_score4)

In [None]:
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Decision Tree')
plt.plot(fpr3, tpr3, linestyle='--',color='yellow', label='KNN')
plt.plot(fpr4, tpr4, linestyle='--',color='red', label='Random Forest')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show()
fig.savefig('multiple_roc_curve.png')

After sampling, random forest leads to the best results in terms of metrics as we can see with the ROC curve and a F1 score of .96 