# Stroke 🧠 Analysis and Predictions 

#### If you like my work, It will be really great of you to upvote this notebook!
#### If not then you leaving a comment on what do I need to work on and improve will be really helpful!

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.simplefilter("ignore")

### Loading up the data

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
# Shape of the data
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Looking for the missing values in the dataset if any
df.isna().sum()

In [None]:
# Dropping missing values from the dataset
df = df.dropna()

In [None]:
df.isna().sum()

In [None]:
df.drop("id", axis=1,inplace=True)

In [None]:
labels = ["Did not get a Stroke", "Got a Stroke"]
values = df['stroke'].value_counts().tolist()

px.pie(df, values=values, names=labels, title="Stroke Distribution Pie Chart", 
       color_discrete_sequence=["seagreen", "firebrick"])

In [None]:
labels = df['gender'].value_counts().index.tolist()[:2]
values = df['gender'].value_counts().tolist()[:2]

px.pie(df, values=values, names=labels, title="Gender Distribution Pie Chart", 
       color_discrete_sequence=["pink", "skyblue"])

In [None]:
plt.style.use("seaborn-darkgrid")
fig, ax =plt.subplots(2,1, figsize=(20,15)) 

sns.histplot(x=df["age"], kde=True, color="darkgreen", bins=30, ax=ax[0])
ax[0].set_xlabel("age",fontsize=15)

sns.kdeplot(x=df["age"], hue= df["stroke"], fill=True, common_norm=False, alpha=0.5, palette="magma",linewidth=0, ax=ax[1])
ax[1].set_xlabel("age",fontsize=15);

In [None]:
labels = ["Non-Heart Disease Patient", "Heart Disease Patient"]
values = df['heart_disease'].value_counts()

px.pie(df, values=values, names=labels, title="Heart Patient Distribution Pie Chart", 
       color_discrete_sequence=["crimson", "firebrick"])

In [None]:
labels = ["Hypertension Not Present", "Hypertension Present"]
values = df['hypertension'].value_counts()

px.pie(df, values=values, names=labels, title="Hypertension Distribution Pie Chart")

In [None]:
labels = ["Married", "Not Married"]
values = df['ever_married'].value_counts()

px.pie(df, values=values, names=labels, title="Marital Status Distribution Pie Chart", 
       color_discrete_sequence=["darkorchid", "orchid"])

In [None]:
labels = ["Private Job", "Self-employed", "Have Children", "Goverment Job", "Never Worked Before"]
values = df['work_type'].value_counts().tolist()
px.pie(df, values=values, names=labels, title="Work Type Distribution Pie Chart")

In [None]:
labels = ["Never Smoked Before", "Unknown", "Ex-Smoker", "Currently Smokes"]
values = df['smoking_status'].value_counts().tolist()
px.pie(df, values=values, names=labels, title="Smoker Type Distribution Pie Chart")

In [None]:
labels = ["Urban", "Rural"]
values = df['Residence_type'].value_counts().tolist()

px.pie(df, values=values, names=labels, title="Residence Type Distribution Pie Chart", 
       color_discrete_sequence=["slategray", "yellowgreen"])

In [None]:
plt.style.use("seaborn-darkgrid")
fig, ax =plt.subplots(2,1, figsize=(20,15)) 

sns.histplot(x=df["avg_glucose_level"], kde=True, bins=30, color="darkorange", ax=ax[0])
ax[0].set_xlabel("avg_glucose_level",fontsize=15)

sns.kdeplot(x=df["avg_glucose_level"], hue= df["stroke"], fill=True, common_norm=False, palette="Spectral",alpha=0.5, linewidth=0, ax=ax[1])
ax[1].set_xlabel("avg_glucose_level",fontsize=15);

In [None]:
plt.style.use("seaborn")
fig, ax =plt.subplots(2,1, figsize=(20,15)) 

sns.histplot(x=df["bmi"], kde=True, color="darkorchid", ax=ax[0], bins=30)
ax[0].set_xlabel("bmi",fontsize=15)

sns.kdeplot(x=df["avg_glucose_level"], hue= df["stroke"], fill=True, common_norm=False, palette="magma",alpha=0.5, linewidth=0, ax=ax[1])
ax[1].set_xlabel("bmi",fontsize=15);

## Correlation with `age`

In [None]:
plt.style.use("seaborn")
fig, ax =plt.subplots(2,1, figsize=(20,15)) 

sns.scatterplot(x = df['bmi'], y = df['age'], ax=ax[0], color="darkorchid");
ax[0].set_xlabel("bmi",fontsize=15);

sns.scatterplot(x = df['avg_glucose_level'], y = df['age'], ax=ax[1], color="darkorange");
ax[1].set_xlabel("avg_glucose_level",fontsize=15);

## Correlation with `avg_glucose_level`

In [None]:
plt.style.use("seaborn")
fig, ax =plt.subplots(2,1, figsize=(20,15)) 

sns.scatterplot(x = df['bmi'], y = df['avg_glucose_level'], ax=ax[0], color="deepskyblue");
ax[0].set_xlabel("bmi",fontsize=15);

sns.scatterplot(x = df['age'], y = df['avg_glucose_level'], ax=ax[1], color="darkorange");
ax[1].set_xlabel("age",fontsize=15);

## Stroke Or No Stroke Pair Grid

In [None]:
sns.pairplot(df, hue='stroke', palette="viridis");

## Heatmap Correlation

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, fmt='.1g', cmap="viridis", cbar=False);

In [None]:
# Creating dummy variables for the non-numeric data

gender_dum = pd.get_dummies(df['gender'])
residence_type_dum = pd.get_dummies(df['Residence_type'])
smoking_status_dum = pd.get_dummies(df['smoking_status'])
work_type_dum = pd.get_dummies(df['work_type'])

In [None]:
df = pd.concat([df, gender_dum, residence_type_dum, smoking_status_dum, work_type_dum],axis='columns')
df = df.drop(columns=['gender','ever_married','work_type','Residence_type','smoking_status'])
df.head()

## Splitting the data into training and test datasets
Here, we are trying to predict the chance of a person getting a Stroke using the given data. Hence, the `stroke` will be the y label and rest of the data will be the X or the input data.

In [None]:
# X data
X = df.drop("stroke", axis=1)
X.head()

In [None]:
# y data
y = df["stroke"]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train,y_train)

In [None]:
RandomForestClassifierScore = rfc.score(X_test, y_test)
print("Accuracy obtained by Random Forest Classifier model:",RandomForestClassifierScore*100)

In [None]:
rfc.predict_proba(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion' : ['gini', 'entropy'],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search_rfc = GridSearchCV(estimator = rfc, 
                               param_grid = param_grid, 
                               cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search_rfc.fit(X_train, y_train)

In [None]:
grid_search_rfc.best_params_

In [None]:
grid_search_rfc.best_score_

In [None]:
grid_search_rfc_predict = grid_search_rfc.predict(X_test)

In [None]:
print('Improvement in Random Forest Classifier after GridSearchCV: {:0.2f}%.'.format(100 * (grid_search_rfc.best_score_ - RandomForestClassifierScore) / RandomForestClassifierScore))

In [None]:
# Comparing the results after the improvement in Random Forest Classifier

plt.style.use("seaborn")

x = ["Random Forest Classifier",  
     "GridSearch-RandomForestClassifier"]

y = [RandomForestClassifierScore,  
     grid_search_rfc.best_score_]

fig, ax = plt.subplots(figsize=(6,6))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.title("Random Forest Classifier  vs  GridSearched Random Forest Classifier", fontsize=14, fontname="Helvetica", y=1.03);

In [None]:
# Classification Report of GridSearch-Random Forest Classifier
from sklearn.metrics import classification_report

print(classification_report(y_test, grid_search_rfc_predict))