# Importing Libraries and Data 📚

In [None]:
from IPython.display import HTML
f = open("../input/notebookassets/blue.css").read()
HTML(f"<style>{f}</style>")

In [None]:
! pip install -q dabl rich

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings

import plotly
import plotly.figure_factory as ff

import rich
import dabl

cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 15)]
warnings.simplefilter("ignore")

In [None]:
data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data.head()

# Exploratory Data Analysis 📊

In [None]:
print(f"Shape of data is: {data.shape}")

In [None]:
data.info()

In [None]:
data.describe()

Let's check the Null values in the dataset and their proportions.

In [None]:
data.isna().sum()

We can see from above list that only the column with Null values is `bmi`.

Let's see what percentage of values in `bmi` column are Null.

In [None]:
percent_null_bmi = (data['bmi'].isna().sum() / data.shape[0]) * 100
print(f"Percentage of Null values in BMI column: \033[2;37;31m{percent_null_bmi:.2f}%\033[0;37;31m")

In [None]:
# Let's drop the Null values
data = data.dropna()

## Age 📟

In [None]:
plt.style.use("classic")
sns.distplot(data['age'], color='magenta', kde=True)
plt.axvline(data['age'].mean(), color='orange', linestyle='-', linewidth=0.8)
min_ylim, max_ylim = plt.ylim()
plt.text(data['age'].mean()*1.05, max_ylim*0.95, 'Mean (μ): {:.2f}'.format(data['age'].mean()))
plt.xlabel("Age (in years)")
plt.title(f"Distribution of Ages")
plt.show()

## Gender ♀♂

In [None]:
sns.set(style="whitegrid")
labels = data['gender'].value_counts().index.tolist()[:2]
values = data['gender'].value_counts().tolist()[:2]

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, explode=[0, 0.005], shadow=True)
plt.title("Gender Distribution Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Hypertension 🛑

In [None]:
sns.set(style="whitegrid")
labels = ["Not Present", "Present"]
values = data['hypertension'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors[::-1], shadow=True, explode=[0, 0.2])
plt.title("Hypertension Distribution Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Presence of Heart Disease ❤ 

In [None]:
sns.set(style="whitegrid")
random.shuffle(colors)
labels = ["Heart Disease Absent", "Heart Disease Present"]
values = data['heart_disease'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, shadow=True, explode=[0, 0.2])
plt.title("Heart Disease Distribution Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Ever Married? 💍

In [None]:
sns.set(style="whitegrid")
random.shuffle(colors)
labels = ["Married", "Never Married"]
values = data['ever_married'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, shadow=True)
plt.title("Marital Status Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Work Type 👨‍🏭

In [None]:
sns.set(style="whitegrid")
random.shuffle(colors)
labels = ["Private Job", "Self-employed", "Have Children", "Goverment Job", "Never Worked Before"]
values = data['work_type'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, shadow=True, explode=[0.1, 0.1, 0.1, 0.1, 0.2])
plt.title("Work Type Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Residence Type 🏡

In [None]:
sns.set(style="whitegrid")
random.shuffle(colors)
labels = ["Urban Residence", "Rural Residence"]
values = data['Residence_type'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, shadow=True)
plt.title("Residence Type Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Average Glucose level of the Patient 💉

In [None]:
plt.style.use("classic")
plt.figure(figsize=(10, 7))
sns.distplot(data['avg_glucose_level'], color='darkblue', kde=True)
plt.axvline(data['avg_glucose_level'].mean(), color='red', linestyle='-', linewidth=0.8)
min_ylim, max_ylim = plt.ylim()
plt.text(data['avg_glucose_level'].mean()*1.05, max_ylim*0.9, 'Mean (μ): {:.2f}'.format(data['avg_glucose_level'].mean()))
plt.xlabel("Average Glucose Level")
plt.title(f"Distribution of Average Glucose Level")
plt.show()

## Body Mass Index of Patient (`BMI`) 🦴

In [None]:
plt.style.use("classic")
plt.figure(figsize=(10, 7))
sns.distplot(data['bmi'], color='darkred', kde=True)
plt.axvline(data['bmi'].mean(), color='darkgreen', linestyle='-', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(data['bmi'].mean()*1.05, max_ylim*0.9, 'Mean (μ): {:.2f}'.format(data['bmi'].mean()))
plt.xlabel("Body Mass Index")
plt.title(f"Distribution of Body Mass Index")
plt.show()

## Smoking Status 🚬

In [None]:
sns.set(style="whitegrid")
random.shuffle(colors)
labels = ["Never Smoked Before", "Unknown", "Ex-Smoker", "Currently Smokes"]
values = data['smoking_status'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, shadow=True)
plt.title("Smoking Status Pie Chart", fontdict={'fontsize': 14})
plt.show()

## Stroke (Target Variable) 🌡

In [None]:
sns.set(style="whitegrid")
random.shuffle(colors)
labels = ["Did not get a Stroke", "Did get a Stroke"]
values = data['stroke'].value_counts().tolist()

plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, shadow=True, explode=[0, 0.1])
plt.title("Stroke Pie Chart", fontdict={'fontsize': 14})
plt.show()

### Relationship of Gender and Age

In [None]:
avg_male_age = data[data['gender'] == 'Male']['age'].mean()
avg_female_age = data[data['gender'] == 'Female']['age'].mean()

plt.style.use("classic")
plt.figure(figsize=(16, 6))
sns.kdeplot(data.loc[data['gender'] == 'Male', 'age'], label = 'Male', shade=True, color='darkred')
sns.kdeplot(data.loc[data['gender'] == 'Female', 'age'], label = 'Female', shade=True, color='darkblue')

plt.axvline(avg_male_age, color='orange', linestyle='-', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(avg_male_age-20, max_ylim*0.93, 'Avg. Male Age: {} yrs'.format(int(avg_male_age)))

plt.axvline(avg_female_age, color='magenta', linestyle='-', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(avg_female_age*1.05, max_ylim*0.93, 'Avg. Female Age: {} yrs'.format(int(avg_female_age)))

plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Distribution of Ages for Male and Female Patients')
plt.legend()
plt.show()

### Relationship of Age and Location

In [None]:
plt.style.use("classic")
plt.figure(figsize=(16, 6))
sns.kdeplot(data.loc[data['Residence_type'] == 'Urban', 'age'], label = 'Urban', shade=True, color='yellow')
sns.kdeplot(data.loc[data['Residence_type'] == 'Rural', 'age'], label = 'Rural', shade=True, color='green')

plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Distribution of Ages for Urban and Rural Patients')
plt.legend()
plt.show()

### Relationship between Age and Average Glucose Level

In [None]:
fig = ff.create_2d_density(
    x=data['age'],
    y=data['avg_glucose_level'],
    title="Age-Average Glucose Level Density Plot",
    colorscale=['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]
)

fig.show()

### Relationship between Average Glucose Level and Body Mass Index

In [None]:
fig = ff.create_2d_density(
    x=data['avg_glucose_level'],
    y=data['bmi'],
    title="Average Glucose Level-BMI Density Plot",
    colorscale=['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]
)

fig.show()

### Relationship between Average Glucose Level and Stroke

In [None]:
fig = ff.create_2d_density(
    x=data['avg_glucose_level'],
    y=data['stroke'],
    title="Average Glucose Level-Stroke Density Plot",
    colorscale=['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]
)

fig.show()

### Relationship Between BMI and Stroke

In [None]:
fig = ff.create_2d_density(
    x=data['bmi'],
    y=data['stroke'],
    title="BMI-Stroke Density Plot",
    colorscale=['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]
)

fig.show()

### DABL PLot

Let's use DABL to see if we can come across

In [None]:
dabl.plot(data, y='stroke')