In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("Datasets/healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df.shape

(5110, 12)

### Displaying those rows that have heart disease

In [4]:
# pd.set_option('display.max_rows', 800)

filt = (df['heart_disease']==1)

print("\nNumber of People having Heart-Disease: ", df[filt].shape[0])
print("\nDetails of people having Heart-Disease are: ")

df[filt]


Number of People having Heart-Disease:  276

Details of people having Heart-Disease are: 


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
11,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1
13,8213,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4855,42563,Female,57.0,1,1,Yes,Private,Rural,231.72,45.7,formerly smoked,0
4869,72715,Female,50.0,0,1,Yes,Private,Urban,193.80,26.4,never smoked,0
4921,4062,Male,72.0,0,1,Yes,Private,Rural,238.27,,smokes,0
4986,53697,Male,58.0,0,1,Yes,Private,Rural,225.35,26.5,smokes,0


## Displaying those rows that have hypertension and no heart disease

In [5]:
filt = (df['hypertension']==1) & (df['heart_disease']==0)


print("\nNumber of People having Hypertension but not Heart-Disease: ", df[filt].shape[0])
print("\nDetails of above people are: ")

df[filt]


Number of People having Hypertension but not Heart-Disease:  434

Details of above people are: 


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
10,12109,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
15,58202,Female,50.0,1,0,Yes,Self-employed,Rural,167.41,30.9,never smoked,1
17,34120,Male,75.0,1,0,Yes,Private,Urban,221.29,25.8,smokes,1
21,13861,Female,52.0,1,0,Yes,Self-employed,Urban,233.29,48.9,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5088,22190,Female,64.0,1,0,Yes,Self-employed,Urban,76.89,30.2,Unknown,0
5091,6369,Male,59.0,1,0,Yes,Private,Rural,95.05,30.9,never smoked,0
5093,32235,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5100,68398,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,0


## Find Missing values from the dataset and clean the data.

In [6]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### Rows having Missing values

In [7]:
filt = df.isnull().any(axis=1)
df[filt]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
13,8213,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,25226,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
27,61843,Male,58.0,0,0,Yes,Private,Rural,189.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5039,42007,Male,41.0,0,0,No,Private,Rural,70.15,,formerly smoked,0
5048,28788,Male,40.0,0,0,Yes,Private,Urban,191.15,,smokes,0
5093,32235,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5099,7293,Male,40.0,0,0,Yes,Private,Rural,83.94,,smokes,0


In [8]:
print("Mean of bmi is: ", df['bmi'].mean())
print("Median of bmi is: ", df['bmi'].median())

Mean of bmi is:  28.893236911794666
Median of bmi is:  28.1


In [9]:
# Replacing missing values with mean
mean = df['bmi'].mean()
df['bmi'].fillna(mean, inplace=True)

In [10]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

### Checking for duplicate rows

In [11]:
df.duplicated().sum()

0

### Convert Categorical Variables to Numerical

In [12]:
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,True,False,True,False,True,False,False,True,True,False,False
1,51676,61.0,0,0,202.21,28.893237,1,False,False,True,False,False,True,False,False,False,True,False
2,31112,80.0,0,1,105.92,32.5,1,True,False,True,False,True,False,False,False,False,True,False
3,60182,49.0,0,0,171.23,34.4,1,False,False,True,False,True,False,False,True,False,False,True
4,1665,79.0,1,0,174.12,24.0,1,False,False,True,False,False,True,False,False,False,True,False


### Handling Outliers

In [13]:
numerical_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
df[numerical_features].head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
0,67.0,0,1,228.69,36.6
1,61.0,0,0,202.21,28.893237
2,80.0,0,1,105.92,32.5
3,49.0,0,0,171.23,34.4
4,79.0,1,0,174.12,24.0


In [14]:
Q1 = df[numerical_features].quantile(0.25)
Q3 = df[numerical_features].quantile(0.75)
IQR = Q3 - Q1

In [15]:
filt = ~((df[numerical_features] < (Q1 - 1.5 * IQR)) | (df[numerical_features] > (Q3 + 1.5 * IQR))).any(axis=1)
df = df[filt]

## Cleaned Dataset

In [16]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
7,10434,69.0,0,0,94.39,22.8,1,False,False,False,False,True,False,False,True,False,True,False
8,27419,59.0,0,0,76.15,28.893237,1,False,False,True,False,True,False,False,False,False,False,False
9,60491,78.0,0,0,58.57,24.2,1,False,False,True,False,True,False,False,True,False,False,False
12,12175,54.0,0,0,104.51,27.3,1,False,False,True,False,True,False,False,True,False,False,True
18,27458,60.0,0,0,89.22,37.8,1,False,False,False,False,True,False,False,True,False,True,False
