# Stroke Perdiction

## Import libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

## Import dataset

In [3]:
df = pd.read_csv('Dataset/healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.shape

(5110, 12)

In [5]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [6]:
df.agg(
    {
        'age': ["min", "max", "median", "mean", "std"],
        'avg_glucose_level': ["min", "max", "median", "mean", "std"],
        'bmi': ["min", "max", "median", "mean", "std"],
    }
)

Unnamed: 0,age,avg_glucose_level,bmi
min,0.08,55.12,10.3
max,82.0,271.74,97.6
median,45.0,91.885,28.1
mean,43.226614,106.147677,28.893237
std,22.612647,45.28356,7.854067


## Data Preprocessing

### change column format

#### from string to categorical

In [7]:
df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = \
    df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']].astype('category')
df.dtypes

id                      int64
gender               category
age                   float64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
dtype: object

#### from int to categorical

the value in these columns (*hypertension*, *heart_diseae*, and *stroke*) are actually binary categorical data. Int data type may not appropriate. 

In [8]:
df[['hypertension', 'heart_disease', 'stroke']] = \
    df[['hypertension', 'heart_disease', 'stroke']].astype('category')
df.dtypes

id                      int64
gender               category
age                   float64
hypertension         category
heart_disease        category
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke               category
dtype: object

### Deal with NA

#### find if there are NA in any column

In [9]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### fill NA in *bmi* column with mean of *bmi*

use mean

In [10]:
df = df.fillna(value = {'bmi': df['bmi'].mean()})
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### inspect unique values in some categorical columns

In [11]:
col_names = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in col_names:
    print('name: ' + col)
    print(df[col].unique())
    print('------------------------------------')

name: gender
['Male', 'Female', 'Other']
Categories (3, object): ['Female', 'Male', 'Other']
------------------------------------
name: ever_married
['Yes', 'No']
Categories (2, object): ['No', 'Yes']
------------------------------------
name: work_type
['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked']
Categories (5, object): ['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children']
------------------------------------
name: Residence_type
['Urban', 'Rural']
Categories (2, object): ['Rural', 'Urban']
------------------------------------
name: smoking_status
['formerly smoked', 'never smoked', 'smokes', 'Unknown']
Categories (4, object): ['Unknown', 'formerly smoked', 'never smoked', 'smokes']
------------------------------------


There are 2 columns with interesting values
- *gender*: 'Other'
- *smoking_status*: 'Unknown'<br>

how much of these interesting value does each column has?

#### count value in *gender* column

In [12]:
df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

only 1 entry of 'Other'. drop this row might not affect model's performance that much.

#### count value in *smoking_status* column

In [13]:
df['smoking_status'].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

hmm... 1544 unknown of 5110 entries...<br>
it's 30.21% of this column!<br>
This is quite a big number.<br>
If leave this 'Unkown' like that, it may result in poor performance.