In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
stroke_data =pd.read_csv('../Data/train_2v.csv')

# We will start by preview our data and determing what kind of problem we have. 

In [3]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468.0,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523.0,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543.0,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136.0,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [4]:
stroke_data.dtypes

id                    object
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [5]:
stroke_data.shape

(43400, 12)

In [6]:
# Get the number of indiviudals who had a stroke.
number_of_people_who_had_a_stroke = len(stroke_data[stroke_data["stroke"] == 1])
number_of_people_who_had_a_stroke

783

In [7]:
# Get the percentage of people who had a stroke
percentage_of_people_who_had_a_stroke = (number_of_people_who_had_a_stroke / len(stroke_data) ) * 100
percentage_of_people_who_had_a_stroke

1.804147465437788

 # After previewing our data, we see that we have a classification problem.  We will determine what the most appropriate features are, and use these to predict whether an individual will have a stroke.
 
 # We also notice that only about 1.8% of the individuals in our dataset had a stroke.  We will have to find a way to deal with imbalanced data.

# 1. Prework / Data Cleansing

In [8]:
# We do not want to work with unique identifiers.
# We will therefore drop the id column.

del stroke_data["id"]
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [9]:
stroke_data.isna().sum()

gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [10]:
stroke_data['smoking_status'].nunique()

3

In [11]:
stroke_data['smoking_status'].unique()

array([nan, 'never smoked', 'formerly smoked', 'smokes'], dtype=object)

In [12]:
stroke_data['bmi'].nunique()

555

### We are working with NaN data.  We have a few options for what to do.

1. Fill NaN with zeros
2. Fill the nas with the mode
3. Label encode the three categories

In [13]:
# We need to do something about the NaN values in the bmi and smoking_status column
# We will try two things:
# 1. fill the NaN values in the bmi column with the mean bmi value.
# 2. dropping any record where this row is value altogether.
# The second option could help alleviate the imbalanaced data issue.

stroke_data["bmi"].fillna(stroke_data["bmi"].mean(), inplace=True)
# stroke_data.dropna(how="any",inplace=True)

stroke_data.dropna(axis=0, inplace=True)

stroke_data.shape

(30108, 11)

In [14]:
stroke_data.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [15]:
# Get the number of indiviudals who had a stroke after dropping NaN values in the smoking_status column.
number_of_people_who_had_a_stroke = len(stroke_data[stroke_data["stroke"] == 1])
number_of_people_who_had_a_stroke

638

In [16]:
# Get the percentage of people who had a stroke after dropping NaN values in the smoking_status column.
percentage_of_people_who_had_a_stroke = (number_of_people_who_had_a_stroke / len(stroke_data) ) * 100
percentage_of_people_who_had_a_stroke

2.1190381294008236

In [38]:
# Create our features and label
X = stroke_data.drop(["stroke"], axis=1)
y = stroke_data["stroke"]

In [41]:
X = pd.get_dummies(X, columns=["hypertension", "heart_disease", "work_type", "Residence_type", "smoking_status"])
X.head()

Unnamed: 0,gender,age,ever_married,avg_glucose_level,bmi,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
1,Male,58.0,Yes,87.96,39.2,0,1,1,0,0,0,1,0,0,0,1,0,1,0
3,Female,70.0,Yes,69.04,35.9,1,0,1,0,0,0,1,0,0,1,0,1,0,0
6,Female,52.0,Yes,77.59,17.7,1,0,1,0,0,0,1,0,0,0,1,1,0,0
7,Female,75.0,Yes,243.53,27.0,1,0,0,1,0,0,0,1,0,1,0,0,1,0
8,Female,32.0,Yes,77.67,32.3,1,0,1,0,0,0,1,0,0,1,0,0,0,1


## 2. Data Exploration