# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load and Explore Data

In [2]:
#loading the data and indexing the first column because its a 
df = pd.read_csv('Data/stroke.csv', index_col=0)
df

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5110 entries, 9046 to 44679
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 479.1+ KB


## Summary

* What I need to do to get my data prepared for the machine learning.

 -- Impute missing values in bmi column with SimpleImputer(strategy = 'mean')

  -- OneHotEncoder columns: gender, ever_married, work_type, residence_type, smoking_status.
  
  -- StandardScaler columns: age, avg_glucose_level, bmi.
  


# Validation Split

In [4]:
## Make X and y
target = 'stroke'
X = df.drop(columns=target).copy()
y = df[target].copy()

## train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
35217,Female,60.00,1,0,Yes,Private,Urban,234.50,43.7,never smoked
3531,Male,41.00,0,0,Yes,Private,Rural,83.97,28.5,formerly smoked
58761,Male,52.00,0,0,Yes,Private,Urban,87.51,30.5,formerly smoked
66110,Female,55.00,0,0,Yes,Private,Rural,63.47,27.8,Unknown
29158,Female,55.00,0,0,Yes,Private,Rural,111.19,39.7,formerly smoked
...,...,...,...,...,...,...,...,...,...,...
13846,Male,43.00,0,0,Yes,Govt_job,Rural,88.00,30.6,never smoked
1307,Female,61.00,1,0,Yes,Private,Rural,170.05,60.2,smokes
31481,Female,1.16,0,0,No,children,Urban,97.28,17.8,Unknown
61827,Male,80.00,0,0,Yes,Self-employed,Rural,196.08,31.0,formerly smoked


# Get the ColumnTransformer

In [5]:
## instntiate column selectors. Get only the columns that needed to be transformered.
cat_selector = make_column_selector(dtype_include='object')
scal_cols = ['age', 'avg_glucose_level', 'bmi']



In [6]:
# Instantiate the transformers
scaler = StandardScaler() # for scal_cols
mean_imputer = SimpleImputer(strategy='mean') # this used for dealing with the missing values in bmi column.
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') # for categorical columns.

In [7]:
## create a pipelines to put scaler and mean_imputer together for columns need to be impute missing values and standardscale..
num_pipe = make_pipeline(mean_imputer, scaler)

In [8]:
## combine the cols and transformers in a tuple.
num_tuple = (num_pipe, scal_cols)
cat_tuple = (ohe, cat_selector)

In [9]:
## finally, get the columntransformer.
preprocessor = make_column_transformer(cat_tuple, num_tuple, remainder='passthrough')
preprocessor

## Check the Transforming Result.

In [10]:
## fit and transform the training dataset.
preprocessor.fit(X_train)
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_train_transformed.shape

(3832, 20)

In [11]:
# convert the training data to dataframe.
X_train_df = pd.DataFrame(X_train_transformed)
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.737217,2.823301,1.926307,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.106549,-0.489814,-0.05082,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.381947,-0.411899,0.209329,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.515173,-0.941012,-0.141871,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.515173,0.109289,1.406011,0.0,0.0


# Machine Learning: Finding the Best Model.