### Pre-Processing And Training Data Development
Creating Dummy Variable, 
Splitting Data into training and testing dataset.

In [18]:
#import the required libraries
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn import linear_model, preprocessing
from scipy.stats import spearmanr

#### Load Data

In [19]:
#Reading the data from stroke_data_explored.csv into DataFrame stroke_data
stroke_data = pd.read_csv('../data/stroke_data_explored.csv')
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [20]:
#Checking the number of rows and columns
stroke_data.shape

(5109, 11)

In [21]:
#Checking the data type
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5109 non-null   object 
 1   age                5109 non-null   float64
 2   hypertension       5109 non-null   int64  
 3   heart_disease      5109 non-null   int64  
 4   ever_married       5109 non-null   object 
 5   work_type          5109 non-null   object 
 6   Residence_type     5109 non-null   object 
 7   avg_glucose_level  5109 non-null   float64
 8   bmi                5109 non-null   float64
 9   smoking_status     5109 non-null   object 
 10  stroke             5109 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.2+ KB


In [22]:
#Checking the unique values in object columns
pd.unique(stroke_data.select_dtypes(include='object').values.ravel('K'))

array(['Male', 'Female', 'Yes', 'No', 'Private', 'Self-employed',
       'Govt_job', 'children', 'Never_worked', 'Urban', 'Rural',
       'formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

#### Observation:
##### gender -> 'Male', 'Female'
##### ever_married -> 'Yes', 'No'
##### work_type -> 'Private', 'Self-employed','Govt_job', 'children', 'Never_worked'
##### Residence_type -> 'Urban', 'Rural'
##### smoking_status ->  'formerly smoked', 'never smoked', 'smokes', 'Unknown'

### Creating Dummy Features For Categorical Variables

In [23]:
# Convert categorical data into dummy features
stroke_data = pd.get_dummies(stroke_data)
stroke_data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
1,61.0,0,0,202.21,28.9,1,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0


In [24]:
# Reordering the DataFrameccolumns
reorder = list(stroke_data.columns)
reorder.remove('stroke')
reorder.append('stroke')
stroke_data = stroke_data[reorder]
stroke_data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,...,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,67.0,0,1,228.69,36.6,0,1,0,1,0,...,1,0,0,0,1,0,1,0,0,1
1,61.0,0,0,202.21,28.9,1,0,0,1,0,...,0,1,0,1,0,0,0,1,0,1
2,80.0,0,1,105.92,32.5,0,1,0,1,0,...,1,0,0,1,0,0,0,1,0,1
3,49.0,0,0,171.23,34.4,1,0,0,1,0,...,1,0,0,0,1,0,0,0,1,1
4,79.0,1,0,174.12,24.0,1,0,0,1,0,...,0,1,0,1,0,0,0,1,0,1


### Handling Class Imbalance

In [25]:
# Checking for class imbalance
stroke_data['stroke'][stroke_data['stroke']==0].count()

4860

Out of total 5109 rows, there are 4860 rows corresponding to stroke=0 and 249 rows corresponding to stroke=1. This is a class imbalance. If the model is fit with imbalance class data then, the model will have class bias and will most likely lead to less number of +ve stroke prediction. 

In [26]:
# Imbalance handling using up-resampling 
from sklearn.utils import resample

without_stroke = stroke_data[stroke_data.stroke == 0]
with_stroke = stroke_data[stroke_data.stroke == 1]

with_stroke_upsampled = resample(with_stroke,
                       replace=True,
                       n_samples=4860,
                       random_state=0)

stroke_data_upsampled = pd.concat([without_stroke,with_stroke_upsampled])

stroke_data_upsampled.stroke.value_counts()

1    4860
0    4860
Name: stroke, dtype: int64

In [27]:
stroke_data_upsampled.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,...,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
249,3.0,0,0,95.12,18.0,0,1,1,0,0,...,0,0,1,1,0,1,0,0,0,0
250,58.0,1,0,87.96,39.2,0,1,0,1,0,...,1,0,0,0,1,0,0,1,0,0
251,8.0,0,0,110.89,17.6,1,0,1,0,0,...,1,0,0,0,1,1,0,0,0,0
252,70.0,0,0,69.04,35.9,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,0
253,14.0,0,0,161.28,19.1,0,1,1,0,0,...,0,0,0,1,0,1,0,0,0,0


### Train/Test Split

In [28]:
#Splitting data into dependent and independent features
y = stroke_data_upsampled['stroke']
X = stroke_data_upsampled.drop('stroke', axis=1)

In [34]:
#Splitting the data into train set and test set 
import sklearn.model_selection #as train_test_split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

### Data Scaling
Scaling data to put them all on the consistent scale.

In [36]:
#Using Standard Scaler to scale the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_train)

### Conclusion


After creating the dummy features for the categorical variables the data is split into test and training set in the ration of  80/20. The training and the test data has been scaled to put all the data values on the same scale.