In [45]:
#Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle 

In [4]:
data_with_adult2= pd.read_csv('data_with_adult2.csv')
data_with_adult2

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5629,Male,79.0,0,0,Yes,Private,Rural,114.77,27.2,formerly smoked,1
5630,Male,74.0,0,0,Yes,Private,Urban,167.13,28.7,Unknown,1
5631,Female,76.0,1,1,Yes,Self-employed,Urban,199.86,31.7,smokes,1
5632,Male,74.0,0,0,Yes,Self-employed,Rural,60.98,28.1,never smoked,1


Using ColumnTransformer to create a pipeline:

In [8]:
#determin the x and y value
X = data_with_adult2.drop(columns='stroke')
y = data_with_adult2['stroke']

In [9]:
# Using stratify kFold
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 20, stratify = y)

In [13]:
#Linear regression model
numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')), 
                              ('scaling', StandardScaler())])
categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')), 
                                  ('one-hot-encode', OneHotEncoder(sparse=False))])

# (name, transformer, list of column names)
preprocessing = ColumnTransformer([('numeric', numeric_transform, ['age', 'avg_glucose_level', 'hypertension', 'heart_disease', 'bmi']), 
                                        ('categorical', categorical_transform, ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])])

pipeline = Pipeline([('preprocessing', preprocessing), 
                          ('model', LinearRegression())])
pipeline.fit(X_train, y_train)

r2 = pipeline.score(X_test, y_test)
print(f'Test set r^2: {r2}')

Test set r^2: 0.2646703392311074




In [14]:
from sklearn import set_config
set_config(display='diagram')

pipeline

In [22]:
#Decision tree model
numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')), 
                              ('scaling', StandardScaler())])
categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')), 
                                  ('one-hot-encode', OneHotEncoder(sparse=False))])

# (name, transformer, list of column names)
preprocessing = ColumnTransformer([('numeric', numeric_transform, ['age', 'avg_glucose_level', 'hypertension', 'heart_disease', 'bmi']), 
                                        ('categorical', categorical_transform, ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])])

pipeline = Pipeline([('preprocessing', preprocessing), 
                          ('model', DecisionTreeClassifier())])
pipeline.fit(X_train, y_train)

r2 = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
print(f'Test set r^2: {r2}')
print('Training set score: {:.2f}'.format(pipeline.score(X_train, y_train)))
print('Test set score: {:.2f}'.format(pipeline.score(X_test, y_test)))
print ("DecisionTreeClassifier F1 score: {: .2f}".format(f1_score(y_pred, y_test)))
print('Model precision score with DecisionTreeClassifier index: {0:0.2f}'. format(precision_score(y_pred, y_test)))

Test set r^2: 0.9467770549970431
Training set score: 1.00
Test set score: 0.95
DecisionTreeClassifier F1 score:  0.92
Model precision score with DecisionTreeClassifier index: 1.00




In [16]:
from sklearn import set_config
set_config(display='diagram')

pipeline

In [29]:
# Random forest model
numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')), 
                              ('scaling', StandardScaler())])
categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')), 
                                  ('one-hot-encode', OneHotEncoder(sparse=False))])

# (name, transformer, list of column names)
preprocessing = ColumnTransformer([('numeric', numeric_transform, ['age', 'avg_glucose_level', 'hypertension', 'heart_disease', 'bmi']), 
                                        ('categorical', categorical_transform, ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])])

pipeline = Pipeline([('preprocessing', preprocessing), 
                          ('model', RandomForestClassifier())])
pipeline.fit(X_train, y_train)

r2 = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
print(f'Test set r^2: {r2}')
print('Training set score: {:.2f}'.format(pipeline.score(X_train, y_train)))
print('Test set score: {:.2f}'.format(pipeline.score(X_test, y_test)))
print ("RandomForestClassifier F1 score: {: .2f}".format(f1_score(y_pred, y_test)))
print('Model precision score with RandomForestClassifier index: {0:0.2f}'. format(precision_score(y_pred, y_test)))



Test set r^2: 0.984033116499113
Training set score: 1.00
Test set score: 0.98
RandomForestClassifier F1 score:  0.97
Model precision score with RandomForestClassifier index: 1.00


In [30]:
# Random forest model
set_config(display='diagram')

pipeline

In [32]:
# BaggingClassifier
pipeline = Pipeline([('preprocessing', preprocessing), 
                          ('model', BaggingClassifier())])
pipeline.fit(X_train, y_train)
r2 = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
print(f'Test set r^2: {r2}')
print('Training set score: {:.2f}'.format(pipeline.score(X_train, y_train)))
print('Test set score: {:.2f}'.format(pipeline.score(X_test, y_test)))
print ("BaggingClassifier F1 score: {: .2f}".format(f1_score(y_pred, y_test)))
print('Model precision score with BaggingClassifier index: {0:0.2f}'. format(precision_score(y_pred, y_test)))



Test set r^2: 0.97397989355411
Training set score: 1.00
Test set score: 0.97
BaggingClassifier F1 score:  0.96
Model precision score with BaggingClassifier index: 1.00


In [33]:
set_config(display='diagram')

pipeline

In [35]:
pipeline = Pipeline([('preprocessing', preprocessing), 
                          ('model', GaussianNB())])
pipeline.fit(X_train, y_train)

r2 = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
print(f'Test set r^2: {r2}')
print('Training set score: {:.2f}'.format(pipeline.score(X_train, y_train)))
print('Test set score: {:.2f}'.format(pipeline.score(X_test, y_test)))
print ("GaussianNB F1 score: {: .2f}".format(f1_score(y_pred, y_test)))
print('Model precision score with GaussianNB index: {0:0.2f}'. format(precision_score(y_pred, y_test)))

Test set r^2: 0.7321111768184506
Training set score: 0.73
Test set score: 0.73
GaussianNB F1 score:  0.57
Model precision score with GaussianNB index: 0.58




In [36]:
set_config(display='diagram')

pipeline

Grid search and feature union:

In [37]:
data_with_adult_2 = data_with_adult2

In [38]:
lableen=LabelEncoder()
data_with_adult_2["gender"]=lableen.fit_transform(data_with_adult_2["gender"])
data_with_adult_2["ever_married"]=lableen.fit_transform(data_with_adult_2["ever_married"])
data_with_adult_2["work_type"]=lableen.fit_transform(data_with_adult_2["work_type"])
data_with_adult_2["Residence_type"]=lableen.fit_transform(data_with_adult_2["Residence_type"])
data_with_adult_2["smoking_status"]=lableen.fit_transform(data_with_adult_2["smoking_status"])
data_with_adult_2

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,1,1,228.69,36.6,1,1
1,1,80.0,0,1,1,1,0,105.92,32.5,2,1
2,0,49.0,0,0,1,1,1,171.23,34.4,3,1
3,0,79.0,1,0,1,2,0,174.12,24.0,2,1
4,1,81.0,0,0,1,1,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5629,1,79.0,0,0,1,1,0,114.77,27.2,1,1
5630,1,74.0,0,0,1,1,1,167.13,28.7,0,1
5631,0,76.0,1,1,1,2,1,199.86,31.7,3,1
5632,1,74.0,0,0,1,2,0,60.98,28.1,2,1


In [39]:
#assigning the X and Y
X = data_with_adult_2.drop(columns='stroke')
y = data_with_adult_2['stroke']
# Using stratify kFold
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 20, stratify = y)

In [43]:
pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', DecisionTreeClassifier())])

# Find the best hyperparameters and model using GridSearchCV on the train set
param_grid = {'classifier': [GaussianNB(), SVC(), BaggingClassifier(), RandomForestClassifier(), 
                             DecisionTreeClassifier()], 
              'features__pca__n_components': [3, 5],
              'features__select_best__k': [1, 3, 6]}
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
y_pred = grid.predict(X_test)
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')

Best test set accuracy: 0.9822590183323477
Achieved with hyperparameters: {'classifier': RandomForestClassifier(), 'features__pca__n_components': 5, 'features__select_best__k': 6}


The Final model for this project using feature union:

In [47]:
feature_union = FeatureUnion([('pca', PCA(5)), 
                              ('select_best', SelectKBest(k=6))])

In [48]:
pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', RandomForestClassifier())])
pipeline.fit(X_train, y_train)

In [49]:
y_pred = pipeline.predict(X_test)

In [50]:
print('Training set score: {:.2f}'.format(pipeline.score(X_train, y_train)))
print('Test set score: {:.2f}'.format(pipeline.score(X_test, y_test)))
print('Model F1 score with RandomForestClassifier: {0:0.2f}'. format(f1_score(y_test, y_pred)))
print('Model accuracy score with RandomForestClassifier: {0:0.2f}'. format(accuracy_score(y_test, y_pred)))
print('Model precision score with RandomForestClassifier: {0:0.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Model roc_auc_score with RandomForestClassifier: {0:0.2f}'. format(roc_auc_score(y_test, y_pred)))

Training set score: 1.00
Test set score: 0.98
Model F1 score with RandomForestClassifier: 0.97
Model accuracy score with RandomForestClassifier: 0.98
Model precision score with RandomForestClassifier: 0.97
Model roc_auc_score with RandomForestClassifier: 0.99


In [53]:
# Save the model
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)