In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder #for converting to numerical
from imblearn.over_sampling import SMOTE # for balancing
from sklearn.preprocessing import MinMaxScaler #for normalization

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score # Cross-validation for assess the ML algorithms

from sklearn.model_selection import GridSearchCV # for hyperparameter tuning 

import time

In [3]:
df = pd.read_csv("E:\Modules\Teaching Block 2\Big Data Applications-Hamidreza\Assessment\Mini Project\healthcare-dataset-stroke-data.csv")

In [4]:
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
df.tail(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [6]:
# the data set has no duplicated data
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [7]:
# Only the bmi field in the data set has 201 null values
null_values = df.isnull().sum()
null_values

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [8]:
# Fill the BMI field for Females
df_female = df[df["gender"] == "Female"]
df_female["bmi"].fillna(df_female["bmi"].mean(),inplace=True)  # df_female["bmi"].mean() =  29.065757680358995

df_female.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_female["bmi"].fillna(df_female["bmi"].mean(),inplace=True)  # df_female["bmi"].mean() =  29.065757680358995


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,29.065758,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
# Fill the BMI field for Males
df_male = df[df["gender"] == "Male"]
df_male["bmi"].fillna(df_male["bmi"].mean(),inplace=True)
df_male.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_male["bmi"].fillna(df_male["bmi"].mean(),inplace=True)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [10]:
df_2 = pd.concat([df_male,df_female])

# checking for null values once more:
null_values = df_2.isnull().sum()
null_values

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [11]:
# imbalanced data set 

num_stroke_1= df_2['stroke'].value_counts()[1]
print("the number of records with stroke value 1 out of 5110 records is: ", num_stroke_1)

the number of records with stroke value 1 out of 5110 records is:  249


In [12]:
#  Using OneHotEncoder, convert categorical fields to numerical fields in preparation for oversampling.
cat_cols = ['gender', 'ever_married', 'work_type','Residence_type','smoking_status']
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df_2[cat_cols]).toarray()
df_2_encoded = pd.DataFrame(encoded, columns=encoder.get_feature_names(cat_cols))
df_3 = pd.concat([df_2, df_2_encoded], axis=1)
df_3.drop(cat_cols, axis=1, inplace=True)
df_3.dropna(inplace=True)  #to remove any records with null values
df_3.head()
# After this step,
# we have 22 columns instead of 12.For example, in the gender column, we have gender_Female and gender_Male.



Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046.0,67.0,0.0,1.0,228.69,36.6,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,31112.0,80.0,0.0,1.0,105.92,32.5,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,56669.0,81.0,0.0,0.0,186.21,29.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6,53882.0,74.0,1.0,1.0,70.09,27.4,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
13,8213.0,78.0,0.0,1.0,219.84,28.647936,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [13]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5108 entries, 0 to 5107
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              5108 non-null   float64
 1   age                             5108 non-null   float64
 2   hypertension                    5108 non-null   float64
 3   heart_disease                   5108 non-null   float64
 4   avg_glucose_level               5108 non-null   float64
 5   bmi                             5108 non-null   float64
 6   stroke                          5108 non-null   float64
 7   gender_Female                   5108 non-null   float64
 8   gender_Male                     5108 non-null   float64
 9   ever_married_No                 5108 non-null   float64
 10  ever_married_Yes                5108 non-null   float64
 11  work_type_Govt_job              5108 non-null   float64
 12  work_type_Never_worked          51

In [14]:
# Random Forest Classification before oversampling
X_rf = df_3.drop('stroke', axis=1)  #all columns except stroke
y_rf = df_3['stroke']
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.3, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_rf_train, y_rf_train)
y_rf_pred = rf.predict(X_rf_test)

cm_rf_b = confusion_matrix(y_rf_test, y_rf_pred)
accuracy_rf_b= accuracy_score(y_rf_test, y_rf_pred)

print("Confusion Matrix of Random Forest:\n", cm_rf_b)
print("\nAccuracy of Random Forest:", round(accuracy_rf_b*100,2))


Confusion Matrix of Random Forest:
 [[1444    8]
 [  71   10]]

Accuracy of Random Forest: 94.85


In [15]:
# implementing oversampling
df_3= df_3.drop('id',axis=1) # I don't want this field to have any effect on oversampling.
features = df_3.drop('stroke', axis=1)
target = df_3['stroke']
smote = SMOTE(sampling_strategy='minority', random_state=42, k_neighbors=3)
oversampled_features, oversampled_target = smote.fit_resample(features, target)
df_4 = pd.concat([pd.DataFrame(oversampled_features), pd.DataFrame(oversampled_target)], axis=1)
df_4.columns = features.columns.tolist() + ['stroke'] # name of the columns
# df_4 is our oversampled data set

In [16]:
# balanced data set 

number_stroke_1= df_4['stroke'].value_counts()[1]
print("the number of records with stroke value 1 out of 9718 records is: ", number_stroke_1)
# We now have the same number of records in each class.

the number of records with stroke value 1 out of 9718 records is:  4859


In [17]:
# df_4.to_csv("balanced_stroke.csv")

In [18]:
# normalization
scaler = MinMaxScaler()
numerical_features = ['age', 'avg_glucose_level', 'bmi']
scaled_features = scaler.fit_transform(df_4[numerical_features])
df_4[numerical_features] = scaled_features

In [19]:
df_4.head(5)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,...,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.816895,0.0,1.0,0.801265,0.30126,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.975586,0.0,1.0,0.234512,0.254296,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.987793,0.0,0.0,0.605161,0.214204,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,0.902344,1.0,1.0,0.069107,0.195876,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,0.951172,0.0,1.0,0.76041,0.210171,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [20]:
# Random Forest 
X_rf = df_4.drop('stroke', axis=1)  #all columns except stroke
y_rf = df_4['stroke']
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.3, random_state=42)
rf = RandomForestClassifier(max_depth= None, min_samples_leaf=2, min_samples_split= 10, n_estimators= 200, random_state=42)

rf.fit(X_rf_train, y_rf_train)
y_rf_pred = rf.predict(X_rf_test)

cm_rf = confusion_matrix(y_rf_test, y_rf_pred)
accuracy_rf = accuracy_score(y_rf_test, y_rf_pred)


print("Confusion Matrix of Random Forest :\n", cm_rf)
print("\nAccuracy of Random Forest:", round(accuracy_rf*100,2))

Confusion Matrix of Random Forest :
 [[1435   17]
 [  66 1398]]

Accuracy of Random Forest: 97.15


In [21]:
# Cross-validation to assess the performance of Random Forest algorthim
cv_scores_rf = cross_val_score(rf, X_rf, y_rf, cv=10)
print("Mean CV Accuracy:", round(cv_scores_rf.mean()*100,2))
print("Std CV Accuracy:", round(cv_scores_rf.std(),2))

Mean CV Accuracy: 97.13
Std CV Accuracy: 0.06


In [22]:
#  XGradient Boosting 
X_gb = df_4.drop('stroke', axis=1)  #all columns except stroke
y_gb = df_4['stroke']
X_gb_train, X_gb_test, y_gb_train, y_gb_test = train_test_split(X_gb, y_gb, test_size=0.3, random_state=42)


gb_clf = xgb.XGBClassifier(learning_rate= 0.1, max_depth= 6, n_estimators= 100)
gb_clf.fit(X_gb_train, y_gb_train)


y_gb_pred = gb_clf.predict(X_gb_test)

cm_gb = confusion_matrix(y_gb_test, y_gb_pred)
accuracy_gb = accuracy_score(y_gb_test, y_gb_pred)

print("Confusion Matrix of XGBoost :\n", cm_gb)
print("\nAccuracy of XGBoost:", round(accuracy_gb*100,2))

Confusion Matrix of XGBoost :
 [[1424   28]
 [  56 1408]]

Accuracy of XGBoost: 97.12


In [23]:
# Cross-validation to assess the performance of XGBoost algorthim
cv_scores_gb = cross_val_score(gb_clf, X_gb, y_gb, cv=10)
print("Mean CV Accuracy:", round(cv_scores_gb.mean()*100,2))
print("Std CV Accuracy:", round(cv_scores_gb.std(),2))

Mean CV Accuracy: 96.77
Std CV Accuracy: 0.07


In [24]:
# SVM (Support Vector Machines)
X_svm = df_4.drop('stroke', axis=1)  #all columns except stroke
y_svm = df_4['stroke']
X_svm_train, X_svm_test, y_svm_train, y_svm_test = train_test_split(X_svm, y_svm, test_size=0.3, random_state=42)

svm = SVC(C= 100, kernel='rbf')
svm.fit(X_svm_train, y_svm_train)

y_svm_pred = svm.predict(X_svm_test)

cm_svm = confusion_matrix(y_svm_test, y_svm_pred)
accuracy_svm = accuracy_score(y_svm_test, y_svm_pred)

print("Confusion Matrix of SVM:\n", cm_svm)
print("\nAccuracy of SVM:", round(accuracy_svm*100,2))

Confusion Matrix of SVM:
 [[1401   51]
 [  71 1393]]

Accuracy of SVM: 95.82


In [25]:
# Cross-validation to assess the performance of SVM algorthim
cv_scores_svm = cross_val_score(svm, X_gb, y_gb, cv=10)
print("Mean CV Accuracy:", round(cv_scores_svm.mean()*100,2))
print("Std CV Accuracy:", round(cv_scores_svm.std(),2))

Mean CV Accuracy: 95.64
Std CV Accuracy: 0.06


In [30]:
# Multilayer Perceptron (MLP)
X_MLP = df_4.drop('stroke', axis=1)  #all columns except stroke
y_MLP = df_4['stroke']
X_MLP_train, X_MLP_test, y_MLP_train, y_MLP_test = train_test_split(X_MLP, y_MLP, test_size=0.3, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu', solver='lbfgs', random_state=42)

mlp.fit(X_MLP_train, y_MLP_train)
y_MLP_pred = mlp.predict(X_MLP_test)

cm_MLP = confusion_matrix(y_MLP_test, y_MLP_pred)
accuracy_MLP = accuracy_score(y_MLP_test, y_MLP_pred)


print("Confusion Matrix of Multilayer Perceptron :\n", cm_MLP)
print("\nAccuracy of Multilayer Perceptron:", round(accuracy_MLP*100,2))

Confusion Matrix of Multilayer Perceptron :
 [[1400   52]
 [  66 1398]]

Accuracy of Multilayer Perceptron: 95.95


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [27]:
# Cross-validation to assess the performance of Multilayer Perceptron (MLP) algorthim
cv_scores_MLP = cross_val_score(mlp, X_MLP, y_MLP, cv=10)
print("Mean CV Accuracy:", round(cv_scores_MLP.mean()*100,2))
print("Std CV Accuracy:", round(cv_scores_MLP.std(),2))

Mean CV Accuracy: 95.85
Std CV Accuracy: 0.06


In [53]:
#Grid search for random forest:
start_time = time.time()

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_rf_train, y_rf_train)

print(grid_search.best_params_)
print("\nAccuracy for Random Forest:",round(grid_search.best_score_*100,2))

end_time =time.time()
elapsed_time = end_time - start_time
minutes, seconds = divmod(elapsed_time, 60)
print(f"'\nElapsed time for Random Forest is : {minutes} minutes and {round(seconds,0)} seconds'")

{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}

Accuracy for Random Forest: 97.81
'
Elapsed time for Random Forest is : 5.0 minutes and 45.0 seconds'


In [54]:
#Grid search for XGBoost:
start_time_2 = time.time()

param_grid_2 = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5, 6]
#    ,'min_samples_split': [2, 3, 4],
#     'min_samples_leaf': [1, 2, 3]
}

grid_search_2 = GridSearchCV(estimator=gb_clf, param_grid=param_grid_2, cv=5, scoring='accuracy')
grid_search_2.fit(X_gb_train, y_gb_train)

print(grid_search_2.best_params_)
print("\nAccuracy for XGBoost:",round(grid_search_2.best_score_*100,2))

end_time_2 =time.time()
elapsed_time_2 = end_time_2 - start_time_2
minutes_2, seconds_2 = divmod(elapsed_time_2, 60)
print(f"'\nElapsed time for XGBoost is: {minutes_2} minutes and {round(seconds_2,0)} seconds'")

{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}

Accuracy for XGBoost: 97.59
'
Elapsed time for XGBoost is: 1.0 minutes and 24.0 seconds'


In [55]:
#Grid search for SVM:
start_time_3 = time.time()

param_grid_3 = {
    'C': [0.1, 1, 10, 100],
#     'gamma': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly']
}

grid_search_3= GridSearchCV(estimator=svm, param_grid=param_grid_3, cv=5, scoring='accuracy')
grid_search_3.fit(X_svm_train, y_svm_train)

print(grid_search_3.best_params_)
print("\nAccuracy for SVM:",round(grid_search_3.best_score_*100,2))

end_time_3 =time.time()
elapsed_time_3 = end_time_3 - start_time_3
minutes_3, seconds_3 = divmod(elapsed_time_3, 60)
print(f"'\nElapsed time for SVM is: {minutes_3} minutes and {round(seconds_3,0)} seconds'")

{'C': 100, 'kernel': 'rbf'}

Accuracy for SVM: 96.46
'
Elapsed time for SVM is: 0.0 minutes and 27.0 seconds'


In [29]:
#Grid search for MLP:
start_time_4 = time.time()

param_grid_4 = {
    'hidden_layer_sizes': [(50,50), (100,100)],
    'activation': ['relu', 'logistic'],
    'solver': ['adam', 'lbfgs']
#     ,'alpha': [0.0001, 0.001, 0.01],
}

grid_search_4= GridSearchCV(estimator=mlp, param_grid=param_grid_4, cv=5, scoring='accuracy')
grid_search_4.fit(X_MLP_train, y_MLP_train)

print(grid_search_4.best_params_)
print("\nAccuracy for MLP:",round(grid_search_4.best_score_*100,2))

end_time_4 =time.time()
elapsed_time_4 = end_time_4 - start_time_4
minutes_4, seconds_4 = divmod(elapsed_time_4, 60)
print(f"'\nElapsed time for MLP is: {minutes_4} minutes and {round(seconds_4,0)} seconds'")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

{'activation': 'relu', 'hidden_layer_sizes': (100, 100), 'solver': 'lbfgs'}

Accuracy for MLP: 96.47
'
Elapsed time for MLP is: 7.0 minutes and 35.0 seconds'


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
