## Import libraries & Load Dataset

In [4]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from model_utils import save_model, load_model

In [6]:
# load dataset
df = pd.read_csv("./Dataset/processed_train.csv") 
x = df.copy()
x = x.drop(columns='stroke')
y = df['stroke']
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size= 0.2, random_state= 42)
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

6052
6052
1514
1514


## Load Model 
XGB, XGB with GridSearch, Random Forest

In [7]:
xgb = load_model('./models/xgb')
random_forest = load_model('./models/random_forest')
xgb_gs = load_model('./models/xgb_gridsearch')

## Ensemble Learning 

In [9]:
# Hard voting 
eclf1 = VotingClassifier(estimators=[('xgb', xgb),
                                    ('xgb_gridsearch', xgb_gs), 
                            ('random_forest', random_forest)], 
                                    voting='hard')
eclf1 = eclf1.fit(x_train, y_train)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [10]:
y_pred1 = eclf1.predict(x_test)
f1_score(y_test, y_pred1)

0.988567585743107

In [12]:
# Soft voting 
eclf2 = VotingClassifier(estimators=[('xgb', xgb),
                                    ('xgb_gridsearch', xgb_gs), 
                            ('random_forest', random_forest)], 
                                    voting='soft')
eclf2 = eclf1.fit(x_test, y_test)
y_pred2 = eclf2.predict(x_test)
f1_score(y_test, y_pred2)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




1.0

## Generate submission

In [11]:
# from . import generate_submission
from generate_submission import generate_submission
test_df = pd.read_csv("./Dataset/processed_test.csv")
generate_submission(eclf2, test_df, 'ensemble_soft')