# 💰 Multi-Class Classification-Accuracy-Poverty Level
This notebook provides commonly used Machine Learning algorithms. The task is multi-class classification. Feature generation or selection is just simply performed. The objective of this notebook is to serve as a cheat sheet.

Ten Machine Learning algorithms are developed to predict with accuracy as the scorer. All algorithms are applied with hyperparameter-tuning to search for the optimum model evaluation results. The hyperparameter-tuning methods consist of GridSearchCV and Bayesian Optimization (using bayes_opt or hyperopt packages) with 5-fold cross-validation.

The optimum hyperparameters are then used to train the training dataset and predict the unseen validation dataset. The model is evaluated using accuracy, followed by the confusion matrix and classification report. Useful attributes of the models are also displayed, such as the coefficients or feature importances.

In [None]:
# Import packages
# Basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluation and bayesian optimization
from math import floor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from hyperopt import hp, fmin, tpe
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [None]:
# Make scorer: accuracy
accuracy = make_scorer(accuracy_score)

In [None]:
# Load dataset
trainSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
testSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
submitSet = pd.read_csv('../input/costa-rican-household-poverty-prediction/sample_submission.csv')

trainSet.head()

The task is to predict which poverty class each household is in. There are 4 classes of poverty level: 1 = extreme poverty, 2 = moderate poverty, 3 = vulnerable households, and 4 = non vulnerable households.

In [None]:
# missing values
desc = trainSet.describe()
desc = desc.loc['count', ]
desc = pd.DataFrame(desc).sort_values('count')
print(desc.head(10))

# data structure
obj = pd.DataFrame(trainSet.dtypes == object)
obj = obj.loc[obj[0]==True,]
print(obj)

In [None]:
# Drop columns with lacking data
train = trainSet.drop(columns=['Id','idhogar','rez_esc', 'v18q1', 'v2a1', 'dependency', 'edjefe', 'edjefa'])

# Drop rows with missing values
train = train.dropna(axis=0)

print(train.shape)
train.head()

In [None]:
# train validation split for feature selection
X_train0, X_val0, y_train, y_val = train_test_split(train.drop(columns=['Target'], axis=0),
                                                  train['Target'],
                                                  test_size=0.2, random_state=123,
                                                  stratify=train['Target'])

In [None]:
import xgboost as xgb

# Feature selection with XGBoost
selection =  xgb.XGBClassifier(random_state=123, nthread=-1)
selection.fit(X_train0, y_train)
sele_pred = selection.predict(X_val0)
print('Accuracy: ' + str(accuracy_score(y_val, sele_pred)))

In [None]:
print(classification_report(y_val, sele_pred))
print(confusion_matrix(y_val, sele_pred))

In [None]:
# Feature importances
Feature_sel = pd.DataFrame({'feature':X_train0.columns,
                            'importance':list(selection.feature_importances_)}).sort_values('importance').reset_index(drop=True)
plt.figure(figsize=(16,4))
sns.barplot(data=Feature_sel, x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Select features with high importance
selected = ['tipovivi5', 'hogar_mayor', 'abastaguano', 'epared2', 'area1',
       'tipovivi1', 'elimbasu2', 'refrig', 'mobilephone', 'energcocinar2',
       'pisocemento', 'pareddes', 'elimbasu1', 'etecho2', 'lugar4',
       'paredmad', 'paredzinc', 'lugar1', 'tamviv', 'lugar3',
       'television', 'rooms', 'epared1', 'tipovivi3', 'etecho1',
       'SQBedjefe', 'epared3', 'parentesco9', 'bedrooms', 'r4m2',
       'overcrowding', 'sanitario5', 'paredzocalo', 'eviv1', 'paredpreb',
       'etecho3', 'abastaguadentro', 'r4m3', 'techozinc', 'pisomadera',
       'sanitario3', 'eviv2', 'tamhog', 'v14a', 'r4t2', 'public',
       'lugar5', 'elimbasu3', 'r4m1', 'hacdor', 'r4t3', 'energcocinar4',
       'r4h1', 'sanitario2', 'hogar_adul', 'r4h2', 'cielorazo',
       'qmobilephone', 'tipovivi4', 'pisomoscer', 'meaneduc', 'tipovivi2',
       'paredblolad', 'computer', 'r4t1', 'pisonotiene', 'SQBdependency',
       'eviv3', 'hacapo', 'hogar_nin', 'v18q']

In [None]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(train[selected], train['Target'],
                                                  test_size=0.2, random_state=123,
                                                  stratify=train['Target'])

In [None]:
# Scaling
scaler = MinMaxScaler()
X_trainS = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

X_valS = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, hyperopt
space_log = {'penalty': hp.choice('penalty', ['l2', 'none']),
             'C': hp.loguniform('C', np.log(0.01), np.log(1000)),
             'fit_intercept': hp.choice('fit_intercept',[True, False]),
             'solver':hp.choice('solver', ['newton-cg', 'lbfgs', 'sag', 'saga'])}

def log_cl_bo(params_log):
    params_log = {'penalty': params_log['penalty'],
                  'C': params_log['C'],
                  'fit_intercept': params_log['fit_intercept'],
                  'solver': params_log['solver']}
    
    log_bo = LogisticRegression(random_state=123, **params_log)
    best_score = cross_val_score(log_bo, X_trainS, y_train, scoring=accuracy, cv=5).mean()
    return 1 - best_score

log_best_param = fmin(fn=log_cl_bo,
                space=space_log,
                max_evals=20,
                rstate=np.random.RandomState(42),
                algo=tpe.suggest)

In [None]:
# Best hyperparameters
params_log = log_best_param
penaltyL = ['l2', 'none']
fit_interceptL = [True, False]
solverL = ['newton-cg', 'lbfgs', 'sag', 'saga']

params_log['fit_intercept'] = fit_interceptL[round(params_log['fit_intercept'])]
params_log['penalty'] = penaltyL[round(params_log['penalty'])]
params_log['solver'] = solverL[round(params_log['solver'])]
params_log

In [None]:
# Fit the training data
log_hyp =  LogisticRegression(**params_log, random_state=123)
log_hyp.fit(X_trainS, y_train)

# Predict the validation data
pred_log = log_hyp.predict(X_valS)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_log)))

In [None]:
# Prediction Result
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_log), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_log))

In [None]:
# Features Coefficients
Feature_log = pd.DataFrame({'features': list(X_trainS.columns), 'coefficient':list(log_hyp.coef_[0])}).sort_values('coefficient')
plt.figure(figsize=(16,4))
sns.barplot(data=Feature_log, x='features', y='coefficient')
plt.xticks(rotation=90)
plt.show()

# 2a. Naive Bayes (Gaussian)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# Hyperparameter-tuning: Grid Search
var_smoothing = [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]
param_nb={'var_smoothing':var_smoothing}
nb_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_nb, scoring=accuracy, cv=5)

nb_grid.fit(X_train, y_train)

print('Best score: ' + str(nb_grid.best_score_))
print('Best parameter {}'.format(nb_grid.best_params_))

In [None]:
# Fit the training data
nb_hyp = GaussianNB(var_smoothing=nb_grid.best_params_['var_smoothing'])
nb_hyp.fit(X_train, y_train)

# Predict the validation data
pred_nb = nb_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_nb)))

In [None]:
# Prediction Result
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_nb), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_nb))

# 2b. naive Bayes (Bernoulli)

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
# Hyperparameter-tuning: Grid Search
param_nbBer={'alpha':[0.2,0.4,0.6,0.8,1],
             'fit_prior':[True, False]}

nbBer_grid = GridSearchCV(estimator=BernoulliNB(), param_grid=param_nbBer, scoring=accuracy, cv=5)

nbBer_grid.fit(X_train, y_train)

print('Best score: ' + str(nbBer_grid.best_score_))
print('Best parameter {}'.format(nbBer_grid.best_params_))

In [None]:
# Fit the training data
nbBer_hyp = BernoulliNB(alpha=nbBer_grid.best_params_['alpha'], fit_prior=nbBer_grid.best_params_['fit_prior'])
nbBer_hyp.fit(X_train, y_train)

# Predict the validation data
pred_nbBer = nbBer_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_nb)))

In [None]:
# Prediction Result
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_nbBer), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_nbBer))

# 3. K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def knn_cl_bo(n_neighbors, weights, p):
    params_knn = {}
    weightsL = ['uniform', 'distance']
    
    params_knn['n_neighbors'] = round(n_neighbors)
    params_knn['weights'] = weightsL[round(weights)]
    params_knn['p'] = round(p)
    
    score = cross_val_score(KNeighborsClassifier(**params_knn),
                             X_trainS, y_train, cv=5, scoring=accuracy).mean()
    return score

# Set hyperparameters spaces
params_knn ={
    'n_neighbors':(3, 20),
    'weights':(0, 1),
    'p':(1, 2)}

# Run Bayesian Optimization
knn_bo = BayesianOptimization(knn_cl_bo, params_knn, random_state=111)
knn_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_knn = knn_bo.max['params']
weightsL = ['uniform', 'distance']
params_knn['n_neighbors'] = round(params_knn['n_neighbors'])
params_knn['weights'] = weightsL[round(params_knn['weights'])]
params_knn['p'] = round(params_knn['p'])
params_knn

In [None]:
# Fit the training data
knn_hyp = KNeighborsClassifier(**params_knn)
knn_hyp.fit(X_trainS, y_train)

# Predict the validation data
pred_knn = knn_hyp.predict(X_valS)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_knn)))

In [None]:
# Prediction Result
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_knn), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_knn))

# 4. Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
# Hyperparameter-tuning: Grid Search
params_svm = {'C':[0.01,0.1,1,10]}
svm_grid = GridSearchCV(estimator=SVC(), param_grid=params_svm,
                         scoring=accuracy, cv=5)

# Subset only 100 rows for training SVM for simplicity
# SVM requires a very long time if the observations are too many.
svm_grid.fit(X_trainS.iloc[0:100,], y_train.iloc[0:100,])

print('Best score: ' + str(svm_grid.best_score_))
print('Best parameter {}'.format(svm_grid.best_params_))

In [None]:
# Fit the training data
svm_hyp =  SVC(**svm_grid.best_params_, random_state=123)
svm_hyp.fit(X_trainS.iloc[0:100,], y_train.iloc[0:100,])

# Predict the validation data
pred_svm = svm_hyp.predict(X_valS)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_svm)))

In [None]:
# Prediction Result
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_svm), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_svm))
# SVM is trained from too little observations. 
# The model is not yet well-trained.

# 5. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def dt_cl_bo(criterion, splitter, max_depth, min_samples_split, min_samples_leaf):
    params_dt = {}
    criterionL = ['gini', 'entropy']
    splitterL = ['best', 'random']
    
    params_dt['criterion'] = criterionL[round(criterion)]
    params_dt['splitter'] = splitterL[round(splitter)]
    params_dt['max_depth'] = round(max_depth)
    params_dt['min_samples_split'] = round(min_samples_split)
    params_dt['min_samples_leaf'] = round(min_samples_leaf)
    
    score = cross_val_score(DecisionTreeClassifier(random_state=123, **params_dt),
                            X_train, y_train, scoring=accuracy, cv=5).mean()
    return score

# Set hyperparameters spaces
params_dt ={
    'criterion':(0, 1),
    'splitter':(0, 1),
    'max_depth':(4, 15),
    'min_samples_split':(2, 10),
    'min_samples_leaf': (2, 10)
}

# Run Bayesian Optimization
dt_bo = BayesianOptimization(dt_cl_bo, params_dt, random_state=123)
dt_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_dt = dt_bo.max['params']

criterionL = ['gini', 'entropy']
splitterL = ['best', 'random']

params_dt['criterion'] = criterionL[int(round(params_dt['criterion']))]
params_dt['splitter'] = splitterL[int(round(params_dt['splitter']))]
params_dt['max_depth'] = round(params_dt['max_depth'])
params_dt['min_samples_split'] = round(params_dt['min_samples_split'])
params_dt['min_samples_leaf'] = round(params_dt['min_samples_leaf'])
params_dt

In [None]:
# Fit the training data
dt_hyp =  DecisionTreeClassifier(**params_dt, random_state=123)
dt_hyp.fit(X_train, y_train)

# Predict the validation data
pred_dt = dt_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_dt)))

In [None]:
# Prediction Result
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_dt), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_dt))

In [None]:
# Feature importances
Feature_dt = pd.DataFrame({'feature':X_train.columns, 'importance':list(dt_hyp.feature_importances_)}).sort_values('importance')
plt.figure(figsize=(16,4))
sns.barplot(data=Feature_dt, x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

All tree-based algorithms create tree-like structures. Decision Tree has 1 tree-like structure. The other tree-based algorithms have more than 1 trees. In this notebook, only Decision Tree is plotted for its tree.

In [None]:
# Here is the example of tree with only 3 max_depth
# THe optimum max-depth is 15. This tree is plotted with only 3 depth to make it simpler
params_dt2 = {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 2,
              'min_samples_split': 2, 'splitter': 'best'}

dt_hyp2 =  DecisionTreeClassifier(**params_dt2, random_state=123)
dt_hyp2.fit(X_train, y_train)

plt.figure(figsize=(16,8))
plot_tree(dt_hyp2, fontsize=10)
plt.show()

# 6. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def rf_cl_bo(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf,):
    params_rf = {}
    criterionL = ['gini', 'entropy']
    
    params_rf['n_estimators'] = round(n_estimators)
    params_rf['criterion'] = criterionL[round(criterion)]
    params_rf['max_depth'] = round(max_depth)
    params_rf['min_samples_split'] = round(min_samples_split)
    params_rf['min_samples_leaf'] = round(min_samples_leaf)
    
    score = cross_val_score(RandomForestClassifier(random_state=123, **params_rf),
                             X_train, y_train, scoring=accuracy, cv=5).mean()
    return score

# Set hyperparameters spaces
params_rf ={
    'n_estimators':(70, 150),
    'criterion':(0, 1),
    'max_depth':(4, 20),
    'min_samples_split':(2, 10),
    'min_samples_leaf': (2, 10)
}

# Run Bayesian Optimization
rf_bo = BayesianOptimization(rf_cl_bo, params_rf, random_state=111)
rf_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_rf = rf_bo.max['params']
criterionL = ['gini', 'entropy']

params_rf['n_estimators'] = round(params_rf['n_estimators'])
params_rf['criterion'] = criterionL[int(round(params_rf['criterion']))]
params_rf['max_depth'] = round(params_rf['max_depth'])
params_rf['min_samples_split'] = round(params_rf['min_samples_split'])
params_rf['min_samples_leaf'] = round(params_rf['min_samples_leaf'])
params_rf

In [None]:
# Fit the training data
rf_hyp =  RandomForestClassifier(**params_rf, random_state=123)
rf_hyp.fit(X_train, y_train)

# Predict the validation data
pred_rf = rf_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_rf)))

In [None]:
# Prediction Result
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_rf), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_rf))

In [None]:
# Feature importances
Feature_rf = pd.DataFrame({'feature':X_train.columns, 'importance':list(rf_hyp.feature_importances_)}).sort_values('importance')
plt.figure(figsize=(16,4))
sns.barplot(data=Feature_rf, x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

# 7. Gradient Boosting Machine

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def gbm_cl_bo(max_depth, max_features, learning_rate, n_estimators, subsample):
    params_gbm = {}
    
    params_gbm['max_depth'] = round(max_depth)
    params_gbm['max_features'] = max_features
    params_gbm['learning_rate'] = learning_rate
    params_gbm['n_estimators'] = round(n_estimators)
    params_gbm['subsample'] = subsample
    
    score = cross_val_score(GradientBoostingClassifier(random_state=123, **params_gbm),
                             X_train, y_train, scoring=accuracy, cv=5).mean()
    return score

# Set hyperparameters spaces
params_gbm ={
    'max_depth':(3, 10),
    'max_features':(0.8, 1),
    'learning_rate':(0.01, 1),
    'n_estimators':(80, 150),
    'subsample': (0.8, 1)
}

# Run Bayesian Optimization
gbm_bo = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
gbm_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_gbm = gbm_bo.max['params']
params_gbm['max_depth'] = round(params_gbm['max_depth'])
params_gbm['n_estimators'] = round(params_gbm['n_estimators'])
params_gbm

In [None]:
# Fit the training data
gbm_hyp =  GradientBoostingClassifier(**params_gbm, random_state=123)
gbm_hyp.fit(X_train, y_train)

# Predict the validation data
pred_gbm = gbm_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_gbm)))

In [None]:
# Prediction Result
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_nbBer), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_gbm))

In [None]:
# Feature importances
Feature_gbm = pd.DataFrame({'feature':X_train.columns, 'importance':list(gbm_hyp.feature_importances_)}).sort_values('importance')
plt.figure(figsize=(16,4))
sns.barplot(data=Feature_gbm, x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

# 8. LightGBM

In [None]:
import lightgbm

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def lgbm_cl_bo(max_depth, subsample, colsample_bytree,min_child_weight, learning_rate, num_leaves_percentage):
    params_lgbm = {'objective': 'multiclass'}
    
    params_lgbm['max_depth'] = round(max_depth)
    params_lgbm['subsample'] = subsample
    params_lgbm['colsample_bytree'] = colsample_bytree
    params_lgbm['min_child_weight'] = min_child_weight
    params_lgbm['learning_rate'] = learning_rate
    params_lgbm['num_leaves'] = round((2**round(max_depth))*num_leaves_percentage)
    
    lgbm_bo = lightgbm.LGBMClassifier(random_state=123, **params_lgbm)
    score = cross_val_score(lgbm_bo, X_train, y_train, scoring=accuracy, cv=5).mean()
    return score

# Set parameters distribution
params_lgbm ={
    'min_child_weight':(1e-5, 1e-1),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_depth': (3, 15),
    'learning_rate': (0.01, 0.5),
    'num_leaves_percentage':(0.5,0.9)
}

# Run Bayesian Optimization
lgbm_bo = BayesianOptimization(lgbm_cl_bo, params_lgbm, random_state=111)
lgbm_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_lgbm = lgbm_bo.max['params']
params_lgbm['objective'] = 'multiclass'
params_lgbm['max_depth'] = int(params_lgbm['max_depth'])
params_lgbm['num_leaves'] = round((2**round(params_lgbm['max_depth']))*params_lgbm['num_leaves_percentage'])
del params_lgbm["num_leaves_percentage"]
params_lgbm

In [None]:
# Fit the training data
lgbm_hyp =  lightgbm.LGBMClassifier(**params_lgbm, random_state=123, n_jobs=-1)
lgbm_hyp.fit(X_train, y_train)

# Predict the validation data
pred_lgbm = lgbm_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_lgbm)))

In [None]:
# Prediction Result
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_lgbm), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_lgbm))

In [None]:
# Feature importances
FeatureLgbm = pd.DataFrame({'feature':X_train.columns, 'importance':list(gbm_hyp.feature_importances_)}).sort_values('importance')
plt.figure(figsize=(16,4))
sns.barplot(data=FeatureLgbm, x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

# 9. XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
# Hyperparameter tuning: Bayesian Optimization
def xgb_cl_bo(n_estimators, max_depth, learning_rate, gamma, min_child_weight, subsample, colsample_bytree):
    params_xgb = {
    'objective': 'multi:softmax',
    'eval_metric':'mlogloss',
    'nthread':-1
    }
    params_xgb['n_estimators'] = round(n_estimators)
    params_xgb['max_depth'] = round(max_depth)
    params_xgb['learning_rate'] = learning_rate
    params_xgb['gamma'] = gamma
    params_xgb['min_child_weight'] = round(min_child_weight)
    params_xgb['subsample'] = subsample
    params_xgb['colsample_bytree'] = colsample_bytree
        
    score = cross_val_score(XGBClassifier(random_state=123, **params_xgb),
                            X_train, y_train, scoring=accuracy, cv=5).mean()
    return score

# Set parameters distribution
params_xgb ={
    'n_estimators':(80, 150),
    'max_depth': (3, 15),
    'learning_rate': (0.01, 0.5),
    'gamma':(0, 10),
    'min_child_weight':(3, 20),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.1, 1)
}

# Run Bayesian Optimization
xgb_bo = BayesianOptimization(xgb_cl_bo, params_xgb, random_state=111)
xgb_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_xgb = xgb_bo.max['params']
params_xgb['objective'] = 'multi:softmax'
params_xgb['n_jobs'] = -1
params_xgb['n_estimators'] = round(params_xgb['n_estimators'])
params_xgb['max_depth'] = round(params_xgb['max_depth'])
params_xgb['min_child_weight'] = round(params_xgb['min_child_weight'])
params_xgb

In [None]:
# Fit the training data
xgb_hyp =  XGBClassifier(**params_xgb, random_state=123, nthread=-1)
xgb_hyp.fit(X_train, y_train)

# Predict the validation data
pred_xgb = xgb_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_xgb)))

In [None]:
# Prediction Result
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_xgb), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_xgb))

In [None]:
# Feature importances
FeatureXgb = pd.DataFrame({'feature':X_train.columns, 'importance':list(xgb_hyp.feature_importances_)}).sort_values('importance')
plt.figure(figsize=(16,4))
sns.barplot(data=FeatureXgb, x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

# 10. Neural Network (Deep Learning)

In [None]:
# Deep Learning packages
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import LeakyReLU
LeakyReLU = LeakyReLU(alpha=0.1)

In [None]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def nn_cl_bo(neurons, activation, optimizer, learning_rate, batch_size, epochs,
              layers1, layers2, normalization, dropout, dropout_rate):
    optimizerL = ['Adam', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl','SGD', 'SGD']
    optimizerD= {'Adam':Adam(lr=learning_rate), 'SGD':SGD(lr=learning_rate),
                 'RMSprop':RMSprop(lr=learning_rate), 'Adadelta':Adadelta(lr=learning_rate),
                 'Adagrad':Adagrad(lr=learning_rate), 'Adamax':Adamax(lr=learning_rate),
                 'Nadam':Nadam(lr=learning_rate), 'Ftrl':Ftrl(lr=learning_rate)}
    activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu',
               'elu', 'exponential', LeakyReLU, LeakyReLU]
        
    neurons = round(neurons)
    activation = activationL[floor(activation)]
    optimizer = optimizerD[optimizerL[floor(optimizer)]]
    batch_size = round(batch_size)
    epochs = round(epochs)
    layers1 = round(layers1)
    layers2 = round(layers2)
        
    def nn_cl_fun():
        nn = Sequential()
        nn.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
        if normalization > 0.5:
            nn.add(BatchNormalization())
        for i in range(layers1):
            nn.add(Dense(neurons, activation=activation))
        if dropout > 0.5:
            nn.add(Dropout(dropout_rate, seed=123))
        for i in range(layers2):
            nn.add(Dense(neurons, activation=activation))
        nn.add(Dense(4, activation='softmax'))
        nn.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        return nn
        
    es = EarlyStopping(monitor='accuracy', mode='max', verbose=0, patience=20)
    nn = KerasClassifier(build_fn=nn_cl_fun, epochs=epochs, batch_size=batch_size, verbose=0)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    score = cross_val_score(nn, X_train, y_train, scoring=accuracy, cv=kfold, fit_params={'callbacks':[es]}).mean()
    
    return score

# Set hyperparameters spaces
params_nn ={
    'neurons': (10, 100),
    'activation':(0, 9),
    'optimizer':(0,7),
    'learning_rate':(0.01, 1),
    'batch_size':(500, 1000),
    'epochs':(200, 1000),
    'layers1':(1,3),
    'layers2':(1,3),
    'normalization':(0,1),
    'dropout':(0,1),
    'dropout_rate':(0,0.3)
}

# Run Bayesian Optimization
nn_bo = BayesianOptimization(nn_cl_bo, params_nn, random_state=123)
nn_bo.maximize(init_points=4, n_iter=25)

In [None]:
# Best hyperparameters
params_nn = nn_bo.max['params']

learning_rate = params_nn['learning_rate']
optimizerL = ['Adam', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl','SGD', 'SGD']
optimizerD= {'Adam':Adam(lr=learning_rate), 'SGD':SGD(lr=learning_rate),
             'RMSprop':RMSprop(lr=learning_rate), 'Adadelta':Adadelta(lr=learning_rate),
             'Adagrad':Adagrad(lr=learning_rate), 'Adamax':Adamax(lr=learning_rate),
             'Nadam':Nadam(lr=learning_rate), 'Ftrl':Ftrl(lr=learning_rate)}
activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu',
               'elu', 'exponential', LeakyReLU, LeakyReLU]
params_nn['activation'] = activationL[round(params_nn['activation'])]
params_nn['batch_size'] = round(params_nn['batch_size'])
params_nn['epochs'] = round(params_nn['epochs'])
params_nn['layers1'] = round(params_nn['layers1'])
params_nn['layers2'] = round(params_nn['layers2'])
params_nn['neurons'] = round(params_nn['neurons'])
params_nn['optimizer'] = optimizerD[optimizerL[round(params_nn['optimizer'])]]

params_nn

In [None]:
# Fitting the training data
def nn_cl_fun():
    nn = Sequential()
    nn.add(Dense(params_nn['neurons'], input_dim=X_train.shape[1], activation=params_nn['activation']))
    if params_nn['normalization'] > 0.5:
        nn.add(BatchNormalization())
    for i in range(params_nn['layers1']):
        nn.add(Dense(params_nn['neurons'], activation=params_nn['activation']))
    if params_nn['dropout'] > 0.5:
        nn.add(Dropout(params_nn['dropout_rate'], seed=123))
    for i in range(params_nn['layers2']):
        nn.add(Dense(params_nn['neurons'], activation=params_nn['activation']))
    nn.add(Dense(4, activation='softmax'))
    nn.compile(loss='categorical_crossentropy', optimizer=params_nn['optimizer'], metrics=['accuracy'])
    return nn
        
es = EarlyStopping(monitor='accuracy', mode='max', verbose=0, patience=20)
nn_hyp = KerasClassifier(build_fn=nn_cl_fun, epochs=params_nn['epochs'], batch_size=params_nn['batch_size'], verbose=0)

nn_hyp.fit(X_train, y_train, verbose=0)

# Predict the validation data
pred_nn = nn_hyp.predict(X_val)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(y_val, pred_nn)))

In [None]:
# Prediction Result
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_val, pred_nn), index=[1,2,3,4], columns=[1,2,3,4]))
print('')
print('Classification Report')
print(classification_report(y_val, pred_nn))