In [0]:
! pip install h2o

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn import tree

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#Initialize H2o
h2o.init()

In [0]:
# Reading dataset from Google drive
df_creditcarddata = h2o.import_file("/content/drive/My Drive/Colab Notebooks/UCI_Credit_Card.csv")

In [0]:
type(df_creditcarddata)

In [0]:
df_creditcarddata.head()

In [0]:
#check dimensions of the data
df_creditcarddata.shape

In [0]:
df_creditcarddata.columns

In [0]:
df_creditcarddata.types

In [0]:
#Count for the response var
df_creditcarddata['default.payment.next.month'].table()

In [0]:
df_creditcarddata = df_creditcarddata.drop(["ID"], axis = 1) 
df_creditcarddata.head()

In [0]:
import pylab as pl
df_creditcarddata[['AGE','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6', 'LIMIT_BAL']].as_data_frame().hist(figsize=(20,20))
pl.show()

In [0]:
# Defaulters by Gender
columns = ["default.payment.next.month","SEX"]
default_by_gender = df_creditcarddata.group_by(by=columns).count(na ="all")
print(default_by_gender.get_frame())

# Defaulters by  education
columns = ["default.payment.next.month","EDUCATION"]
default_by_education = df_creditcarddata.group_by(by=columns).count(na ="all")
print(default_by_education.get_frame())

# Defaulters by MARRIAGE
columns = ["default.payment.next.month","MARRIAGE"]
default_by_marriage = df_creditcarddata.group_by(by=columns).count(na ="all")
print(default_by_marriage.get_frame())

In [0]:
# Convert the categorical variables into factors

df_creditcarddata['SEX'] = df_creditcarddata['SEX'].asfactor()
df_creditcarddata['EDUCATION'] = df_creditcarddata['EDUCATION'].asfactor()
df_creditcarddata['MARRIAGE'] = df_creditcarddata['MARRIAGE'].asfactor()
df_creditcarddata['PAY_0'] = df_creditcarddata['PAY_0'].asfactor()
df_creditcarddata['PAY_2'] = df_creditcarddata['PAY_2'].asfactor()
df_creditcarddata['PAY_3'] = df_creditcarddata['PAY_3'].asfactor()
df_creditcarddata['PAY_4'] = df_creditcarddata['PAY_4'].asfactor()
df_creditcarddata['PAY_5'] = df_creditcarddata['PAY_5'].asfactor()
df_creditcarddata['PAY_6'] = df_creditcarddata['PAY_6'].asfactor()

In [0]:
df_creditcarddata.types


In [0]:
# Also, encode the binary response variable as a factor
df_creditcarddata['default.payment.next.month'] = df_creditcarddata['default.payment.next.month'].asfactor()  
df_creditcarddata['default.payment.next.month'].levels() 

In [0]:
# Define predictors manually
predictors = ['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE','PAY_0','PAY_2','PAY_3',\
              'PAY_4','PAY_5','PAY_6','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4',\
              'BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']

target = 'default.payment.next.month'


In [0]:
# Split the H2O data frame into training/test sets

# using 70% for training
# using the rest 30% for test evaluation

splits = df_creditcarddata.split_frame(ratios=[0.7], seed=1) 

train = splits[0]
test = splits[1] 


**GENERALIZED LINEAR MODEL (Defaut Settings)**

STANDARDIZATION is enabled by default

GLM with default setting
GLM using lmbda search
GLM using Grid search
GLM WITH DEFAULT SETTINGS

Logistic Regression (Binomial Family)

H2O's GLM has the "family" argument, where the family is 'binomial' if the data is categorical 2 levels/classes or binary (Enum or Int).

In [0]:
GLM_default_settings = H2OGeneralizedLinearEstimator(family='binomial', \
                                            model_id='GLM_default',nfolds = 10, \
                                            fold_assignment = "Modulo", \
                                            keep_cross_validation_predictions = True)

GLM_default_settings.train(x = predictors, y = target, training_frame = train)

### **GLM WITH LAMBDA SEARCH**

The model parameter, lambda, controls the amount of regularization in a GLM model
Setting  lambda_search = True gives us optimal lambda value for the regularization strength.

In [0]:
GLM_regularized = H2OGeneralizedLinearEstimator(family='binomial', model_id='GLM', \
                                                lambda_search=True, nfolds = 10, \
                                                fold_assignment = "Modulo", \
                                                keep_cross_validation_predictions = True)

GLM_regularized.train(x = predictors, y = target,training_frame = train)

### **GLM WITH GRID SEARCH**

GLM needs to find the optimal values of the regularization parameters α and λ
lambda: controls the amount of regularization, when set to 0 it gets disabled

alpha : controls the distribution between lasso & ridge regression penalties.

random grid search: H2o supports 2 types of grid search, cartesian and random. We make use of the random as the search criteria for faster computation

Stopping metric: we specify the metric used for early stopping. AUTO takes log loss as default

source: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/lambda.html



In [0]:
hyper_parameters = { 'alpha': [0.0001, 0.001, 0.01, 0.1],
                     'lambda': [0.001, 0.01, 0.1] }
search_criteria = { 'strategy': "RandomDiscrete", 
                    'stopping_metric': "AUTO",
                    'stopping_rounds': 5}

GLM_grid_search = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', \
                  nfolds = 10, fold_assignment = "Modulo", \
                  keep_cross_validation_predictions = True),\
                  hyper_parameters, grid_id="GLM_grid", search_criteria=search_criteria)

GLM_grid_search.train(x= predictors,y= target, training_frame=train)


### Get the grid results, sorted by validation AUC
  

In [0]:
# Get the grid results, sorted by validation AUC
GLM_grid_sorted = GLM_grid_search.get_grid(sort_by='auc', decreasing=True)
GLM_grid_sorted

In [0]:
# Extract the best model from random grid search
Best_GLM_model_from_Grid = GLM_grid_sorted.model_ids[0]

#model performance
Best_GLM_model_from_Grid = h2o.get_model(Best_GLM_model_from_Grid)
print(Best_GLM_model_from_Grid)

### RF WITH DEFAULT SETTINGS

  

In [0]:
# Build a RF model with default settings
RF_default_settings = H2ORandomForestEstimator(model_id = 'RF_D',\
                                nfolds = 10, fold_assignment = "Modulo", \
                                keep_cross_validation_predictions = True)

# Use train() to build the model
RF_default_settings.train(x = predictors, y = target, training_frame = train)

In [0]:
#Let's see the default parameters that RF model utilizes:
RF_default_settings.summary()

### RF with GRID SEARCH to extract the best model

  

In [0]:
hyper_params = {'sample_rate':[0.7,0.9],
                'col_sample_rate_per_tree': [0.8, 0.9],
                'max_depth': [3, 5, 9],
                'ntrees': [200, 300, 400]
               }

In [0]:
RF_grid_search = H2OGridSearch(H2ORandomForestEstimator(nfolds = 10, \
                             fold_assignment = "Modulo", \
                             keep_cross_validation_predictions = True, \
                             stopping_metric = 'AUC',stopping_rounds = 5), \
                             hyper_params = hyper_params, \
                             grid_id= 'RF_gridsearch')

# Use train() to start the grid search
RF_grid_search.train(x = predictors, y = target, training_frame = train)

In [0]:
# Sort the grid models
RF_grid_sorted = RF_grid_search.get_grid(sort_by='auc', decreasing=True)
print(RF_grid_sorted)

In [0]:
# Extract the best model from random grid search
Best_RF_model_from_Grid = RF_grid_sorted.model_ids[0]

# Model performance
Best_RF_model_from_Grid = h2o.get_model(Best_RF_model_from_Grid) 
print(Best_RF_model_from_Grid)

In [0]:
GBM_default_settings = H2OGradientBoostingEstimator(model_id = 'GBM_default', \
                       nfolds = 10, \
                       fold_assignment = "Modulo", \
                       keep_cross_validation_predictions = True)

# Use train() to build the model
GBM_default_settings.train(x = predictors, y = target, training_frame = train)

In [0]:
hyper_params = {'learn_rate': [0.001,0.01, 0.1],
                'sample_rate': [0.8, 0.9],
                'col_sample_rate': [0.2, 0.5, 1],
                'max_depth': [3, 5, 9],
                'ntrees' : [100, 200, 300]
               }

In [0]:
GBM_grid_search = H2OGridSearch(H2OGradientBoostingEstimator(nfolds = 10, \
                        fold_assignment = "Modulo", \
                        keep_cross_validation_predictions = True,\
                        stopping_metric = 'AUC', stopping_rounds = 5),
                        hyper_params = hyper_params, grid_id= 'GBM_Grid')

# Use train() to start the grid search
GBM_grid_search.train(x = predictors, y = target, training_frame = train)

In [0]:
# Sort and show the grid search results
GBM_grid_sorted = GBM_grid_search.get_grid(sort_by='auc', decreasing=True)
print(GBM_grid_sorted)

In [0]:
# Extract the best model from random grid search
Best_GBM_model_from_Grid = GBM_grid_sorted.model_ids[0]

Best_GBM_model_from_Grid = h2o.get_model(Best_GBM_model_from_Grid) 
print(Best_GBM_model_from_Grid)

### STACKED ENSEMBLE

In [0]:
# list the best models from each grid
all_models = [Best_GLM_model_from_Grid, Best_RF_model_from_Grid, Best_GBM_model_from_Grid]

In [0]:
# Set up Stacked Ensemble
ensemble = H2OStackedEnsembleEstimator(model_id = "ensemble", base_models = all_models, metalearner_algorithm = "deeplearning")

# uses GLM as the default metalearner
ensemble.train(y = target, training_frame = train)

### Checking model performance of all base learners

In [0]:
# Checking the model performance for all GLM models built

model_perf_GLM_default = GLM_default_settings.model_performance(test)

model_perf_GLM_regularized = GLM_regularized.model_performance(test)

model_perf_Best_GLM_model_from_Grid = Best_GLM_model_from_Grid.model_performance(test)

In [0]:
# Checking the model performance for all RF models built


model_perf_RF_default_settings = RF_default_settings.model_performance(test)

model_perf_Best_RF_model_from_Grid = Best_RF_model_from_Grid.model_performance(test)

In [0]:
# Checking the model performance for all GBM models built

model_perf_GBM_default_settings = GBM_default_settings.model_performance(test)

model_perf_Best_GBM_model_from_Grid = Best_GBM_model_from_Grid.model_performance(test)

### Best AUC from the base learners

In [0]:
# Best AUC from the base learner models
best_auc = max(model_perf_GLM_default.auc(), model_perf_GLM_regularized.auc(), \
               model_perf_Best_GLM_model_from_Grid.auc(), \
               model_perf_RF_default_settings.auc(), \
               model_perf_Best_RF_model_from_Grid.auc(), \
               model_perf_GBM_default_settings.auc(), \
               model_perf_Best_GBM_model_from_Grid.auc())

print("Best AUC out of all the models performed: ", format(best_auc))

### AUC from the Ensemble Learner

In [0]:
# Eval ensemble performance on the test data
Ensemble_model = ensemble.model_performance(test)
Ensemble_model = Ensemble_model.auc()


In [0]:
print(Ensemble_model)