In [1]:
import h2o
from h2o.automl import H2OAutoML
import random, os, sys
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import psutil
import numpy as np

In [2]:
pct_memory=0.5
virtual_memory=psutil.virtual_memory()
min_mem_size=int(round(int(pct_memory*virtual_memory.available)/1073741824,0))
print(min_mem_size)

4


In [3]:
# 65535 Highest port no
port_no=random.randint(5555,55555)
h2o.init(strict_version_check=False,min_mem_size_GB=min_mem_size,port=port_no) # start h2o


Checking whether there is an H2O instance running at http://localhost:41331..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-macosx) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-macosx) (build 25.121-b15, mixed mode)
  Starting server from /Users/bear/anaconda/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/lh/42j8mfjx069d1bkc2wlf2pw40000gn/T/tmpt14e3kwx
  JVM stdout: /var/folders/lh/42j8mfjx069d1bkc2wlf2pw40000gn/T/tmpt14e3kwx/h2o_bear_started_from_python.out
  JVM stderr: /var/folders/lh/42j8mfjx069d1bkc2wlf2pw40000gn/T/tmpt14e3kwx/h2o_bear_started_from_python.err
  Server is running at http://127.0.0.1:41331
Connecting to H2O server at http://127.0.0.1:41331... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.3
H2O cluster version age:,"14 days, 3 hours and 42 minutes"
H2O cluster name:,H2O_from_python_bear_oroawx
H2O cluster total nodes:,1
H2O cluster free memory:,3.833 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [4]:
# location of clean data file
path = 'data/loan.csv'

In [5]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [6]:
frame = h2o.import_file(path=path, col_types=col_types) # import from url

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
frame.describe() # summarize table

Rows:163987
Cols:15




Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
type,int,enum,real,int,enum,real,enum,enum,real,int,real,int,enum,int,enum
mins,500.0,,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,,0.0,
mean,13074.169141456336,,13.715904065566173,5.68435293299533,,71915.67051974901,,,15.881530121290117,0.2273570060625282,54.07917280242258,24.579733834274638,,14.854273655448353,
maxs,35000.0,,26.06,10.0,,7141778.0,,,39.99,29.0,150.70000000000002,118.0,,65.0,
sigma,7993.556188734649,,4.391939870545795,3.6106637311002365,,59070.915654918244,,,7.587668224192549,0.6941679229284182,25.285366766770505,11.685190365910659,,6.947732922546696,
zeros,0,,0,14248,,0,,,270,139459,1562,0,,11,
missing,0,0,0,5804,0,4,0,0,0,29,193,29,0,29,0
0,5000.0,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0.0,83.7,9.0,0,26.0,verified
1,2500.0,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400.0,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified


In [8]:
# split into training and test for cross validation
train, test = frame.split_frame([0.7])

In [9]:
# assign target and inputs for logistic regression
y = 'bad_loan'
X = [name for name in frame.columns if name != y]
print(y)
print(X)

bad_loan
['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status']


In [10]:
# determine column types
reals, enums = [], []
for key, val in frame.types.items():
    if key in X:
        if val == 'enum':
            enums.append(key)
        else: 
            reals.append(key)

print(enums)
print(reals)

['term', 'home_ownership', 'purpose', 'addr_state', 'verification_status']
['loan_amnt', 'int_rate', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length']


In [11]:
# impute missing values
_ = frame[reals].impute(method='mean')

In [12]:
# set target to factor for logisitic regression
# just to be safe ...
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [13]:
# automl
# runs for 300 seconds then builds a stacked ensemble
auto = H2OAutoML(max_runtime_secs=300) # init automl, run for 300 seconds
auto.train(x=X,  
           y=y,
           training_frame=train,   # training data split into 70/30 train/valid
           leaderboard_frame=test) 

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [14]:
# view leaderboard
lb = auto.leaderboard
lb

model_id,auc,logloss,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20190208_233125,0.705949,0.436496,0.352658,0.370509,0.137277
StackedEnsemble_AllModels_AutoML_20190208_233125,0.705949,0.436496,0.352658,0.370509,0.137277
XGBoost_1_AutoML_20190208_233125,0.705049,0.43492,0.352097,0.370148,0.13701
GLM_grid_1_AutoML_20190208_233125_model_1,0.697069,0.439172,0.355497,0.37174,0.138191
XRT_1_AutoML_20190208_233125,0.690484,0.441469,0.362372,0.372741,0.138936
DRF_1_AutoML_20190208_233125,0.68615,0.444452,0.363315,0.373784,0.139714




In [15]:
# view best model 
best = auto.leader
best # must use predict(), no POJO/MOJO available yet for Stacked Ensemble

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_AutoML_20190208_233125
No model summary for this model


ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.11194802177960891
RMSE: 0.334586344281426
LogLoss: 0.364526311884174
Null degrees of freedom: 114881
Residual degrees of freedom: 114877
Null deviance: 109373.19598219445
Residual deviance: 83755.02352375534
AIC: 83765.02352375534
AUC: 0.8682171551562193
pr_auc: 0.6206654262910508
Gini: 0.7364343103124387
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.224877838927852: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,79108.0,14742.0,0.1571,(14742.0/93850.0)
1,6641.0,14391.0,0.3158,(6641.0/21032.0)
Total,85749.0,29133.0,0.1861,(21383.0/114882.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2248778,0.5737466,228.0
max f2,0.1549138,0.6933756,292.0
max f0point5,0.3450173,0.5930796,151.0
max accuracy,0.3702581,0.8552689,137.0
max precision,0.8606823,1.0,0.0
max recall,0.0800694,1.0,378.0
max specificity,0.8606823,1.0,0.0
max absolute_mcc,0.2526012,0.4690388,209.0
max min_per_class_accuracy,0.1927640,0.7745340,256.0


Gains/Lift Table: Avg response rate: 18.31 %, avg score: 18.43 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100016,0.6215286,5.2245523,5.2245523,0.9564839,0.6820886,0.9564839,0.6820886,0.0522537,0.0522537,422.4552267,422.4552267
,2,0.0200031,0.5547107,4.6445747,4.9345635,0.8503046,0.5851554,0.9033943,0.6336220,0.0464530,0.0987067,364.4574673,393.4563470
,3,0.0300047,0.5110801,4.3450780,4.7380683,0.7954743,0.5314731,0.8674209,0.5995723,0.0434576,0.1421643,334.5078046,373.8068329
,4,0.0400063,0.4786207,4.1026284,4.5792083,0.7510879,0.4940351,0.8383377,0.5731880,0.0410327,0.1831970,310.2628396,357.9208345
,5,0.0500078,0.4498012,3.7841161,4.4201899,0.6927763,0.4634461,0.8092254,0.5512396,0.0378471,0.2210441,278.4116110,342.0189898
,6,0.1000070,0.3557563,3.1086505,3.7644773,0.5691156,0.3981944,0.6891810,0.4747237,0.1554298,0.3764739,210.8650544,276.4477299
,7,0.1500061,0.2979540,2.4534470,3.3274926,0.4491643,0.3251228,0.6091801,0.4248596,0.1226702,0.4991442,145.3447049,232.7492574
,8,0.2000052,0.2573386,1.9285235,2.9777655,0.3530641,0.2766323,0.5451538,0.3878044,0.0964245,0.5955687,92.8523494,197.7765526
,9,0.3000035,0.2023755,1.5005967,2.4853902,0.2747214,0.2275237,0.4550123,0.3343790,0.1500571,0.7456257,50.0596683,148.5390198




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.13748186280088928
RMSE: 0.3707854673539529
LogLoss: 0.4367329135632413
Null degrees of freedom: 114881
Residual degrees of freedom: 114877
Null deviance: 109383.27031825924
Residual deviance: 100345.50115194457
AIC: 100355.50115194457
AUC: 0.7073339990532225
pr_auc: 0.3443936029165522
Gini: 0.41466799810644495
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.18517325178997945: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,66110.0,27740.0,0.2956,(27740.0/93850.0)
1,8550.0,12482.0,0.4065,(8550.0/21032.0)
Total,74660.0,40222.0,0.3159,(36290.0/114882.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1851733,0.4075489,261.0
max f2,0.1166577,0.5672765,332.0
max f0point5,0.2847464,0.3707921,185.0
max accuracy,0.5601740,0.8179784,48.0
max precision,0.8199763,1.0,0.0
max recall,0.0612037,1.0,399.0
max specificity,0.8199763,1.0,0.0
max absolute_mcc,0.1851733,0.2415183,261.0
max min_per_class_accuracy,0.1684787,0.6492963,277.0


Gains/Lift Table: Avg response rate: 18.31 %, avg score: 18.31 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100016,0.5841206,2.9902124,2.9902124,0.5474326,0.6399320,0.5474326,0.6399320,0.0299068,0.0299068,199.0212353,199.0212353
,2,0.0200031,0.5258412,2.6194070,2.8048097,0.4795474,0.5521569,0.5134900,0.5960445,0.0261982,0.0561050,161.9407006,180.4809680
,3,0.0300047,0.4834752,2.5148209,2.7081467,0.4604003,0.5035736,0.4957934,0.5652208,0.0251521,0.0812571,151.4820882,170.8146747
,4,0.0400063,0.4532940,2.3104025,2.6087107,0.4229765,0.4675400,0.4775892,0.5408006,0.0231076,0.1043648,131.0402550,160.8710698
,5,0.0500078,0.4296574,2.2295860,2.5328858,0.4081810,0.4410291,0.4637076,0.5208463,0.0222994,0.1266641,122.9586000,153.2885758
,6,0.1000070,0.3462783,2.0150598,2.2739953,0.3689067,0.3836394,0.4163113,0.4522488,0.1007512,0.2274154,101.5059805,127.3995317
,7,0.1500061,0.2938816,1.7820774,2.1100322,0.3262535,0.3183604,0.3862937,0.4076219,0.0891023,0.3165177,78.2077430,111.0032203
,8,0.2000052,0.2560072,1.5994953,1.9824035,0.2928273,0.2738798,0.3629281,0.3741879,0.0799734,0.3964911,59.9495324,98.2403538
,9,0.3000035,0.2037546,1.3498714,1.7715656,0.2471274,0.2281252,0.3243290,0.3255017,0.1349848,0.5314758,34.9871351,77.1565593







In [16]:
# shutdown h2o ... be careful this can erase your work
h2o.cluster().shutdown(prompt=True)

Are you sure you want to shutdown the H2O instance running at http://127.0.0.1:41331 (Y/N)? Y
H2O session _sid_9a0d closed.
