In [1]:
## first step is to import h2o module
## make sure h20 is installed
## pip install h2o 
## or it can be installed from h2o git for the latest version
import h2o

In [3]:
h2o.init()
# h2o.init(ip, port) -- use this when h2o is installed on a separate machine in the network
# h2o.init(nthreads = -1, max_mem_size = 8) # nthreads uses all possible cores. mem size is what h2o will work on
# h2o.remove_all()  #clean slate, in case cluster was already running

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,25 secs
H2O cluster version:,3.10.3.4
H2O cluster version age:,2 months
H2O cluster name:,H2O_from_python_Hari_2p2iss
H2O cluster total nodes:,1
H2O cluster free memory:,869 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [4]:
## h2o's syntax is very similar to python's scikit learn

## initiate a h2o dataframe with tupple
df = h2o.H2OFrame(zip(*((1, 2, 3), ('a', 'b', 'c'), (0.1, 0.2, 0.3))))
print df

## initiate a h2o dataframe with list
df = h2o.H2OFrame(zip(*[[1, 2, 3], ['a', 'b', 'c'], [0.1, 0.2, 0.3]]))
print df

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3
1,a,0.1
2,b,0.2
3,c,0.3



Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3
1,a,0.1
2,b,0.2
3,c,0.3





In [5]:
## initialize an h2o frame using a dictionary
## this can enable us to name the columns
df = h2o.H2OFrame({'A': [1, 2, 3],'B': ['a', 'b', 'c'],'C': [0.1, 0.2, 0.3]})
print df
print df.types

Parse progress: |█████████████████████████████████████████████████████████| 100%


A,C,B
1,0.1,a
2,0.2,b
3,0.3,c



{u'A': u'int', u'C': u'real', u'B': u'string'}


In [6]:
df = h2o.H2OFrame.from_python({'A': [1, 2, 3], 'B': ['a', 'a', 'b'], 'C': ['hello', 'all', 'world'], \
                               'D': ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']},\
                              column_types=['numeric', 'enum', 'string', 'time'])

print df
print df.types

Parse progress: |█████████████████████████████████████████████████████████| 100%


A,C,B,D
1,hello,a,2015-03-12 11:00:00
2,all,a,2015-03-13 12:00:00
3,world,b,2015-03-14 13:00:00



{u'A': u'int', u'C': u'enum', u'B': u'string', u'D': u'time'}


In [7]:
## generate a dataframe using randomly created numbers
import numpy as np
df = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [9]:
## see the top 10 rows of the h2o dataframe
df.head(2) ## df.head(2)

A,B,C,D
-1.03543,0.909379,0.994444,0.373655
0.974353,-0.0169278,-0.571329,-1.08699




In [10]:
## see the last 10 rows of the dataframe
df.tail() # df.tail(2)

A,B,C,D
1.6099,-0.29257,-0.766423,-0.293612
-0.0161793,-1.9786,0.764942,0.50233
-2.5635,1.72895,0.171417,2.49418
-1.64574,-0.649679,0.851393,-0.0257901
0.681083,1.99798,0.362346,0.795162
0.59534,0.253213,0.305017,-0.49355
-0.147343,-0.654514,0.326571,0.150587
-1.38354,-1.00327,0.02295,1.22599
-1.01918,0.173406,-0.264354,-0.883294
-0.769345,-0.160151,0.385937,-0.430516




In [11]:
## get the column names ## same as that in sklearn
df.columns

[u'A', u'B', u'C', u'D']

In [12]:
## give a basic summary about the h2o dataframe
df.describe()

Rows:100
Cols:4




Unnamed: 0,A,B,C,D
type,real,real,real,real
mins,-3.02012750236,-2.98638960275,-2.57208429425,-2.4369225965
mean,-0.0715529345131,-0.0178428779686,-0.0225369614545,0.0713415205026
maxs,3.90130990131,2.73108841508,2.75166106298,3.48332225106
sigma,1.20921873371,0.977444966177,1.06953137966,1.09328346902
zeros,0,0,0,0
missing,0,0,0,0
0,-1.03542620857,0.909378842725,0.994444034398,0.373655390315
1,0.97435274277,-0.0169277574882,-0.571328989491,-1.08699268481
2,2.40537107008,0.367050331205,-1.51747799918,0.799445681213


In [13]:
## subsetting the h2o dataframe. take out the column A. By column name
df['A']

A
-1.03543
0.974353
2.40537
3.90131
-0.0761141
-3.02013
-0.759712
-0.698607
-0.203799
0.282423




In [14]:
## subset the h2o dataframe using index
df[1]

B
0.909379
-0.0169278
0.36705
0.452033
-1.38733
0.749367
-0.0714153
2.73109
-0.910066
-0.432436




In [15]:
## subset more than one columns
df[['B', 'C']]

B,C
0.909379,0.994444
-0.0169278,-0.571329
0.36705,-1.51748
0.452033,0.157483
-1.38733,-1.34165
0.749367,-1.31955
-0.0714153,1.31608
2.73109,-0.994619
-0.910066,0.678528
-0.432436,0.000752553




In [17]:
## Dealing with missing values in H2O

## generate a h2o frame with some NA or None data
df = h2o.H2OFrame.from_python(
{'A': [1, 2, 3,None,''],
 'B': ['a', 'a', 'b', 'NA', 'NA'],
 'C': ['hello', 'all', 'world', None, None],
 'D': ['12MAR2015:11:00:00',None,
'13MAR2015:12:00:00',None,
 '14MAR2015:13:00:00']},
 column_types=['numeric', 'enum', 'string', 'time'])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [18]:
## check if the column A contains a missing value
df["A"].isna()

isNA(A)
0
0
0
1
1




In [19]:
## assign the missing values in column A to 5
df[ df["A"].isna(), "A"] = 5
print df["A"].isna()

isNA(A)
0
0
0
0
0





In [20]:
## taking mean when NAs are present
df = h2o.H2OFrame.from_python( {'A': [1, 2, 3,None,''],  'B': ['a', 'a', 'b', 'NA', 'NA'],  \
                                 'C': ['hello', 'all', 'world', None, None], 'D': ['12MAR2015:11:00:00',None, \
                                                                                   '13MAR2015:12:00:00',None,\
                                                                                   '14MAR2015:13:00:00']}, \
                               column_types=['numeric', 'enum', 'string', 'time']) 

print '***NA removed***'
print df.mean(na_rm=True)

Parse progress: |█████████████████████████████████████████████████████████| 100%
***NA removed***
[2.0, nan, nan, 1426248000000.0]


In [21]:
## apply functions to multiple columns
df = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))

## like in sklearn, apply function works well with lambda function
print df.apply(lambda x: x.mean()), type(df.apply(lambda x: x.mean()))
print df.mean(), type(df.mean())

Parse progress: |█████████████████████████████████████████████████████████| 100%


A,B,C,D
-0.136173,-0.0374335,-0.0513363,0.104493


 <class 'h2o.frame.H2OFrame'>
[-0.13617266610991108, -0.03743352538621636, -0.05133632825747932, 0.10449299515034864] <type 'list'>


In [22]:
df.apply(lambda row: row.sum(), axis=0)

A,B,C,D
-13.6173,-3.74335,-5.13363,10.4493




In [23]:
## methods to deal with strings
df = h2o.H2OFrame.from_python(['Hello', 'World', 'Welcome', 'To', 'H2O', 'World'])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [24]:
print 'number of l\'s in the dataframe are = ', df.countmatches('l')

number of l's in the dataframe are =  

C1
2
1
1
0
0
1





In [25]:
## substituting l with x
print df.sub('l','x')

C1
Hexlo
Worxd
Wexcome
To
H2O
Worxd





In [26]:
## merging / joining dataframes
## syntax works like dataframe in R

df1 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))
df2 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [27]:
## row bind. append both rows. need to have same number of columns
df2.rbind(df1)

A,B,C,D
-0.336579,1.05786,0.630106,-0.224133
-0.322339,0.979067,-1.32546,-0.943295
0.724562,-0.228323,-1.92471,-1.25866
-0.0916786,0.432251,-0.00438264,2.11848
-1.07114,0.464086,0.337923,0.0491678
0.970062,-1.89083,0.987645,0.513328
0.232859,-1.01059,-0.587221,-0.387639
-0.647283,0.493918,-0.846988,-0.0714801
-1.23586,-1.56229,0.608167,-0.10527
-2.04096,-0.218677,0.413752,0.719071




In [28]:
## column bind. append side by side
df2.cbind(df1)

A,B,C,D,A0,B0,C0,D0
-0.336579,1.05786,0.630106,-0.224133,-1.1067,-0.0485177,-0.522742,-0.99105
-0.322339,0.979067,-1.32546,-0.943295,-0.469044,0.130979,-0.15931,-0.456781
0.724562,-0.228323,-1.92471,-1.25866,1.825,0.827959,0.549254,-0.868524
-0.0916786,0.432251,-0.00438264,2.11848,-1.5467,0.685545,-0.264445,-1.15441
-1.07114,0.464086,0.337923,0.0491678,0.949694,0.0222718,0.544893,0.226752
0.970062,-1.89083,0.987645,0.513328,0.385319,-0.199407,-0.638651,-1.1183
0.232859,-1.01059,-0.587221,-0.387639,1.01137,0.00964041,0.0682824,-0.792892
-0.647283,0.493918,-0.846988,-0.0714801,-1.44081,1.00483,-0.250032,-0.304646
-1.23586,-1.56229,0.608167,-0.10527,0.936107,0.365372,-0.136865,-0.416424
-2.04096,-0.218677,0.413752,0.719071,-0.104475,0.125852,1.00419,1.95182




In [29]:
## merging dataframe
df1 = h2o.H2OFrame.from_python( {'A': ['Hello', 'World', 'Welcome', 'To', 'H2O', 'World'], \
                                  'n': [0,1,2,3,4,5]} )
df2 = h2o.H2OFrame.from_python(np.random.randint(0,6,(100,1)), column_names= list('n'))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [30]:
# Combine column "n" from both datasets
df2.merge(df1)

n,A
4,H2O
5,World
5,World
0,Hello
3,To
1,World
0,Hello
5,World
2,Welcome
4,H2O




In [31]:
# grouping

df = h2o.H2OFrame(
 {'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
 'C' : np.random.randn(8).tolist(),
 'D' : np.random.randn(8).tolist()})

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [32]:
df.group_by('A').sum().frame

A,sum_C,sum_B,sum_D
bar,-0.176285,3,0.432173
foo,0.895666,5,0.463436




In [33]:
df.group_by('A').mean().frame

A,mean_D,mean_B,mean_C
bar,0.144058,1,-0.0587618
foo,0.0926871,1,0.179133




In [34]:
print df.group_by('A').min().frame
print df.group_by('A').max().get_frame()

A,min_D,min_B,min_C
bar,0.0338973,0,-1.46841
foo,-1.18631,0,-1.15827





A,max_D,max_B,max_C
bar,0.266902,2,1.08642
foo,1.25459,2,1.71932





In [35]:
df1 = df.group_by(['A','B']).sum().frame
print df1

A,B,sum_C,sum_D
bar,one,1.08642,0.0338973
bar,three,0.205707,0.266902
bar,two,-1.46841,0.131374
foo,one,-0.923943,-1.62136
foo,three,-0.371698,1.04974
foo,two,2.19131,1.03506





In [56]:
## working with date and time

df = h2o.H2OFrame.from_python({'D': ['18OCT2015:11:00:00', '19OCT2015:12:00:00', '20OCT2015:13:00:00']}, \
                              column_types=['time'])
print df['D'].day()
print df['D'].dayOfWeek()

Parse progress: |█████████████████████████████████████████████████████████| 100%


D
18
19
20





D
Sun
Mon
Tue





In [58]:
## machine learning

## H2O supports the following models:
# Deep Learning
# Naive Bayes
# Principal Components Analysis (PCA)
# K-means
# Generalized Linear Models (GLM)
# Gradient Boosting Machine (GBM)
# Generalized Low Rank Model (GLRM)
# Distributed Random Forest (DRF)

In [43]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#https://h2o-release.s3.amazonaws.com/h2o/rel-turan/4/docs-website/h2o-py/docs/modeling.html#h2ogradientboostingestimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
#https://h2o-release.s3.amazonaws.com/h2o/rel-turan/4/docs-website/h2o-py/docs/modeling.html#h2ogeneralizedlinearestimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
#https://h2o-release.s3.amazonaws.com/h2o/rel-turan/4/docs-website/h2o-py/docs/modeling.html#h2orandomforestestimator

from h2o.grid.grid_search import H2OGridSearch
## same as sklearn
## from sklearn.ensemble import GradientBoostingClassifier

In [36]:
## read data set
## data source -- http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
path = 'C:\\Users\\Hari\\Documents\\Experiment_with_H2O\\bank\\bank-full.csv'
df = h2o.import_file(path=path)

## sklearn -- df = pd.read_csv(path) # sklearn has no read method

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [37]:
## check out the first 5 rows
df.head(5)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no




In [None]:
# Input variables:
# # bank client data:
# 1 - age (numeric)
# 2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
# 3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
# 4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
# 5 - default: has credit in default? (categorical: 'no','yes','unknown')
# 6 - housing: has housing loan? (categorical: 'no','yes','unknown')
# 7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# # related with the last contact of the current campaign:
# 8 - contact: contact communication type (categorical: 'cellular','telephone') 
# 9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
# 10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
# 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# # other attributes:
# 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
# 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
# 14 - previous: number of contacts performed before this campaign and for this client (numeric)
# 15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
    
# Output variable (desired target):
# 21 - y - has the client subscribed a term deposit? (binary: 'yes','no')

In [62]:
## check what data types h2o parsed
df.types

{u'age': u'int',
 u'balance': u'int',
 u'campaign': u'int',
 u'contact': u'enum',
 u'day': u'int',
 u'default': u'enum',
 u'duration': u'int',
 u'education': u'enum',
 u'housing': u'enum',
 u'job': u'enum',
 u'loan': u'enum',
 u'marital': u'enum',
 u'month': u'enum',
 u'pdays': u'int',
 u'poutcome': u'enum',
 u'previous': u'int',
 u'y': u'enum'}

In [63]:
## to change a variable to a factor. Not there in scikit learn. There in R
## df["var"] = df["var"].asfactor()   

In [38]:
## get the categories for a categorical variable
df['marital'].levels()

[['divorced', 'married', 'single']]

In [39]:
## get a brief summary of the data
df.describe()

Rows:45211
Cols:17




Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
type,int,enum,enum,enum,enum,int,enum,enum,enum,int,enum,int,int,int,int,enum,enum
mins,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
mean,40.9362102143,,,,,1362.27205769,,,,15.8064187919,,258.163079781,2.76384065825,40.1978279622,0.580323372631,,
maxs,95.0,,,,,102127.0,,,,31.0,,4918.0,63.0,871.0,275.0,,
sigma,10.618762041,,,,,3044.76582917,,,,8.32247615304,,257.527812265,3.09802088328,100.128745991,2.30344104493,,
zeros,0,,,,,3514,,,,0,,3,0,0,36954,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,may,261.0,1.0,-1.0,0.0,unknown,no
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,may,151.0,1.0,-1.0,0.0,unknown,no
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,may,76.0,1.0,-1.0,0.0,unknown,no


In [40]:
## any machine learning task, we divide the data into three parts
## training set - the algorithm learns using this data
## validation set - the algorithm prunes or avoids overfitting using this data
## test set - the final output and the accuracy is gauged using this data

## METHOD 1

## a little different syntax that scikit learn
splits = df.split_frame(ratios=[0.70, 0.15], seed=1)  

train = splits[0]
valid = splits[1]
test = splits[2]

## METHOD 2
# # Construct validation and training datasets by sampling (20/80)
iloc = df[0].runif()
train = df[iloc < 0.7]
valid = df[iloc >= 0.7]

In [41]:
## get the shape of each of the datasets
print 'train shape', train.shape
print 'validation shape', valid.shape

train shape (31680, 17)
validation shape (13531, 17)


In [44]:
## read about the documentation
help(H2OGeneralizedLinearEstimator)

Help on class H2OGeneralizedLinearEstimator in module h2o.estimators.glm:

class H2OGeneralizedLinearEstimator(h2o.estimators.estimator_base.H2OEstimator)
 |  Generalized Linear Modeling
 |  
 |  Fits a generalized linear model, specified by a response variable, a set of predictors, and a
 |  description of the error distribution.
 |  
 |  A subclass of :class:`ModelBase` is returned. The specific subclass depends on the machine learning task
 |  at hand (if it's binomial classification, then an H2OBinomialModel is returned, if it's regression then a
 |  H2ORegressionModel is returned). The default print-out of the models is shown, but further GLM-specific
 |  information can be queried out of the object. Upon completion of the GLM, the resulting object has
 |  coefficients, normalized coefficients, residual/null deviance, aic, and a host of model metrics including
 |  MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices.
 |  
 |  Method resolution order:
 |  

In [45]:
print df['y'].unique()

C1
no
yes





In [48]:
## MODEL 1 - GENERALIZED LINEAR MODEL OR LOGISTIC REGRESSION (BINOMIAL)
## lambda, controls the amount of regularization in a GLM model and we can find the optimal
## value for lambda automatically by setting lambda_search = True and passing in a validation 
## frame (which is used to evaluate model performance using a particular value of lambda).

glm_model = H2OGeneralizedLinearEstimator(family='binomial', model_id='glm_model', lambda_search=True)

In [49]:
#Prepare predictors and response columns
predictors = df.col_names[:-1]     #last column is response, our desired response variable 
response = df.col_names[-1] 

In [50]:
glm_model.train(predictors, response, training_frame=train, validation_frame=valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [51]:
glm_model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_model


ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.0707011639635
RMSE: 0.265896904765
LogLoss: 0.238470624639
Null degrees of freedom: 31679
Residual degrees of freedom: 31640
Null deviance: 22925.2338485
Residual deviance: 15109.4987771
AIC: 15189.4987771
AUC: 0.909048872279
Gini: 0.818097744558
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.190614726004: 


0,1,2,3,4
,no,yes,Error,Rate
no,25436.0,2523.0,0.0902,(2523.0/27959.0)
yes,1160.0,2561.0,0.3117,(1160.0/3721.0)
Total,26596.0,5084.0,0.1163,(3683.0/31680.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1906147,0.5817149,246.0
max f2,0.1159957,0.6921813,287.0
max f0point5,0.3591864,0.5833824,179.0
max accuracy,0.3812761,0.9041035,171.0
max precision,0.7700209,0.7220259,59.0
max recall,0.0034762,1.0,397.0
max specificity,0.9993617,0.9996066,0.0
max absolute_mcc,0.1634900,0.5265882,259.0
max min_per_class_accuracy,0.1159957,0.8379468,287.0


Gains/Lift Table: Avg response rate: 11.75 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100063,0.9138298,5.9892316,5.9892316,0.7034700,0.7034700,0.0599301,0.0599301,498.9231550,498.9231550
,2,0.0200126,0.8277311,6.0966617,6.0429466,0.7160883,0.7097792,0.0610051,0.1209352,509.6661713,504.2946632
,3,0.0300189,0.7429166,6.2040919,6.0966617,0.7287066,0.7160883,0.0620801,0.1830153,520.4091875,509.6661713
,4,0.0400253,0.6580526,5.6400835,5.9825172,0.6624606,0.7026814,0.0564364,0.2394518,464.0083523,498.2517165
,5,0.05,0.5780672,5.1190812,5.8102661,0.6012658,0.6824495,0.0510615,0.2905133,411.9081232,481.0266058
,6,0.1,0.3260161,4.3590433,5.0846547,0.5119949,0.5972222,0.2179522,0.5084655,335.9043268,408.4654663
,7,0.15,0.2064084,3.0314432,4.4002508,0.3560606,0.5168350,0.1515722,0.6600376,203.1443160,340.0250829
,8,0.2,0.1461806,2.2789573,3.8699274,0.2676768,0.4545455,0.1139479,0.7739855,127.8957270,286.9927439
,9,0.3,0.0864680,1.1959151,2.9785900,0.1404672,0.3498527,0.1195915,0.8935770,19.5915077,197.8589985




ModelMetricsBinomialGLM: glm
** Reported on validation data. **

MSE: 0.071290409637
RMSE: 0.267002639757
LogLoss: 0.239223543276
Null degrees of freedom: 13530
Residual degrees of freedom: 13491
Null deviance: 9705.81755637
Residual deviance: 6473.86752814
AIC: 6553.86752814
AUC: 0.90713141135
Gini: 0.8142628227
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.198961499733: 


0,1,2,3,4
,no,yes,Error,Rate
no,10882.0,1081.0,0.0904,(1081.0/11963.0)
yes,495.0,1073.0,0.3157,(495.0/1568.0)
Total,11377.0,2154.0,0.1165,(1576.0/13531.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1989615,0.5765717,241.0
max f2,0.1053972,0.6821206,294.0
max f0point5,0.4038401,0.5741231,162.0
max accuracy,0.4038401,0.9039982,162.0
max precision,0.9301438,0.7383178,17.0
max recall,0.0081749,1.0,392.0
max specificity,0.9995739,0.9995820,0.0
max absolute_mcc,0.1967256,0.5198490,242.0
max min_per_class_accuracy,0.1126933,0.8316327,289.0


Gains/Lift Table: Avg response rate: 11.59 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100510,0.9049510,6.1548385,6.1548385,0.7132353,0.7132353,0.0618622,0.0618622,515.4838498,515.4838498
,2,0.0200281,0.8158974,5.6251323,5.8909627,0.6518519,0.6826568,0.0561224,0.1179847,462.5132275,489.0962704
,3,0.0300052,0.7188915,5.6890542,5.8238257,0.6592593,0.6748768,0.0567602,0.1747449,468.9054233,482.3825651
,4,0.0400562,0.6387991,5.0761555,5.6362184,0.5882353,0.6531365,0.0510204,0.2257653,407.6155462,463.6218371
,5,0.0500333,0.5721107,5.6890542,5.6467543,0.6592593,0.6543575,0.0567602,0.2825255,468.9054233,464.6754326
,6,0.1000665,0.3385575,4.4358251,5.0412897,0.5140325,0.5841950,0.2219388,0.5044643,343.5825069,404.1289697
,7,0.1500259,0.2153062,3.0381842,4.3742457,0.3520710,0.5068966,0.1517857,0.65625,203.8184172,337.4245690
,8,0.2000591,0.1454468,2.1031929,3.8062728,0.2437223,0.4410787,0.1052296,0.7614796,110.3192920,280.6272758
,9,0.3000517,0.0863514,1.2819825,2.9650499,0.1485588,0.3435961,0.1281888,0.8896684,28.1982499,196.5049921



Scoring History: 


0,1,2,3,4,5,6,7
,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test
,2017-04-03 19:49:27,0.000 sec,3,.23E0,2,0.7111968,0.7052345
,2017-04-03 19:49:27,0.093 sec,5,.21E0,2,0.6999791,0.6943485
,2017-04-03 19:49:27,0.183 sec,7,.19E0,2,0.6899445,0.6845913
,2017-04-03 19:49:27,0.247 sec,9,.18E0,2,0.6809676,0.6758427
,2017-04-03 19:49:27,0.325 sec,11,.16E0,2,0.6729199,0.6679821
---,---,---,---,---,---,---,---
,2017-04-03 19:49:32,4.907 sec,135,.2E-3,38,0.4770548,0.4785171
,2017-04-03 19:49:32,4.952 sec,136,.18E-3,38,0.4770202,0.4784969
,2017-04-03 19:49:32,4.989 sec,137,.16E-3,39,0.4769912,0.4784816



See the whole table with table.as_data_frame()




In [52]:
## accuracy, auc, confusion matrix
print 'accuracy', glm_model.accuracy()
print 'auc', glm_model.auc()
print 'confusion matrix', glm_model.confusion_matrix()
print glm_model.confusion_matrix(thresholds=[0.1, 0.5, 0.99])

accuracy [[0.38127608888448294, 0.9041035353535354]]
auc 0.909048872279
confusion matrix Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.190614726004: 


0,1,2,3,4
,no,yes,Error,Rate
no,25436.0,2523.0,0.0902,(2523.0/27959.0)
yes,1160.0,2561.0,0.3117,(1160.0/3721.0)
Total,26596.0,5084.0,0.1163,(3683.0/31680.0)



Could not find exact threshold 0.1; using closest threshold found 0.10067554089.
Could not find exact threshold 0.5; using closest threshold found 0.500238198461.
Could not find exact threshold 0.99; using closest threshold found 0.992534695742.
Confusion Matrix (Act/Pred) @ threshold = 0.10067554089: 


0,1,2,3,4
,no,yes,Error,Rate
no,22653.0,5306.0,0.1898,(5306.0/27959.0)
yes,486.0,3235.0,0.1306,(486.0/3721.0)
Total,23139.0,8541.0,0.1828,(5792.0/31680.0)


Confusion Matrix (Act/Pred) @ threshold = 0.500238198461: 


0,1,2,3,4
,no,yes,Error,Rate
no,27290.0,669.0,0.0239,(669.0/27959.0)
yes,2414.0,1307.0,0.6488,(2414.0/3721.0)
Total,29704.0,1976.0,0.0973,(3083.0/31680.0)


Confusion Matrix (Act/Pred) @ threshold = 0.992534695742: 


0,1,2,3,4
,no,yes,Error,Rate
no,27933.0,26.0,0.0009,(26.0/27959.0)
yes,3683.0,38.0,0.9898,(3683.0/3721.0)
Total,31616.0,64.0,0.1171,(3709.0/31680.0)


[, , ]


In [53]:
prediction = glm_model.predict(valid[:-1])

glm prediction progress: |████████████████████████████████████████████████| 100%


In [54]:
print prediction['predict']
print valid['y']

predict
no
no
no
no
no
no
no
no
no
no





y
no
no
no
no
no
no
no
no
no
no





In [57]:
## Gradient Boosting Estimator with cross validation
gbm_model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=50, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=5)

In [58]:
# train the model
gbm_model.train(predictors, response, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [59]:
## accuracy, auc, confusion matrix
print 'accuracy', gbm_model.accuracy()
print 'auc', gbm_model.auc()
print 'confusion matrix', gbm_model.confusion_matrix()
print gbm_model.confusion_matrix(thresholds=[0.1, 0.5, 0.99])

accuracy [[0.48121123557609496, 0.9136679292929293]]
auc 0.936265823803
confusion matrix Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.280851394502: 


0,1,2,3,4
,no,yes,Error,Rate
no,25991.0,1968.0,0.0704,(1968.0/27959.0)
yes,1059.0,2662.0,0.2846,(1059.0/3721.0)
Total,27050.0,4630.0,0.0955,(3027.0/31680.0)



Could not find exact threshold 0.1; using closest threshold found 0.0997466948207.
Could not find exact threshold 0.5; using closest threshold found 0.498556625655.
Could not find exact threshold 0.99; using closest threshold found 0.976099373476.
Confusion Matrix (Act/Pred) @ threshold = 0.0997466948207: 


0,1,2,3,4
,no,yes,Error,Rate
no,22946.0,5013.0,0.1793,(5013.0/27959.0)
yes,323.0,3398.0,0.0868,(323.0/3721.0)
Total,23269.0,8411.0,0.1684,(5336.0/31680.0)


Confusion Matrix (Act/Pred) @ threshold = 0.498556625655: 


0,1,2,3,4
,no,yes,Error,Rate
no,27215.0,744.0,0.0266,(744.0/27959.0)
yes,1999.0,1722.0,0.5372,(1999.0/3721.0)
Total,29214.0,2466.0,0.0866,(2743.0/31680.0)


Confusion Matrix (Act/Pred) @ threshold = 0.976099373476: 


0,1,2,3,4
,no,yes,Error,Rate
no,27959.0,0.0,0.0,(0.0/27959.0)
yes,3719.0,2.0,0.9995,(3719.0/3721.0)
Total,31678.0,2.0,0.1174,(3719.0/31680.0)


[, , ]


In [60]:
## Grid Search
ntrees_opt = [5,50,100]
max_depth_opt = [2,3,5]
learn_rate_opt = [0.1,0.2]

hyper_params = {'ntrees': ntrees_opt, 
                'max_depth': max_depth_opt,
                'learn_rate': learn_rate_opt}
search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 60}  #updated

In [61]:
gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params = hyper_params, search_criteria = search_criteria)

In [62]:
gs.train(predictors, response, training_frame=train, validation_frame=valid)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [63]:
print gs

     learn_rate max_depth ntrees  \
0           0.2         5    100   
1           0.1         5    100   
2           0.2         3    100   
3           0.2         3     50   
4           0.2         2    100   
5           0.1         3     50   
6           0.1         2    100   
7           0.2         2     50   
8           0.1         2     50   
9           0.2         5      5   
10          0.2         3      5   
11          0.2         2      5   
12          0.1         5      5   
13          0.1         3      5   
14          0.1         2      5   

                                                          model_ids  \
0   Grid_GBM_py_31_sid_8e36_model_python_1491261070999_182_model_11   
1   Grid_GBM_py_31_sid_8e36_model_python_1491261070999_182_model_12   
2    Grid_GBM_py_31_sid_8e36_model_python_1491261070999_182_model_2   
3    Grid_GBM_py_31_sid_8e36_model_python_1491261070999_182_model_1   
4    Grid_GBM_py_31_sid_8e36_model_python_1491261070999_182_model_5 

In [64]:
# sort the models by auc score
gbm_gridperf = gs.get_grid(sort_by='auc', decreasing=True)

In [65]:
# get the best model
best_model = gbm_gridperf.models[0]

In [66]:
# get the auc score
gbm_perf = best_model.model_performance(valid)
print gbm_perf.auc()

0.934139244388


In [98]:
## RANDOM FOREST

In [67]:
rf = H2ORandomForestEstimator(seed=1, nfolds=5, model_id="rf",
    ntrees=200,
    max_depth=30,
    stopping_rounds=2,
    stopping_tolerance=0.01,
    score_each_iteration=True)

In [68]:
rf.train(x=predictors, y=response, training_frame=train, validation_frame = valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [69]:
rf.varimp()

[(u'duration', 17205.51953125, 1.0, 0.2693027175820283),
 (u'month', 8219.2353515625, 0.4777092221268929, 0.12864839173276904),
 (u'age', 5823.7744140625, 0.3384829155251551, 0.09115436900601238),
 (u'poutcome', 5762.84326171875, 0.334941543104922, 0.09020066778927358),
 (u'day', 5531.85693359375, 0.32151641358730043, 0.08658524392628736),
 (u'job', 4810.06640625, 0.2795653102781107, 0.07528769779955816),
 (u'balance', 3758.682861328125, 0.2184579695196831, 0.058831324869102565),
 (u'pdays', 2612.758544921875, 0.1518558355751118, 0.04089518920106726),
 (u'campaign', 2199.911865234375, 0.12786082171123775, 0.034433266759107524),
 (u'education', 2018.1630859375, 0.11729742204366195, 0.03158851452172427),
 (u'marital', 1523.8519287109375, 0.08856762075351456, 0.023851500958695915),
 (u'housing', 1300.3089599609375, 0.07557510586060569, 0.020352581389810585),
 (u'contact', 1245.947265625, 0.07241555614534126, 0.019501706065154348),
 (u'previous', 1138.54736328125, 0.06617337890979298, 0.01

In [102]:
print rf.accuracy

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.0701776590924
RMSE: 0.264910662474
LogLoss: 0.47675875444
Mean Per-Class Error: 0.156155161358
AUC: 0.896125706055
Gini: 0.79225141211
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.303231111832: 


0,1,2,3,4
,no,yes,Error,Rate
no,25364.0,2556.0,0.0915,(2556.0/27920.0)
yes,1159.0,2570.0,0.3108,(1159.0/3729.0)
Total,26523.0,5126.0,0.1174,(3715.0/31649.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3032311,0.5804630,235.0
max f2,0.1511398,0.6978951,305.0
max f0point5,0.4741819,0.5626811,154.0
max accuracy,0.5133341,0.8997441,142.0
max precision,0.9464286,0.7681159,6.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9988539,0.0
max absolute_mcc,0.2352305,0.5264974,266.0
max min_per_class_accuracy,0.1666723,0.8402579,297.0


Gains/Lift Table: Avg response rate: 11.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100161,0.9370000,8.4872620,8.4872620,1.0,1.0,0.0850094,0.0850094,748.7262001,748.7262001
,2,0.0200006,0.9072962,8.4872620,8.4872620,1.0,1.0,0.0847412,0.1697506,748.7262001,748.7262001
,3,0.0303643,0.8928571,8.4872620,8.4872620,1.0,1.0,0.0879592,0.2577098,748.7262001,748.7262001
,4,0.0436665,0.8571429,8.4872620,8.4872620,1.0,1.0,0.1128989,0.3706087,748.7262001,748.7262001
,5,0.0557364,0.8214286,8.4872620,8.4872620,1.0,1.0,0.1024403,0.4730491,748.7262001,748.7262001
,6,0.1004139,0.6785714,8.4872620,8.4872620,1.0,1.0,0.3791901,0.8522392,748.7262001,748.7262001
,7,0.1502733,0.1964286,2.9635497,6.6545416,0.3491762,0.7840622,0.1477608,1.0,196.3549659,565.4541632
,8,0.2153939,0.1071429,0.0,4.6426581,0.0,0.5470148,0.0,1.0,-100.0,364.2658061
,9,0.3608645,0.0357143,0.0,2.7711234,0.0,0.3265038,0.0,1.0,-100.0,177.1123369




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.0626579193298
RMSE: 0.250315639403
LogLoss: 0.268976543602
Mean Per-Class Error: 0.13860894979
AUC: 0.923098927956
Gini: 0.846197855913
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.323041640982: 


0,1,2,3,4
,no,yes,Error,Rate
no,11062.0,940.0,0.0783,(940.0/12002.0)
yes,442.0,1118.0,0.2833,(442.0/1560.0)
Total,11504.0,2058.0,0.1019,(1382.0/13562.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3230416,0.6180210,222.0
max f2,0.1563736,0.7232571,304.0
max f0point5,0.4444131,0.6049945,157.0
max accuracy,0.5000018,0.9098953,131.0
max precision,0.7250000,0.7892977,39.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9999167,0.0
max absolute_mcc,0.3075893,0.5685077,227.0
max min_per_class_accuracy,0.1785736,0.8539410,293.0


Gains/Lift Table: Avg response rate: 11.50 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0112078,0.8214286,6.7489710,6.7489710,0.7763158,0.7763158,0.0756410,0.0756410,574.8970985,574.8970985
,2,0.0211621,0.75,6.8904748,6.8155320,0.7925926,0.7839721,0.0685897,0.1442308,589.0474834,581.5532029
,3,0.0349506,0.6785714,5.7647333,6.4009764,0.6631016,0.7362869,0.0794872,0.2237179,476.4733306,540.0976415
,4,0.0439463,0.6428571,5.3444199,6.1847014,0.6147541,0.7114094,0.0480769,0.2717949,434.4419924,518.4701428
,5,0.0531633,0.6071429,5.7029949,6.1011878,0.656,0.7018031,0.0525641,0.3243590,470.2994872,510.1187809
,6,0.1013125,0.4642857,4.5797778,5.3781305,0.5267994,0.6186317,0.2205128,0.5448718,357.9777752,437.8130482
,7,0.1501991,0.3285714,3.4092509,4.7373022,0.3921569,0.5449190,0.1666667,0.7115385,240.9250880,373.7302217
,8,0.2102197,0.2142857,2.1360171,3.9945993,0.2457002,0.4594879,0.1282051,0.8397436,113.6017136,299.4599286
,9,0.3374134,0.0714286,0.8668391,2.8155376,0.0997101,0.3238636,0.1102564,0.95,-13.3160907,181.5537587




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.06679400542
RMSE: 0.258445362543
LogLoss: 0.277824404731
Mean Per-Class Error: 0.142660508206
AUC: 0.918190534616
Gini: 0.836381069231
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.295157658688: 


0,1,2,3,4
,no,yes,Error,Rate
no,25304.0,2616.0,0.0937,(2616.0/27920.0)
yes,977.0,2752.0,0.262,(977.0/3729.0)
Total,26281.0,5368.0,0.1135,(3593.0/31649.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2951577,0.6050346,240.0
max f2,0.1615565,0.7193602,303.0
max f0point5,0.4394788,0.5775748,175.0
max accuracy,0.4634618,0.9012923,165.0
max precision,1.0,1.0,0.0
max recall,0.0,1.0,399.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.2951577,0.5534992,240.0
max min_per_class_accuracy,0.1851879,0.8533119,292.0


Gains/Lift Table: Avg response rate: 11.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0102373,0.8148148,6.5226180,6.5226180,0.7685185,0.7685185,0.0667739,0.0667739,552.2618019,552.2618019
,2,0.0200006,0.7469953,5.9603102,6.2481265,0.7022654,0.7361769,0.0581925,0.1249665,496.0310207,524.8126528
,3,0.0315018,0.6923077,5.4560970,5.9589603,0.6428571,0.7021063,0.0627514,0.1877179,445.6097000,495.8960281
,4,0.0413915,0.6538462,4.9350853,5.7143245,0.5814696,0.6732824,0.0488067,0.2365245,393.5085253,471.4324492
,5,0.0514076,0.6153846,4.9531340,5.5660163,0.5835962,0.6558082,0.0496112,0.2861357,395.3133975,456.6016321
,6,0.1020885,0.4615385,4.4605747,5.0172301,0.5255611,0.5911483,0.2260660,0.5122017,346.0574730,401.7230090
,7,0.1529274,0.3333333,3.5288864,4.5224481,0.4157862,0.5328512,0.1794047,0.6916063,252.8886438,352.2448078
,8,0.2014914,0.2307692,2.3854894,4.0073931,0.2810670,0.4721656,0.1158488,0.8074551,138.5489385,300.7393113
,9,0.3048122,0.0909091,1.1368259,3.0343699,0.1339450,0.3575205,0.1174578,0.9249128,13.6825919,203.4369922



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8845808,0.0060399,0.86962,0.8916587,0.8814538,0.886885,0.8932869
auc,0.9182724,0.0026785,0.9157462,0.9209911,0.9168177,0.9136567,0.9241503
err,0.1154191,0.0060399,0.1303799,0.1083413,0.1185463,0.1131150,0.1067131
err_count,730.8,40.460846,827.0,678.0,760.0,715.0,674.0
f0point5,0.5435946,0.0107998,0.5168953,0.5463259,0.5412999,0.5498008,0.5636511
f1,0.60691,0.0059715,0.5991275,0.6021127,0.6033403,0.6069269,0.6230425
f2,0.6876460,0.0106300,0.7124740,0.6705882,0.6814430,0.6773006,0.6964241
lift_top_group,6.3798437,0.2627986,6.329697,6.7492576,6.8541565,6.05984,5.906267
logloss,0.2777598,0.0132357,0.2891578,0.2529644,0.2794759,0.3052329,0.2619678


Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2017-02-23 19:26:52,54.119 sec,0.0,,,,,,,,,,
,2017-02-23 19:26:52,54.203 sec,1.0,0.3586046,4.4069587,0.6914080,6.7963311,0.1306151,0.3487614,4.1692628,0.7050370,4.0789564,0.1232119
,2017-02-23 19:26:52,54.313 sec,2.0,0.3449803,3.8351851,0.7177361,8.1513728,0.1330026,0.3035062,2.1248388,0.7824082,5.1812798,0.1450376
,2017-02-23 19:26:53,54.495 sec,3.0,0.3379472,3.4512549,0.7361613,8.4252789,0.1381265,0.2871126,1.4264772,0.8235846,5.5283426,0.1633240
,2017-02-23 19:26:53,54.658 sec,4.0,0.3288017,3.0295213,0.7510073,8.4709403,0.1437331,0.2774813,1.0749095,0.8480628,5.6419623,0.1127415
---,---,---,---,---,---,---,---,---,---,---,---,---,---
,2017-02-23 19:27:01,1 min 2.955 sec,24.0,0.2673525,0.5440972,0.8888501,8.4872620,0.1207937,0.2514514,0.2793526,0.9211744,6.6480392,0.1048518
,2017-02-23 19:27:02,1 min 3.608 sec,25.0,0.2665146,0.5211094,0.8913222,8.4872620,0.1240482,0.2509901,0.2789712,0.9215679,6.5360565,0.1086123
,2017-02-23 19:27:02,1 min 4.296 sec,26.0,0.2657235,0.5034343,0.8936158,8.4872620,0.1165598,0.2506824,0.2740809,0.9222134,6.8376549,0.1019761



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
duration,16109.1289062,1.0,0.2709734
month,7392.9511719,0.4589293,0.1243576
age,5529.1386719,0.3432301,0.0930062
poutcome,5217.3823242,0.3238774,0.0877621
day,5088.8173828,0.3158965,0.0855995
job,4493.6137695,0.2789483,0.0755875
balance,3706.7463379,0.2301022,0.0623516
pdays,2460.3674316,0.1527313,0.0413861
campaign,2097.1188965,0.1301820,0.0352759


<bound method ?.accuracy of >


In [70]:
print rf.auc()

0.901961075975


In [71]:
h2o.shutdown(prompt=False)

    >>> h2o.shutdown(prompt=False)
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
H2O session _sid_8e36 closed.
