In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pysparkling import *
from pysparkling import *
import h2o

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate(ss)

Connecting to H2O server at http://192.168.1.13:54321 ... successful.


0,1
H2O cluster uptime:,10 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,19 days
H2O cluster name:,sparkling-water-dwoodbridge_local-1578195511536
H2O cluster total nodes:,1
H2O cluster free memory:,546 Mb
H2O cluster total cores:,8
H2O cluster allowed cores:,8



Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-dwoodbridge_local-1578195511536
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,192.168.1.13,54321)
  ------------------------

  Open H2O Flow in browser: http://192.168.1.13:54321 (CMD + click in Mac OSX)

    


## Create H2OFrames

In [2]:
adult_train_df = ss.read.parquet("../Day3/spark-warehouse/adulttrain")
adult_train_h2o = hc.as_h2o_frame(adult_train_df, "adult_train")
adult_train_h2o["label"] = adult_train_h2o["label"].asfactor()

adult_valid_df = ss.read.parquet("../Day3/spark-warehouse/adultvalid")
adult_valid_h2o = hc.as_h2o_frame(adult_valid_df, "adult_valid")
adult_valid_h2o["label"] = adult_valid_h2o["label"].asfactor()

## Set variables

In [4]:
predictors = adult_valid_h2o.names[:]
response = "label"
predictors.remove(response)

n_folds = 3
fold_assignment = "Modulo"
keep_cross_validation_predictions = True

## ML Algorithms

Added nfolds, fold_assignmen, keep_cross_validation_predictions

Only used training_fame for three algorithms

In [5]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.xgboost import H2OXGBoostEstimator


model_gbm = H2OGradientBoostingEstimator(ntrees=50,
                                         max_depth=6,
                                         learn_rate=0.1,
                                         nfolds=n_folds,
                                         fold_assignment=fold_assignment,
                                         keep_cross_validation_predictions=keep_cross_validation_predictions,
                                         distribution="bernoulli")
model_gbm.train(x=predictors,
                y="label",
                training_frame=adult_train_h2o)


model_dl = H2ODeepLearningEstimator(variable_importances=True,
                                    nfolds=n_folds,
                                    fold_assignment=fold_assignment,
                                    keep_cross_validation_predictions=keep_cross_validation_predictions,
                                    loss="Automatic")

model_dl.train(x=predictors,
               y="label",
               training_frame=adult_train_h2o)

model_xg = H2OXGBoostEstimator(nfolds=n_folds,
                               fold_assignment=fold_assignment,
                               keep_cross_validation_predictions=keep_cross_validation_predictions)
model_xg.train(x=predictors,
               y="label",
               training_frame=adult_train_h2o)


gbm Model Build progress: |███████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%


## Develop a stacked ensemble
http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/_modules/h2o/estimators/stackedensemble.html

http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/modeling.html#h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator

In [6]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
model_stack = H2OStackedEnsembleEstimator(training_frame=adult_train_h2o,
                                   validation_frame=adult_valid_h2o,
                                   base_models=[model_gbm.model_id, model_dl.model_id, model_xg.model_id])

In [7]:
model_stack.train(x=predictors, y="label", training_frame=adult_train_h2o, validation_frame=adult_valid_h2o)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [8]:
model_stack

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_model_python_1578195512168_261

No model summary for this model

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.08056416432589214
RMSE: 0.28383827142563445
LogLoss: 0.26400230472638075
Null degrees of freedom: 39013
Residual degrees of freedom: 39010
Null deviance: 42878.5702775286
Residual deviance: 20599.57183319004
AIC: 20607.57183319004
AUC: 0.9435300680691157
AUCPR: 0.8161787502529673
Gini: 0.8870601361382313

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.31084217123369695: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,27112.0,2591.0,0.0872,(2591.0/29703.0)
1,1,2051.0,7260.0,0.2203,(2051.0/9311.0)
2,Total,29163.0,9851.0,0.119,(4642.0/39014.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.310842,0.75775,235.0
1,max f2,0.11903,0.832787,323.0
2,max f0point5,0.646423,0.79282,122.0
3,max accuracy,0.47916,0.886656,173.0
4,max precision,0.970022,1.0,0.0
5,max recall,0.03443,1.0,397.0
6,max specificity,0.970022,1.0,0.0
7,max absolute_mcc,0.377828,0.679947,209.0
8,max min_per_class_accuracy,0.200022,0.861361,282.0
9,max mean_per_class_accuracy,0.137736,0.863947,313.0



Gains/Lift Table: Avg response rate: 23.87 %, avg score: 23.83 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010022,0.969772,4.190098,4.190098,1.0,0.970055,1.0,0.970055,0.041993,0.041993,319.009773,319.009773
1,,2,0.020018,0.969033,4.190098,4.190098,1.0,0.969425,1.0,0.96974,0.041886,0.083879,319.009773,319.009773
2,,3,0.030015,0.967948,4.190098,4.190098,1.0,0.96853,1.0,0.969337,0.041886,0.125765,319.009773,319.009773
3,,4,0.040011,0.96617,4.190098,4.190098,1.0,0.967168,1.0,0.968795,0.041886,0.167651,319.009773,319.009773
4,,5,0.050008,0.962869,4.190098,4.190098,1.0,0.964661,1.0,0.967969,0.041886,0.209537,319.009773,319.009773
5,,6,0.100015,0.850089,3.900163,4.04513,0.930805,0.913441,0.965402,0.940705,0.195038,0.404575,290.016273,304.513023
6,,7,0.149997,0.706474,3.012573,3.701062,0.718974,0.783329,0.883288,0.888264,0.150575,0.55515,201.257283,270.106206
7,,8,0.200005,0.492813,2.495589,3.399655,0.595592,0.603443,0.811355,0.81705,0.124799,0.679948,149.558871,239.96551
8,,9,0.299995,0.219101,1.685276,2.828244,0.402205,0.329888,0.674983,0.654676,0.16851,0.848459,68.527643,182.824437
9,,10,0.40001,0.099007,0.937457,2.355487,0.223731,0.149407,0.562156,0.528343,0.09376,0.942219,-6.254348,135.548683




ModelMetricsBinomialGLM: stackedensemble
** Reported on validation data. **

MSE: 0.09286343061453382
RMSE: 0.30473501704683337
LogLoss: 0.2993826852143767
Null degrees of freedom: 9827
Residual degrees of freedom: 9824
Null deviance: 10872.215747616709
Residual deviance: 5884.66606057379
AIC: 5892.66606057379
AUC: 0.9251287363706848
AUCPR: 0.7726598059322806
Gini: 0.8502574727413696

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2341250521270587: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,6478.0,974.0,0.1307,(974.0/7452.0)
1,1,482.0,1894.0,0.2029,(482.0/2376.0)
2,Total,6960.0,2868.0,0.1481,(1456.0/9828.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.234125,0.722349,264.0
1,max f2,0.10082,0.806787,329.0
2,max f0point5,0.751119,0.768641,87.0
3,max accuracy,0.544071,0.869862,152.0
4,max precision,0.970011,1.0,0.0
5,max recall,0.033929,1.0,399.0
6,max specificity,0.970011,1.0,0.0
7,max absolute_mcc,0.379229,0.630901,205.0
8,max min_per_class_accuracy,0.179835,0.839909,287.0
9,max mean_per_class_accuracy,0.128466,0.843755,312.0



Gains/Lift Table: Avg response rate: 24.18 %, avg score: 23.82 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010073,0.969838,4.136364,4.136364,1.0,0.970084,1.0,0.970084,0.041667,0.041667,313.636364,313.636364
1,,2,0.020045,0.969144,4.136364,4.136364,1.0,0.969534,1.0,0.96981,0.041246,0.082912,313.636364,313.636364
2,,3,0.030016,0.968009,4.136364,4.136364,1.0,0.968618,1.0,0.969414,0.041246,0.124158,313.636364,313.636364
3,,4,0.04009,0.965865,4.136364,4.136364,1.0,0.967078,1.0,0.968827,0.041667,0.165825,313.636364,313.636364
4,,5,0.050061,0.962708,4.136364,4.136364,1.0,0.96426,1.0,0.967918,0.041246,0.207071,313.636364,313.636364
5,,6,0.10002,0.853874,3.58878,3.86285,0.867617,0.91362,0.933876,0.940796,0.179293,0.386364,258.877986,286.285027
6,,7,0.150081,0.708955,2.934128,3.553066,0.70935,0.785109,0.858983,0.888865,0.146886,0.533249,193.412786,255.306626
7,,8,0.200041,0.479684,2.198759,3.214834,0.531568,0.59498,0.777213,0.815469,0.109848,0.643098,119.875949,221.4834
8,,9,0.300061,0.218511,1.66212,2.697263,0.401831,0.326264,0.652085,0.6524,0.166246,0.809343,66.211967,169.726255
9,,10,0.39998,0.102274,1.019348,2.278104,0.246436,0.149984,0.55075,0.526892,0.101852,0.911195,1.934827,127.810411







In [11]:
model_stack.model_performance()


ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.08090528124824824
RMSE: 0.2844385368550616
LogLoss: 0.26487554019945564
Null degrees of freedom: 39013
Residual degrees of freedom: 39010
Null deviance: 42878.5702775286
Residual deviance: 20667.708650683126
AIC: 20675.708650683126
AUC: 0.9430759011764169
AUCPR: 0.768663415361708
Gini: 0.8861518023528339

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2918062875427942: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,26861.0,2842.0,0.0957,(2842.0/29703.0)
1,1,1920.0,7391.0,0.2062,(1920.0/9311.0)
2,Total,28781.0,10233.0,0.1221,(4762.0/39014.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.291806,0.756345,239.0
1,max f2,0.110195,0.831263,323.0
2,max f0point5,0.644975,0.791369,124.0
3,max accuracy,0.462364,0.886118,181.0
4,max precision,0.969852,1.0,0.0
5,max recall,0.034606,1.0,397.0
6,max specificity,0.969852,1.0,0.0
7,max absolute_mcc,0.318492,0.677745,229.0
8,max min_per_class_accuracy,0.203574,0.86081,277.0
9,max mean_per_class_accuracy,0.128498,0.863287,313.0



Gains/Lift Table: Avg response rate: 23.87 %, avg score: 23.93 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010022,0.969843,4.190098,4.190098,1.0,0.970082,1.0,0.970082,0.041993,0.041993,319.009773,319.009773
1,,2,0.020018,0.969514,4.190098,4.190098,1.0,0.969689,1.0,0.969885,0.041886,0.083879,319.009773,319.009773
2,,3,0.030015,0.968903,4.190098,4.190098,1.0,0.969241,1.0,0.969671,0.041886,0.125765,319.009773,319.009773
3,,4,0.040011,0.96718,4.190098,4.190098,1.0,0.968168,1.0,0.969295,0.041886,0.167651,319.009773,319.009773
4,,5,0.050008,0.964366,4.190098,4.190098,1.0,0.965905,1.0,0.968618,0.041886,0.209537,319.009773,319.009773
5,,6,0.100015,0.851754,3.895867,4.042983,0.92978,0.91542,0.96489,0.942019,0.194823,0.40436,289.58674,304.298256
6,,7,0.149997,0.710912,3.010424,3.698914,0.718462,0.785782,0.882775,0.889958,0.150467,0.554828,201.042406,269.891403
7,,8,0.200005,0.4951,2.474112,3.392674,0.590466,0.608242,0.809689,0.81952,0.123725,0.678552,147.411204,239.267429
8,,9,0.299995,0.221809,1.686351,2.823948,0.402461,0.331975,0.673958,0.657019,0.168618,0.84717,68.635054,182.39483
9,,10,0.40001,0.099627,0.948195,2.35495,0.226294,0.151054,0.562027,0.530511,0.094834,0.942004,-5.180515,135.494984





