In [1]:
import h2o

from h2o import H2OFrame
from h2o.estimators import *

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.6" 2020-01-14; OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
  Starting server from /home/josh/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpicupi_ox
  JVM stdout: /tmp/tmpicupi_ox/h2o_josh_started_from_python.out
  JVM stderr: /tmp/tmpicupi_ox/h2o_josh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Denver
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,1 month and 2 days
H2O cluster name:,H2O_from_python_josh_lz8ucu
H2O cluster total nodes:,1
H2O cluster free memory:,3.875 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [2]:
class ModelTester:
    
    def __init__(self, df_train: H2OFrame, df_test: H2OFrame):
        self.train_data, self.test_data = df_train.split_frame(ratios=[0.8])
        self.full_train = df_train
        self.full_test = df_test
        
        self.columns = df_train.columns
        self.columns.remove('Buy')
        
    def test_trained(self, model: H2OEstimator):
        raw_predictions = model.predict(self.test_data).as_data_frame()['predict']
        mean = raw_predictions.mean()
        
        predictions = (raw_predictions > mean).apply(int)
        actuals = self.test_data.as_data_frame()['Buy']
        
        accuracy = sum([prediction == actual for prediction, actual in zip(predictions, actuals)]) / len(predictions)
        
        return accuracy
        
    def test(self, model: H2OEstimator):
        model.train(
            x=self.columns,
            y='Buy',
            training_frame=self.train_data,
        )
        
        return self.test_trained(model)

        
    def submit(self, model: H2OEstimator):
        model.train(
            x=self.columns,
            y='Buy',
            training_frame=self.full_train,
        )
        
        prediction = model.predict(self.full_test).as_data_frame()['predict']
        
        result_df = self.full_test.as_data_frame()
        result_df['Buy'] = (prediction > 0.5).apply(int)
        result_df[['Unnamed: 0', 'Buy']].to_csv(
            'submission.csv',
            index=None,
            header=True,
        )

In [3]:
train_data = h2o.import_file('data/stock_XY_train.csv').drop(['C1', 'operatingProfitMargin', 'Ticker', 'Sector',  'Yr'], axis=1)
test_data = h2o.import_file('data/stock_X_test.csv').drop(['C1', 'operatingProfitMargin', 'Ticker', 'Sector',  'Yr'], axis=1)

train_data['Buy'] = train_data['Buy'].asfactor() 

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
model_tester = ModelTester(
    train_data,
    test_data,
)

In [7]:
gb = H2OGradientBoostingEstimator(distribution='bernoulli',
                                  ntrees=10,
                                  max_depth=3,
                                  min_rows=2,
                                  learn_rate=0.2,
                                  nfolds=5,
                                  fold_assignment='Modulo',
                                  keep_cross_validation_predictions=True,
                                  seed=1234)
model_tester.submit(gb)

rf = H2ORandomForestEstimator(ntrees=50,
                              nfolds=5,
                              fold_assignment='Modulo',
                              keep_cross_validation_predictions=True,
                              seed=1234)
model_tester.submit(rf)

gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
GB: None
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%
RF: None


In [8]:
ensemble = H2OStackedEnsembleEstimator(base_models=[gb, rf])

model_tester.submit(ensemble)

stackedensemble Model Build progress: |███████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
