In [None]:
import h2o

from h2o import H2OFrame
from h2o.estimators import *

h2o.init()

In [26]:
class ModelTester:
    
    def __init__(self, df_train: H2OFrame, df_test: H2OFrame):
        self.train_data, self.test_data = df_train.split_frame(ratios=[0.8])
        self.full_train = df_train
        self.full_test = df_test
        
        self.columns = df_train.columns
        self.columns.remove('Buy')
        
    def test_trained(self, model: H2OEstimator):
        raw_predictions = model.predict(self.test_data).as_data_frame()['predict']
        mean = raw_predictions.mean()
        
        predictions = (raw_predictions > mean).apply(int)
        actuals = self.test_data.as_data_frame()['Buy']
        
        accuracy = sum([prediction == actual for prediction, actual in zip(predictions, actuals)]) / len(predictions)
        
        return accuracy
        
    def test(self, model: H2OEstimator):
        model.train(
            x=self.columns,
            y='Buy',
            training_frame=self.train_data,
        )
        
        return self.test_trained(model)

        
    def submit(self, model: H2OEstimator):
        model.train(
            x=self.columns,
            y='Buy',
            training_frame=self.full_train,
        )
        
        prediction = model.predict(self.full_test).as_data_frame()['predict']
        
        result_df = self.full_test.as_data_frame()
        result_df['Buy'] = (prediction > 0.5).apply(int)
        result_df[['Unnamed: 0', 'Buy']].to_csv(
            'submission.csv',
            index=None,
            header=True,
        )

In [9]:
train_data = h2o.import_file('data/stock_XY_train.csv').drop(['C1', 'operatingProfitMargin', 'Ticker', 'Sector',  'Yr'], axis=1)
test_data = h2o.import_file('data/stock_X_test.csv').drop(['C1', 'operatingProfitMargin', 'Ticker', 'Sector',  'Yr'], axis=1)

train_data['Buy'] = train_data['Buy'].asfactor() 

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [29]:
model_tester = ModelTester(
    train_data,
    test_data,
)

In [30]:
gb = H2OGradientBoostingEstimator(
    nfolds=50,
    fold_assignment="Modulo",
    keep_cross_validation_predictions=True,
)
print('GB:', model_tester.test(gb))

rf = H2ORandomForestEstimator(
    nfolds=10,
    fold_assignment="Modulo",
    keep_cross_validation_predictions=True,
)

print('RF:', model_tester.test(rf))

ensemble = H2OStackedEnsembleEstimator(base_models=[gb, rf])

print(model_tester.submit(ensemble))

gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
0.609375
