# Write-Up Outline

1. Trial run with random model (~17)
2. Initial random forest test (untuned) (~14)
3. Initial neural network test
4. Initial gradient boosting test

In [4]:
import numpy as np
import pandas as pd
import h2o

from sklearn.model_selection import train_test_split
from h2o import H2OFrame
from h2o.estimators import *

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 min 11 secs
H2O cluster timezone:,America/Denver
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,27 days
H2O cluster name:,H2O_from_python_josh_5z3iu4
H2O cluster total nodes:,1
H2O cluster free memory:,3.875 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [9]:
class ModelTester:
    
    def __init__(self, df_train: H2OFrame, df_test: H2OFrame):
        self.train_data, self.test_data = df_train.split_frame(ratios=[0.8])
        self.full_train = df_train
        self.full_test = df_test
        
        self.columns = df_train.columns
        self.columns.remove('Buy')
        
    def test(self, model: H2OEstimator):
        model.train(
            x=self.columns,
            y='Buy',
            training_frame=self.train_data,
        )
        
        raw_predictions = model.predict(self.test_data).as_data_frame()['predict']
        mean = raw_predictions.mean()
        
        predictions = (raw_predictions > mean).apply(int)
        actuals = self.test_data.as_data_frame()['Buy']
        
        accuracy = sum([prediction == actual for prediction, actual in zip(predictions, actuals)]) / len(predictions)
        
        return accuracy
        
    def submit(self, model: H2OEstimator):
        model.train(
            x=self.columns,
            y='Buy',
            training_frame=self.full_train,
        )
        
        prediction = model.predict(self.full_test).as_data_frame()['predict']
        mean = prediction.mean()
        
        result_df = self.full_test.as_data_frame()
        result_df['Buy'] = (prediction > mean).apply(int)
        result_df[['Unnamed: 0', 'Buy']].to_csv(
            'submission.csv',
            index=None,
            header=True,
        )

In [5]:
train_data = h2o.import_file('data/stock_XY_train.csv').drop(['operatingProfitMargin', 'Ticker', 'Sector', 'Yr'], axis=1)
test_data = h2o.import_file('data/stock_X_test.csv').drop(['operatingProfitMargin', 'Ticker', 'Sector', 'Yr'], axis=1)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
model_tester = ModelTester(
    train_data,
    test_data,
)

results = {}

In [None]:
results['H2ODeepLearningEstimator()'] = model_tester.test(H2ODeepLearningEstimator())

results['H2OGradientBoostingEstimator()'] = model_tester.test(H2OGradientBoostingEstimator())
results['H2OGradientBoostingEstimator(balance_classes=True)'] = model_tester.test(H2OGradientBoostingEstimator(balance_classes=True))

# results['H2OGeneralizedLinearEstimator()'] = model_tester.test(H2OGeneralizedLinearEstimator())
# results['H2OGeneralizedLinearEstimator(alpha=0.25)'] = model_tester.test(H2OGeneralizedLinearEstimator(alpha=0.25))
# results['H2OGeneralizedLinearEstimator(alpha=0.5)'] = model_tester.test(H2OGeneralizedLinearEstimator(alpha=0.5))
# results['H2OGeneralizedLinearEstimator(alpha=0.75)'] = model_tester.test(H2OGeneralizedLinearEstimator(alpha=0.75))

# results['H2ONaiveBayesEstimator()'] = model_tester.test(H2ONaiveBayesEstimator())
# results['H2ONaiveBayesEstimator(balance_classes=True)'] = model_tester.test(H2ONaiveBayesEstimator(balance_classes=True))
# results['H2ONaiveBayesEstimator(nfolds=5)'] = model_tester.test(H2ONaiveBayesEstimator(nfolds=5))
# results['H2ONaiveBayesEstimator(nfolds=5, balance_classes=True)'] = model_tester.test(H2ONaiveBayesEstimator(nfolds=5, balance_classes=True))

# results['H2OSupportVectorMachineEstimator(disable_training_metrics=False)'] = model_tester.test(H2OSupportVectorMachineEstimator(disable_training_metrics=False))

results['H2ORandomForestEstimator()'] = model_tester.test(H2ORandomForestEstimator())

# results['H2OStackedEnsembleEstimator()'] = model_tester.test(H2OStackedEnsembleEstimator())

# results['H2OTargetEncoderEstimator()'] = model_tester.test(H2OTargetEncoderEstimator())

results['H2OXGBoostEstimator()'] = model_tester.test(H2OXGBoostEstimator())

deeplearning Model Build progress: |█████████████████████████

In [89]:
results
# model_tester.submit(H2OXGBoostEstimator())

xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%
