# H2O Models on lagged data

In [341]:
from sklearn import preprocessing
import pandas as pd
import numpy as np
import h2o
from h2o.automl import H2OAutoML
import random

In [342]:
#Loading the data
data = pd.read_csv("Data/20180920_mastertable.csv")

# save columns that we do not scale
invest = data.invest
date = data.date

# drop the first row
data = data.dropna()

# scale the data
data_scaled = data.drop(["date","Unnamed: 0","invest"], axis=1)

# save column names
cols = data_scaled.columns

# scaling and bring back to pandas
data_scaled = preprocessing.scale(data_scaled)
data_scaled = pd.DataFrame(data_scaled)

# attaching the unscaled data and column names
data_scaled.columns = cols
data_scaled["invest"] = invest
data_scaled["date"] = date

# saving as data
data = data_scaled

data = pd.read_csv("Data/20180920_mastertable.csv")
for lines in range(4):
        data["lag_return_day"+str(lines+1)]=data["return_day+1"].shift(lines+1)
data

In [343]:
#Here we add a 12 day lag based on academic papers for all the columns
for lines in range(12):
        data["lag_return_day"+str(lines+1)]=data["return_day+1"].shift(lines+1)
        data["lag_close_day"+str(lines+1)]=data["close"].shift(lines+1)
        data["lag_volume_day"+str(lines+1)]=data["volume"].shift(lines+1)
        data["lag_googletrends_buy_sell_day"+str(lines+1)]=data["googletrends_buy_sell"].shift(lines+1)
        data["lag_Tweets (#)"+str(lines+1)]=data["Tweets (#)"].shift(lines+1)
        data["lag_Active Influencers (#)"+str(lines+1)]=data["Active Influencers (#)"].shift(lines+1)
        data["lag_Twitter Average SA"+str(lines+1)]=data["Twitter Average SA"].shift(lines+1)
        data["lag_Forum SA Merit (weighted)"+str(lines+1)]=data["Forum SA Merit (weighted)"].shift(lines+1)
        data["lag_Reddit Comments (#)"+str(lines+1)]=data["Reddit Comments (#)"].shift(lines+1)
        data["lag_Reddit Average SA"+str(lines+1)]=data["Reddit Average SA"].shift(lines+1)
        data["lag_volatility_14"+str(lines+1)]=data["volatility_14"].shift(lines+1)

In [344]:
#We drop the return day+1 data since the dependent variable is made with this column
data = data.drop(["return_day+1","lag_return_day1"], axis=1)

In [345]:
#Here we initialize h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,3 hours 5 mins
H2O cluster timezone:,Europe/Amsterdam
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.7
H2O cluster version age:,19 days
H2O cluster name:,Daniel
H2O cluster total nodes:,1
H2O cluster free memory:,1.303 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [346]:
#Here we make the training, validation and testing set
#We leave 1 day inbetween due to the autocorrelation
train = data[0:260].dropna()
validation = data[262:300]
test = data[302:]
#We convert the train, test and validation set to h2o dataframe
train = h2o.H2OFrame(train)
validation = h2o.H2OFrame(validation)
test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [347]:
#identify the columns on which we want to regres
x = train.columns
y = "invest"
x.remove(y)

In [None]:
# Run AutoML for 120 seconds
aml = H2OAutoML(max_runtime_secs = 120,seed = 12)
aml.train(x = x, y = y,
          training_frame = train,
          validation_frame = validation
         )
# View the AutoML Leaderboard
lb = aml.leaderboard
lb

AutoML progress: |█████████████████████████

In [None]:
#use the best model to predict our test set and convert it to pandas
preds = aml.leader.predict(test)
predictions = preds.as_data_frame(use_pandas=True)

In [None]:
#Here we evaluate the model
import sklearn
from sklearn import metrics
test_df = test.as_data_frame(use_pandas=True)
test_df
y_test = test_df["invest"]
predictions = predictions.values[:,2]
predictions
sklearn.metrics.roc_auc_score(y_test, predictions)

In [None]:
output = pd.DataFrame(predictions)
output.to_csv("Data/model_lagged_predictions_final.csv")