In [1]:
#!/usr/bin/python


'''
rightSphnix:
* create Labels and store to disk
* build models
'''

##################################################################################
#########################       Configure       ##################################
##################################################################################

assets = 'Test'   # Typically AllStocks, SchwabOneSource, SchwabETFs, or Test
horizon = 2       # prediction horizon in days

totalBuildTimeAllowed_seconds = 28800


startDate = '2001-01-01'



In [2]:

##################################################################################
###########################       Imports       ##################################
##################################################################################
print('importing packages')
from multiprocessing import Pool
import pandas
import os
import uyulala
#reload(uyulala)

import datetime
import numpy
import random
import string
import subprocess
import time
from psutil import virtual_memory
import shutil
from pathlib import Path
import math
import glob

totMem = virtual_memory().total
availMem = virtual_memory().available

folderName = 'Assets-'+assets+'--Hrzn-'+str(horizon)


importing packages


In [3]:

##################################################################################
#################              Clear directories          ########################
##################################################################################
print('clearing directories')
try:
    [ os.remove(os.path.join(uyulala.dataDir,'labeled',folderName,f)) for f in os.listdir(os.path.join(uyulala.dataDir,'labeled',folderName)) if f.endswith(".csv") ]
except:
    os.makedirs(os.path.join(uyulala.dataDir,'labeled',folderName))

try:
    [ os.remove(os.path.join(uyulala.modelsDir,folderName,f)) for f in os.listdir(os.path.join(uyulala.modelsDir,folderName)) if f!='pca_model_id.txt'  ]
except:
    os.makedirs(os.path.join(uyulala.modelsDir,folderName))


clearing directories


In [4]:

'''
##################################################################################
################# Get and transform data (run leftSphnix) ########################
##################################################################################
print('getting and transforming data')
if assets!="Test":
    import warnings
    warnings.filterwarnings("ignore")


filePath = os.path.join(uyulala.uyulalaDir,'greatRiddleGate','leftSphnix.py')
print('making call: '+'python %s --assets=%s --horizon=%i --start=%s' % (filePath,assets,horizon,startDate))
subprocess.call('python %s --assets=%s --horizon=%i --start=%s' % (filePath,assets,horizon,startDate), shell=True)
'''




In [5]:

##################################################################################
########################       Create        ###############################
##################################################################################
print('creating ')

evaluate = [ f.replace('.csv','') for f in os.listdir(os.path.join(uyulala.dataDir,'transformed',folderName)) if f.endswith(".csv") ]


def createLabels(asset=''):
    try:
        labeled = pandas.read_csv(os.path.join(uyulala.dataDir,'raw',folderName,asset+'.csv'),parse_dates=['DateCol']).set_index('DateCol',drop=False)
        labeled = labeled.drop_duplicates(subset=['Date'], keep='last')
        #########################################################################
        # THE BELOW MUST REMAIN IN CORRECT ORDER SINCE CALLED BELOW BY POSITION #
        #########################################################################
        # Key Regression Field (what's the biggest loss?)
        print('label for biggest loss')
        labeled = uyulala.lowPercentChange(df=labeled,horizon=horizon)
        # Key Regression Field (what's the predicted return?)
        print('label for highest gain')
        labeled = uyulala.percentChange(df=labeled,horizon=horizon,HighOrClose='High')
        # Key Classification Field (is it a good buy?)
        print('label for whether higest gain comes before biggest loss')
        labeled = uyulala.expectedReturnPct(df=labeled,horizon=horizon)
        #add weights
        print('add weights column')
        labeled = uyulala.weights(df=labeled, horizon=horizon,weightForIncrease=1,weightForDecrease=2)
        # Clean-up
        labeled = labeled.drop(['Open','High','Low','Close','Volume'],axis=1)
        labeled.to_csv(os.path.join(uyulala.dataDir,'labeled',folderName,asset+'.csv'),index=False)
        return asset
    except:
        print('unable to create label for '+asset)
        pass


print('labelling data')
for i in range(0,len(evaluate),500):
    l = evaluate[i:i+500]
    pool = Pool(uyulala.availableCores,maxtasksperchild=1)
    pool.map(createLabels, l)
    pool.close()
    pool.join()

print('Done labelling data')



creating 
labelling data
label for biggest losslabel for biggest losslabel for biggest loss


label for highest gainlabel for highest gainlabel for highest gain


label for whether higest gain comes before biggest losslabel for whether higest gain comes before biggest loss

label for whether higest gain comes before biggest loss
add weights column
add weights column
add weights column
Done labelling data


In [6]:

##################################################################################
##########################       Load Data       #################################
##################################################################################


import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.frame import H2OFrame

try:
    h2o.init(nthreads = -1,max_mem_size="%sG" % int(totMem/1500000000/1.5),min_mem_size="%sG" % int(availMem/1500000000/1.5))
except:
    time.sleep(20)
    h2o.init(nthreads = -1,max_mem_size="%sG" % int(totMem/1500000000/1.5),min_mem_size="%sG" % int(availMem/1500000000/1.5))


Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_101"; Java(TM) SE Runtime Environment (build 1.8.0_101-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.101-b13, mixed mode)
  Starting server from /Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpu7n830tt
  JVM stdout: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpu7n830tt/h2o_Damian_started_from_python.out
  JVM stderr: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpu7n830tt/h2o_Damian_started_from_python.err
  Server is running at http://127.0.0.1:54323
Connecting to H2O server at http://127.0.0.1:54323 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,23 days
H2O_cluster_name:,H2O_from_python_Damian_arqt2n
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.223 Gb
H2O_cluster_total_cores:,24
H2O_cluster_allowed_cores:,24


In [7]:


print('importing data')

dataSize = sum(f.stat().st_size for f in Path(os.path.join(uyulala.dataDir,'transformed',folderName)).glob('**/*') if f.is_file() ) + sum(os.path.getsize(os.path.join(uyulala.dataDir,'labeled',folderName,f)) for f in os.listdir(os.path.join(uyulala.dataDir,'labeled',folderName)))
ratio = ((availMem/2000000000) / (20.0000000000000)) / (dataSize/1000000000)
print('full data size: {}gb'.format(dataSize/1000000000.00))

transformed_files = [file for file in os.listdir(os.path.join(uyulala.dataDir,'transformed',folderName)) if file not in ['.DS_Store']]
#transformed_pca_files = [file for file in os.listdir(os.path.join(uyulala.dataDir,'transformed_pca',folderName)) if file not in ['.DS_Store']]

def sampleAndCleanDataAsNeeded(transformed_files=transformed_files):
    if ratio < 0.98:
        print('reducing file size by {}%'.format(100*(1-ratio)))
        k=math.ceil(len(transformed_files)*ratio)
        sampledFiles=random.choices(transformed_files, k=max(1,min(k,len(transformed_files))))
    else:
        sampledFiles=transformed_files

    print('Files to use: {}'.format(sampledFiles))

    fullDF = h2o.import_file(path=os.path.join(uyulala.dataDir,'transformed',folderName),pattern = "(%s)" % ('|'.join(sampledFiles),),col_types={'DateCol':'enum','Date':'enum'}).na_omit().merge(h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled',folderName),pattern = ".*\.csv",col_types={'DateCol':'enum','Date':'enum'}).na_omit()).na_omit()
    #fullDF = h2o.import_file(path=os.path.join(uyulala.dataDir,'transformed_pca',folderName),pattern = "(%s)" % ('|'.join(sampledFiles),),col_types={'DateCol':'enum','Date':'enum'}).na_omit().merge(h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled',folderName),pattern = ".*\.csv",col_types={'DateCol':'enum','Date':'enum'}).na_omit()).na_omit()

    ##################################################################################
    #####################       Clean and Split Data       ############################
    ##################################################################################

    uniqueMonths = list(set([x[0:7] for x in fullDF['Date'].unique().as_data_frame()['C1'].tolist()]))
    holdoutMonths = random.choices(uniqueMonths, k=int(len(uniqueMonths)*.15))
    fullDF = fullDF.cbind(H2OFrame(fullDF.as_data_frame()['Date'].apply(lambda x:x[0:7]).to_frame(name='mnth'),column_types=['enum']))
    oot = fullDF[fullDF['mnth'].isin(holdoutMonths),:]
    fullDF = fullDF.drop('mnth')
    oot = oot.drop('mnth')

    print('Final data size is %s' % (fullDF.shape,))
    train,test = fullDF.split_frame(ratios=[.85])
    test = test.rbind(oot)
    print('Training data size: %s' % (train.shape,))
    print('Validation data size: %s' % (test.shape,))
    print(train.head(2))
    print(test.head(2))
    features = [s for s in fullDF.columns if "feat_" in s]
    #features = [s for s in fullDF.columns if "PC" in s]
    labels = [s for s in fullDF.columns if "lab_" in s]

    train = train[(train[labels[0]]>=train[labels[0]].mean()[0] - (2*train[labels[0]].sd()[0])) &
    (train[labels[0]]<=train[labels[0]].mean()[0] + (2*train[labels[0]].sd()[0])) &
    (train[labels[1]]>=train[labels[1]].mean()[0] - (2*train[labels[1]].sd()[0])) &
    (train[labels[1]]<=train[labels[1]].mean()[0] + (2*train[labels[1]].sd()[0]))]

    h2o.remove('oot')
    h2o.remove('fullDF')
    return train,test,labels,features



importing data
full data size: 0.003383502gb


In [10]:
train,test,labels,features = sampleAndCleanDataAsNeeded(transformed_files=transformed_files)

Files to use: ['CHIX.csv', 'QQQC.csv', 'SDEM.csv']
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Final data size is (1472, 128)
Training data size: (1233, 128)
Validation data size: (417, 128)


Date,Symbol,DateCol,feat_Open,feat_High,feat_Low,feat_Close,feat_Volume,feat_VROC11,feat_VROC7,feat_VROC5,feat_VROC3,feat_PROC11,feat_PROC7,feat_PROC2,feat_PROC3,feat_DOW,feat_RSI17,feat_RSI13,feat_RSI11,feat_RSIg7,feat_RSIg5,feat_RSIg3,feat_Close11/19SMARatio,feat_Close5/11SMARatio,feat_Close3/7SMARatio,feat_Close2PctFromSMA,feat_Close4PctFromSMA,feat_Close8PctFromSMA,feat_10CCI,feat_6CCI,feat_3CCI,feat_10FI,feat_6FI,feat_3FI,feat_10EMV,feat_6EMV,feat_3EMV,feat_Close10PctFromUpperBB,feat_Close10PctFromLowerBB,feat_Close10BBBandwidth,feat_Close10PctB,feat_Close6PctFromUpperBB,feat_Close6PctFromLowerBB,feat_Close6BBBandwidth,feat_Close6PctB,feat_Close3PctFromUpperBB,feat_Close3PctFromLowerBB,feat_Close3BBBandwidth,feat_Close3PctB,feat_DIRatio17,feat_ADX17,feat_DMI17,feat_AdjATR17,feat_DIRatio14,feat_ADX14,feat_DMI14,feat_AdjATR14,feat_DIRatio11,feat_ADX11,feat_DMI11,feat_AdjATR11,feat_DIRatio9,feat_ADX9,feat_DMI9,feat_AdjATR9,feat_DIRatio7,feat_ADX7,feat_DMI7,feat_AdjATR7,feat_DIRatio5,feat_ADX5,feat_DMI5,feat_AdjATR5,feat_DIRatio4,feat_ADX4,feat_DMI4,feat_AdjATR4,feat_DIRatio3,feat_ADX3,feat_DMI3,feat_AdjATR3,feat_DIRatio2,feat_ADX2,feat_DMI2,feat_AdjATR2,feat_Close91226MACD,feat_Close71123MACD,feat_235MACDg,feat_357MACDg,feat_5711MACDg,feat_11StocOsc,feat_11StocOscRatio,feat_7StocOsc,feat_7StocOscRatio,feat_5StocOsc,feat_5StocOscRatio,feat_3StocOsc,feat_3StocOscRatio,feat_11PriceChannelDist,feat_7PriceChannelDist,feat_5PriceChannelDist,feat_3PriceChannelDist,feat_PSAR,feat_10ADL,feat_10CMF,feat_10ChaikinOscillator,feat_5ADL,feat_5CMF,feat_5ChaikinOscillator,feat_10AroonUp,feat_10AroonDown,feat_10AroonOscillator,feat_5AroonUp,feat_5AroonDown,feat_5AroonOscillator,feat_autocorr10High1,feat_autocorr5High1,feat_autocorr5High2,feat_autocorr3High1,feat_autocorr3High2,feat_autocorr3High3,feat_autocorr2High2,feat_autocorr2High1,lab_lowPercentChange_H2,lab_percentChange_H2High,lab_expectedReturnPct_H2,weights
2018-01-29,QQQC,2018-01-29,33.52,33.52,33.0,33.09,43500,3.02778,1.00461,1.51445,0.520979,0.0200852,0.0172989,-0.00681481,-0.0158544,1,100,100,100,100,100,100,0.0055529,0.0040961,0.000439317,-0.0147387,-0.015618,-0.0123862,-24.5769,-104.075,-65.2611,0.00532378,0.00583593,0.00498633,0.000254589,-0.0274721,-0.0933507,-0.0336929,0.0178335,5.19383,0.334443,-0.0339628,0.00313279,3.76764,0.0815836,-0.0439932,0.0143894,5.92598,0.235625,72.0074,72.0074,51.8507,0.0147691,71.2545,71.2545,50.772,0.0153551,70.4897,70.4897,49.688,0.016836,69.9576,69.9576,48.9407,0.0168328,68.9846,68.9846,47.5888,0.0166809,64.9319,64.9319,42.1615,0.0190533,58.4735,58.4735,34.1915,0.0211492,43.663,53.8032,23.4921,0.0198802,9.00012,57.5215,5.177,0.0224129,0.0644791,0.0226054,-1.32587,-0.269581,-0.0960192,35.8491,-0.486126,8.10811,-0.865892,8.10811,-0.848038,8.10811,-0.848038,-0.00675371,-0.0138578,-0.0138578,-0.0138578,-0.0299033,0.177129,0.155732,0.0,0.158157,0.0791925,-0.171255,90,20,70,80,100,-20,0.592387,-0.576956,0.352371,-0.964164,0.911224,-0.567063,1,-1,-0.0231496,0.0261956,n-2,2
2018-01-30,CHIX,2018-01-30,18.6,18.71,18.48,18.55,56600,0.280543,0.612536,-0.358277,0.607955,0.0458357,0.00483351,-0.0285566,-0.00478723,2,100,100,100,100,100,100,0.0137898,0.00578382,0.000379075,-0.00562852,-0.0142155,-0.0134946,-26.4599,-88.3808,-60.7058,0.0054197,0.00450123,0.004331,0.00127872,-0.00727705,-0.00876567,-0.0386485,0.0245082,6.36063,0.373056,-0.0408596,0.00907903,5.0745,0.174375,-0.0515094,0.0224548,7.50546,0.287953,79.9021,79.9021,63.8434,0.0144861,75.4258,75.4258,56.8906,0.0159506,67.8277,67.8277,46.006,0.0174314,59.3509,59.3509,35.2253,0.0166187,45.5136,52.2672,23.7887,0.0182098,21.3328,43.7308,9.32899,0.0208352,2.44845,40.6149,0.994438,0.0218795,-23.5569,48.4502,-11.4134,0.0229671,-59.6172,70.3765,-41.9565,0.0204319,0.0557491,0.0120404,-5.63807,-0.314211,-0.133945,48.5507,-0.330371,8.97436,-0.810371,8.97436,-0.793471,8.97436,-0.793196,-0.00107701,-0.0169581,-0.0169581,-0.0169581,-0.036864,0.264216,0.154216,0.0200817,-0.216544,-0.220365,-0.3144,80,10,70,60,100,-40,0.665055,-0.482434,-0.268992,-0.380017,0.0364025,0.312363,-1,1,-0.0227393,0.0,p0,2





Date,Symbol,DateCol,feat_Open,feat_High,feat_Low,feat_Close,feat_Volume,feat_VROC11,feat_VROC7,feat_VROC5,feat_VROC3,feat_PROC11,feat_PROC7,feat_PROC2,feat_PROC3,feat_DOW,feat_RSI17,feat_RSI13,feat_RSI11,feat_RSIg7,feat_RSIg5,feat_RSIg3,feat_Close11/19SMARatio,feat_Close5/11SMARatio,feat_Close3/7SMARatio,feat_Close2PctFromSMA,feat_Close4PctFromSMA,feat_Close8PctFromSMA,feat_10CCI,feat_6CCI,feat_3CCI,feat_10FI,feat_6FI,feat_3FI,feat_10EMV,feat_6EMV,feat_3EMV,feat_Close10PctFromUpperBB,feat_Close10PctFromLowerBB,feat_Close10BBBandwidth,feat_Close10PctB,feat_Close6PctFromUpperBB,feat_Close6PctFromLowerBB,feat_Close6BBBandwidth,feat_Close6PctB,feat_Close3PctFromUpperBB,feat_Close3PctFromLowerBB,feat_Close3BBBandwidth,feat_Close3PctB,feat_DIRatio17,feat_ADX17,feat_DMI17,feat_AdjATR17,feat_DIRatio14,feat_ADX14,feat_DMI14,feat_AdjATR14,feat_DIRatio11,feat_ADX11,feat_DMI11,feat_AdjATR11,feat_DIRatio9,feat_ADX9,feat_DMI9,feat_AdjATR9,feat_DIRatio7,feat_ADX7,feat_DMI7,feat_AdjATR7,feat_DIRatio5,feat_ADX5,feat_DMI5,feat_AdjATR5,feat_DIRatio4,feat_ADX4,feat_DMI4,feat_AdjATR4,feat_DIRatio3,feat_ADX3,feat_DMI3,feat_AdjATR3,feat_DIRatio2,feat_ADX2,feat_DMI2,feat_AdjATR2,feat_Close91226MACD,feat_Close71123MACD,feat_235MACDg,feat_357MACDg,feat_5711MACDg,feat_11StocOsc,feat_11StocOscRatio,feat_7StocOsc,feat_7StocOscRatio,feat_5StocOsc,feat_5StocOscRatio,feat_3StocOsc,feat_3StocOscRatio,feat_11PriceChannelDist,feat_7PriceChannelDist,feat_5PriceChannelDist,feat_3PriceChannelDist,feat_PSAR,feat_10ADL,feat_10CMF,feat_10ChaikinOscillator,feat_5ADL,feat_5CMF,feat_5ChaikinOscillator,feat_10AroonUp,feat_10AroonDown,feat_10AroonOscillator,feat_5AroonUp,feat_5AroonDown,feat_5AroonOscillator,feat_autocorr10High1,feat_autocorr5High1,feat_autocorr5High2,feat_autocorr3High1,feat_autocorr3High2,feat_autocorr3High3,feat_autocorr2High2,feat_autocorr2High1,lab_lowPercentChange_H2,lab_percentChange_H2High,lab_expectedReturnPct_H2,weights
2018-01-29,CHIX,2018-01-29,18.88,18.92,18.75,18.76,65200,1.50769,1.61847,0.122203,0.241905,0.0768355,0.0299401,0.00638298,-0.0150963,1,100,100,100,100,100,100,0.0142327,0.00961287,0.00176616,-0.0126316,-0.00832562,-0.000798935,30.5998,-23.318,-24.0426,0.00520092,0.00403969,0.00308004,0.0111273,0.00381212,-0.028138,-0.0339961,0.0509928,8.42727,0.579596,-0.0280911,0.0160067,4.43658,0.352785,-0.0372279,0.0235743,6.12201,0.373287,87.5146,87.5146,76.5881,0.0141502,84.7113,84.7113,71.76,0.0153306,79.8944,79.8944,63.8311,0.0177479,74.4307,74.4307,55.3992,0.0173859,65.3107,65.3107,42.6548,0.0183736,48.8142,48.8142,23.8283,0.0210034,35.4905,36.2867,12.8783,0.0223027,16.5582,33.2287,5.50208,0.0241843,-11.9194,44.1973,-5.26804,0.026968,0.122564,0.0772546,-0.242097,-0.112087,-0.0216683,70.0599,-0.131625,35.0649,-0.439358,24.2424,-0.525681,24.2424,-0.48378,0.0181818,-0.00609272,-0.00898045,-0.00898045,0.00860215,0.0920873,0.0764525,0.0,0.0733793,0.0611734,-0.183515,90,10,80,80,60,20,0.777965,-0.579427,0.230812,-0.9382,0.759993,0.305754,1,-1,-0.00645161,0.0166667,n0,1
2018-02-01,CHIX,2018-02-01,18.54,18.64,18.48,18.48,38100,-0.362876,-0.568027,0.0823864,-0.415644,0.0191361,-0.0179136,-0.00374131,-0.0147992,4,100,100,100,100,100,100,0.0122732,0.000873024,-0.00485344,-0.00778523,-0.00858369,-0.017152,-75.4162,-67.1874,-49.1627,0.00423439,0.0034489,0.00318974,-0.00243184,-0.0210309,-0.0197531,-0.0386309,0.00909642,4.84427,0.183229,-0.0417795,0.014219,5.67809,0.243308,-0.0223596,0.00998259,3.25435,0.301754,67.1859,67.1859,45.1395,0.0155406,60.5062,60.5062,36.61,0.0174848,49.7552,49.7552,24.7558,0.0167422,38.5929,41.9822,16.2021,0.0180072,22.1364,35.7941,7.92353,0.0198245,-1.76651,30.8511,-0.544987,0.0206993,-16.4175,31.9194,-5.24038,0.0188648,-31.2854,38.1181,-11.9254,0.0165202,-44.9132,54.5986,-24.522,0.0172977,-0.0296007,-0.0658694,0.371613,-0.775105,-0.264502,19.5876,-0.529051,0.0,-1.0,0.0,-1.0,0.0,-1.0,-0.0157124,-0.0206677,-0.0206677,-0.0115004,-0.0389572,0.065686,-0.00769396,-0.0204671,-0.321731,-0.328254,-0.223028,60,100,-40,20,100,-80,0.241813,-0.178163,0.404035,-0.956895,0.116569,0.874376,1,-1,-0.0438312,0.000541126,p0,2





In [12]:
gbm = H2OGradientBoostingEstimator(distribution="quantile", quantile_alpha = quantile_alpha, nfolds = 6, ntrees=10000, learn_rate_annealing=0.99)
gbm.train(x=features,y=label,training_frame=train,weights_column='weights')



gbm Model Build progress: |███████████████████████████████████████████████ (failed) 100%


OSError: Job with key $03017f00000134d4ffffffff$_82f8e5ccb41dae98894ae021b8454a34 failed with an exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_model_python_1603379487373_145_cv_1.  Details: ERRR on field: _ntrees: The tree model will not fit in the driver node's memory ( 182  B per tree x 10000 > Zero  ) - try decreasing ntrees and/or max_depth or increasing min_rows!

stacktrace: 
water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_model_python_1603379487373_145_cv_1.  Details: ERRR on field: _ntrees: The tree model will not fit in the driver node's memory ( 182  B per tree x 10000 > Zero  ) - try decreasing ntrees and/or max_depth or increasing min_rows!

	at water.exceptions.H2OModelBuilderIllegalArgumentException.makeFromBuilder(H2OModelBuilderIllegalArgumentException.java:19)
	at hex.tree.SharedTree.doScoringAndSaveModel(SharedTree.java:729)
	at hex.tree.SharedTree$Driver.scoreAndBuildTrees(SharedTree.java:443)
	at hex.tree.SharedTree$Driver.computeImpl(SharedTree.java:370)
	at hex.ModelBuilder$Driver.compute2(ModelBuilder.java:238)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1563)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:974)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1477)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


In [11]:

print('building models')
#timePerRun = int(totalBuildTimeAllowed_seconds / (len(labels)*1.0000000000))
timePerRun = 999999
print('Time per run: ' + str(timePerRun) + ' seconds')


label='lab_lowPercentChange_H2'
quantile_alpha=.05
perf_metric='mse'

print('building model for {}'.format(label))
# GBM hyperparameters
hyper_params = {'learn_rate': [i * 0.01 for i in range(1,21,2)],
                'max_depth': list(range(3, 22, 3)),
                'sample_rate': [i * 0.1 for i in range(5, 10)],
                'col_sample_rate': [i * 0.1 for i in range(3, 10)],
                'col_sample_rate_per_tree': [i * 0.1 for i in range(5, 8)],
                'min_rows': [50, 100, 500],
                'min_split_improvement': [1e-3, 1e-5]}
# Search criteria
search_criteria = {'strategy':'RandomDiscrete', 'max_models':50, 'max_runtime_secs':timePerRun,
                    'stopping_metric':perf_metric, 'stopping_rounds':5}
gbm = H2OGradientBoostingEstimator(distribution="quantile", quantile_alpha = quantile_alpha, nfolds = 6, ntrees=10000, learn_rate_annealing=0.99)
# Train and validate a random grid of GBMs
gbm_grid = H2OGridSearch(model=gbm,
                          grid_id='gbm_grid_{}'.format(label),
                          hyper_params=hyper_params,
                          search_criteria=search_criteria,parallelism=0)
gbm_grid.train(x=features,y=label,training_frame=train,weights_column='weights')


building models
Time per run: 999999 seconds
building model for lab_lowPercentChange_H2
gbm Grid Build progress: |█ (cancelled)


H2OJobCancelled: Job<$03017f00000134d4ffffffff$_a9e57f9fb067acca2978f54385384bc4> was cancelled by the user.

In [None]:
# Grab the top GBM model, chosen by validation metric
print('choosing top model...')
for i in range(max(10,len(gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models))):
    ithPerf = eval('''gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models[{}].model_performance(test).{}()'''.format(i,perf_metric))
    print('model {0} {1}: {2}'.format(i,perf_metric,ithPerf))
    if i==0:
        leadingPerf = ithPerf
        leadingModel = gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models[i]
        print('''leading test performance: {}'''.format(leadingPerf))
    else:
        if ithPerf < leadingPerf:
            leadingPerf = ithPerf
            leadingModel = gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models[i]
            print('''leading test performance: {}'''.format(leadingPerf))
print('variable importance:')
print(leadingModel.varimp(use_pandas=True))


In [8]:

##################################################################################
#######################       Building Models       ##############################
##################################################################################


print('building models')
#timePerRun = int(totalBuildTimeAllowed_seconds / (len(labels)*1.0000000000))
timePerRun = 999999
print('Time per run: ' + str(timePerRun) + ' seconds')

#executionOrder = []


def createQuantileModel(label,quantile_alpha=.05,perf_metric='mse'):
    print('building model for {}'.format(label))
    # GBM hyperparameters
    hyper_params = {'learn_rate': [i * 0.01 for i in range(1,21,2)],
                    'max_depth': list(range(3, 22, 3)),
                    'sample_rate': [i * 0.1 for i in range(5, 10)],
                    'col_sample_rate': [i * 0.1 for i in range(3, 10)],
                    'col_sample_rate_per_tree': [i * 0.1 for i in range(5, 8)],
                    'min_rows': [50, 100, 500],
                    'min_split_improvement': [1e-3, 1e-5]}
    # Search criteria
    search_criteria = {'strategy':'RandomDiscrete', 'max_models':500, 'max_runtime_secs':timePerRun,
                        'stopping_metric':perf_metric, 'stopping_rounds':5}
    gbm = H2OGradientBoostingEstimator(distribution="quantile", quantile_alpha = quantile_alpha, nfolds = 6, ntrees=10000, learn_rate_annealing=0.99)
    # Train and validate a random grid of GBMs
    gbm_grid = H2OGridSearch(model=gbm,
                              grid_id='gbm_grid_{}'.format(label),
                              hyper_params=hyper_params,
                              search_criteria=search_criteria,parallelism=0)
    gbm_grid.train(x=features,y=label,training_frame=train,weights_column='weights')
    # Grab the top GBM model, chosen by validation metric
    print('choosing top model...')
    for i in range(max(10,len(gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models))):
        ithPerf = eval('''gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models[{}].model_performance(test).{}()'''.format(i,perf_metric))
        print('model {0} {1}: {2}'.format(i,perf_metric,ithPerf))
        if i==0:
            leadingPerf = ithPerf
            leadingModel = gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models[i]
            print('''leading test performance: {}'''.format(leadingPerf))
        else:
            if ithPerf < leadingPerf:
                leadingPerf = ithPerf
                leadingModel = gbm_grid.get_grid(sort_by=perf_metric, decreasing=False).models[i]
                print('''leading test performance: {}'''.format(leadingPerf))
    print('variable importance:')
    print(leadingModel.varimp(use_pandas=True))
    return leadingModel

r=1
print('''model building round {}'''.format(r))
while len(transformed_files)>0:
    train,test,labels,features = sampleAndCleanDataAsNeeded(transformed_files=transformed_files)
    low_quantile_model = createQuantileModel(label=labels[0], quantile_alpha=.05, perf_metric='mae')
    #executionOrder.append(low_quantile_model.model_id)
    h2o.save_model(model=low_quantile_model, path=os.path.join(uyulala.modelsDir,folderName,labels[0]), force=True)
    high_quantile_model = createQuantileModel(label=labels[1], quantile_alpha=.05, perf_metric='mae')
    #executionOrder.append(high_quantile_model.model_id)
    h2o.save_model(model=high_quantile_model, path=os.path.join(uyulala.modelsDir,folderName,labels[1]), force=True)

    print('building model for {}'.format(labels[2]))
    aml = H2OAutoML(project_name=labels[2],
                    stopping_rounds=5,max_models=500,
                    max_runtime_secs = timePerRun)
    aml.train(x=features,y=labels[2],training_frame=train,leaderboard_frame=test,weights_column='weights')
    #executionOrder = executionOrder + [aml._leader_id]
    h2o.save_model(model=aml.leader, path=os.path.join(uyulala.modelsDir,folderName,labels[2]), force=True)
    print('variable importance:')
    print(aml.leader.varimp(use_pandas=True))
    transformed_files = [x for x in transformed_files if x not in sampledFiles]
    r=r+1
print('done building models')





building models
Time per run: 999999 seconds
model building round 1
Files to use: ['CHIX.csv', 'QQQC.csv', 'SDEM.csv']
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Final data size is (1472, 128)
Training data size: (1261, 128)
Validation data size: (400, 128)


DateCol,Symbol,Date,feat_Open,feat_High,feat_Low,feat_Close,feat_Volume,feat_VROC11,feat_VROC7,feat_VROC5,feat_VROC3,feat_PROC11,feat_PROC7,feat_PROC2,feat_PROC3,feat_DOW,feat_RSI17,feat_RSI13,feat_RSI11,feat_RSIg7,feat_RSIg5,feat_RSIg3,feat_Close11/19SMARatio,feat_Close5/11SMARatio,feat_Close3/7SMARatio,feat_Close2PctFromSMA,feat_Close4PctFromSMA,feat_Close8PctFromSMA,feat_10CCI,feat_6CCI,feat_3CCI,feat_10FI,feat_6FI,feat_3FI,feat_10EMV,feat_6EMV,feat_3EMV,feat_Close10PctFromUpperBB,feat_Close10PctFromLowerBB,feat_Close10BBBandwidth,feat_Close10PctB,feat_Close6PctFromUpperBB,feat_Close6PctFromLowerBB,feat_Close6BBBandwidth,feat_Close6PctB,feat_Close3PctFromUpperBB,feat_Close3PctFromLowerBB,feat_Close3BBBandwidth,feat_Close3PctB,feat_DIRatio17,feat_ADX17,feat_DMI17,feat_AdjATR17,feat_DIRatio14,feat_ADX14,feat_DMI14,feat_AdjATR14,feat_DIRatio11,feat_ADX11,feat_DMI11,feat_AdjATR11,feat_DIRatio9,feat_ADX9,feat_DMI9,feat_AdjATR9,feat_DIRatio7,feat_ADX7,feat_DMI7,feat_AdjATR7,feat_DIRatio5,feat_ADX5,feat_DMI5,feat_AdjATR5,feat_DIRatio4,feat_ADX4,feat_DMI4,feat_AdjATR4,feat_DIRatio3,feat_ADX3,feat_DMI3,feat_AdjATR3,feat_DIRatio2,feat_ADX2,feat_DMI2,feat_AdjATR2,feat_Close91226MACD,feat_Close71123MACD,feat_235MACDg,feat_357MACDg,feat_5711MACDg,feat_11StocOsc,feat_11StocOscRatio,feat_7StocOsc,feat_7StocOscRatio,feat_5StocOsc,feat_5StocOscRatio,feat_3StocOsc,feat_3StocOscRatio,feat_11PriceChannelDist,feat_7PriceChannelDist,feat_5PriceChannelDist,feat_3PriceChannelDist,feat_PSAR,feat_10ADL,feat_10CMF,feat_10ChaikinOscillator,feat_5ADL,feat_5CMF,feat_5ChaikinOscillator,feat_10AroonUp,feat_10AroonDown,feat_10AroonOscillator,feat_5AroonUp,feat_5AroonDown,feat_5AroonOscillator,feat_autocorr10High1,feat_autocorr5High1,feat_autocorr5High2,feat_autocorr3High1,feat_autocorr3High2,feat_autocorr3High3,feat_autocorr2High2,feat_autocorr2High1,lab_lowPercentChange_H2,lab_percentChange_H2High,lab_expectedReturnPct_H2,weights
2018-01-29,CHIX,2018-01-29,18.88,18.92,18.75,18.76,65200,1.50769,1.61847,0.122203,0.241905,0.0768355,0.0299401,0.00638298,-0.0150963,1,100,100,100,100,100,100,0.0142327,0.00961287,0.00176616,-0.0126316,-0.00832562,-0.000798935,30.5998,-23.318,-24.0426,0.00520092,0.00403969,0.00308004,0.0111273,0.00381212,-0.028138,-0.0339961,0.0509928,8.42727,0.579596,-0.0280911,0.0160067,4.43658,0.352785,-0.0372279,0.0235743,6.12201,0.373287,87.5146,87.5146,76.5881,0.0141502,84.7113,84.7113,71.76,0.0153306,79.8944,79.8944,63.8311,0.0177479,74.4307,74.4307,55.3992,0.0173859,65.3107,65.3107,42.6548,0.0183736,48.8142,48.8142,23.8283,0.0210034,35.4905,36.2867,12.8783,0.0223027,16.5582,33.2287,5.50208,0.0241843,-11.9194,44.1973,-5.26804,0.026968,0.122564,0.0772546,-0.242097,-0.112087,-0.0216683,70.0599,-0.131625,35.0649,-0.439358,24.2424,-0.525681,24.2424,-0.48378,0.0181818,-0.00609272,-0.00898045,-0.00898045,0.00860215,0.0920873,0.0764525,0,0.0733793,0.0611734,-0.183515,90,10,80,80,60,20,0.777965,-0.579427,0.230812,-0.9382,0.759993,0.305754,1,-1,-0.00645161,0.0166667,n0,1
2018-01-29,QQQC,2018-01-29,33.52,33.52,33.0,33.09,43500,3.02778,1.00461,1.51445,0.520979,0.0200852,0.0172989,-0.00681481,-0.0158544,1,100,100,100,100,100,100,0.0055529,0.0040961,0.000439317,-0.0147387,-0.015618,-0.0123862,-24.5769,-104.075,-65.2611,0.00532378,0.00583593,0.00498633,0.000254589,-0.0274721,-0.0933507,-0.0336929,0.0178335,5.19383,0.334443,-0.0339628,0.00313279,3.76764,0.0815836,-0.0439932,0.0143894,5.92598,0.235625,72.0074,72.0074,51.8507,0.0147691,71.2545,71.2545,50.772,0.0153551,70.4897,70.4897,49.688,0.016836,69.9576,69.9576,48.9407,0.0168328,68.9846,68.9846,47.5888,0.0166809,64.9319,64.9319,42.1615,0.0190533,58.4735,58.4735,34.1915,0.0211492,43.663,53.8032,23.4921,0.0198802,9.00012,57.5215,5.177,0.0224129,0.0644791,0.0226054,-1.32587,-0.269581,-0.0960192,35.8491,-0.486126,8.10811,-0.865892,8.10811,-0.848038,8.10811,-0.848038,-0.00675371,-0.0138578,-0.0138578,-0.0138578,-0.0299033,0.177129,0.155732,0,0.158157,0.0791925,-0.171255,90,20,70,80,100,-20,0.592387,-0.576956,0.352371,-0.964164,0.911224,-0.567063,1,-1,-0.0231496,0.0261956,n-2,2





DateCol,Symbol,Date,feat_Open,feat_High,feat_Low,feat_Close,feat_Volume,feat_VROC11,feat_VROC7,feat_VROC5,feat_VROC3,feat_PROC11,feat_PROC7,feat_PROC2,feat_PROC3,feat_DOW,feat_RSI17,feat_RSI13,feat_RSI11,feat_RSIg7,feat_RSIg5,feat_RSIg3,feat_Close11/19SMARatio,feat_Close5/11SMARatio,feat_Close3/7SMARatio,feat_Close2PctFromSMA,feat_Close4PctFromSMA,feat_Close8PctFromSMA,feat_10CCI,feat_6CCI,feat_3CCI,feat_10FI,feat_6FI,feat_3FI,feat_10EMV,feat_6EMV,feat_3EMV,feat_Close10PctFromUpperBB,feat_Close10PctFromLowerBB,feat_Close10BBBandwidth,feat_Close10PctB,feat_Close6PctFromUpperBB,feat_Close6PctFromLowerBB,feat_Close6BBBandwidth,feat_Close6PctB,feat_Close3PctFromUpperBB,feat_Close3PctFromLowerBB,feat_Close3BBBandwidth,feat_Close3PctB,feat_DIRatio17,feat_ADX17,feat_DMI17,feat_AdjATR17,feat_DIRatio14,feat_ADX14,feat_DMI14,feat_AdjATR14,feat_DIRatio11,feat_ADX11,feat_DMI11,feat_AdjATR11,feat_DIRatio9,feat_ADX9,feat_DMI9,feat_AdjATR9,feat_DIRatio7,feat_ADX7,feat_DMI7,feat_AdjATR7,feat_DIRatio5,feat_ADX5,feat_DMI5,feat_AdjATR5,feat_DIRatio4,feat_ADX4,feat_DMI4,feat_AdjATR4,feat_DIRatio3,feat_ADX3,feat_DMI3,feat_AdjATR3,feat_DIRatio2,feat_ADX2,feat_DMI2,feat_AdjATR2,feat_Close91226MACD,feat_Close71123MACD,feat_235MACDg,feat_357MACDg,feat_5711MACDg,feat_11StocOsc,feat_11StocOscRatio,feat_7StocOsc,feat_7StocOscRatio,feat_5StocOsc,feat_5StocOscRatio,feat_3StocOsc,feat_3StocOscRatio,feat_11PriceChannelDist,feat_7PriceChannelDist,feat_5PriceChannelDist,feat_3PriceChannelDist,feat_PSAR,feat_10ADL,feat_10CMF,feat_10ChaikinOscillator,feat_5ADL,feat_5CMF,feat_5ChaikinOscillator,feat_10AroonUp,feat_10AroonDown,feat_10AroonOscillator,feat_5AroonUp,feat_5AroonDown,feat_5AroonOscillator,feat_autocorr10High1,feat_autocorr5High1,feat_autocorr5High2,feat_autocorr3High1,feat_autocorr3High2,feat_autocorr3High3,feat_autocorr2High2,feat_autocorr2High1,lab_lowPercentChange_H2,lab_percentChange_H2High,lab_expectedReturnPct_H2,weights
2018-02-05,SDEM,2018-02-05,14.25,14.36,13.88,13.9,13100,0.617284,3.09375,1.18333,0.364583,-0.00554017,-0.0471135,-0.0264407,-0.0290737,1,100,100,100,100,100,100,0.00744729,-0.00733802,-0.00978648,-0.0145339,-0.0343869,-0.0513564,-143.872,-115.167,-66.4456,0.00448255,0.00442847,0.00460392,-0.00240797,-0.0316613,-0.0659749,-0.095523,-0.00515337,9.51598,-0.0515782,-0.0866729,0.00620723,9.67736,0.0610382,-0.0812954,0.0303929,11.4605,0.25,48.1243,49.132,23.6444,0.0131992,42.7883,46.5077,19.8999,0.0141048,33.0412,43.168,14.2632,0.015105,21.6325,42.2623,9.14239,0.0165609,2.92777,42.3498,1.2399,0.0170907,-26.977,48.7506,-13.1515,0.0193848,-46.233,56.782,-26.252,0.0198152,-66.2545,69.3652,-45.9575,0.0228143,-84.6433,86.8622,-73.5231,0.0308315,-0.462988,-0.576425,0.210675,0.567551,-4.68015,1.68067,-0.903751,1.68067,-0.824711,2.1978,-0.774824,2.29885,-0.897523,-0.0397237,-0.0397237,-0.0303453,-0.0289906,-0.074105,-0.102219,-0.140455,-0.165381,-0.665297,-0.593689,-0.418541,40,100,-60,40,100,-60,0.707053,0.738958,0.452235,0.882723,-0.518046,0.701599,1,1,0.0,0.0396254,n0,1
2018-02-09,QQQC,2018-02-09,28.69,29.25,27.8,29.23,38800,5.80702,1.20455,0.545817,-0.337884,-0.133333,-0.13179,-0.0400394,-0.0561471,5,100,100,100,100,100,100,-0.0129944,-0.0250955,-0.01812,0.00688942,-0.0144146,-0.0473785,-102.704,-88.2292,-59.4928,0.0193095,0.020015,0.021399,-0.444274,-0.740387,-0.712263,-0.142871,0.0408889,19.3634,0.190721,-0.0863411,0.0375886,12.7026,0.277118,-0.0358625,0.0334922,6.94369,0.465594,9.07685,43.0795,3.91026,0.0275966,-3.07617,44.8134,-1.37854,0.0309989,-20.4128,49.3929,-10.0825,0.0348677,-35.9765,56.0137,-20.1518,0.0376354,-55.2814,66.4742,-36.7478,0.0383102,-77.1996,80.906,-62.4592,0.0433,-87.5693,88.9221,-77.8685,0.0439803,-95.6473,95.9276,-91.7521,0.0405701,-99.5933,99.6143,-99.2092,0.0457584,0.351187,0.261811,-0.013231,0.0397584,0.151822,22.6624,0.962558,29.668,1.08018,35.8396,1.01488,53.5581,1.0815,-0.0557261,-0.0324396,-0.0189629,0.00326068,-0.0923519,-0.267035,-0.187263,-0.0946378,0.105395,0.134919,0.233549,30,100,-70,20,100,-80,0.938188,0.975839,0.982063,0.975153,0.95807,0.996139,1,1,-0.00343643,0.0439863,n0,1



building model for lab_lowPercentChange_H2
gbm Grid Build progress: |█

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2878, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-b3791d5977cd>", line 57, in <module>
    low_quantile_model = createQuantileModel(label=labels[0], quantile_alpha=.05, perf_metric='mae')
  File "<ipython-input-8-b3791d5977cd>", line 34, in createQuantileModel
    gbm_grid.train(x=features,y=label,training_frame=train,weights_column='weights')
  File "/Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/h2o/grid/grid_search.py", line 342, in train
    self.build_model(parms)
  File "/Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/h2o/grid/grid_search.py", line 361, in build_model
    self._model_build(x, y, training_frame, validation_frame, algo_params)
  File "/Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/h2o/grid/grid_search.py",

KeyboardInterrupt: 

In [None]:


##################################################################################
######################       Check Predictions       #############################
##################################################################################


train['dataset'] = 'train'
test['dataset'] = 'test'
fullDF = train.rbind(test)
h2o.remove('train')
h2o.remove('test')

preds = low_quantile_model.predict(fullDF)
preds.set_names([x+'_low' for x in preds.names])
fullDF = fullDF.cbind(preds)

preds = high_quantile_model.predict(fullDF)
preds.set_names([x+'_high' for x in preds.names])
fullDF = fullDF.cbind(preds)

preds = aml.leader.predict(fullDF)
preds.set_names([x+'_expectedReturnPct' for x in preds.names])
fullDF = fullDF.cbind(preds)


with open(os.path.join(uyulala.modelsDir,folderName,"executionOrder.txt"), "w") as output:
    output.write(str(executionOrder))
print(fullDF.head(2))
h2o.export_file(fullDF, path=os.path.join(uyulala.dataDir,'model_data',folderName), force = True, parts=-1)


print('done')
