In [1]:
#!/usr/bin/python


'''
rightSphnix:
* create Labels and store to disk
* build models
'''

##################################################################################
#########################       Configure       ##################################
##################################################################################

assets = 'Test'   # Typically AllStocks, SchwabOneSource, SchwabETFs, or Test
horizon = 2       # prediction horizon in days

totalBuildTimeAllowed_seconds = 28800


startDate = '2010-01-01'



In [2]:

##################################################################################
###########################       Imports       ##################################
##################################################################################
print('importing packages')
from multiprocessing import Pool
import pandas
import os
import uyulala
#reload(uyulala)

import datetime
import numpy
import random
import string
import subprocess
import time
from psutil import virtual_memory
import shutil
from pathlib import Path
import math
import glob
import shutil

totMem = virtual_memory().total
availMem = virtual_memory().available

folderName = 'Assets-'+assets+'--Hrzn-'+str(horizon)


importing packages


In [3]:

##################################################################################
#################              Clear directories          ########################
##################################################################################
print('clearing directories')
try:
    shutil.rmtree(os.path.join(uyulala.dataDir,'labeled',folderName))
    os.makedirs(os.path.join(uyulala.dataDir,'labeled',folderName))
except:
    os.makedirs(os.path.join(uyulala.dataDir,'labeled',folderName))

try:
    shutil.rmtree(os.path.join(uyulala.modelsDir,folderName))
    os.makedirs(os.path.join(uyulala.modelsDir,folderName))
except:
    os.makedirs(os.path.join(uyulala.modelsDir,folderName))


clearing directories


In [4]:
'''
##################################################################################
################# Get and transform data (run leftSphnix) ########################
##################################################################################
print('getting and transforming data')
if assets!="Test":
    import warnings
    warnings.filterwarnings("ignore")


filePath = os.path.join(uyulala.uyulalaDir,'greatRiddleGate','leftSphnix.py')
print('making call: '+'python %s --assets=%s --horizon=%i --start=%s' % (filePath,assets,horizon,startDate))
subprocess.call('python %s --assets=%s --horizon=%i --start=%s' % (filePath,assets,horizon,startDate), shell=True)

'''

getting and transforming data
making call: python /Users/Damian/Documents/uyulala/greatRiddleGate/leftSphnix.py --assets=Test --horizon=2 --start=2010-01-01


0

In [5]:

##################################################################################
########################       Create        ###############################
##################################################################################
print('creating ')

evaluate = [ f.replace('.csv','') for f in os.listdir(os.path.join(uyulala.dataDir,'raw',folderName)) if f.endswith(".csv") ]


creating 


In [6]:

def createLabels(asset=''):
    try:
        labeled = pandas.read_csv(os.path.join(uyulala.dataDir,'raw',folderName,asset+'.csv'),parse_dates=['DateCol']).set_index('DateCol',drop=False)
        labeled = labeled.drop_duplicates(subset=['Date'], keep='last') # KEEP EVERYTHING BELOW THIS POINT IN ORDER
        print('label for biggest loss') # Key Regression Field (what's the biggest loss?)
        labeled = uyulala.lowPercentChange(df=labeled,horizon=horizon)
        print('label for highest gain') # Key Regression Field (what's the predicted return?)
        labeled = uyulala.percentChange(df=labeled,horizon=horizon,HighOrClose='High')
        print('label for whether higest gain comes before biggest loss') # Key Classification Field (is it a good buy?)
        labeled = uyulala.expectedReturnPct(df=labeled,horizon=horizon)
        print('add weights column')  #add weights
        labeled = uyulala.weights(df=labeled, horizon=horizon,weightForIncrease=1,weightForDecrease=2)
        labeled = labeled.drop(['Open','High','Low','Close','Volume'],axis=1)
        labeled['YearMo'] = labeled['DateCol'].dt.strftime('%Y%m')
        labeled['Asset'] = labeled['Symbol']
        labeled.to_parquet(os.path.join(uyulala.dataDir,'labeled',folderName),index=False,partition_cols=['YearMo','Asset'])
        return asset
    except:
        print('unable to create label for '+asset)
        pass




In [None]:

print('labelling data')
for i in range(0,len(evaluate),400):
    l = evaluate[i:i+400]
    pool = Pool(uyulala.availableCores,maxtasksperchild=1)
    pool.map(createLabels, l)
    pool.close()
    pool.join()

print('Done labelling data')


labelling data
label for biggest losslabel for biggest losslabel for biggest losslabel for biggest losslabel for biggest loss




label for highest gainlabel for highest gainlabel for highest gainlabel for highest gainlabel for highest gain



In [3]:

##################################################################################
##########################       Load Data       #################################
##################################################################################
try: h2o.cluster().shutdown()
except: pass

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.frame import H2OFrame

try:
    h2o.init(nthreads = -1,max_mem_size="%sG" % int(totMem/1500000000/1.5),min_mem_size="%sG" % int(availMem/1500000000/1.5))
except:
    time.sleep(20)
    h2o.init(nthreads = -1,max_mem_size="%sG" % int(totMem/1500000000/1.5),min_mem_size="%sG" % int(availMem/1500000000/1.5))



Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_101"; Java(TM) SE Runtime Environment (build 1.8.0_101-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.101-b13, mixed mode)
  Starting server from /Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpi0gfopzl
  JVM stdout: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpi0gfopzl/h2o_Damian_started_from_python.out
  JVM stderr: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpi0gfopzl/h2o_Damian_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_Damian_feic16
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.223 Gb
H2O_cluster_total_cores:,24
H2O_cluster_allowed_cores:,24


In [4]:
### IMPORT DATA FOR HYPERPARAMETER TUNING
YrMos = [str(f)[-6:] for f in Path(os.path.join(uyulala.dataDir,'labeled',folderName)).glob('**/*') if f.is_dir()]
YrMos = [f for f in YrMos if f.startswith('20') | f.startswith('19')]
ootMonths = YrMos[-int(len(YrMos)*0.1):] #use latest 10% of months as holdout set

dataSize = sum(f.stat().st_size for f in Path(os.path.join(uyulala.dataDir,'transformed',folderName)).glob('**/*') if f.is_file() \
               & ~any([x in str(f) for x in ['YearMo=' + sub for sub in ootMonths] ]) ) \
            + sum(f.stat().st_size for f in Path(os.path.join(uyulala.dataDir,'labeled',folderName)).glob('**/*') if f.is_file() \
               & ~any([x in str(f) for x in ['YearMo=' + sub for sub in ootMonths] ]) )
ratio = ((availMem/2000000000) / (20.0000000000000)) / (dataSize/1000000000)
print('full data size: {}gb'.format(dataSize/1000000000.00))

transformed_files = ['YearMo='+str(f).split('YearMo=')[1] for f in Path(os.path.join(uyulala.dataDir,'transformed',folderName)).glob('**/*') if f.is_file() \
               & ~any([x in str(f) for x in ['YearMo=' + sub for sub in ootMonths] ]) ]

if ratio < 0.98:
    print('reducing file size by {}%'.format(100*(1-ratio)))
    k=math.ceil(len(transformed_files)*ratio)
    sampledFiles=random.choices(transformed_files, k=max(1,min(k,len(transformed_files))))
else:
    sampledFiles=transformed_files
#print('Files to use: {}'.format(sampledFiles))

full data size: 0.061800569gb


In [5]:
fullDF = h2o.import_file(path=os.path.join(uyulala.dataDir,'transformed',folderName),pattern = "(%s)" % ('|'.join(sampledFiles),),col_types={'DateCol':'enum','Date':'enum'}).na_omit().merge(h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled',folderName),pattern = ".*\.parquet",col_types={'DateCol':'enum','Date':'enum'}).na_omit()).na_omit()
fullDF

Parse progress: |█████████████████████████████████████████████████████████| 100%




Parse progress: |█████████████████████████████████████████████████████████| 100%




Symbol,Date,DateCol,feat_Open,feat_High,feat_Low,feat_Close,feat_Volume,feat_VROC11,feat_VROC7,feat_VROC5,feat_VROC3,feat_PROC11,feat_PROC7,feat_PROC2,feat_PROC3,feat_DOW,feat_RSIg7,feat_RSIg5,feat_RSIg3,feat_RSI17,feat_RSI13,feat_RSI11,feat_Close11/19SMARatio,feat_Close5/11SMARatio,feat_Close3/7SMARatio,feat_Close2PctFromSMA,feat_Close4PctFromSMA,feat_Close8PctFromSMA,feat_10CCI,feat_6CCI,feat_3CCI,feat_10FI,feat_6FI,feat_3FI,feat_10EMV,feat_6EMV,feat_3EMV,feat_Close10PctFromUpperBB,feat_Close10PctFromLowerBB,feat_Close10BBBandwidth,feat_Close10PctB,feat_Close6PctFromUpperBB,feat_Close6PctFromLowerBB,feat_Close6BBBandwidth,feat_Close6PctB,feat_Close3PctFromUpperBB,feat_Close3PctFromLowerBB,feat_Close3BBBandwidth,feat_Close3PctB,feat_DIRatio17,feat_ADX17,feat_DMI17,feat_AdjATR17,feat_DIRatio14,feat_ADX14,feat_DMI14,feat_AdjATR14,feat_DIRatio11,feat_ADX11,feat_DMI11,feat_AdjATR11,feat_DIRatio9,feat_ADX9,feat_DMI9,feat_AdjATR9,feat_DIRatio7,feat_ADX7,feat_DMI7,feat_AdjATR7,feat_DIRatio5,feat_ADX5,feat_DMI5,feat_AdjATR5,feat_DIRatio4,feat_ADX4,feat_DMI4,feat_AdjATR4,feat_DIRatio3,feat_ADX3,feat_DMI3,feat_AdjATR3,feat_DIRatio2,feat_ADX2,feat_DMI2,feat_AdjATR2,feat_Close91226MACD,feat_Close71123MACD,feat_235MACDg,feat_357MACDg,feat_5711MACDg,feat_11StocOsc,feat_11StocOscRatio,feat_7StocOsc,feat_7StocOscRatio,feat_5StocOsc,feat_5StocOscRatio,feat_3StocOsc,feat_3StocOscRatio,feat_11PriceChannelDist,feat_7PriceChannelDist,feat_5PriceChannelDist,feat_3PriceChannelDist,feat_PSAR,feat_10ADL,feat_10CMF,feat_10ChaikinOscillator,feat_5ADL,feat_5CMF,feat_5ChaikinOscillator,feat_10AroonUp,feat_10AroonDown,feat_10AroonOscillator,feat_5AroonUp,feat_5AroonDown,feat_5AroonOscillator,feat_autocorr10High1,feat_autocorr5High1,feat_autocorr5High2,feat_autocorr3High1,feat_autocorr3High2,feat_autocorr3High3,feat_autocorr2High2,feat_autocorr2High1,lab_lowPercentChange_H2,lab_percentChange_H2High,lab_expectedReturnPct_H2,weights
AAPL,2010-01-29,1264720000000000.0,6.21,6.24,5.87,5.93,1245950000.0,1.05639,1.03536,0.413017,-0.332684,-0.0414747,-0.0616541,-0.04,-0.0545455,5,28.5714,40.8602,17.9104,33.9535,35.7513,33.8983,-0.00624869,-0.00835469,-0.00556729,-0.0182119,-0.0458568,-0.0546034,-126.409,-108.512,-69.0777,0.0214999,0.0165793,0.00946995,-0.0113371,-0.0229038,-0.043297,-0.120238,0.00584483,13.3732,0.0407832,-0.0970525,0.0149558,11.6803,0.120565,-0.109277,0.0447835,15.9194,0.258921,-41.5142,44.924,-18.6498,0.0324032,-41.4171,43.9512,-18.2033,0.0358748,-40.6747,42.2358,-17.1793,0.0392435,-39.4963,40.4135,-15.9619,0.0440468,-37.6183,38.4104,-14.4493,0.0467407,-36.3066,39.7738,-14.4405,0.0490369,-38.1593,43.262,-16.5085,0.0525935,-46.313,52.3854,-24.2613,0.052448,-70.2081,73.9173,-51.8959,0.0525181,0.318865,0.272589,0.215742,0.213029,0.251032,7.69231,-0.710169,8.21918,-0.692354,8.21918,-0.716793,9.52381,-0.607913,-0.0527157,-0.0489174,-0.0489174,-0.0412288,-0.100866,-0.327443,-0.234136,0.0,-0.17178,-0.179354,0.0685022,30,100,-70,40,100,-60,0.462217,0.110884,-0.722802,0.965116,-0.730831,-0.592137,1,1,-0.00673401,0.020202,n0,1
AAPL,2010-02-01,1264980000000000.0,5.94,6.05,5.9,6.01,749876000.0,0.73224,0.233036,-0.296353,-0.564675,-0.0692308,-0.0805471,-0.0457413,-0.0692308,1,34.5865,34.5238,20.2899,38.2075,39.6907,37.5691,-0.00706215,-0.0093068,-0.0120538,0.00670017,-0.0191758,-0.0318164,-102.339,-83.7009,-45.6555,0.0197697,0.0148737,0.0113073,-0.0124845,-0.016851,-0.0392331,-0.109107,0.0326528,14.7394,0.205208,-0.0865798,0.0361473,12.5902,0.269033,-0.0388169,0.0349062,7.38675,0.455099,-42.1464,45.132,-19.0215,0.0326214,-42.2515,44.4239,-18.7697,0.0361237,-42.037,43.3295,-18.2145,0.0407981,-41.6998,42.4309,-17.6936,0.0421162,-41.6801,42.2733,-17.6196,0.0461575,-44.6336,46.9446,-20.9531,0.0470335,-50.1768,53.2383,-26.7133,0.0455343,-62.5215,65.5577,-40.9877,0.0432765,-85.121,86.3574,-73.5083,0.0420441,0.290946,0.239662,0.0786703,0.147925,0.230613,17.9487,0.419929,19.1781,0.45003,19.1781,0.408124,29.7872,1.05116,-0.0399361,-0.0360866,-0.0360866,-0.015561,-0.0847101,-0.194253,-0.148324,0.0155388,-0.198123,-0.18101,0.0279967,20,90,-70,20,80,-60,0.673402,0.502528,-0.062761,0.95241,0.998995,-0.487992,1,1,-0.0132231,0.0214876,n-1,1
AAPL,2010-02-02,1265070000000000.0,6.05,6.06,5.97,6.04,698342000.0,0.175527,-0.20802,-0.625977,-0.404908,-0.0719755,-0.053125,-0.0288462,-0.044164,2,47.1698,30.0,38.2979,39.2523,37.766,40.8046,-0.0069817,-0.0121987,-0.0144088,0.00248963,0.00124327,-0.0194805,-74.8634,-51.2046,54.713,0.0160368,0.0108711,0.0062963,-0.0133829,-0.0159471,-0.0286926,-0.0893855,0.0390296,13.1732,0.276766,-0.0776851,0.0495962,12.9094,0.359388,-0.0109804,0.0272793,3.79504,0.705174,-41.9258,44.545,-18.6758,0.0326855,-41.9135,43.7787,-18.3492,0.0359701,-41.5508,42.6221,-17.7098,0.0396424,-41.0747,41.6579,-17.1109,0.0407588,-40.7021,41.1466,-16.7475,0.0406899,-41.6642,43.2046,-18.0009,0.0393978,-43.0551,44.8919,-19.3283,0.0361703,-42.3272,43.8452,-18.5584,0.0329799,-18.7937,38.3658,-7.21034,0.0198224,0.245126,0.191633,-0.0413305,0.0478716,0.151407,21.7949,0.378378,23.2877,0.378378,26.9841,0.488605,45.9459,0.616734,-0.0351438,-0.0312751,-0.0234438,-0.00247729,-0.0762233,-0.290507,-0.180957,0.000505782,0.00901638,-0.0253977,0.156353,10,80,-70,20,60,-40,0.779028,0.936664,0.352342,0.733273,0.905635,0.763015,-1,-1,-0.0182724,0.0265781,p2,2
AAPL,2010-02-03,1265160000000000.0,6.02,6.18,6.0,6.15,615328000.0,-0.157094,-0.422607,-0.642785,-0.506138,-0.0692771,-0.0221519,0.0214876,-0.00961538,3,44.2308,34.8315,89.2857,41.1765,42.0513,34.3949,-0.00905731,-0.0130671,-0.00693741,0.00902379,0.0194778,-0.00263531,-36.0081,-3.08695,73.6842,0.0159094,0.0122875,0.0150361,-0.011699,-0.0174184,0.000938243,-0.0588243,0.0538727,11.2977,0.449911,-0.0478132,0.0650242,11.1875,0.548712,-0.0103135,0.038984,4.86008,0.782635,-37.9301,40.2315,-15.2598,0.0335893,-36.5615,38.165,-13.9537,0.0357464,-33.7299,36.1025,-12.1773,0.0384567,-30.2407,35.7914,-10.8235,0.0407867,-24.2781,37.0597,-8.99738,0.0398779,-13.0883,43.4852,-5.69149,0.0347615,-2.74848,50.019,-1.37476,0.0320165,16.1142,59.2003,9.53965,0.0229237,55.2106,74.2638,41.0015,0.0219888,0.172134,0.118074,-0.205572,-0.0606142,0.0610807,35.8974,0.423729,38.3562,0.423729,59.5745,0.690269,89.2857,0.623191,-0.0175719,-0.0136327,0.00737101,0.0182119,-0.0555377,-0.214868,-0.123108,0.0112292,-0.0478759,-0.143589,0.066193,40,70,-30,20,40,-20,0.745217,0.749336,0.59657,-0.519266,-0.961844,-0.832671,-1,1,-0.029654,0.00823723,p0,2
AAPL,2010-02-04,1265240000000000.0,6.07,6.12,5.91,5.93,757652000.0,0.237684,-0.594211,-0.354367,0.0103692,-0.0796992,-0.0727273,0.00990099,0.0115702,4,31.6239,36.9048,40.4762,38.3966,39.6135,32.1429,-0.0110884,-0.0123066,-0.00412201,-0.0182119,-0.0169913,-0.0316391,-72.3002,-51.7604,-56.1404,0.0139517,0.0108149,0.0120376,-0.0126356,-0.0169249,0.00178927,-0.0843427,0.0231586,11.0894,0.197257,-0.0486569,0.0159838,6.57142,0.23524,-0.0527157,0.0189003,7.28477,0.25,-35.8086,37.8336,-13.5477,0.0346426,-33.9158,35.2959,-11.9709,0.0379981,-30.3539,32.3238,-9.81155,0.039561,-26.2875,30.7198,-8.07546,0.0392412,-19.9251,29.5056,-5.87901,0.0379044,-9.96076,30.224,-3.01054,0.0334563,-3.13613,31.4982,-0.987825,0.0269967,4.22056,33.4367,1.41122,0.0277311,2.30512,40.853,0.941712,0.0341709,0.180134,0.131145,-0.00871899,-0.0161322,0.0572124,8.21918,-0.625899,9.52381,-0.598533,16.2162,-0.526648,7.40741,-0.844207,-0.0489174,-0.0412288,-0.0206441,-0.019024,-0.0857197,-0.196998,-0.1397,0.0178084,-0.0571158,-0.0754995,0.0379253,30,60,-30,20,20,0,0.704768,0.401495,0.209319,0.0691164,-0.888459,-0.339422,-1,-1,-0.00841751,0.0286195,n0,1
AAPL,2010-02-05,1265330000000000.0,5.94,6.05,5.89,6.03,850307000.0,0.398176,-0.506373,-0.317545,0.217607,-0.0805471,-0.0692308,-0.0210356,-0.00165017,5,33.8843,58.3333,48.9362,42.0833,33.8624,38.3234,-0.0118387,-0.00776121,0.000197246,0.0083612,-0.00124224,-0.00863132,-59.2123,-39.8016,-36.8979,0.0140858,0.0132183,0.0166425,-0.00823815,-0.0146253,-0.00184567,-0.0687747,0.0427726,11.3017,0.357077,-0.024087,0.030563,5.44735,0.545779,-0.0362747,0.0367302,7.29882,0.484869,-34.3799,36.1638,-12.4331,0.0351982,-32.2493,33.4383,-10.7836,0.0379181,-28.5102,30.1467,-8.59489,0.0393402,-24.5582,28.0988,-6.90055,0.0383119,-19.0297,26.2118,-4.98803,0.0342099,-12.4403,25.9485,-3.22807,0.0268866,-10.2199,27.2371,-2.78361,0.0274099,-12.759,31.5876,-4.03027,0.0315961,-34.3989,48.785,-16.7815,0.032831,0.142397,0.0950007,-0.073343,-0.042833,0.0280504,21.9178,-0.00425532,34.0426,0.246637,48.2759,0.167338,48.2759,-0.000975364,-0.0328789,-0.012285,-0.0008285,-0.0008285,-0.0667566,-0.00840058,-0.00896869,0.065133,0.235575,0.319357,0.220243,20,50,-30,60,100,-40,0.725713,-0.347048,-0.834494,-0.461084,-0.917103,0.816912,-1,1,-0.00827815,0.0115894,n0,1
AAPL,2010-02-08,1265590000000000.0,6.04,6.11,5.99,5.99,478271000.0,-0.4576,-0.592442,-0.3622,-0.222738,-0.0453125,-0.0362776,-0.00163399,-0.0113269,1,42.0,48.2759,28.9474,39.1489,35.1648,46.0432,-0.0105959,-0.0072761,-0.00234229,-0.00332779,-0.00580913,-0.00642753,-34.0802,12.451,76.7959,0.0124508,0.0117297,0.0123066,-0.00628556,-0.000497528,-0.0022232,-0.0701512,0.0399138,11.1755,0.337199,-0.0292032,0.0187403,4.81957,0.379468,-0.01545,0.0182453,3.36483,0.533113,-30.6969,32.2701,-9.90589,0.034456,-27.5556,29.0818,-8.01366,0.036309,-22.1762,26.58,-5.89444,0.0362961,-16.4747,25.6009,-4.21767,0.0346019,-8.13185,25.7879,-2.09704,0.0304811,3.92358,29.5153,1.15806,0.0258559,11.755,34.2291,4.02363,0.028607,21.3436,43.5169,9.28805,0.028434,36.7692,64.4972,23.7151,0.0230431,0.120131,0.0759134,-0.106756,-0.0783141,-0.011143,16.4384,0.0588235,32.4324,0.280248,34.4828,0.0451977,43.4783,0.315377,-0.0392943,-0.0107349,-0.0074565,-0.00249792,-0.069541,-0.101702,-0.0865385,0.0519868,0.0431287,0.10127,0.0185215,10,40,-30,40,80,-40,0.732099,-0.0786525,-0.878342,0.0879584,-0.924473,0.316438,-1,-1,-0.00990099,0.00660066,p0,2
AAPL,2010-02-09,1265670000000000.0,6.06,6.1,6.01,6.05,632887000.0,-0.40613,-0.492046,-0.09373,-0.164673,-0.0348101,-0.0224359,0.00826446,-0.00326797,2,58.1081,50.8197,75.0,41.3502,39.548,41.4063,-0.0107148,-0.00547267,-0.000434628,0.00498339,0.00833333,0.00560981,-12.4978,30.8046,60.1287,0.0109899,0.0101528,0.00934796,-0.00884392,0.00306034,0.00271604,-0.0485324,0.046459,9.509,0.46535,-0.0205955,0.0278435,4.82641,0.562977,-0.00565942,0.0147206,2.02881,0.718218,-27.4713,28.86,-7.92824,0.0350642,-23.5265,25.3422,-5.96212,0.0356636,-16.9226,23.6215,-3.99737,0.0347709,-10.0218,23.6068,-2.36582,0.0307936,0.036734,25.4702,0.00935623,0.0245865,14.8323,31.8931,4.73048,0.0264922,24.9399,38.4243,9.58299,0.0258337,38.3948,49.4815,18.9983,0.021373,60.4919,69.7346,42.1838,0.0188364,0.0796397,0.0376132,-0.34493,-0.16404,-0.0722036,24.6575,0.173913,55.1724,0.360631,55.1724,0.2,72.7273,0.326483,-0.0296712,0.0024855,0.0024855,0.00833333,-0.0568944,-0.02281,-0.02868,0.0631313,-0.0473614,-0.0341826,-0.0479628,10,30,-20,20,60,-40,0.880018,-0.343064,-0.762065,-0.725823,-0.749057,0.933257,1,-1,-0.00991736,0.0181818,n0,1
AAPL,2010-02-10,1265760000000000.0,6.05,6.07,6.0,6.02,370362000.0,-0.801639,-0.506103,-0.398107,-0.564438,-0.080303,0.00330579,-0.00654664,0.00330579,3,50.7246,36.7347,47.0588,42.6087,47.2973,36.0656,-0.011224,-0.00509174,-0.000829876,-0.0024855,-0.00041511,-0.0012443,-12.6651,-4.88299,-38.49,0.0092384,0.00781708,0.00591359,-0.00742001,0.00160347,0.00418554,-0.0260702,0.0240087,5.01306,0.466919,-0.0249259,0.0233262,4.82908,0.471374,-0.00986842,0.0100671,1.99336,0.5,-25.2823,26.5091,-6.70213,0.0341211,-21.0215,22.5891,-4.74857,0.0344248,-14.1984,19.7711,-2.80719,0.0309983,-7.47461,19.4078,-1.45066,0.0269925,1.4449,20.5164,0.296442,0.0226921,11.7022,23.076,2.7004,0.0229734,15.4278,23.5184,3.62837,0.0189128,13.6008,30.3374,4.12613,0.0164016,-9.50931,52.9181,-5.03215,0.0147825,0.0556968,0.0175878,-0.235132,-0.170331,-0.0985314,23.8095,0.100503,44.8276,0.0154821,56.5217,0.16,25.0,-0.468859,-0.0266774,-0.0024855,0.00249792,-0.00495868,-0.0583711,-0.143762,-0.175192,0.026931,-0.190243,-0.221049,-0.127201,10,20,-10,20,40,-20,0.775167,0.128854,-0.771342,-0.572856,-0.148034,0.972128,-1,1,-0.00332779,0.0349418,n0,1
AAPL,2010-02-11,1265850000000000.0,6.01,6.16,5.99,6.13,550346000.0,-0.680509,-0.211926,-0.273617,0.150699,-0.0523077,0.0165017,0.00983607,0.00818331,4,55.8442,75.0,79.1667,38.0282,45.0704,38.5827,-0.012,0.000406278,0.00196618,0.0090535,0.013642,0.0144808,95.7092,104.424,71.6919,0.0111957,0.0120547,0.0162187,-0.00491438,-0.000929145,0.0016656,-0.00688029,0.0418912,4.79324,0.853018,-0.00446839,0.040308,4.39881,0.896184,-0.00815344,0.029743,3.74917,0.77845,-20.9469,24.8403,-5.20327,0.033264,-15.6066,22.0646,-3.44353,0.0326014,-7.12881,21.1376,-1.50686,0.028752,1.2384,22.7313,0.281504,0.0234706,12.5973,26.899,3.38856,0.0245129,27.2466,34.8291,9.48975,0.0206497,35.4281,40.2825,14.2713,0.0192006,43.4915,51.8598,22.5546,0.0190541,52.2136,73.0227,38.1278,0.0195648,-0.00393171,-0.0411313,2.86904,-0.342424,-0.167725,55.3191,0.599032,82.7586,0.358491,88.8889,0.329458,82.3529,0.371938,0.004095,0.0157415,0.0174274,0.0090535,-0.0380164,0.00553938,-0.00295712,0.0531477,0.0841142,0.0994105,0.0980895,10,10,0,100,20,80,0.356927,-0.692983,-0.507699,-0.838628,0.0339422,0.662849,-1,-1,-0.0130933,0.0294599,n-1,1




In [None]:
### HYPERPARAMETER TUNING AND VARIABLE SELECTION
features = [s for s in fullDF.columns if "feat_" in s]
#features = [s for s in fullDF.columns if "PC" in s]
labels = [s for s in fullDF.columns if "lab_" in s]

timePerRun = int(60*60*.5)

label = labels[0]
quantile_alpha=.05
perf_metric='mae'

print('building model for {}'.format(label))
# GBM hyperparameters
hyper_params = {'learn_rate': [i * 0.01 for i in range(1,21,2)],
                'max_depth': list(range(2, 11, 2)),
                'sample_rate': [i * 0.1 for i in range(5, 10)],
                'col_sample_rate': [i * 0.1 for i in range(3, 10)],
                'col_sample_rate_per_tree': [i * 0.1 for i in range(5, 8)],
                'min_rows': [50, 100, 500],
                'min_split_improvement': [1e-3, 1e-5]}
# Search criteria
search_criteria = {'strategy':'RandomDiscrete', 'max_models':1000, 'max_runtime_secs':timePerRun,
                    'stopping_metric':perf_metric, 'stopping_rounds':5, 'stopping_tolerance':0.0001}
gbm = H2OGradientBoostingEstimator(distribution="quantile", quantile_alpha = quantile_alpha, nfolds = 6, ntrees=500, learn_rate_annealing=0.99)
# Train and validate a random grid of GBMs
gbm_grid = H2OGridSearch(model=gbm,
                          grid_id='gbm_grid_{}'.format(label),
                          hyper_params=hyper_params,
                          search_criteria=search_criteria,parallelism=0)
gbm_grid.train(x=features,y=label,training_frame=fullDF,weights_column='weights')
gbm_gridperf=gbm_grid.get_grid(sort_by=perf_metric,decreasing=False,)

building model for lab_lowPercentChange_H2
gbm Grid Build progress: |████████████████████████████████████████████████

In [None]:
resultingGrid = pandas.DataFrame(gbm_grid.sorted_metric_table()).drop('residual_deviance',axis=1)
modelScores=pandas.DataFrame(columns=['model_ids','mae','rmse'])
for i in range(len(gbm_gridperf.models)):
    mae=gbm_gridperf.models[i].mae(train=True,valid=True,xval=False)
    rmse=gbm_gridperf.models[i].rmse(train=True,valid=True,xval=False)
    modelScores.loc[-1] = [gbm_gridperf.models[i].model_id,mae['valid'],rmse['valid']]
    modelScores.index = modelScores.index + 1
    modelScores = modelScores.sort_index()
modelScores = modelScores.merge(resultingGrid,how='inner',on='model_ids')
modelScores = modelScores.sort_values('mae',ascending=True).reset_index(drop=True)

modelScores['trimmedModelId']=modelScores['model_ids'].apply(lambda x: x[:-5])
modelScores['col_sample_rate'] = modelScores['col_sample_rate'].astype('float64')
modelScores['col_sample_rate_per_tree'] = modelScores['col_sample_rate_per_tree'].astype('float64')
modelScores['learn_rate'] = modelScores['learn_rate'].astype('float64')
modelScores['max_depth'] = modelScores['max_depth'].astype('float64')
modelScores['min_rows'] = modelScores['min_rows'].astype('float64')
modelScores['min_split_improvement'] = modelScores['min_split_improvement'].astype('float64')
modelScores['sample_rate'] = modelScores['sample_rate'].astype('float64')

try: modelScores = modelScores.drop(columns=[''])
except: pass

modelScores[[x for x in modelScores.columns if x not in ['trimmedModelId','model_ids']]] = modelScores[[x for x in modelScores.columns if x not in ['trimmedModelId','model_ids']]].astype('float64')
groupedModelScores = modelScores.dropna(subset=['mae']).groupby('trimmedModelId').mean().sort_values('mae',ascending=True)

groupedModelScores['aggregate'] = (2 * groupedModelScores['mae'] + groupedModelScores['rmse'])/3
groupedModelScores = groupedModelScores.sort_values('aggregate',ascending=True)
groupedModelScores.head()

In [None]:
featureImportance = gbm_gridperf.models['model_id'==list(modelScores[modelScores['trimmedModelId']==groupedModelScores.index[0]]['model_ids'])[0]].varimp(use_pandas=True)
topFeatures = list(featureImportance[featureImportance['percentage']>=.005]['variable'].values)
print(featureImportance.head(30))

In [None]:
print(featureImportance.tail(20))

In [None]:
### ITERATE THROUGH 4MO TRAINING + 1MO VALIDATION TO BUILD GBMS WITH CHECKPOINTING (USING HYPERPARAMETERS AND VARIABLES FROM ABOVE)

subsetMonths = [i for i in YrMos if i not in ootMonths]
subsetMonths = [subsetMonths[i:i + 5] for i in range(0, len(subsetMonths), 5)]
valMos=[]
rounds=1
for i in range(len(subsetMonths)):
    print('''Round {}'''.format(rounds))
    if len(subsetMonths[i])<3: break
    valMos.extend([subsetMonths[i][-1]])
    trainMos = subsetMonths[i][0:-2]
    trainFiles = ['YearMo='+str(f).split('YearMo=')[1] for f in Path(os.path.join(uyulala.dataDir,'transformed',folderName)).glob('**/*') if f.is_file() \
               & any([x in str(f) for x in ['YearMo=' + sub for sub in trainMos] ]) ]
    train = h2o.import_file(path=os.path.join(uyulala.dataDir,'transformed',folderName),pattern = "(%s)" % ('|'.join(trainFiles),),col_types={'DateCol':'enum','Date':'enum'}).na_omit().merge(h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled',folderName),pattern = ".*\.parquet",col_types={'DateCol':'enum','Date':'enum'}).na_omit()).na_omit()
    validation = h2o.import_file(path=os.path.join(uyulala.dataDir,'transformed',folderName),pattern = "(%s)" % ('|'.join(valMos),),col_types={'DateCol':'enum','Date':'enum'}).na_omit().merge(h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled',folderName),pattern = ".*\.parquet",col_types={'DateCol':'enum','Date':'enum'}).na_omit()).na_omit()
    if rounds == 1:
        gbm = H2OGradientBoostingEstimator(distribution="quantile", quantile_alpha = quantile_alpha, nfolds = 0, ntrees=int(100*rounds), 
                                          learn_rate_annealing=0.99,
                                          col_sample_rate=groupedModelScores['col_sample_rate'][0],
                                          col_sample_rate_per_tree=groupedModelScores['col_sample_rate_per_tree'][0],
                                          learn_rate = groupedModelScores['learn_rate'][0],
                                          max_depth=int(groupedModelScores['max_depth'][0]),
                                          min_rows=int(groupedModelScores['min_rows'][0]),
                                          min_split_improvement=groupedModelScores['min_split_improvement'][0],
                                          sample_rate=groupedModelScores['sample_rate'][0])
    else:
        gbm = H2OGradientBoostingEstimator(checkpoint = gbm.model_id, distribution="quantile", quantile_alpha = quantile_alpha, nfolds = 0, ntrees=int(100*rounds), 
                                          learn_rate_annealing=0.99,
                                          col_sample_rate=groupedModelScores['col_sample_rate'][0],
                                          col_sample_rate_per_tree=groupedModelScores['col_sample_rate_per_tree'][0],
                                          learn_rate = groupedModelScores['learn_rate'][0],
                                          max_depth=int(groupedModelScores['max_depth'][0]),
                                          min_rows=int(groupedModelScores['min_rows'][0]),
                                          min_split_improvement=groupedModelScores['min_split_improvement'][0],
                                          sample_rate=groupedModelScores['sample_rate'][0])
    gbm.train(x=features,y=label,training_frame=train,validation_frame=validation,weights_column='weights')
    print('''====================Round {} Performance on Validation Set===================='''.format(rounds))
    print('''MAE: {}'''.format(gbm.mae(train=False,valid=True,xval=False)))
    print('''RMSE: {}'''.format(gbm.rmse(train=False,valid=True,xval=False)))
    rounds=rounds+1

In [None]:
train.shape