In [1]:
#!/usr/bin/python


'''
rightSphnix:
* create Labels and store to disk
* build models
'''

##################################################################################
#########################       Configure       ##################################
##################################################################################

assets = 'Test'   # Typically AllStocks, SchwabOneSource, SchwabETFs, or Test
horizon = 2       # prediction horizon in days

totalBuildTimeAllowed_seconds = 28800


startDate = '2001-01-01'



In [2]:

##################################################################################
###########################       Imports       ##################################
##################################################################################
print('importing packages')
from multiprocessing import Pool
import pandas
import os
import uyulala
#reload(uyulala)

import datetime
import numpy
import random
import string
import subprocess
import time
from psutil import virtual_memory
import shutil
from pathlib import Path
import math
import glob
import shutil

totMem = virtual_memory().total
availMem = virtual_memory().available

folderName = 'Assets-'+assets+'--Hrzn-'+str(horizon)


importing packages


In [3]:

##################################################################################
#################              Clear directories          ########################
##################################################################################
print('clearing directories')
try:
    shutil.rmtree(os.path.join(uyulala.dataDir,'labeled',folderName))
    os.makedirs(os.path.join(uyulala.dataDir,'labeled',folderName))
except:
    os.makedirs(os.path.join(uyulala.dataDir,'labeled',folderName))

try:
    shutil.rmtree(os.path.join(uyulala.modelsDir,folderName))
    os.makedirs(os.path.join(uyulala.modelsDir,folderName))
except:
    os.makedirs(os.path.join(uyulala.modelsDir,folderName))


clearing directories


In [4]:
'''
##################################################################################
################# Get and transform data (run leftSphnix) ########################
##################################################################################
print('getting and transforming data')
if assets!="Test":
    import warnings
    warnings.filterwarnings("ignore")


filePath = os.path.join(uyulala.uyulalaDir,'greatRiddleGate','leftSphnix.py')
print('making call: '+'python %s --assets=%s --horizon=%i --start=%s' % (filePath,assets,horizon,startDate))
subprocess.call('python %s --assets=%s --horizon=%i --start=%s' % (filePath,assets,horizon,startDate), shell=True)

'''




In [5]:

##################################################################################
########################       Create        ###############################
##################################################################################
print('creating ')

evaluate = [ f.replace('.csv','') for f in os.listdir(os.path.join(uyulala.dataDir,'raw',folderName)) if f.endswith(".csv") ]


creating 


In [6]:

def createLabels(asset=''):
    try:
        labeled = pandas.read_csv(os.path.join(uyulala.dataDir,'raw',folderName,asset+'.csv'),parse_dates=['DateCol']).set_index('DateCol',drop=False)
        labeled = labeled.drop_duplicates(subset=['Date'], keep='last') # KEEP EVERYTHING BELOW THIS POINT IN ORDER
        print('label for biggest loss') # Key Regression Field (what's the biggest loss?)
        labeled = uyulala.lowPercentChange(df=labeled,horizon=horizon)
        print('label for highest gain') # Key Regression Field (what's the predicted return?)
        labeled = uyulala.percentChange(df=labeled,horizon=horizon,HighOrClose='High')
        print('label for whether higest gain comes before biggest loss') # Key Classification Field (is it a good buy?)
        labeled = uyulala.expectedReturnPct(df=labeled,horizon=horizon)
        print('add weights column')  #add weights
        labeled = uyulala.weights(df=labeled, horizon=horizon,weightForIncrease=1,weightForDecrease=2)
        labeled = labeled.drop(['Open','High','Low','Close','Volume'],axis=1)
        labeled['YearMo'] = labeled['DateCol'].dt.strftime('%Y%m')
        labeled['Asset'] = labeled['Symbol']
        labeled.to_parquet(os.path.join(uyulala.dataDir,'labeled',folderName),index=False,partition_cols=['YearMo','Asset'])
        return asset
    except:
        print('unable to create label for '+asset)
        pass




In [7]:

print('labelling data')
for i in range(0,len(evaluate),400):
    l = evaluate[i:i+400]
    pool = Pool(uyulala.availableCores,maxtasksperchild=1)
    pool.map(createLabels, l)
    pool.close()
    pool.join()

print('Done labelling data')


labelling data
label for biggest losslabel for biggest losslabel for biggest loss


label for highest gainlabel for highest gainlabel for highest gain


label for whether higest gain comes before biggest losslabel for whether higest gain comes before biggest losslabel for whether higest gain comes before biggest loss


add weights column
add weights column
add weights column
Done labelling data


In [7]:
#h2o.cluster().shutdown()

In [8]:

##################################################################################
##########################       Load Data       #################################
##################################################################################


import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.frame import H2OFrame

try:
    h2o.init(nthreads = -1,max_mem_size="%sG" % int(totMem/1500000000/1.5),min_mem_size="%sG" % int(availMem/1500000000/1.5))
except:
    time.sleep(20)
    h2o.init(nthreads = -1,max_mem_size="%sG" % int(totMem/1500000000/1.5),min_mem_size="%sG" % int(availMem/1500000000/1.5))



Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_101"; Java(TM) SE Runtime Environment (build 1.8.0_101-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.101-b13, mixed mode)
  Starting server from /Users/Damian/opt/anaconda3/envs/uyulala/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpa3obf7e7
  JVM stdout: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpa3obf7e7/h2o_Damian_started_from_python.out
  JVM stderr: /var/folders/5b/s4769fcn60d842cy18f7nc3h0000gn/T/tmpa3obf7e7/h2o_Damian_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_Damian_tn0qwt
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.223 Gb
H2O_cluster_total_cores:,24
H2O_cluster_allowed_cores:,24


importing data


In [9]:
dataSize = sum(f.stat().st_size for f in Path(os.path.join(uyulala.dataDir,'transformed',folderName)).glob('**/*') if f.is_file() ) + sum(f.stat().st_size for f in Path(os.path.join(uyulala.dataDir,'labeled',folderName)).glob('**/*') if f.is_file() )
ratio = ((availMem/2000000000) / (20.0000000000000)) / (dataSize/1000000000)
print('full data size: {}gb'.format(dataSize/1000000000.00))


full data size: 0.017388292gb


In [21]:
YrMos = [str(f)[-6:] for f in Path(os.path.join(uyulala.dataDir,'labeled',folderName)).glob('**/*') if f.is_dir()]
YrMos = [f for f in YrMos if f.startswith('20') | f.startswith('19')]
ootMonths = YrMos[-int(len(YrMos)*0.1):] #use latest 10% of months as holdout set

In [23]:
'''
1,2,3,4,5
6,7,8,9,10
11,12,1,2,3
4,5,6,7,8
9,10,11,12,1

2,3,4,5,6
7,8,9,10,11
12,1,2,3,4
5,6,7,8,9
10,11,12,1,2

3,4,5,6,7
8,9,10,11,12
'''

'\n1,2,3,4,5\n6,7,8,9,10\n11,12,1,2,3\n4,5,6,7,8\n9,10,11,12,1\n\n2,3,4,5,6\n7,8,9,10,11\n12,1,2,3,4\n5,6,7,8,9\n10,11,12,1,2\n\n3,4,5,6,7\n8,9,10,11,12\n'

In [None]:
### HYPERPARAMETER TUNING AND VARIABLE SELECTION
