In [1]:
%load_ext autoreload
%autoreload 2

# get project dir
# import standard libs
from IPython.display import display
from IPython.core.debugger import set_trace as bp
from pathlib import PurePath, Path
import sys
import time
from collections import OrderedDict as od
import re
import os
import json

pp = PurePath(Path.cwd()).parts[:]
pdir = PurePath(*pp)
data_script_dir = pdir / 'src' / 'data'
bars_script_dir = pdir / 'src' / 'features'
sys.path.append(data_script_dir.as_posix())
sys.path.append(bars_script_dir.as_posix())
viz_dir = pdir / 'reports' / 'figures'
data_dir = pdir / 'data'

# import python scientific stack
import pandas as pd
pd.set_option('display.max_rows', 100)
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
from multiprocessing import cpu_count
pbar = ProgressBar()
pbar.register()
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from numba import jit
import math

# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns

plt.style.use('seaborn-talk')
plt.style.use('bmh')
#plt.rcParams['font.family'] = 'DejaVu Sans Mono'
plt.rcParams['font.size'] = 9.5
plt.rcParams['font.weight'] = 'medium'
plt.rcParams['figure.figsize'] = 10,7
blue, green, red, purple, gold, teal = sns.color_palette('colorblind', 6)

# import util libs
# from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from utils import *
from bars import *
from labelling import *
from mpEngine import *
from sampleWeights import *
from ffd import *
from cvFin import *
from featureImportance import *
RANDOM_STATE = 777


In [2]:
infp=PurePath(data_dir/'processed'/'clean_IVE_fut_prices.parq')
df = pd.read_parquet(infp)
df = df.assign(dates = df.index)
dbars = dollar_bar_df(df, 'dv', 1_000_000)
dbars = dbars.drop(['dates'], axis = 1)
# x = np.log(dbars.price).cumsum()
x = dbars.price.cumsum()
x = x[~x.index.duplicated()]
dfx2 = fracDiff_FFD(x.to_frame(),2)

joined = dfx2.join(x.rename('original'),how='left')
joined.corr()
coint_pval = sm.tsa.stattools.coint(joined.price, joined.original)[1]
dfx2 = -dfx2[~dfx2.index.duplicated()]

ffd_std = dfx2.std()[0]

ffd_std

19.2953597377909

In [3]:
def getTEvents2(gRaw, h, symmetric = True, isReturn = False):
    """
    Symmetric CUSUM Filter
    Sample a bar t iff S_t >= h at which point S_t is reset
    Multiple events are not triggered by gRaw hovering around a threshold level
    It will require a full run of length h for gRaw to trigger an event
    
    Two arguments:
        gRaw: the raw time series we wish to filter (gRaw), e.g. return
        h: threshold
        
    Return:
        pd.DatatimeIndex.append(tEvents): 
    """
    tEvents, sPos, sNeg = [], 0, 0
    if isReturn:
        diff = gRaw
    else:
        diff = gRaw.diff()
    if symmetric:
        if np.shape(h) == ():

            for i in diff.index[1:]:
                tmp = diff.loc[i].mean()
                sPos, sNeg = max(0,sPos+tmp), min(0,sNeg+tmp)
                if sNeg < -h and tradableHour(i):
                    sNeg = 0; tEvents.append(i)
                elif sPos > h and tradableHour(i):
                    sPos = 0; tEvents.append(i)
        else:
            for i in diff.index[1:]:
                tmp = diff.loc[i].mean()
                sPos, sNeg = max(0,sPos+tmp), min(0,sNeg+tmp)
                if sNeg < -h[i] and tradableHour(i):
                    sNeg = 0; tEvents.append(i)
                elif sPos > h[i] and tradableHour(i):
                    sPos = 0; tEvents.append(i)
    else:
        if np.shape(h) == ():

            for i in diff.index[1:]:
                tmp = diff.loc[i].mean()
                sAbs = sAbs+tmp
                if sAbs > h and tradableHour(i):
                    sNeg = 0; tEvents.append(i)
                
        else:
            for i in diff.index[1:]:
                tmp = diff.loc[i].mean()
                sAbs = sAbs+tmp
                if sAbs > h[i] and tradableHour(i):
                    sNeg = 0; tEvents.append(i)
            
    return pd.DatetimeIndex(tEvents)

def getDailyVol2(close,span0=100):
    # daily vol reindexed to close
    df0=close.index.searchsorted(close.index-pd.Timedelta(days=1))
    #bp()
    df0=df0[df0>0]
    #bp()
    df0=(pd.Series(close.index[df0-1],
                   index=close.index[close.shape[0]-df0.shape[0]:]))
    #bp()
    try:
        df0=close.loc[df0.index]/close.loc[df0.values].values-1 # daily rets
    except Exception as e:
        print(e)
        print('adjusting shape of close.loc[df0.index]')
        cut = close.loc[df0.index].shape[0] - close.loc[df0.values].shape[0]
        # I dont't think it make sense
        df0=close.loc[df0.index].iloc[:-cut]/close.loc[df0.values].values-1
    df0=df0.ewm(span=span0).std().rename('dailyVol')
    return df0

def evaluate(X,y,clf):
    from sklearn import metrics
    # The random forest model by itself
    y_pred_rf = clf.predict_proba(X)[:, 1]
    y_pred = clf.predict(X)
    fpr_rf, tpr_rf, _ = metrics.roc_curve(y, y_pred_rf)
    print(metrics.classification_report(y, y_pred))

    plt.figure(figsize=(9,6))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rf, tpr_rf, label='clf')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    

tEvents = getTEvents2(dfx2,h=ffd_std)
dbars_feat = dbars.price.loc[tEvents]
frac_diff_feat = dfx2.loc[tEvents]
ftMtx = (pd.DataFrame()
         .assign(dbars=dbars_feat,
                 frac_diff_feat=frac_diff_feat)
         .drop_duplicates().dropna())
cprint(ftMtx)

ftMtx = ftMtx[~ftMtx.index.duplicated()]
dailyVol = getDailyVol2(ftMtx.dbars)
t1 = addVerticalBarrier(tEvents, ftMtx.dbars, hour=120)

ptsl = [1,1]
#ptsl = [daily]
target=dailyVol*2
# select minRet
minRet = 0.01
# get cpu count - 1
cpus = cpu_count() - 1
events = getEvents(ftMtx.dbars,tEvents,ptsl,target,minRet,cpus,t1=t1)

close=ftMtx.dbars
out = get_Concur_Uniqueness(close,events,cpus)
# get avg uniqueness for bootstrapping
avgU = out['tW'].mean()
labels = getBins(events, ftMtx.dbars)
clean_labels = dropLabels(labels)
trgt = clean_labels.bin


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier

# model data# model  
data = ftMtx.join(out,how='left').join(trgt,how='left').dropna()
data_ = data.drop(['t1'],axis = 1)
X = data_.iloc[:,:-1].values
y = data_.iloc[:,-1].values.reshape(-1,1)
XX = data_.iloc[:,:-1]
yy = data_.iloc[:,-1]

-------------------------------------------------------------------------------
dataframe information
-------------------------------------------------------------------------------
                        dbars  frac_diff_feat
dates                                        
2018-10-02 11:49:10  116.4980        116.1370
2018-10-02 12:12:54  116.5699         -0.0719
2018-10-04 13:44:24  116.1600       -116.1610
2018-10-04 13:54:45  116.0337        116.1663
2018-10-04 14:01:44  116.0100          0.0237
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1677 entries, 2009-11-03 11:20:59 to 2018-10-04 14:01:44
Data columns (total 2 columns):
dbars             1677 non-null float64
frac_diff_feat    1677 non-null float64
dtypes: float64(2)
memory usage: 39.3 KB
None
-------------------------------------------------------------------------------



2018-10-28 11:25:17.302596 100.0% applyPtSlOnT1 done after 0.01 minutes. Remaining 0.0 minutes..
2018-10-28 11:25:17.583000 100.0% mpNumCoEvents done after 0.0 minutes. Remaining 0.0 minutes..
2018-10-28 11:25:17.847053 100.0% mpSampleTW done after 0.0 minutes. Remaining 0.0 minutes..


dropped label:  0.0 0.0012338062924120913


2018-10-28 11:25:18.236623 14.29% mpSampleW done after 0.0 minutes. Remaining 0.01 minutes.2018-10-28 11:25:18.275094 28.57% mpSampleW done after 0.0 minutes. Remaining 0.01 minutes.2018-10-28 11:25:18.276405 42.86% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.2018-10-28 11:25:18.279181 57.14% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.2018-10-28 11:25:18.279471 71.43% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.2018-10-28 11:25:18.280019 85.71% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.2018-10-28 11:25:18.280050 100.0% mpSampleW done after 0.0 minutes. Remaining 0.0 minutes.


In [4]:
def test(trnsX, cont, n_estimators = 1000, cv = 10):
    # arguments
    dict0 = {'minWLeaf': [0.], 'scoring': ['accuracy'], 'method': ['MDI','MDA', 'SFI'], 'max_samples': [1.]}    
    # split dict0 into 3 different jobs (by method)
    jobs =(dict(zip(dict0, i)) for i in product(*dict0.values()))
    out = [] # empty list
    # key arguments
    kargs = {'pathOut': './testFunc/', 'n_estimators': n_estimators, 'tag': 'testFunc', 'cv': cv}
    for job in jobs: # for each jobs
        # job params
        job['simNum'] = job['method'] + '_' + job['scoring'] + '_'+ '%.2f'%job['minWLeaf'] + '_' + str(job['max_samples'])
        print (job['simNum']) # print job params
        kargs.update(job) # update/add the elemets to the dictionary
        imp, oob, oos = featImportance(trnsX = trnsX, cont = cont, **kargs) #  find faeture importance using imp, oob, oos
        plotFeatImportance(imp = imp, oob = oob, oos = oos, **kargs) # plot the feature importance
        df0 = imp[['mean']] / imp['mean'].abs().sum() # normalised
        df0['type'] = [i[0] for i in df0.index] # 
        df0 = df0.groupby('type')['mean'].sum().to_dict() 
        df0.update({'oob': oob, 'oos': oos}) # update/add the elemets to the dictionary
        df0.update(job) # update/add the elemets to the dictionary
        out.append(df0) # append df0 to out
    out = pd.DataFrame(out).sort_values(['method', 'scoring', 'minWLeaf', 'max_samples']) # sort the df by
#     # only the followings are output
    out = out[['method', 'scoring', 'minWLeaf', 'max_samples', 'I', 'R', 'N', 'oob', 'oos']]
#    # out = out['method', 'scoring', 'minWLeaf', 'max_samples', 'oob', 'oos']
    out.to_csv(kargs['pathOut'] + 'stats.csv')
    return

## Before PCA

In [5]:
SFI_imp, SFI_oob, SFI_oos = featImportance(XX, data[['bin','w','t1']], method = 'SFI')

print(SFI_imp.sort_values(by=['mean'], ascending=False))

2018-10-28 11:25:25.183056 75.0% auxFeatImpSFI done after 0.06 minutes. Remaining 0.02 minutes.

                    mean std
tW              0.543083   0
w               0.529795   0
dbars           0.477375   0
frac_diff_feat  0.451235   0


2018-10-28 11:25:26.389552 100.0% auxFeatImpSFI done after 0.08 minutes. Remaining 0.0 minutes.


In [6]:
MDI_imp, MDI_oob, MDI_oos = featImportance(XX, data[['bin','w','t1']], method = 'MDI')

print(MDI_imp.sort_values(by=['mean'], ascending=False))

                    mean       std
w               0.312872  0.001526
dbars           0.286996  0.001430
frac_diff_feat  0.213445  0.001166
tW              0.186688  0.000874


In [7]:
MDA_imp, MDA_oob, MDA_oos = featImportance(XX, data[['bin','w','t1']], method = 'MDI')
print(MDA_imp.sort_values(by=['mean'], ascending=False))

                    mean       std
w               0.317509  0.001478
dbars           0.284453  0.001373
frac_diff_feat  0.213380  0.001123
tW              0.184658  0.000877


## After PCA

In [9]:
pcaX = orthoFeats(XX)

In [15]:
pcaX = pd.DataFrame(pcaX,index = XX.index)

In [22]:
pcaX.columns = [("pc" + str(i+1)) for i in pcaX.columns]

In [23]:
SFI_imp, SFI_oob, SFI_oos = featImportance(pcaX, data[['bin','w','t1']], method = 'SFI')

print(SFI_imp.sort_values(by=['mean'], ascending=False))

2018-10-28 11:36:44.965493 100.0% auxFeatImpSFI done after 0.07 minutes. Remaining 0.0 minutes.


         mean std
pc4   0.61986   0
pc2  0.531624   0
pc3  0.496696   0
pc1  0.490896   0


In [24]:
MDI_imp, MDI_oob, MDI_oos = featImportance(pcaX, data[['bin','w','t1']], method = 'MDI')

print(MDI_imp.sort_values(by=['mean'], ascending=False))

         mean       std
pc1  0.275558  0.001453
pc4  0.266737  0.001547
pc3  0.243212  0.001119
pc2  0.214493  0.001123


In [26]:
MDA_imp, MDA_oob, MDA_oos = featImportance(pcaX, data[['bin','w','t1']], method = 'MDI')
print(MDA_imp.sort_values(by=['mean'], ascending=False))

         mean       std
pc1  0.278535  0.001459
pc4  0.262368  0.001479
pc3  0.243087  0.001138
pc2  0.216011  0.001150


## pcaX merge XX

In [27]:
joinX = XX.join(pcaX)

In [29]:
SFI_imp, SFI_oob, SFI_oos = featImportance(joinX, data[['bin','w','t1']], method = 'SFI')

print(SFI_imp.sort_values(by=['mean'], ascending=False))

2018-10-28 11:42:08.149976 87.5% auxFeatImpSFI done after 0.1 minutes. Remaining 0.01 minutes..

                    mean std
pc4              0.61986   0
tW              0.543083   0
pc2             0.531624   0
w               0.529795   0
pc3             0.496696   0
pc1             0.490896   0
dbars           0.477375   0
frac_diff_feat  0.451235   0


2018-10-28 11:42:08.383152 100.0% auxFeatImpSFI done after 0.11 minutes. Remaining 0.0 minutes.


In [30]:
MDI_imp, MDI_oob, MDI_oos = featImportance(joinX, data[['bin','w','t1']], method = 'MDI')

print(MDI_imp.sort_values(by=['mean'], ascending=False))

                    mean       std
w               0.149303  0.001427
dbars           0.142741  0.001202
pc1             0.139010  0.001298
pc4             0.133645  0.001210
pc3             0.125986  0.001004
pc2             0.107286  0.000833
frac_diff_feat  0.104613  0.000867
tW              0.097417  0.000714


In [31]:
MDA_imp, MDA_oob, MDA_oos = featImportance(joinX, data[['bin','w','t1']], method = 'MDI')
print(MDA_imp.sort_values(by=['mean'], ascending=False))

                    mean       std
w               0.148162  0.001384
dbars           0.142745  0.001218
pc1             0.138361  0.001314
pc4             0.136027  0.001261
pc3             0.127085  0.000967
pc2             0.106574  0.000826
frac_diff_feat  0.105005  0.000833
tW              0.096041  0.000726


## Rerun joinX after removing the most important features the previous section

In [47]:
join_SFI = joinX[SFI_imp.sort_values(by=['mean'], ascending=False)[1:].index]

In [48]:
SFI_imp, SFI_oob, SFI_oos = featImportance(joinX, data[['bin','w','t1']], method = 'SFI')

print(SFI_imp.sort_values(by=['mean'], ascending=False))

2018-10-28 11:49:36.230492 100.0% auxFeatImpSFI done after 0.12 minutes. Remaining 0.0 minutes.


                    mean std
pc4              0.61986   0
tW              0.543083   0
pc2             0.531624   0
w               0.529795   0
pc3             0.496696   0
pc1             0.490896   0
dbars           0.477375   0
frac_diff_feat  0.451235   0


In [50]:
joinX_MDI = joinX[MDI_imp.sort_values(by=['mean'], ascending=False)[1:].index]
MDI_imp, MDI_oob, MDI_oos = featImportance(joinX_MDI, data[['bin','w','t1']], method = 'MDI')

print(MDI_imp.sort_values(by=['mean'], ascending=False))

                    mean       std
dbars           0.163858  0.001263
pc1             0.161836  0.001378
pc4             0.161214  0.001351
pc3             0.146205  0.001010
pc2             0.128153  0.000919
frac_diff_feat  0.126614  0.000930
tW              0.112120  0.000766


In [51]:
joinX_MDA = joinX[MDA_imp.sort_values(by=['mean'], ascending=False)[1:].index]
MDA_imp, MDA_oob, MDA_oos = featImportance(joinX_MDA, data[['bin','w','t1']], method = 'MDI')
print(MDA_imp.sort_values(by=['mean'], ascending=False))

                    mean       std
pc4             0.163570  0.001295
pc1             0.163480  0.001381
dbars           0.161630  0.001204
pc3             0.146044  0.001045
pc2             0.126822  0.000916
frac_diff_feat  0.125326  0.000926
tW              0.113128  0.000772
