In [1]:
import sys
sys.path.insert(0, '/Users/orentapiero/MyResearch') 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm 
from statsmodels.tsa.stattools import adfuller

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

from FILTERS.utilities import strided_app
from FILTERS.wavelet_transform import WT

from joblib import Parallel, delayed
from joblib.externals.loky import set_loky_pickler

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *


set_loky_pickler()

plt.rcParams['figure.figsize'] = [10,8]
sns.set()
tqdm.pandas()

In [2]:
def VWAP(OHLC,what,L):
    Upper = OHLC[[what,'volume']].prod(1).rolling(L).sum()
    Lower = OHLC['volume'].rolling(L).sum()
    return Upper/Lower

def AnchVWAP(OHLC,what,t0,t1):
    idx = OHLC.index.date
    CumVol = OHLC.volume.groupby(idx).apply(lambda x: x.between_time(t0,t1).cumsum()).droplevel(0)
    Prod = OHLC.groupby(idx).apply(lambda x: x[[what,'volume']].prod(1).between_time(t0,t1).cumsum()).droplevel(0)
    return Prod/CumVol

def HeikenAshi(OHLC):
    cols = ['open','high','low','close']
    OHLC_ = OHLC[cols].copy()
    
    Close = OHLC_.mean(1).rename('close')
    Open = OHLC_[['open','close']].mean(1).shift(1).rename('open')
    High = pd.concat([Open,Close,OHLC_['high']],axis = 1).max(1).rename('high')
    Low = pd.concat([Open,Close,OHLC_['low']],axis = 1).min(1).rename('low')
    ha_OHLC = pd.concat([Open,High,Low,Close],axis = 1)
    return ha_OHLC

def ohlc_plot(df,date,indicator_list,filname = 'plot.html'):
    df_ = df.loc[date].copy()
    
    fig = make_subplots(rows=2, 
                         cols=1,
                         shared_xaxes=True,row_width=[0.2, 0.9])

    date = df_.index
    Op,Hi,Lo,Cl,Vol = df_.open,df_.high,df_.low,df_.close,df_.msg
#     AvwapHigh,AvwapLow = df_['AvwapHigh_9_2_18'],df_['AvwapLow_9_2_18']

    fig.append_trace(go.Candlestick(x=date,open=Op,high=Hi,low=Lo,close=Cl),row=1,col=1)
    
    for item in indicator_list:
        fig.append_trace(go.Scatter(x = date,y = df_[item],name = item),row = 1,col = 1)
#     fig.append_trace(go.Scatter(x = date,y = AvwapLow),row = 1,col = 1)

    fig.append_trace(go.Bar(x=date,y=Vol,name = 'msg'),row=2,col=1)
    fig.update_layout(xaxis_rangeslider_visible=False,legend = dict(orientation = 'h'))
    fig.layout.yaxis2.showgrid=False
    fig.write_html(filname)
    return

In [3]:
data = pd.read_csv('/Users/orentapiero/Data/bitmex_BTCUSD_1m.csv')
data.index = pd.to_datetime(data['time'],unit = 's')
del data['time']
ohlc = data.loc[:'2019-12-30'].copy()

ohlc['time'] = ohlc.index
grouper = ohlc.resample('60T',label = 'right',closed='right')

OHLC = [grouper['open'].first().rename('open'),
        grouper['high'].max().rename('high'),
        grouper['low'].min().rename('low'),
        grouper['close'].last().rename('close'),
        grouper['volume'].sum().rename('Volume')]

OHLC = pd.concat(OHLC,axis=1)
OHLC.index = pd.to_datetime(OHLC.index)
OHLC = OHLC.loc[OHLC.Volume>0]

In [52]:
from sklearn import linear_model
from sklearn.svm import SVR

def Lag(x,L):
    Lx=np.empty_like(x)
    Lx[:]=np.nan
    Lx[L:]=x[:-L]
    return Lx

def create_ar(x,order):
    AR = [x]
    for l in order:
        AR.append(Lag(x,l))
    AR = np.vstack(AR).T
    return AR

def np_dropna(X):
    return X[~np.isnan(X).any(axis=1)]

def scaleX(X):
    return StandardScaler().fit(X)

def scaleY(y):
    return StandardScaler().fit(y.reshape(-1,1))

def svr_fit_predict(x,order):
    
    Y = create_ar(x,order)
    Y = np_dropna(Y)
    
    svr = SVR()
    y,X,Xcv = Y[:,0],Y[:,1:],Y[-1,:-1]
    
    
    scale_y,scale_X = scaleY(y),scaleX(X)
    ys,Xs,Xs_cv = scale_y.transform(y.reshape(-1,1)),scale_X.transform(X),scale_X.transform(Xcv.reshape(1,-1))

    svr.fit(Xs,ys.ravel())
    
    
    fitted_val = scale_y.inverse_transform(svr.predict(Xs).reshape(-1,1)).ravel()
    error = y-fitted_val
    predicted = scale_y.inverse_transform(svr.predict(Xs_cv[0].reshape(1,-1)).reshape(1,-1))[0][0]
    return error,fitted_val,predicted

def roll_fit_predict(OHLC,dates,order):
    ohlc_ = OHLC.loc[dates]
    Pmid = ohlc_[['open','close']].mean(1).values
    Rmid = np.diff(np.log(Pmid),prepend = np.nan)
    error,fitted_val,predicted = svr_fit_predict(Rmid,order)
    
    out = dict(date = dates[-1],
               close=ohlc_.close.iloc[-1],
               Pmid = Pmid[-1],
               predicted_mid = 5*predicted,
               sigma_e = error.std())
    return out
    

In [53]:
order = (1,2)
logOHLC = np.log(OHLC)

strided_dates = strided_app(logOHLC.index.values,100,1)
N = strided_dates.shape[0]
fun = delayed(roll_fit_predict)

output = Parallel(n_jobs=-1)(fun(logOHLC,strided_dates[j,:],order) for j in tqdm(range(N)))
output = pd.DataFrame(output)
output.index = pd.to_datetime(output['date'])
del output['date']

  0%|          | 0/26153 [00:00<?, ?it/s]

In [54]:
output['predicted_pmid'] = output['close'] + (output['predicted_mid'])
output['Rmid'] = np.log(output['Pmid']).diff()
output['Rc2c'] = np.log(output['close']).diff()
output['Rmid2cl'] = output['close'] - output['Pmid'].shift(1)
Ohlc = pd.concat([logOHLC,output['predicted_pmid']],axis = 1).dropna()


In [55]:
def direc(output,var):
    pos = (output[var]>0) & (output['predicted_mid'].shift(1) > 0)
    neg = (output[var]<=0) & (output['predicted_mid'].shift(1) <= 0)
    mask = pos | neg
    return mask.sum()/mask.count()

In [56]:
error_m2m = (np.exp(output['Pmid']) - np.exp(output['predicted_pmid']).shift(1))
error_c2m = (np.exp(output['close']) - np.exp(output['predicted_pmid']).shift(1))

RMSE_m2m = error_m2m.groupby(error_m2m.index.year).apply(lambda x: np.sqrt((x**2).mean())).rename('RMSE_m2m')
RMSE_c2m = error_c2m.groupby(error_c2m.index.year).apply(lambda x: np.sqrt((x**2).mean())).rename('RMSE_c2m')
RMSE_m2m.loc['All'] = np.sqrt((error_m2m**2).mean())
RMSE_c2m.loc['All'] = np.sqrt((error_c2m**2).mean())

MAD_m2m = error_m2m.abs().groupby(error_m2m.index.year).mean().rename('MAD_m2m')
MAD_c2m = error_c2m.abs().groupby(error_m2m.index.year).mean().rename('MAD_c2m')
MAD_m2m.loc['All'] = error_m2m.abs().mean()
MAD_c2m.loc['All'] = error_c2m.abs().mean()

MDA_m2m = output.groupby(output.index.year).apply(lambda x: direc(x,'Rmid')).rename('MDA_m2m')
MDA_m2m.loc['All'] = direc(output,'Rmid')

MDA_c2m = output.groupby(output.index.year).apply(lambda x: direc(x,'Rmid2cl')).rename('MDA_c2m')
MDA_c2m.loc['All'] = direc(output,'Rmid2cl')

MDA_c2c = output.groupby(output.index.year).apply(lambda x: direc(x,'Rc2c')).rename('MDA_c2c')
MDA_c2c.loc['All'] = direc(output,'Rc2c')

pd.concat([RMSE_m2m,RMSE_c2m,MAD_m2m,MAD_c2m,MDA_m2m,MDA_c2m,MDA_c2c],axis = 1)

Unnamed: 0_level_0,RMSE_m2m,RMSE_c2m,MAD_m2m,MAD_c2m,MDA_m2m,MDA_c2m,MDA_c2c
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017,52.593487,99.59386,21.100838,39.477727,0.664087,0.599053,0.481922
2018,46.379999,88.287508,25.506422,47.386477,0.659436,0.595844,0.464208
2019,35.45875,68.154651,18.365006,34.491207,0.64782,0.582122,0.458624
All,45.34372,86.287554,21.662384,40.460654,0.657171,0.592399,0.468283


In [57]:
RMSE_0_6 = np.sqrt((error_c2m.between_time('00:00','06:00')**2).mean())
RMSE_7_12 = np.sqrt((error_c2m.between_time('07:00','12:00')**2).mean())
RMSE_13_16 = np.sqrt((error_c2m.between_time('13:00','16:00')**2).mean())
RMSE_17_23 = np.sqrt((error_c2m.between_time('17:00','23:00')**2).mean())

print('00-06:',RMSE_0_6)
print('07-12:',RMSE_7_12)
print('13-16:',RMSE_13_16)
print('17-23:',RMSE_17_23)

00-06: 85.8670509518088
07-12: 79.61710807405325
13-16: 94.12201131618878
17-23: 87.5145460634388


In [58]:
MAE_0_6 = ((error_c2m.abs().between_time('00:00','06:00')).mean())
MAE_7_12 = ((error_c2m.abs().between_time('07:00','12:00')).mean())
MAE_13_16 = ((error_c2m.abs().between_time('13:00','16:00')).mean())
MAE_17_23 = ((error_c2m.abs().between_time('17:00','23:00')).mean())

print('00-06:',MAE_0_6)
print('07-12:',MAE_7_12)
print('13-16:',MAE_13_16)
print('17-23:',MAE_17_23)

00-06: 39.280194255678964
07-12: 38.82386635899933
13-16: 43.90612154151903
17-23: 41.0742347650551


In [59]:
def count_intersections(blend_ohlc_):
    return (blend_ohlc_.close > blend_ohlc_.predicted_pmid).astype(float).diff().abs().sum()

def ttinter(blend_ohlc_):
    inter = (blend_ohlc_.close > blend_ohlc_.predicted_pmid).astype(float).diff().abs()
    if inter.sum()>0:
        ttm = inter.loc[inter==1].index[0]-blend_ohlc_.index[0]
    else:
        ttm = pd.Timedelta(np.nan)
    return ttm

In [60]:
output=output.groupby(output.index).first()
ohlc=ohlc.groupby(ohlc.index).first()

In [61]:
spread = (np.exp(output['predicted_pmid'])-np.exp(logOHLC['open'].shift(-1))).rename('spread')
spread = spread.groupby(spread.index).first()
blend_ohlc = pd.concat([ohlc[['open','high','low','close']],
                        np.exp(output['predicted_pmid']),
                        spread],axis = 1).fillna(method = 'ffill').dropna()


In [62]:
sp = spread.copy()
sp.index = pd.to_datetime(sp.index)

print(sp.abs().between_time('00:00','06:00').mean())
print(sp.abs().between_time('07:00','12:00').mean())
print(sp.abs().between_time('13:00','16:00').mean())
print(sp.abs().between_time('17:00','23:00').mean())


7.198053460524467
7.125402747620133
7.58949173908981
7.359633680431353


In [63]:
intersect = blend_ohlc.groupby(blend_ohlc.index.strftime('%Y-%m-%d %H')).apply(count_intersections).rename('n_inter')
ttm = blend_ohlc.groupby(blend_ohlc.index.strftime('%Y-%m-%d %H')).apply(ttinter).rename('t2inter')
spread = blend_ohlc.groupby(blend_ohlc.index.strftime('%Y-%m-%d %H')).first()
mspread = blend_ohlc.spread.abs().groupby(blend_ohlc.index.strftime('%Y-%m-%d %H')).min().rename('min_spr')
mspread1 = (blend_ohlc.spread**2).groupby(blend_ohlc.index.strftime('%Y-%m-%d %H')).min().rename('min_spr1')

res = pd.concat([intersect,ttm,spread,mspread,mspread1],axis = 1)
res.index = pd.to_datetime(res.index)


In [64]:
summary1 = res[['n_inter','t2inter','min_spr','min_spr1']].describe().copy()
summary1.loc['count'] /= len(res)


print(np.sqrt(summary1.loc['mean'].loc['min_spr1']))
summary1

14.565994369558286


Unnamed: 0,n_inter,t2inter,min_spr,min_spr1
count,1.0,0.753144,1.0,1.0
mean,2.820392,0 days 00:10:16.937055837,7.291484,212.1682
std,2.926524,0 days 00:13:11.159578050,12.609859,1101.279
min,0.0,0 days 00:01:00,0.000362,1.307381e-07
25%,1.0,0 days 00:01:00,1.14245,1.305191
50%,2.0,0 days 00:04:00,3.191761,10.18734
75%,4.0,0 days 00:13:00,7.832445,61.34719
max,31.0,0 days 00:59:00,200.486627,40194.89


In [65]:
summary1 = res[['n_inter','t2inter','min_spr','min_spr1']].between_time('00:00','06:00').describe()
summary1.loc['count'] /= len(res.between_time('00:00','06:00'))
print(np.sqrt(summary1.loc['mean'].loc['min_spr1']))
summary1

14.656279044570171


Unnamed: 0,n_inter,t2inter,min_spr,min_spr1
count,1.0,0.743674,1.0,1.0
mean,2.711551,0 days 00:10:31.703102961,7.194675,214.8065
std,2.867139,0 days 00:13:23.722290358,12.769673,1177.26
min,0.0,0 days 00:01:00,0.000678,4.603187e-07
25%,0.0,0 days 00:01:00,1.134339,1.286725
50%,2.0,0 days 00:04:00,3.110376,9.674441
75%,4.0,0 days 00:14:00,7.6215,58.08726
max,29.0,0 days 00:59:00,180.605755,32618.44


In [66]:
summary1 = res[['n_inter','t2inter','min_spr','min_spr1']].between_time('07:00','12:00').describe()
summary1.loc['count'] /= len(res.between_time('07:00','12:00'))
print(np.sqrt(summary1.loc['mean'].loc['min_spr1']))
summary1

14.116723044212799


Unnamed: 0,n_inter,t2inter,min_spr,min_spr1
count,1.0,0.740673,1.0,1.0
mean,2.721101,0 days 00:10:18.455821635,7.125403,199.28187
std,2.875914,0 days 00:13:12.373159528,12.18742,928.730273
min,0.0,0 days 00:01:00,0.001279,2e-06
25%,0.0,0 days 00:01:00,1.094442,1.197803
50%,2.0,0 days 00:04:00,3.001987,9.011925
75%,4.0,0 days 00:14:00,7.654785,58.595741
max,31.0,0 days 00:59:00,153.344152,23514.429077


In [67]:
summary1 = res[['n_inter','t2inter','min_spr','min_spr1']].between_time('13:00','16:00').describe()
summary1.loc['count'] /= len(res.between_time('13:00','16:00'))
print(np.sqrt(summary1.loc['mean'].loc['min_spr1']))
summary1

14.728915917705068


Unnamed: 0,n_inter,t2inter,min_spr,min_spr1
count,1.0,0.777752,1.0,1.0
mean,2.941514,0 days 00:09:43.845473311,7.589492,216.941
std,2.950695,0 days 00:12:39.873195256,12.624466,1001.735
min,0.0,0 days 00:01:00,0.000362,1.307381e-07
25%,1.0,0 days 00:01:00,1.256576,1.578985
50%,2.0,0 days 00:04:00,3.378944,11.41727
75%,4.0,0 days 00:12:00,8.297721,68.85219
max,24.0,0 days 00:59:00,139.262779,19394.12


In [68]:
summary1 = res[['n_inter','t2inter','min_spr','min_spr1']].between_time('17:00','23:00').describe()
summary1.loc['count'] /= len(res.between_time('17:00','23:00'))
print(np.sqrt(summary1.loc['mean'].loc['min_spr1']))
summary1

14.75970957566237


Unnamed: 0,n_inter,t2inter,min_spr,min_spr1
count,1.0,0.75924,1.0,1.0
mean,2.945085,0 days 00:10:20.580010357,7.360319,217.849
std,3.00635,0 days 00:13:15.339807495,12.794381,1208.257
min,0.0,0 days 00:01:00,0.000432,1.868177e-07
25%,1.0,0 days 00:01:00,1.120128,1.254687
50%,2.0,0 days 00:04:00,3.316515,10.99928
75%,4.0,0 days 00:14:00,7.917683,62.68971
max,31.0,0 days 00:59:00,200.486627,40194.89
