In [None]:
from jupyter_imports import *
from utils.io_utils import *
from histfeed.ftx_history import *
from scipy.interpolate import CubicSpline
from scipy.fft import fft, fftfreq
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,LassoLarsIC,RidgeCV,ElasticNetCV,LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit,cross_val_score,cross_val_predict,train_test_split

### get history

In [None]:
coins = 'wide'
futures_df = main([None,'get',coins,'ftx','500'])

### params (Lasso 5% / funding+price / pca=2, seem best)

In [None]:
coin = 'ETH'
n_split = 5
raw_features = [f'{coin}-PERP_rate_funding',
                f'{coin}_price_c',
                f'{coin}_rate_size',
                        f'{coin}_price_volume',
                      f'{coin}_rate_borrow']
raw_features = raw_features[:4]
pca_n = 3
horizon_windows = list(range(1,12))+list(range(12,24,4))+list(range(24,48,12))+list(range(48,168,24))
holding_windows = horizon_windows[::3]
hedge_cost = 1e-3
print(horizon_windows)
print(holding_windows)

### transform features into laplace transform for smoothing

In [None]:
features = futures_df[raw_features]
if f'{coin}_price_c' in raw_features:
    features[f'{coin}_price_c'] = features[f'{coin}_price_c'].diff()/features[f'{coin}_price_c']

In [None]:
features[:]=StandardScaler().fit_transform(features)

In [None]:
features.iplot(title='features z-score')

In [None]:
features_dict = {c:pd.concat({i:
    features[c].transform(
        lambda x: x.ewm(times= features.index,halflife=timedelta(hours=i+1)).mean()) # x.shift(periods=i))#
        for i in horizon_windows},axis=1).dropna()
        for c in features.columns}

In [None]:
dummy_features_dict = {c:pd.concat({i:
    features[c].transform(
        lambda x: x.shift(periods=i)) # x.shift(periods=i))#
        for i in horizon_windows},axis=1).dropna()
        for c in features.columns}

In [None]:
for feature_name,feature_data in features_dict.items():
    df = feature_data.tail(500).reset_index().melt(id_vars='index',value_vars=feature_data.columns)
    df = df.sort_values(by=['index','variable'])
    df['index'] = df['index'].apply(str)
    fig=px.scatter(df,x='variable',y='value', range_y=[-1,1],animation_frame='index',title=feature_name)#, animation_group="country")#
           #size="pop", color="continent", hover_name="country",
           #log_x=False, size_max=55, range_x=[0,168]))
    fig.show()

### reduce each series by PCA

In [None]:
fitted_pca = {name:PCA(n_components=pca_n,svd_solver='full').fit(data) for name,data in features_dict.items()}

In [None]:
explained_variance = pd.concat([pd.Series(name=name,
                                         index=range(data.n_components_),
                                         data=data.explained_variance_ratio_)
                                          for name,data in fitted_pca.items()],axis=1).T
explained_variance['total'] = np.sqrt(1-explained_variance.sum(axis=1))
explained_variance['n'] = [data.n_components_ for name,data in fitted_pca.items()]
print(explained_variance)

In [None]:
#factors[f'{coin}-PERP/rate/funding']
#interpolate_laplace = CubicSpline(factors[f'{coin}-PERP/rate/funding'][0].index,factors[f'{coin}-PERP/rate/funding'][mode])
#lambda_laplace = lambda f: float(interpolate_laplace(float(f)))
#[invertlaplace(lambda_laplace,float(x)) for x in data[0].index]
#px.line(x=horizon_windows,y=[lambda_laplace(t) for t in horizon_windows])
#fitted_pca[f'{coin}-PERP/rate/funding'].singular_values_

In [None]:
factors = {name:pd.DataFrame({i:pd.Series(index=horizon_windows,data=eigen_vect) 
                        for i,(eigen_vect,eigen_value) in enumerate(zip(data.components_,data.singular_values_))})
           for name,data in fitted_pca.items()}

fig = make_subplots(rows=len(factors), cols=1,
                    subplot_titles=list(factors.keys()),
                    shared_xaxes=True,
                    vertical_spacing=0.02)

for i,(name,data) in enumerate(factors.items()):
    for mode in data:
        fig.append_trace(go.Scatter(x=data[mode].index,y=data[mode].values,name=mode),
            row=i+1, col=1)

fig.update_layout(height=1000, width=600, title_text="eigenvectors")
fig.show()

In [None]:
smoothed = {name:pd.DataFrame(index=features.index,
                           columns=pd.MultiIndex.from_product([[name],range(data.n_components_)],names=['feature','pca_mode']),
                           data=data.transform(features))
             for (name,data),features in zip(fitted_pca.items(),features_dict.values())}
smoothed = pd.concat(smoothed.values(),axis=1).dropna()
smoothed.xs(0,level=1,axis=1).iplot()

### labels

In [None]:
labels = (futures_df[f'{coin}-PERP/rate/funding']-futures_df[f'{coin}/rate/borrow']).sort_index(ascending=False)
labels = pd.concat({i:
    labels.iloc[i:].transform(
        lambda x: x.rolling(i+1).mean())
        for i in holding_windows},axis=1,join='outer').shift(1)
#labels.iplot()

### linear reg

In [None]:
# model = SVR(kernel='sigmoid')
# model = RandomForestRegressor()
model=LassoCV(cv=TimeSeriesSplit(n_split))
#model=ElasticNetCV(cv=TimeSeriesSplit(n_split),l1_ratio=[.1, .5, .7, .9, .95, .99, 1])
#model=RidgeCV(cv=TimeSeriesSplit(n_split))
#model=LassoLarsCV(cv=TimeSeriesSplit(n_split))
holding_grid = {}
for j in holding_windows:
    x_y = smoothed.join(pd.DataFrame({'y':labels[j]}),how='inner').dropna()
    x = x_y.drop(columns='y')
    y = x_y['y']

    fitted_model = model.fit(x,y)
    holding_grid |= {j:cross_val_score(model, x, y,scoring="neg_mean_absolute_error", cv=TimeSeriesSplit(n_split))}
tx_costs = pd.Series(index=holding_windows,data=[hedge_cost*.00024*365/j for j in holding_windows])
px.line((pd.DataFrame(holding_grid).describe().drop(index=['min','max','count','std'])-tx_costs).T,title=f'{model.__class__}cross val scores by holding period')

In [None]:
x_y = smoothed.join(pd.DataFrame({'y':labels[10]}),how='inner').dropna()
x = x_y.drop(columns='y')
y = x_y['y']

fitted_model = model.fit(x,y)
pd.DataFrame(index=pd.MultiIndex.from_tuples(smoothed.columns,names=['feature','pca_mode']),data=fitted_model.coef_).unstack(level='pca_mode')
pd.Series(index=fitted_model.alphas_,data=[np.mean(x) for x in fitted_model.mse_path_]).iplot(title=f'alpha={fitted_model.alpha_}')

In [None]:
#model=ElasticNet(l1_ratio=0.5)
#model=RidgeCV(cv=TimeSeriesSplit(n_split))
#model=LassoLarsCV(cv=TimeSeriesSplit(n_split))
model=LinearRegression()
holding_grid = {}
for j in holding_windows:
    x_y = smoothed.join(pd.DataFrame({'y':labels[j]}),how='inner').dropna()
    x = x_y.drop(columns='y')
    y = x_y['y']

    fitted_model = model.fit(x,y)
    holding_grid |= {j:cross_val_score(model, x, y,scoring="neg_mean_absolute_error", cv=TimeSeriesSplit(n_split))}
px.line(pd.DataFrame(holding_grid).describe().drop(index=['min','max','count','std']).T,title=f'{model.__class__}cross val scores by holding period')

### performance by holding period

In [None]:
holding_grid = {}
input_df = futures_df[f'{coin}-PERP_rate_funding']-futures_df[f'{coin}_rate_borrow']
benchmark = input_df.ewm(times=input_df.index,halflife=timedelta(hours=48)).mean()
for j in holding_windows:
    x_y = smoothed.join(pd.DataFrame({'y':labels[j]})).dropna()
    x = x_y.drop(columns=['y'])
    y = x_y['y']

    fitted_model = LassoCV().fit(x,y)
    #fitted_model = LinearRegression().fit(x,y)
    #print(fitted_model.score(x_y[smoothed.columns],x_y['y']))
    x_y['predicted'] = fitted_model.predict(x)
    x_y['benchmark'] = y-benchmark[x_y.index]
    x_y['diff'] = y-x_y['predicted']#-0.001*365.25*24_j
    #x_y[['y','predicted','diff']].iplot()
    
    holding_grid |= {j:x_y['diff'].describe()}
pd.DataFrame(holding_grid).T.drop(columns=['min','max','count','std']).iplot(title='tracking error stats by holding period')

### ...or just pick a window

In [None]:
input_df = futures_df[f'{coin}-PERP_rate_funding']-futures_df[f'{coin}_rate_borrow']
feature = input_df.ewm(times=input_df.index,halflife=timedelta(hours=48)).mean()
label = input_df.transform(lambda x: x.rolling(24).mean()).shift(1)

In [None]:
pd.concat([label,label-feature],axis=1).dropna().iplot()

In [None]:
feature = (futures_df[[f'{coin}-PERP_rate_funding']]-futures_df[[f'{coin}_rate_borrow']]).ewm(times= features.index,halflife=timedelta(hours=24)).mean())},axis=1).dropna()
label = (futures_df[[f'{coin}-PERP_rate_funding']]-futures_df[[f'{coin}_rate_borrow']]).transform(lambda x: x.rolling(24).mean())


holding_grid = {}
for i in horizon_windows:
    picked_windows = [i]*len(raw_features)
    smoothed = pd.DataFrame({name:features_dict[name][window] for name,window in zip(raw_features,picked_windows)}).dropna()

    #model=ElasticNet(l1_ratio=0.5)
    model=LinearRegression()
    for j in holding_windows:
        x_y = smoothed.join(pd.DataFrame({'y':labels[j]}),how='inner').dropna()
        x = x_y.drop(columns='y')
        y = x_y['y']

        fitted_model = model.fit(x,y)
        holding_grid |= {(i,j):cross_val_score(model, x, y,scoring="neg_mean_absolute_error", cv=TimeSeriesSplit(n_split))}
holding_grid = pd.DataFrame(holding_grid)

In [None]:
fig = make_subplots(rows=1,#len(holding_grid.columns.levels[0]), cols=1,
            subplot_titles=[f'horizon_{holding}' for holding in holding_grid.columns.levels[0]],
            shared_xaxes=True,
            vertical_spacing=0.02)

for i,horizon in enumerate(holding_grid.columns.levels[0]):
    holding_curve = holding_grid[(horizon,)].describe().T[['mean']]
    for stat in holding_curve.columns:
        fig.append_trace(go.Scatter(x=holding_grid.columns.levels[1],
                                    y=holding_curve[stat].values,
                                    name=f'mean_{horizon}'),
                         row=1, col=1)

fig.update_layout(height=1000, width=600, title_text='mean err by holding by horizon')
fig.show()