This code is based on  https://www.kaggle.com/alexfir/recreating-target</br>
Formulas:
$$R^a(t) = log (P^a(t+16)\ /\ P^a(t+1))$$
$$M(t) = \frac{\sum_a w^a R^a(t)}{\sum_a w^a}$$
$$\beta^a = \frac{\langle M \cdot R^a \rangle}{\langle M^2 \rangle}$$
$$\text{Target}^a(t) = R^a(t) - \beta^a M(t)$$


In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_log_error, mean_squared_error

asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
asset_details = asset_details.sort_values('Asset_ID')

ids = list(asset_details.Asset_ID)
weights = np.array(list(asset_details.Weight))

df = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv')
df.index = pd.to_datetime(df['timestamp'], unit='s')
df = df.sort_index()

supplemental_df = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')
supplemental_df.index = pd.to_datetime(supplemental_df['timestamp'], unit='s')
supplemental_df = supplemental_df.sort_index()

df = df.append(supplemental_df)

In [None]:
df_price = df.groupby([df.index, 'Asset_ID'])['Close'].last().unstack()

#price 16-min returns
#df_price_rets = df_price_rets.shift(periods=-16)/df_price_rets.shift(periods=-1) - 1

#daily data for prediction
#df_price_daily = df.copy()
#df_price_daily.index = df_price_daily.index.normalize()
#df_price_daily = df_price_daily.groupby([df_price_daily.index, 'Asset_ID'])['Close'].last().unstack()

In [None]:
(df_price/df_price.shift(1)-1).iloc[:30,:5].plot()

In [None]:
df_price

In [None]:
asset_details

In [None]:
import gresearch_crypto

#Error: You can only iterate over `iter_test()` once.
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

### Possible solutions to this competition

1. Naive - replicate the test price 16 times and there calc target
2. ARIMA - https://towardsdatascience.com/multi-step-time-series-forecasting-with-arima-lightgbm-and-prophet-cc9e3f95dfb0
3. LGBM  - https://towardsdatascience.com/a-lightgbm-autoregressor-using-sktime-6402726e0e7b
3. ML  - predict 16 future values with SVR/H2O/LGBM+StackingRegressor. 
4. DNN - predict 16 future values with windowed DNN/CNN. 
5. Build model based on the daily data? Then how to predict minute intervals?

For 2/3:<br/>
The X will be current prices and the Y will be 16 future prices (multilabel).<br/>
We will have to rebuild the model at every iteration.<br/>
The data can be limited to the last 5000 records.<br/>
We can try to add function `calculate_target` inside tensorflow model and predict the final targets.</br>
Instead of the prices we can predict returns and use them when calculating targets:<br/>
`ret = price.shift(periods=-16)/price.shift(periods=-1) - 1`

In [None]:
def calculate_target(data):
    all_timestamps = data.index.unique()
    targets = pd.DataFrame(index=all_timestamps)

    for id in ids:
        targets[id] = (data[id].shift(periods=-16)/data[id].shift(periods=-1)) - 1
    
    targets['m'] = np.average(targets.fillna(0), axis=1, weights=weights)
    
    m = targets['m']
    num = targets.multiply(m.values, axis=0).rolling(3750).mean().values
    denom = m.multiply(m.values, axis=0).rolling(3750).mean().values
    beta = np.nan_to_num(num.T / denom, nan=0., posinf=0., neginf=0.)

    targets = targets - (beta * m.values).T
    targets.drop('m', axis=1, inplace=True)
    
    return targets


#Naive prediction
def predict_values_naive(df_price_uplimit: pd.DataFrame, num_preds):
    new_df = df_price_uplimit.iloc[-1:]
    for id_lag in range(num_preds):
        new_df.index = new_df.index + dt.timedelta(minutes=1)
        df_price_uplimit = df_price_uplimit.append(new_df)
        
    return df_price_uplimit


def create_sliding_windows(data: np.array, window_size):
    wx = [data[i:-window_size+i] for i in range(window_size)]
    wx = np.dstack(wx)[0]
    wy = data[window_size:]
    return wx, wy


def train_model(df_price: pd.DataFrame, window_size):
    sliced_data = df_price.iloc[-dataset_size:].copy().fillna(method='ffill')
    #sliced_data = sliced_data.pct_change().iloc[1:]
    #sliced_data.index = np.arange(sliced_data.index.shape[0])
    #sliced_data = (df_price.shift(periods=-16)/df_price.shift(periods=-1) - 1).dropna()

    test_data = sliced_data.iloc[-window_size:].values
    regressors = []
    for i in range(test_data.shape[1]):
        x, y = create_sliding_windows(sliced_data[i].values, window_size)     #device='gpu',num_leaves=128,learning_rate=0.01
        regressor = Pipeline([('scaler', StandardScaler()),('lgb', lgb.LGBMRegressor(device='gpu',num_leaves=128,learning_rate=0.01))]).fit(x, y)
        """regressor = Pipeline([
            ('scaler', StandardScaler()), 
            ('regr', VotingRegressor([
                ('lgb', lgb.LGBMRegressor(device='gpu')), 
                ('svr', SVR(C=0.5))
            ]))
        ]).fit(x, y)"""
        
        regressors.append(regressor)
    return regressors, test_data
    

def predict_values(regressors, test_data: np.array, num_preds):
    pred_data = []
    for _ in range(num_preds):
        y_pred = [regressors[i].predict([test_data[:,i]])[0] for i in range(len(regressors))]
        test_data = np.append(test_data[1:], [y_pred], axis=0)
        pred_data.append(y_pred)
    return pred_data


def weighted_correlation(a, b, weights):
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)
    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w
    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)
    return corr

In [None]:
#debug: create_sliding_windows
create_sliding_windows([1,2,3,4,5,6,7,8,9,10,11,12,13], 4)

Autocorrelation (ACF) plot can be used to find if time series is stationarity. It also can be helpful to find the order of moving average part in ARIMA model.<br/> 
Partial autocorrelation (PACF) plot is useful to identify the order of autoregressive part in ARIMA model. <br/>
Augmented Dickey–Fuller unit test examines if the time series is non-stationary. <br/>
The null hypothesis is that the series is non-stationary, hence if the p-value is small, it implies the time series is NOT non-stationary.

In [None]:
%%script echo skipping
#debug: checking data stationarity

import matplotlib.pyplot as plt
import statsmodels.graphics.tsaplots as smt
import statsmodels.api as sm

y = df_price[0].iloc[-500:].fillna(method='ffill')
y.index = np.arange(y.index.shape[0])
y = (y.shift(periods=-16)/y.shift(periods=-1) - 1).dropna()

print('Min:',y.min(), 'Max:',y.max())
print('Less STD is more stationary:', y.rolling(60*24).std().std())

lags = None
layout = (2, 2)
fig = plt.figure(figsize=(12, 7))
ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))

y.plot(ax=ts_ax)
p_value = sm.tsa.stattools.adfuller(y)[1]
ts_ax.set_title('Time Series Analysis Plots\n Dickey-Fuller: p={0:.5f}'.format(p_value))
smt.plot_acf(y, lags=lags, ax=acf_ax)
smt.plot_pacf(y, lags=lags, ax=pacf_ax)
plt.tight_layout()

In [None]:
%%script echo skipping
#debug: checking LGBM + sktime forecast

!pip install sktime pmdarima

import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sktime.forecasting.compose import make_reduction, TransformedTargetForecaster
from sktime.forecasting.model_selection import ExpandingWindowSplitter, ForecastingGridSearchCV
import lightgbm as lgb


def plot_forecast(series_train, series_test, forecast, forecast_int=None):
    mae = mean_absolute_error(series_test, forecast)
    mape = mean_absolute_percentage_error(series_test, forecast)

    plt.figure(figsize=(12, 6))
    plt.title(f"MAE: {mae:.2f}, MAPE: {mape:.3f}", size=18)
    series_train.plot(label="train", color="b")
    series_test.plot(label="test", color="g")
    forecast.index = series_test.index
    forecast.plot(label="forecast", color="r")
    if forecast_int is not None:
        plt.fill_between(series_test.index,forecast_int["lower"],forecast_int["upper"],alpha=0.2,color="dimgray")
        
    plt.legend(prop={"size": 16})
    plt.show()
    return mae, mape

def create_forecaster():
    # creating forecaster with LightGBM
    regressor = lgb.LGBMRegressor()
    forecaster = make_reduction(regressor, window_length=5, strategy="recursive")
    return forecaster

def grid_serch_forecaster(train, test, forecaster, param_grid):
    # Grid search on window_length
    cv = ExpandingWindowSplitter(initial_window=int(len(train) * 0.7))
    gscv = ForecastingGridSearchCV(forecaster, strategy="refit", cv=cv, param_grid=param_grid)
    gscv.fit(train)
    print(f"best params: {gscv.best_params_}")
    
    # forecasting
    fh=np.arange(len(test))+1
    y_pred = gscv.predict(fh=fh)
    mae, mape = plot_forecast(train, test, y_pred)
    return mae, mape, y_pred
    
param_grid = {"window_length": [20]} # parameter set to be grid searched
forecaster = create_forecaster()

y = df_price[0].iloc[-1000:-150].fillna(method='ffill')
y.index = np.arange(y.index.shape[0])
#y = (y.shift(periods=-16)/y.shift(periods=-1) - 1).dropna()

test_len = 50 #int(len(y) * 0.2)
sun_train, sun_test = y.iloc[:-test_len], y.iloc[-test_len:]
sun_lgb_mae, sun_lgb_mape, y_pred = grid_serch_forecaster(sun_train, sun_test, forecaster, param_grid)
sun_lgb_mae, sun_lgb_mape

In [None]:
%%script echo skipping

sliced_data.plot()

In [None]:
%%script echo skipping

valid_df = pd.DataFrame(valid_data)
valid_df['pred'] = pd.Series(pred_data)
valid_df.plot()

In [None]:
%%script echo skipping

#targets that have to be predicted
df_target = df.groupby([df.index, 'Asset_ID'])['Target'].last().unstack()

df_test, df_pred = next(iter_test)
#env.predict(df_pred)

df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
df_test = df_test.set_index('datetime').drop('timestamp', axis=1)
df_test = df_test.groupby([df_test.index, 'Asset_ID'])['Close'].last().unstack()
df_test

In [None]:
%%script echo skipping

#for debugging public leaderboard
df_target.loc[df_test.index[0]]

In [None]:
%%script echo skipping

#for debugging public leaderboard
df_price_uplimit = df_price.iloc[:df_price.index.get_loc(df_test.index[0], method='nearest')+17]
calculate_target(df_price_uplimit.iloc[-5000:]).dropna().iloc[-1]

In [None]:
%%script echo skipping
#debug: create dummy "iter_test"

test_size = 250
#ids = [0,1]
#weights = weights[ids]

df_price = df.groupby([df.index, 'Asset_ID'])['Close'].last().unstack()[ids]
df_target = df.groupby([df.index, 'Asset_ID'])['Target'].last().unstack()[ids]

cutoff_index = df_price.index.get_loc('2022-01-05 00:00:00', method='nearest')
#iter_test = [(df_price.iloc[cutoff_index+i].to_frame().T, df_target.iloc[cutoff_index+i].to_frame().T) for i in range(10)]
iter_test = [(df_price.iloc[cutoff_index+i].to_frame().T, None) for i in range(test_size)]

test_price = df_price.iloc[cutoff_index:cutoff_index+test_size].values
test_target = df_target.iloc[cutoff_index:cutoff_index+test_size].values
df_price = df_price.iloc[:cutoff_index]

In [None]:
%%script echo skipping
#debug: directly predict targets

i = 0
k = 0
regressor = None
window_size = 360
retrain_size = 0
dataset_size = 1500
pred_price = []
pred_target = []

for df_test, df_pred in iter_test:
    new_df = df_test.copy()
    #new_df.index = pd.to_datetime(new_df['timestamp'], unit='s')
    #new_df = new_df.groupby([new_df.index, 'Asset_ID'])['Close'].last().unstack()
    #print('df_test: ',new_df[0].values)

    if new_df.index[0]<=df_price.index[-1]:
        last_index = df_price.index.get_loc(new_df.index[0], method='nearest')
        df_price_uplimit = df_price.iloc[:last_index]
    else:
        df_price = df_price.append(new_df)
        df_price_uplimit = df_price
        
    df_price_uplimit = calculate_target(df_price_uplimit.iloc[-5000:]).dropna()

    #if i==0 or regressor is None:
    regressor, test_data = train_model(df_price_uplimit, window_size)
    #else:
    #    test_data = df_price_uplimit.fillna(method='ffill').iloc[-window_size:].values

    print('test_data', i, test_data.shape)
    pred_data = predict_values(regressor, test_data, 1)
    target = pred_data[0]
    i = i+1 if i < retrain_size else 0
    #pred_price.append(pred_data[0])
    
    #rmse_price = np.sqrt(mean_squared_error(pred_price, test_price[:len(pred_price)], multioutput='raw_values'))
    pred_target.append(weighted_correlation(target, test_target[i], weights))
    print(k, 'last metric:', pred_target[-1], 'mean metric:', np.mean(pred_target))
    k += 1
    
    try:
        for _, row in df_test.iterrows():
            df_pred.loc[df_pred['row_id']==row['row_id'], 'Target'] = target[row['Asset_ID']]
    except: 
        pass
    
    #env.predict(df_pred.fillna(0))

In [None]:
i = 0
k = 0
regressor = None
window_size = 360
retrain_size = 10
dataset_size = 1500
pred_price = []
pred_target = []

for df_test, df_pred in iter_test:
    new_df = df_test.copy()
    new_df.index = pd.to_datetime(new_df['timestamp'], unit='s')
    new_df = new_df.groupby([new_df.index, 'Asset_ID'])['Close'].last().unstack()
    #print('df_test: ',new_df[0].values)

    num_preds = 17
    if new_df.index[0]<=df_price.index[-1]:
        last_index = df_price.index.get_loc(new_df.index[0], method='nearest') + num_preds
        num_preds = last_index - (df_price.index.shape[0]-1)
        df_price_uplimit = df_price if num_preds>=0 else df_price.iloc[:last_index]
        num_preds = min(max(num_preds, 0), 17)
    else:
        df_price = df_price.append(new_df)
        df_price_uplimit = df_price

    print('num_preds: ',num_preds)
    #we have to predict 17 steps above the limit
    if num_preds>0:
        if i==0 or regressor is None:
            regressor, test_data = train_model(df_price_uplimit, window_size)
        else:
            test_data = np.append(test_data[1:], new_df.values, axis=0)
            
        print('test_data', i, test_data.shape)
        pred_data = predict_values(regressor, test_data, num_preds)
        df_price_uplimit = df_price_uplimit.append(pred_data)
        i = i+1 if i < retrain_size else 0
        #pred_price.append(pred_data[0])
    
    target = calculate_target(df_price_uplimit.iloc[-5000:]).dropna().iloc[-1]
    
    #rmse_price = np.sqrt(mean_squared_error(pred_price, test_price[:len(pred_price)], multioutput='raw_values'))
    #pred_target.append(weighted_correlation(target.values, test_target[i], weights))
    #print(k, 'last metric:', pred_target[-1], 'mean metric:', np.mean(pred_target))
    #k += 1
    
    try:
        for _, row in df_test.iterrows():
            df_pred.loc[df_pred['row_id']==row['row_id'], 'Target'] = target[row['Asset_ID']]
    except: 
        pass
    
    env.predict(df_pred.fillna(0))

In [None]:
%%script echo skipping

dtsize = len(pred_target)
pd.DataFrame(data={'pred':np.vstack(pred_target).T[0],'test':test_target[:dtsize,0]}).plot()
pd.DataFrame(data={'pred':np.vstack(pred_price).T[0], 'test':test_price[:dtsize,0]}).plot()
#pd.DataFrame(data={'pred':pred_price[1], 'test':test_price[1]}).plot()

In [None]:
%%script echo skipping
#debug: naive flow

for df_test, df_pred in iter_test:
    new_df = df_test.copy()
    new_df.index = pd.to_datetime(new_df['timestamp'], unit='s')
    new_df = new_df.groupby([new_df.index, 'Asset_ID'])['Close'].last().unstack()
    print('df_test index: ',new_df.index[0])

    num_preds = 17
    if new_df.index[0]<=df_price.index[-1]:
        last_index = df_price.index.get_loc(new_df.index[0], method='nearest') + num_preds
        num_preds = last_index - (df_price.index.shape[0]-1)
        df_price_uplimit = df_price if num_preds>=0 else df_price.iloc[:last_index]
        num_preds = min(max(num_preds, 0), 17)
    else:
        df_price = df_price.append(new_df)
        df_price_uplimit = df_price

    #we have to predict 17 steps above the limit
    if num_preds>0:
        df_price_uplimit = predict_values_naive(df_price_uplimit.copy(), num_preds)

    target = calculate_target(df_price_uplimit.iloc[-5000:]).dropna().iloc[-1]
    print('target index: ',target.name)
    
    try:
        for _, row in df_test.iterrows():
            df_pred.loc[df_pred['row_id']==row['row_id'], 'Target'] = target[row['Asset_ID']]
    except: 
        pass
    
    env.predict(df_pred.fillna(0))