In [1]:
from pandas import read_csv as rc
from pandas import DataFrame as df
from tqdm import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from datetime import datetime
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from pmdarima.arima import auto_arima
from feature_calculations import *
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from scipy.stats import pearsonr

In [2]:
df_dict = {}

for asset_id in range(1, 2):
    df_dict[f'Asset_{asset_id}'] = rc(f"/Users/ryanlucas/Desktop/G-Research/Data/Asset-{asset_id}.csv")


In [3]:
features_dict = {}

for asset_id in range(1,2):
    features_dict[f'Asset_{asset_id}'] = rc(f"/Users/ryanlucas/Desktop/G-Research/Data/Asset-{asset_id}_features.csv")

In [4]:
def train_and_forecast_AR1(data, ar_order= 1):

    model = ARIMA(data, order=(
        ar_order, 0, 0)).fit(method="yule_walker")

    forecasts = model.forecast(1)

    return forecasts

In [5]:
def ensemble_EN_AL(t, v0, functional_sets, forecast_df, p_norm_df):

    # Step 1: Declare T_0.
    T_0 = np.arange(t - v0 + 1, t+1)

    minimising_model_count = create_value_dict(functional_sets)

    # Re-write starts here
    # Step 2.
    for s in T_0:

        model_with_min_loss = p_norm_df.loc[s].idxmin(axis=1)

        minimising_model_count[model_with_min_loss] += 1

    # Re-write finishes here
    # Step 3: Calculate p^*_t as the empirical distribution of h^*_s.
    weights = {model: count/len(T_0)
               for model, count in minimising_model_count.items()}

    # Step 4: Produce and save the ensembled forecast and its associated ensemble weights.
    # Try removing loop and see if results are same. i.e. forecast_df.loc[t].
    forecasts_candidates = [
        np.array(forecast_df.loc[t, model]).transpose() for model in functional_sets]
    ensembled_forecasts = np.dot(list(weights.values()), forecasts_candidates)

    return weights, ensembled_forecasts

In [None]:
def create_value_dict(H):
    H_tilda = {}
    for model in H:
        H_tilda.update({model: 0})
    return H_tilda

In [None]:
def cross_validate(data, step_size, window_size, forecast_func = train_and_forecast_AR1):
    forecasts = []
    actual_values = []
    for integer_position in tqdm(range(0, len(data)-window_size, step_size), position = 0, leave = True):
                        
        train_data = data.iloc[integer_position: integer_position+window_size]

        forecasts.append(float(forecast_func(train_data)))
        actual_value = data.iloc[integer_position+window_size+1]
        actual_values.append(float(actual_value))

    correlation, _ = pearsonr(forecasts, actual_values) 
    return correlation

In [32]:
window_size_dict = {}

for window_size in [2000, 3000, 5000, 10000, 20000, 40000, 100000, 500000]:
    correlation = cross_validate(df_dict['Asset_1']["Target"].fillna(method = 'bfill').fillna(method = 'ffill'),10000, window_size)
    window_size_dict[window_size] = correlation[0]

100%|██████████| 196/196 [00:07<00:00, 25.51it/s]
100%|██████████| 196/196 [00:09<00:00, 20.26it/s]
100%|██████████| 196/196 [00:14<00:00, 13.63it/s]
100%|██████████| 195/195 [00:26<00:00,  7.29it/s]
100%|██████████| 194/194 [00:48<00:00,  4.04it/s]
100%|██████████| 192/192 [01:42<00:00,  1.87it/s]
100%|██████████| 186/186 [03:58<00:00,  1.28s/it]
100%|██████████| 146/146 [16:33<00:00,  6.81s/it]


In [45]:
df(window_size_dict, index = ["Correlation"]).transpose()

Unnamed: 0,Correlation
2000,0.624005
3000,0.692864
5000,0.750441
10000,0.775504
20000,0.774815
40000,0.777776
100000,0.783638
500000,0.815182
