# Imports

In [None]:
from pathlib import Path
from functools import reduce

import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.formula.api import ols

from more_itertools import powerset

from constants import DataSplit, Model, METRICS, DATASET_SYMBOLS

# Parameters

In [None]:
data_dir = r'./out'
concat_results = True
alpha = 0.05

# Data Loading

In [None]:
path = Path(data_dir)

pattern = rf'{"**/*" if concat_results else ""}results.csv'

data = pd.concat([pd.read_csv(filename, index_col=0) for filename in path.glob(pattern)], ignore_index=True)
display(data)

# Asset presence analysis (model-averaged, baseline models and training data excluded)

In [None]:
df = data.loc[data[DataSplit.TEST] & ~data[Model.RANDOM_BASELINE] & ~data[Model.CONSTANT_BASELINE] & ~data[Model.PREVIOUS_BASELINE] & ~data[Model.CONSENSUS_BASELINE]]

pivot = [
    df.loc[df['Random']].pivot_table(values=[metric for metric in METRICS.keys()],
                                     index='Random',
                                     aggfunc=[np.mean, np.std]),
    df[~df[[asset_type for asset_type in DATASET_SYMBOLS.keys()] + ['Random']].any(axis=1)].pivot_table(values=[metric for metric in METRICS.keys()],
                                                                                                        index='SPY',
                                                                                                        aggfunc=[np.mean, np.std])
]

for asset_type in DATASET_SYMBOLS.keys():
    pivot.append(df.loc[~df['Random']].pivot_table(values=[metric for metric in METRICS.keys()], index=asset_type, aggfunc=[np.mean, np.std]))

pivot = pd.concat(pivot, keys=[tab.index.name for tab in pivot], names=['asset type','presence']).rename(index={'SPY':'SPY-Only', 'Random':'Random Data'})

display(pivot)

# Asset combinations (model-averaged, baselines and training data excluded)

In [None]:
df = data.loc[data[DataSplit.TEST] & ~data[Model.RANDOM_BASELINE] & ~data[Model.CONSTANT_BASELINE] & ~data[Model.PREVIOUS_BASELINE] & ~data[Model.CONSENSUS_BASELINE]].copy()

df['forex'] = df['forex'].map({True:'F',False:''})
df['bond'] = df['bond'].map({True:'B',False:''})
df['Random'] = df['Random'].map({True:'Random Data',False:''})
df['index_futures'] = df['index_futures'].map({True:'I',False:''})
df['commodities_futures'] = df['commodities_futures'].map({True:'C',False:''})

df['asset combination'] = df['forex'] + df['bond'] + df['index_futures'] + df['commodities_futures'] + df['Random']

pivot = df.pivot_table(values=[metric for metric in METRICS.keys()],
                       index='asset combination',
                       aggfunc=[np.mean, np.std])

pivot = pivot.rename(index={'':'SPY-Only'})
pivot = pivot.reindex(['Random Data', 'SPY-Only'] + [reduce(lambda x, y: x+y, c, '') for c in powerset('FBIC')])
pivot = pivot.drop(index='')

display(pivot)

# Model Performance (dataset averaged, random data excluded)
## Out-sample

In [None]:
df = data.loc[~data['Random']]

pivot = []

for model in Model:
    pivot.append(df.loc[df[DataSplit.TEST]].pivot_table(values=[metric for metric in METRICS.keys()], index=model, aggfunc=[np.mean, np.std]))

pivot = pd.concat(pivot, keys=[tab.index.name for tab in pivot], names=['model','used'])
pivot = pivot.loc[pivot.index.get_level_values(1)].droplevel(1)

display(pivot)

## In-sample

In [None]:
df = data.loc[~data['Random']]

pivot = []

for model in Model:
    pivot.append(df.loc[~df[DataSplit.TEST]].pivot_table(values=[metric for metric in METRICS.keys()], index=model, aggfunc=[np.mean, np.std]))

pivot = pd.concat(pivot, keys=[tab.index.name for tab in pivot], names=['model','used'])
pivot = pivot.loc[pivot.index.get_level_values(1)].droplevel(1)

display(pivot)

# Statistical analysis of factor effects (random data baseline excluded)

In [None]:
all_df = data.loc[data[DataSplit.TEST] & ~data['Random']]
all_df = all_df.replace({True: 1, False: -1}) # required to get coefficients (for identifying direction)

for model in Model:
    df = all_df.loc[data[model]]

    for metric in METRICS.keys():
        relation = f'Q("{metric}") ~ forex * bond * index_futures * commodities_futures'
        glm = ols(relation, data=df).fit()
        aov = sm.stats.anova_lm(glm, typ=1)

        coefs = glm.params.rename(index={'Intercept':'Residual/Intercept'})
        coefs.name = 'coefs'
        aov = aov.rename(index={'Residual':'Residual/Intercept'})

        print(f'{model} {metric} (full model):')
        display(aov.join(coefs))

        # while non-significant effects, remove the least significant effect and associated interactions and refit effects model
        while (aov['PR(>F)'] > alpha).any():
            rem_effect = aov['F'].idxmin().split(':')
            print(f'removing effect {":".join(rem_effect)} (p={aov["PR(>F)"].max():.3f})')
            for effect in aov.index:
                if all(main_effect in effect.split(':') for main_effect in rem_effect):
                    relation += f' - {effect}'


            glm = ols(relation, data=df).fit()
            aov = sm.stats.anova_lm(glm, typ=1)

        coefs = glm.params.rename(index={'Intercept':'Residual/Intercept'})
        coefs.name = 'coefs'
        aov = aov.rename(index={'Residual':'Residual/Intercept'})

        print(f'\n{model} {metric} (reduced model):')
        display(aov.join(coefs))