# Imports

In [None]:
from pathlib import Path
import re

import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.formula.api import ols

from more_itertools import powerset

from constants import DataSplit, Model, Metric, DATASET_SYMBOLS, CONSENSUS_BASELINES

# Function & Parameter Definitions

In [None]:
data_dir = r'./out'
concat_results = True
alpha = 0.05
latex_output = False

pivot_agg_func = (lambda x: f'${np.mean(x):.3f} \\pm {np.std(x):.3f}$') if latex_output else [np.mean, np.std]

# These functions are used to clean up the ANOVA printouts
def clean_effect_name(name):
    return ':'.join(map(lambda x: x[0], re.findall(r'"([^"]*)"', name))) if ('"' in name) else name

def fix_ordering(df):
    return df.reindex(columns=df.columns.reindex([metric.value for metric in Metric], level=1)[0])

def get_style(df):
    return df.style.set_table_styles([
        {'selector': 'toprule', 'props': ':hline;'},
        {'selector': 'midrule', 'props': ':hline\hline;'},
        {'selector': 'bottomrule', 'props': ':hline;'}
    ])

def get_latex(df, name):
    df = df.loc[:, (slice(None), ['Coefficient','PR(>F)'])]
    df = df.rename(index={'Residual/Intercept': 'Intercept'}, columns={'PR(>F)': 'P-value'})
    df = df.sort_index(axis=1, level=[0,1])

    s = get_style(df)
    s.format({(metric, col): f'${{:.{precision}f}}$' for metric in Metric for col, precision in {'Coefficient':5, 'P-value':3}.items()})

    latex = s.to_latex(column_format=f'|l|' + 'rc|'*(len(df.columns)//2), caption=name, multicol_align='|c|', position='htbp', position_float='centering')

    return latex.replace('\n & Coefficient', '\nEffect & Coefficient').replace('$nan$','-').replace('\\\\\nIntercept', '\\\\ \hdashline\nIntercept').replace('\hline \hdashline', '\hline')

# Data Loading

In [None]:
path = Path(data_dir)

pattern = rf'{"**/*" if concat_results else ""}results.csv'

data = pd.concat([pd.read_csv(filename, index_col=0) for filename in path.glob(pattern)], ignore_index=True)
display(data)

# Asset presence analysis (model-averaged, baseline models and training data excluded)

In [None]:
df = data.loc[data[DataSplit.TEST] & ~data[Model.RANDOM_BASELINE] & ~data[Model.CONSTANT_BASELINE] & ~data[Model.PREVIOUS_BASELINE] & ~data[Model.CONSENSUS_BASELINE]]

pivot = []

for asset_type in DATASET_SYMBOLS.keys():
    pivot.append(df.loc[~df['Random']].pivot_table(values=[metric for metric in Metric], index=asset_type, aggfunc=pivot_agg_func))

pivot += [
    df[~df[[asset_type for asset_type in DATASET_SYMBOLS.keys()] + ['Random']].any(axis=1)].pivot_table(values=[metric for metric in Metric],
                                                                                                        index='SPY',
                                                                                                        aggfunc=pivot_agg_func),
    df.loc[df['Random']].pivot_table(values=[metric for metric in Metric],
                                     index='Random',
                                     aggfunc=pivot_agg_func)
]

pivot = pd.concat(pivot, keys=[tab.index.name for tab in pivot], names=['Asset Type','Presence']) \
          .rename(index={'SPY': 'SPY-Only', 'Random': 'Random Data'}) \
          .reindex([True, False], level=1)

if latex_output:
    print(get_style(pivot).to_latex(column_format='|lc|ccc|', position='htbp', position_float='centering'))
else:
    display(fix_ordering(pivot))

# Asset combinations (model-averaged, baseline models and training data excluded)

In [None]:
df = data.loc[data[DataSplit.TEST] & ~data[Model.RANDOM_BASELINE] & ~data[Model.CONSTANT_BASELINE] & ~data[Model.PREVIOUS_BASELINE] & ~data[Model.CONSENSUS_BASELINE]].copy()

for asset_type in DATASET_SYMBOLS.keys():
    df[asset_type] = df[asset_type].map({True: asset_type[0], False: ''})

df['Random'] = df['Random'].map({True: 'Random Data', False: ''})

df['Asset Combination'] = df[[asset_type for asset_type in DATASET_SYMBOLS.keys()] + ['Random']].apply(lambda x: ''.join(x.values.astype(str)), axis=1)

pivot = df.pivot_table(values=[metric for metric in Metric],
                       index='Asset Combination',
                       aggfunc=pivot_agg_func)

pivot = pivot.rename(index={'': 'SPY-Only'})
pivot = pivot.reindex([''.join(c) for c in powerset(''.join(asset_type[0] for asset_type in DATASET_SYMBOLS.keys()))] + ['SPY-Only', 'Random Data'])
pivot = pivot.drop(index='')

if latex_output:
    print(get_style(pivot).to_latex(column_format='|l|ccc|', position='htbp', position_float='centering'))
else:
    display(fix_ordering(pivot))

# Model Performance (dataset averaged, random data excluded)
## Out-sample

In [None]:
df = data.loc[~data['Random']]

pivot = []

for model in Model:
    pivot.append(df.loc[df[DataSplit.TEST]].pivot_table(values=[metric for metric in Metric], index=model, aggfunc=pivot_agg_func))

pivot = pd.concat(pivot, keys=[tab.index.name for tab in pivot], names=['Model'])
pivot = pivot.loc[pivot.index.get_level_values(1)].droplevel(1)

if latex_output:
    print(get_style(pivot).to_latex(column_format='|l|ccc|', position='htbp', position_float='centering'))
else:
    display(fix_ordering(pivot))

## In-sample

In [None]:
df = data.loc[~data['Random']]

pivot = []

for model in Model:
    pivot.append(df.loc[~df[DataSplit.TEST]].pivot_table(values=[metric for metric in Metric], index=model, aggfunc=pivot_agg_func))

pivot = pd.concat(pivot, keys=[tab.index.name for tab in pivot], names=['Model','used'])
pivot = pivot.loc[pivot.index.get_level_values(1)].droplevel(1)

if latex_output:
    print(get_style(pivot).to_latex(column_format='|l|ccc|', position='htbp', position_float='centering'))
else:
    display(fix_ordering(pivot))

# Statistical analysis of factor effects (random data baseline excluded)

In [None]:
all_df = data.loc[data[DataSplit.TEST] & ~data['Random']]
all_df = all_df.replace({True: 1, False: -1}) # required to get coefficients (for identifying direction)

index_ordering = [':'.join(c) for c in powerset(''.join(asset_type[0] for asset_type in DATASET_SYMBOLS.keys()))][1:] + ['Residual/Intercept']

for model in Model:
    df = all_df.loc[data[model]]

    to_join = {
        'full': {},
        'reduced': {}
    }

    any_reduced = False

    if not latex_output:
        print(f'Analyzing {model}...')

    for metric in Metric:
        if model in CONSENSUS_BASELINES and metric is Metric.ROC_AUC:
            continue

        relation = f'Q("{metric}") ~ ' + ' * '.join(['Q("'+asset_type+'")' for asset_type in DATASET_SYMBOLS.keys()])
        glm = ols(relation, data=df).fit()
        aov = sm.stats.anova_lm(glm, typ=1)

        aov = aov.rename(index={'Residual': 'Residual/Intercept'})
        coefs = glm.params.rename(index={'Intercept': 'Residual/Intercept'})
        coefs.name = 'Coefficient'

        to_display = aov.join(coefs)

        to_join['full'][metric.value] = to_display

        if not latex_output:
            print(f'\nReducing over {metric}...')

        no_reductions = True

        # while non-significant effects, remove the least significant effect and associated interactions and refit effects model
        while (aov['PR(>F)'] > alpha).any():
            no_reductions = False
            rem_effect = aov['F'].idxmin()
            if not latex_output:
                print(f'removing effect {clean_effect_name(rem_effect)} (p={aov["PR(>F)"].max():.3f})')
            for effect in aov.index:
                if all(main_effect in effect.split(':') for main_effect in rem_effect.split(':')):
                    relation += f' - {effect}'

            glm = ols(relation, data=df).fit()
            aov = sm.stats.anova_lm(glm, typ=1)

        if no_reductions:
            if not latex_output:
                print('no effects removed')
        else:
            any_reduced = True

        coefs = glm.params.rename(index={'Intercept': 'Residual/Intercept'})
        coefs.name = 'Coefficient'
        aov = aov.rename(index={'Residual': 'Residual/Intercept'})

        to_display = aov.join(coefs)

        to_join['reduced'][metric.value] = to_display


    full_model = pd.concat(to_join['full'], axis=1)
    full_model = full_model.set_index(full_model.index.map(clean_effect_name)).reindex(index_ordering)
    if latex_output:
        print(get_latex(full_model, '{} (Full{})'.format(model, " \& Reduced" if not any_reduced else "")))
    else:
        print(f'\n{model} (Full{" & Reduced" if not any_reduced else ""}):')
        display(full_model)

    if any_reduced:
        reduced_model = pd.concat(to_join['reduced'], axis=1)
        reduced_model = reduced_model.set_index(reduced_model.index.map(clean_effect_name)).reindex(index_ordering)
        reduced_model = reduced_model.dropna(how='all')
        if latex_output:
            print(get_latex(reduced_model, f'{model} (Reduced)'))
        else:
            print(f'\n{model} (Reduced):')
            display(reduced_model)