# Analyse performance for different datasets and retraining strategies

- Dataset generation
    - Steady
    - Distribution Shift
    - Linear Coefficients
    - Black Swan
- Model retraining
    - None
    - Whole Dataset
    - Window
- Training Set Analysis
    - Standard deviation difference (normalize features)
    - Feature importance difference



In [1]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
from plotly.express import line, box

In [2]:
import sys

sys.path.append('../src')

from feature_generation import Importance_evo, FeatureGenerator, target_fcn
from model_tuning import ModelTrainer, neg_mse
from analysis import agg_feature, get_feature_importance, get_feature_cols

In [3]:
# parameters

# number of steps
n_steps = 100

# number of training steps
training_steps = 20

# regression model
model = XGBRegressor()

# feature generation settings
fg_settings = {'Steady': {'importance_evo': Importance_evo(const=10, linear=0, seasonal=0), 
                          'dist_drift': False, 
                          'n_disturbances': 0, 
                          'n_black_swans': 0, 
                          'black_swan_impact': 0.75}, 
               'Linear Coefs': {'importance_evo': Importance_evo(const=0, linear=10, seasonal=0), 
                                'dist_drift': False, 
                                'n_disturbances': 0, 
                                'n_black_swans': 0, 
                                'black_swan_impact': 0.75}, 
               'Dist Shift': {'importance_evo': Importance_evo(const=10, linear=0, seasonal=0), 
                              'dist_drift': True, 
                              'n_disturbances': 0, 
                              'n_black_swans': 0, 
                              'black_swan_impact': 0.75}, 
               'Black Swan': {'importance_evo': Importance_evo(const=10, linear=0, seasonal=0), 
                              'dist_drift': False, 
                              'n_disturbances': 0, 
                              'n_black_swans': 1, 
                              'black_swan_impact': 0.75}} 

# model retraining settings
retraining_settings = {'None': {'data_selection': 'all', 'retrain_trigger': 'off', 'model_selection': 'new'}, 
                       'All Data': {'data_selection': 'all', 'retrain_trigger': 'steps', 'model_selection': 'new'}, 
                       'Window': {'data_selection': 'window', 'retrain_trigger': 'steps', 'model_selection': 'new'}}

In [4]:
# generate features
features = dict()
for fg_name, fg_setting in fg_settings.items():
    # initialize feature generator
    fg = FeatureGenerator(imp_evo=fg_setting['importance_evo'], dist_drift=fg_setting['dist_drift'], 
                          n_disturbances=fg_setting['n_disturbances'], n_black_swans=fg_setting['n_black_swans'], 
                          black_swan_impact=fg_setting['black_swan_impact'])
    # generate features
    fg.generate(n_steps, target_fcn)

    # convert features to dataframe
    features[fg_name] = fg.to_df()

In [5]:
# train models
df_lst = []
for retraining_name, retraining_setting in retraining_settings.items():
    for fg_name, fg_df in features.items():
        # initialize model training
        mt = ModelTrainer(data_selection=retraining_setting['data_selection'], 
                          retrain_trigger=retraining_setting['retrain_trigger'], 
                          model_selection=retraining_setting['model_selection'])
        # train model
        train_stats_df = mt.train_models(fg_df, training_steps, model, neg_mse)

        # get average mse per step
        eval_df = (train_stats_df
                   .groupby('step')
                   .apply(lambda x: mean_squared_error(x['target'], x['best_pred']), include_groups=False)
                   .rename('mse')
                   .reset_index())
        
        eval_df['scenario'] = fg_name
        eval_df['retraining'] = retraining_name

        df_lst.append(eval_df)

eval_df = pd.concat(df_lst, ignore_index=True)  
eval_df.head()

Unnamed: 0,step,mse,scenario,retraining
0,0,0.000115,Steady,
1,1,0.000152,Steady,
2,2,0.000136,Steady,
3,3,0.000119,Steady,
4,4,0.000114,Steady,


In [24]:
plot_df = eval_df.loc[eval_df['scenario'] == 'Steady']  
fig = line(plot_df, x='step', y='mse', color='retraining', title='Steady Scenario')
fig.update_layout(xaxis_title='Step', yaxis_title='MSE', legend_title_text='Retraining')
fig.show()

In [25]:
plot_df = eval_df.loc[eval_df['scenario'] == 'Dist Shift']  
fig = line(plot_df, x='step', y='mse', color='retraining', title='Distribution Shift Scenario')
fig.update_layout(xaxis_title='Step', yaxis_title='MSE', legend_title_text='Retraining')
fig.show()

In [26]:
plot_df = eval_df.loc[eval_df['scenario'] == 'Linear Coefs']  
fig = line(plot_df, x='step', y='mse', color='retraining', title='Linear Coefficients Scenario')
fig.update_layout(xaxis_title='Step', yaxis_title='MSE', legend_title_text='Retraining')
fig.show()

In [27]:
plot_df = eval_df.loc[eval_df['scenario'] == 'Black Swan']  
fig = line(plot_df, x='step', y='mse', color='retraining', title='Black Swan Scenario')
fig.update_layout(xaxis_title='Step', yaxis_title='MSE', legend_title_text='Retraining')
fig.show()

In [10]:
# analyse training data
dist_drift_df = features['Dist Shift']
lin_coef_df = features['Linear Coefs']

In [11]:
# define steps per set
first_steps = [0, 9]
second_steps = [10, 19]

In [12]:
dist_drift_agg_df = agg_feature(dist_drift_df, first_steps, second_steps)
lin_coef_agg_df = agg_feature(lin_coef_df, first_steps, second_steps)

# combine dataframes
dist_drift_agg_df['dataset'] = 'Distribution Drift'
lin_coef_agg_df['dataset'] = 'Linear Coefficients'

agg_df = pd.concat([dist_drift_agg_df, lin_coef_agg_df], ignore_index=True)

agg_df.head()

Unnamed: 0,feature,mean_1,std_1,mean_2,std_2,mean_diff,std_diff,dataset
0,feature_0,0.493412,0.159481,0.441654,0.152272,-0.051757,-0.007209,Distribution Drift
1,feature_1,0.455728,0.137839,0.408227,0.144593,-0.047501,0.006754,Distribution Drift
2,feature_2,0.485214,0.150871,0.458013,0.138662,-0.027201,-0.012209,Distribution Drift
3,feature_3,0.474982,0.15629,0.520072,0.153257,0.04509,-0.003033,Distribution Drift
4,feature_4,0.55369,0.141902,0.529812,0.145734,-0.023879,0.003832,Distribution Drift


In [28]:
fig = box(agg_df, x='dataset', y='std_diff', title='Std Difference Distribution')
fig.update_layout(xaxis_title='Scenario', yaxis_title='Difference')
fig.show()

In [29]:
# analyse feature importance
model = XGBRegressor()

df_lst = []
for scenario in ['Dist Shift', 'Linear Coefs']:
    scenario_df = features[scenario]
    scenario_fi_df = pd.DataFrame({'feature': get_feature_cols(scenario_df)}).set_index('feature')
    scenario_fi_df['scenario'] = {'Dist Shift': 'Distribution Drift', 'Linear Coefs': 'Linear Coefficients'}.get(scenario)
    for i, steps in enumerate([first_steps, second_steps]):
        train_df = scenario_df.loc[scenario_df['step'].between(steps[0], steps[1])]
        # get feature importance
        fi_df = get_feature_importance(model, train_df)
        fi_df = fi_df.rename(columns={'importance': f'importance_{i}'})
        scenario_fi_df = scenario_fi_df.join(fi_df.set_index('feature'), how='left')
    df_lst.append(scenario_fi_df.reset_index())

importance_df = pd.concat(df_lst, ignore_index=True)
importance_df['importance_diff'] = importance_df['importance_1'] - importance_df['importance_0']
importance_df.head()

Unnamed: 0,feature,scenario,importance_0,importance_1,importance_diff
0,feature_0,Distribution Drift,0.00215,0.001315,-0.000834
1,feature_1,Distribution Drift,0.033051,0.021789,-0.011262
2,feature_2,Distribution Drift,0.019814,0.023561,0.003748
3,feature_3,Distribution Drift,0.225589,0.294468,0.068879
4,feature_4,Distribution Drift,0.001732,0.002412,0.00068


In [30]:
fig = box(importance_df, x='scenario', y='importance_diff', title='Feature Importance Difference')
fig.update_layout(xaxis_title='Scenario', yaxis_title='Difference')
fig.show()