In [1]:
import pandas as pd
import numpy as np
from solar.backtesting import Backtesting
import pycaret
from pycaret.regression import *
from tqdm import tqdm_notebook as tqdm

In [2]:
raw_df = pd.read_csv('TIMESERIES_all_monthly_enriched_no_dairyntel.csv').ffill()
raw_df['date'] = pd.to_datetime(raw_df.apply(lambda x: str(x['date'])[:10],axis=1))
target_var = "ZuivelNL - Dutch SMP food quotation"
# target_var = "ZuivelNL - Dutch butter quotation"

In [3]:
def remove_invalid_features(df, min_num_valid=100):
    v_check = df.ffill().isna().sum() < len(df) - min_num_valid 
    v_feats = v_check[v_check].index.tolist()
    return [f for f in v_feats if len(df[f].unique()) > 1]

def prepare_data(time_horizon, target_var):
    input_df = raw_df.copy() 
    input_df['Target'] = -(input_df[target_var].diff(-time_horizon))
    valid_feats = remove_invalid_features(input_df, 99+time_horizon)
    data = input_df[valid_feats].dropna()
    data['Target'] = data['Target'].astype('float')
    return data

In [4]:
def run_pycaret(time_horizon,params):
    data = prepare_data(time_horizon, target_var)
    exp_clf = setup(data, **params)
#     best = compare_models(blacklist = ['catboost'])
    best = create_model('ridge')
    print(best)
    best_model = tune_model(best,n_iter=100)
    feature_list = exp_clf[0].columns
    bt_clf = Backtesting(data, best_model, time_horizon, feature_list, max_iterations=36, target_var=target_var, task = 'regression')
    return best_model, feature_list, bt_clf.run(train_length=60,test_length=1)[1]


def optimize(time_horizon, iterations = 10):
    best_acc = 0
    best_model = {}
    best_features = {}
    for i in tqdm(range(iterations)):
        params = {
        'target': 'Target',
        'train_size':0.9,
        'normalize' : True,
        'silent':True,
        'feature_selection':True,
        'feature_selection_threshold':0.001,
        'verbose':False
        }
        model, features, acc_df = run_pycaret(time_horizon, params)
        acc = acc_df.backtesting_accuracy[0]
        print(f'time_horizon:{time_horizon} bt_acc:{acc} num_features:{len(features)}')
        if acc > best_acc:
            best_acc = acc
            best_features = features
            best_model = model
    return best_acc, best_features, best_model

In [5]:
best_models = {}
best_features = {}
for time_horizon in [1,2,3,4,5,6,7,8,9]:
    acc, features, model = optimize(time_horizon)
    best_models[time_horizon] = model
    best_features[time_horizon] = features

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.2943,0.1147,0.3387,0.3834,0.1926,-0.313
1,0.1656,0.0517,0.2273,0.6333,0.1694,0.1933
2,0.2206,0.0687,0.2621,0.8525,0.1753,-2.0832
3,0.1761,0.0568,0.2383,0.8179,0.1762,0.3339
4,0.2803,0.0869,0.2948,0.464,0.1933,-8.1957
5,0.2931,0.12,0.3464,0.7567,0.2091,-4.1717
6,0.2391,0.1055,0.3248,0.7142,0.2072,-0.125
7,0.2602,0.1072,0.3274,0.0982,0.2199,-0.8231
8,0.3369,0.1385,0.3722,-0.127,0.2159,-0.3173
9,0.205,0.0584,0.2417,0.6377,0.1319,0.2082


time_horizon:9 bt_acc:0.5555555555555556 num_features:6



In [6]:
acc_df = pd.DataFrame()
for time_horizon in [1,2,3,4,5,6,7,8,9]:
    data = prepare_data(time_horizon, target_var)
    bt_clf = Backtesting(data, best_models[time_horizon], time_horizon, best_features[time_horizon], max_iterations=36, target_var=target_var, task = 'regression')
    acc_df = acc_df.append(bt_clf.run(train_length=60,test_length=1)[1])
acc_df

Unnamed: 0,time_horizon,task,backtesting_accuracy,sliding,backtesting_iterations,backtesting_training_length,backtesting_test_length,backtesting_end_feature_date,backtesting_start_feature_date,backtesting_end_target_date,backtesting_start_target_date
0,1,regression,0.666667,False,36,60,1,2020-04-01,2017-05-01,2020-05-01,2017-06-01
0,2,regression,0.75,False,36,60,1,2020-02-01,2017-03-01,2020-04-01,2017-05-01
0,3,regression,0.722222,False,36,60,1,2019-12-01,2017-01-01,2020-03-01,2017-04-01
0,4,regression,0.666667,False,36,60,1,2019-10-01,2016-11-01,2020-02-01,2017-03-01
0,5,regression,0.527778,False,36,60,1,2019-08-01,2016-09-01,2020-01-01,2017-02-01
0,6,regression,0.694444,False,36,60,1,2019-06-01,2016-07-01,2019-12-01,2017-01-01
0,7,regression,0.583333,False,36,60,1,2019-04-01,2016-05-01,2019-11-01,2016-12-01
0,8,regression,0.694444,False,36,60,1,2019-02-01,2016-03-01,2019-10-01,2016-11-01
0,9,regression,0.555556,False,36,60,1,2018-12-01,2016-01-01,2019-09-01,2016-10-01


In [7]:
{time_horizon:list(features) for time_horizon,features in best_features.items()}

{1: ['US SMP consumption_1m_lag',
  'ZuivelNL - NZ WMP price_diff1',
  'US milk supply_4m_lag',
  'US milk supply_diff1_yoy',
  'ZuivelNL - Dutch SMP feed quotation_diff1',
  'Ratio_Butter_Cheese_10m_lag'],
 2: ['ARG milk supply_diff1_yoy',
  'GDT monthly price WMP_9m_lag',
  'US SMP consumption',
  'US milk supply_4m_lag',
  'ZuivelNL - US WMP price_1m_lag',
  'Brent crude oil price_diff1'],
 3: ['GDT monthly price WMP_9m_lag',
  'ZuivelNL - Dutch WMP quotation_RealV',
  'AUS ARG milk supply_pct_yoy',
  'USDA US NDM price_diff1',
  'ZuivelNL - US cheddar block price_12m_lag',
  'USD/NZD_11m_lag'],
 4: ['cheese_vs_smp_bucket_valorization_12m_lag',
  'cheese_vs_smp_bucket_valorization',
  'Chinese WMP stocks_9m_lag',
  'ZuivelNL - US cheddar block price_RealV',
  'USDA US cheddar price_3m_lag',
  'cheese_vs_smp_bucket_valorization_ra12m'],
 5: ['Sossna Industriesahne S',
  'cheese_vs_smp_bucket_valorization',
  'AUS ARG milk supply_RealV',
  'USDA US Whey price_RealV',
  'Ratio_SMP_Butt

In [8]:
best_models

{1: Ridge(alpha=0.5710000000000001, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=True, random_state=8611, solver='auto', tol=0.001),
 2: Ridge(alpha=0.994, copy_X=True, fit_intercept=False, max_iter=None,
       normalize=False, random_state=4176, solver='auto', tol=0.001),
 3: Ridge(alpha=0.08700000000000001, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=True, random_state=3875, solver='auto', tol=0.001),
 4: Ridge(alpha=0.18, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=True, random_state=8563, solver='auto', tol=0.001),
 5: Ridge(alpha=0.989, copy_X=True, fit_intercept=False, max_iter=None,
       normalize=True, random_state=3748, solver='auto', tol=0.001),
 6: Ridge(alpha=0.977, copy_X=True, fit_intercept=False, max_iter=None,
       normalize=True, random_state=4517, solver='auto', tol=0.001),
 7: Ridge(alpha=0.163, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=True, random_state=8938, solver='auto', to

In [9]:
from SMP_results import SMP_features,SMP_models
acc_df = pd.DataFrame()
for time_horizon in [1,2,3,4,5,6,7,8,9]:
    data = prepare_data(time_horizon, target_var)
    bt_clf = Backtesting(data, SMP_models[time_horizon], time_horizon, SMP_features[time_horizon], max_iterations=36, target_var=target_var, task = 'classification')
    pred_df,acc_df = bt_clf.run(train_length=60,test_length=1)
    acc_df = acc_df.append(acc_df)
acc_df

IndentationError: unexpected indent (SMP_results.py, line 94)