In [1]:
import pandas as pd
import numpy as np
from solar.backtesting import Backtesting
import pycaret
from pycaret.classification import *

In [2]:
raw_df = pd.read_csv('TIMESERIES_all_monthly_enriched_no_dairyntel.csv').ffill()
raw_df['date'] = pd.to_datetime(raw_df.apply(lambda x: str(x['date'])[:10],axis=1))
target_var = "ZuivelNL - Dutch SMP food quotation"
# target_var = "ZuivelNL - Dutch butter quotation"

In [3]:
def remove_invalid_features(df, min_num_valid=100):
    v_check = df.ffill().isna().sum() < len(df) - min_num_valid 
    v_feats = v_check[v_check].index.tolist()
    return [f for f in v_feats if len(df[f].unique()) > 1]

def prepare_data(time_horizon, target_var):
    input_df = raw_df.copy() 
    input_df['Target'] = -(input_df[target_var].diff(-time_horizon)>=0)
    valid_feats = remove_invalid_features(input_df, 99+time_horizon)
    data = input_df[valid_feats].dropna()
    data['Target'] = data['Target'].astype('int')
    return data

In [11]:
def run_pycaret(time_horizon,params):
    data = prepare_data(time_horizon, target_var)
    exp_clf = setup(data, **params)
#     best = compare_models(blacklist = ['catboost','gbc','ada','qda','ridge','svm'])
    best = create_model('lr')
    print(best)
    best_model = tune_model(best,n_iter=100)
    feature_list = exp_clf[0].columns
    bt_clf = Backtesting(data, best_model, time_horizon, feature_list, max_iterations=36, target_var=target_var, task = 'classification')
    return best_model, feature_list, bt_clf.run(train_length=60,test_length=1)[1]

In [12]:
params = {
    'target': 'Target',
    'train_size':0.9,
    'normalize' : True,
    'silent':True,
    'remove_multicollinearity': True,
    'feature_selection':True,
    'feature_selection_threshold':0.005,
    }
acc_df = pd.DataFrame()
models = {}
feature_list = {}
for time_horizon in [1]:
    model, features, acc = run_pycaret(time_horizon, params)
    models[time_horizon] = model
    feature_list[time_horizon] = features
    acc_df = acc_df.append(acc)
    print(f'time_horizon:{time_horizon}   bt_acc:{acc.backtesting_accuracy[0]}')
acc_df

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6667,0.75,1.0,0.625,0.7692,0.2703,0.3953
1,0.8889,1.0,0.8,1.0,0.8889,0.7805,0.8
2,0.8889,1.0,0.8,1.0,0.8889,0.7805,0.8
3,0.8889,1.0,0.8,1.0,0.8889,0.7805,0.8
4,0.8889,1.0,0.75,1.0,0.8571,0.7692,0.7906
5,0.8889,0.95,1.0,0.8,0.8889,0.7805,0.8
6,0.6667,0.8,0.75,0.6,0.6667,0.3415,0.35
7,0.8889,1.0,0.75,1.0,0.8571,0.7692,0.7906
8,0.6667,0.55,0.5,0.6667,0.5714,0.3077,0.3162
9,0.5556,0.85,0.75,0.5,0.6,0.1429,0.1581


time_horizon:1   bt_acc:0.6944444444444444


Unnamed: 0,time_horizon,task,backtesting_accuracy,sliding,backtesting_iterations,backtesting_training_length,backtesting_test_length,backtesting_end_feature_date,backtesting_start_feature_date,backtesting_end_target_date,backtesting_start_target_date
0,1,classification,0.694444,False,36,60,1,2020-05-01,2017-06-01,2020-06-01,2017-07-01


In [23]:
from tqdm import tqdm_notebook as tqdm
for i in tqdm(range(2)):
    for threshold in [0.001]:
        params = {
        'target': 'Target',
        'train_size':0.9,
        'normalize' : True,
        'silent':True,
        'feature_selection':True,
        'feature_selection_threshold':threshold,
        'verbose':False
        }
        best_acc = 0
        best_model = {}
        best_features = {}
        model, features, acc_df = run_pycaret(time_horizon, params)
        acc = acc_df.backtesting_accuracy[0]
        print(f'time_horizon:{time_horizon} bt_acc:{acc} threshold:{threshold} num_features:{len(features)}')
        if acc > best_acc:
            best_acc = acc
            best_features = features
            best_model = model

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8889,1.0,0.75,1.0,0.8571,0.7692,0.7906
1,0.6667,0.65,0.75,0.6,0.6667,0.3415,0.35
2,0.6667,0.8,0.5,0.6667,0.5714,0.3077,0.3162
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.8889,0.9,1.0,0.8,0.8889,0.7805,0.8
5,0.6667,0.7,0.75,0.6,0.6667,0.3415,0.35
6,0.5556,0.7,0.6,0.6,0.6,0.1,0.1
7,0.6667,0.6,0.6,0.75,0.6667,0.3415,0.35
8,0.6667,0.75,0.6,0.75,0.6667,0.3415,0.35
9,0.6667,0.7,0.6,0.75,0.6667,0.3415,0.35


time_horizon:1 bt_acc:0.7222222222222222 threshold:0.001 num_features:8



In [24]:
best_features

Index(['Sossna MM Konz S_pct_yoy', 'US milk supply_4m_lag',
       'Ratio_Butter_Cheese_9m_lag',
       'ZuivelNL - Dutch SMP feed quotation_diff1',
       'ARG milk supply_diff1_yoy', 'NZ stocks-to-use_diff1',
       'USDA US butter price_diff1_yoy', 'USD/IRR_diff1'],
      dtype='object')

In [25]:
best_model

LogisticRegression(C=0.082, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=7104, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
bt_clf = Backtesting(data, best_model, time_horizon, feature_list, max_iterations=36, target_var=target_var, task = 'classification')