In [1]:
import pandas as pd
import numpy as np
from solar.backtesting import Backtesting
import pycaret
from pycaret.classification import *
from tqdm import tqdm_notebook as tqdm

In [2]:
raw_df = pd.read_csv('TIMESERIES_all_monthly_enriched_no_dairyntel.csv').ffill()
raw_df['date'] = pd.to_datetime(raw_df.apply(lambda x: str(x['date'])[:10],axis=1))
target_var = "ZuivelNL - Dutch SMP food quotation"
# target_var = "ZuivelNL - Dutch butter quotation"

In [3]:
def remove_invalid_features(df, min_num_valid=100):
    v_check = df.ffill().isna().sum() < len(df) - min_num_valid 
    v_feats = v_check[v_check].index.tolist()
    return [f for f in v_feats if len(df[f].unique()) > 1]

def prepare_data(time_horizon, target_var):
    input_df = raw_df.copy() 
    input_df['Target'] = -(input_df[target_var].diff(-time_horizon)>=0)
    valid_feats = remove_invalid_features(input_df, 99+time_horizon)
    data = input_df[valid_feats].dropna()
    data['Target'] = data['Target'].astype('int')
    return data

In [4]:
def run_pycaret(time_horizon,params):
    data = prepare_data(time_horizon, target_var)
    exp_clf = setup(data, **params)
#     best = compare_models(blacklist = ['catboost','gbc','ada','qda','ridge','svm'])
    best = create_model('lr')
    print(best)
    best_model = tune_model(best,n_iter=100)
    feature_list = exp_clf[0].columns
    bt_clf = Backtesting(data, best_model, time_horizon, feature_list, max_iterations=36, target_var=target_var, task = 'classification')
    return best_model, feature_list, bt_clf.run(train_length=60,test_length=1)[1]


def optimize(time_horizon, iterations = 10):
    best_acc = 0
    best_model = {}
    best_features = {}
    for i in tqdm(range(iterations)):
        params = {
        'target': 'Target',
        'train_size':0.9,
        'normalize' : True,
        'silent':True,
        'feature_selection':True,
        'feature_selection_threshold':0.001,
        'verbose':False
        }
        model, features, acc_df = run_pycaret(time_horizon, params)
        acc = acc_df.backtesting_accuracy[0]
        print(f'time_horizon:{time_horizon} bt_acc:{acc} num_features:{len(features)}')
        if acc > best_acc:
            best_acc = acc
            best_features = features
            best_model = model
    return best_acc, best_features, best_model

In [5]:
best_models = {}
best_features = {}
for time_horizon in [1,2,3,4,5,6,7,8,9]:
    acc, features, model = optimize(time_horizon)
    best_models[time_horizon] = model
    best_features[time_horizon] = features

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8,0.8,0.8,0.8,0.8,0.6,0.6
1,0.8,1.0,1.0,0.7143,0.8333,0.6,0.6547
2,0.7,0.88,0.8,0.6667,0.7273,0.4,0.4082
3,0.9,0.88,0.8,1.0,0.8889,0.8,0.8165
4,0.9,0.92,1.0,0.8333,0.9091,0.8,0.8165
5,0.8,0.7917,0.6667,1.0,0.8,0.6154,0.6667
6,0.9,0.875,0.8333,1.0,0.9091,0.8,0.8165
7,0.9,0.875,1.0,0.8571,0.9231,0.7826,0.8018
8,0.7778,0.8,0.8,0.8,0.8,0.55,0.55
9,0.7778,0.75,1.0,0.7143,0.8333,0.5263,0.5976


time_horizon:9 bt_acc:0.6111111111111112 num_features:6



In [None]:
acc_df = pd.DataFrame()
for time_horizon in [1,2,3,4,5,6,7,8,9]:
    data = prepare_data(time_horizon, target_var)
    bt_clf = Backtesting(data, best_models[time_horizon], time_horizon, best_features[time_horizon], max_iterations=36, target_var=target_var, task = 'classification')
    acc_df = acc_df.append(bt_clf.run(train_length=60,test_length=1)[1])
acc_df

In [None]:
best_features

In [None]:
best_models