In [2]:
import numpy as np
import pandas as pd

# import classes
from Tools import DateTimeSeriesSplit, Kraken

# model and metric for regression
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error as MAPE

# ==============================================================
# example 1: regression
# ==============================================================

# set random seed value
seed_value = 23
np.random.seed(seed_value)

# creating dataset
c = 12052
X1 = pd.DataFrame()
X1['var_1'] = np.random.rand(c)
X1['var_2'] = np.random.rand(c)
X1['var_3'] = np.random.rand(c)
X1['var_4'] = np.random.rand(c)
X1['var_5'] = np.random.rand(c)
X1['var_6'] = np.random.rand(c)
X1['var_7'] = np.random.rand(c)
X1['var_8'] = np.random.rand(c)
X1['var_9'] = np.random.rand(c)
X1['date'] = pd.date_range(start='1990-01-01', end='2022-12-30', freq='D')

# create simple dependency for first part of data
y1 = X1['var_1'] + 3 * X1['var_2'] - np.power(X1['var_3'], 1.5)

X2 = pd.DataFrame()
X2['var_1'] = np.random.rand(c)
X2['var_2'] = np.random.rand(c)
X2['var_3'] = np.random.rand(c)
X2['var_4'] = np.random.rand(c)
X2['var_5'] = np.random.rand(c)
X2['var_6'] = np.random.rand(c)
X2['var_7'] = np.random.rand(c)
X2['var_8'] = np.random.rand(c)
X2['var_9'] = np.random.rand(c)
X2['date'] = pd.date_range(start='1990-01-01', end='2022-12-30', freq='D')

# add var_4 to dependency for second part
y2 = X2['var_1'] + 3 * X2['var_2'] - np.power(X2['var_3'], 1.5) + 2 * X2['var_4']

X3 = pd.DataFrame()
X3['var_1'] = np.random.rand(c)
X3['var_2'] = np.random.rand(c)
X3['var_3'] = np.random.rand(c)
X3['var_4'] = np.random.rand(c)
X3['var_5'] = np.random.rand(c) 
X3['var_6'] = np.random.rand(c)
X3['var_7'] = np.random.rand(c)
X3['var_8'] = np.random.rand(c)
X3['var_9'] = np.random.rand(c)
X3['date'] = pd.date_range(start='1990-01-01', end='2022-12-30', freq='D')

# add var_4 and var_5 to dependency for third part
y3 = X3['var_1'] + 3 * X3['var_2'] - np.power(X3['var_3'], 1.5) + 2 * X3['var_4'] + 4 * X3['var_5']

X = pd.concat([X1, X2, X3], axis=0)
y = pd.concat([y1, y2, y3], axis=0)

print("Regression dataset shape:", X.shape)

# creating cross validator
cv_datetime = DateTimeSeriesSplit(window=3000, n_splits=3, test_size=300, margin=0)
group_dt = X['date']

# create model for selector
model_reg = LGBMRegressor(max_depth=3, verbosity=-1)

# create list of variables for model
list_of_vars = list(X.columns)
list_of_vars.remove('date')
if 'index_time' in list_of_vars:
    list_of_vars.remove('index_time')

# create selector
selector_reg = Kraken(
    estimator=model_reg, 
    cv=cv_datetime, 
    metric=MAPE, 
    meta_info_name='example_regression',
    task_type='regression',
    greater_is_better=False,
    comparison_precision= 2 # lower MAPE is better
)

# get rank dictionary from variables
selector_reg.get_rank_dict(X, y, list_of_vars, group_dt)

print("Rank dict (regression) top-5:", dict(list(selector_reg.rank_dict.items())[:5]))

# get variables
best_vars_reg = selector_reg.get_vars(X, y, rank_dict=selector_reg.rank_dict, group_dt=group_dt, max_feature_search_rounds=10)
print("Selected vars (regression):", best_vars_reg)

# ==============================================================


Regression dataset shape: (36156, 10)
[get_rank_dict] Starting combined baseline evaluation and SHAP calculation...
Fold: 1/3 | Status: Done (0.19s)              | Fold Time:   0.19s | Total Time:    0.22s                              
Fold: 2/3 | Status: Done (0.19s)              | Fold Time:   0.19s | Total Time:    0.41s                              
Fold: 3/3 | Status: Done (0.19s)              | Fold Time:   0.19s | Total Time:    0.61s                              
------------------------------
[get_rank_dict] >> FINAL Baseline Performance (All Features)
    Mean CV Score: 1.16
    Fold Scores: [1.18 1.2  1.09]
[get_rank_dict] Completed calculation. Total time: 0.67 seconds.
Rank dict (regression) top-5: {'var_2': 1, 'var_5': 2, 'var_4': 3, 'var_3': 4, 'var_1': 5}
[get_vars] Evaluating initial feature set (if any)...
[get_vars] Starting feature selection procedure...
[get_vars] Starting from scratch (will check top 10 features first).

--- Starting Step: Selecting feature #1 (Ch