In [None]:
%pip install optuna

## training for Y1

In [1]:
n_trials=10
import random
import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
import shap
import pickle
from sklearn.metrics import mean_squared_error

# Function to prepare data (split into training and evaluation sets)
def prepare_data(X_section, y_section, train_size, eval_size):
    # Split the data into training and evaluation sets
    splits = [train_size] + [train_size + eval_size * i for i in range(1, 5)]
    X_splits = np.split(X_section, splits)
    y_splits = np.split(y_section, splits)

    # Assign the splits to respective variables
    X_train, X_eval_1, X_eval_2, X_eval_3, X_eval_4 = X_splits[:5]
    y_train, y_eval_1, y_eval_2, y_eval_3, y_eval_4 = y_splits[:5]

    X_eval = [X_eval_1, X_eval_2, X_eval_3, X_eval_4]
    y_eval = [y_eval_1, y_eval_2, y_eval_3, y_eval_4]

    return X_train, X_eval, y_train, y_eval

# Objective function for Optuna
def objective(trial, X_train, y_train, X_eval, y_eval):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 150),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10, log=True),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 3, 20)
    }

    regr = xgb.XGBRegressor(**param)
    regr.fit(X_train, y_train, eval_set=[(X_eval[0], y_eval[0])], verbose=0)

    preds = regr.predict(X_eval[1])
    rmse = np.sqrt(mean_squared_error(y_eval[1], preds))
    return rmse

with open('df_storage/cleaned_train_df_Y1.pkl', 'rb') as file:
    cleaned_train_df_Y1 = pickle.load(file)

# Preparing the data
X = cleaned_train_df_Y1.drop(['sym', 'exch', 'Y1', 'Y2', 'Q1', 'Q2'], axis=1)
y = cleaned_train_df_Y1['Y1']


# Define the size of the training and evaluation sets
train_size = int(len(X) * 0.6)
eval_size = int(len(X) * 0.1)



X_train, X_eval, y_train, y_eval = prepare_data(X, y, train_size, eval_size)

# Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train, X_eval, y_eval), n_trials=n_trials)

# Best parameters from initial tuning
best_params = study.best_params

# Iteratively reduce features
selected_features = X_train.columns

model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train, eval_set=[(X_eval[0], y_eval[0])], verbose=0)

# Calculate SHAP values
explainer = shap.Explainer(model, X_train, check_additivity=False)
shap_values = explainer.shap_values(X_train)

# Summarize the SHAP values to get feature importance
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([selected_features, shap_sum.tolist()]).T
importance_df.columns = ['feature', 'importance']

In [None]:
importance_df.sort_values(by='importance', ascending=False).to_csv('importance_1.csv')

In [10]:
selected_features = importance_df.sort_values(by='importance', ascending=False).head(10)['feature'].values

# Redefine train and validation sets
X_train = X_train[selected_features]
for i in range(len(X_eval)):
    X_eval[i] = X_eval[i][selected_features]

# Update study for next iteration
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train, X_eval, y_eval),  n_trials=30)
best_params = study.best_params

print(selected_features)

[I 2024-01-22 03:53:05,236] A new study created in memory with name: no-name-601cfcaf-c1df-49da-a9ca-3d201885d3ca
[I 2024-01-22 03:53:49,969] Trial 0 finished with value: 13.531091690063477 and parameters: {'max_depth': 15, 'learning_rate': 0.28795381806350573, 'n_estimators': 86, 'min_child_weight': 1, 'subsample': 0.9356442461717759, 'colsample_bytree': 0.7643129195847209, 'reg_alpha': 0.004905139946654672, 'reg_lambda': 3.350868773905838, 'early_stopping_rounds': 6}. Best is trial 0 with value: 13.531091690063477.
[I 2024-01-22 03:56:35,585] Trial 1 finished with value: 13.609201431274414 and parameters: {'max_depth': 23, 'learning_rate': 0.2785841337672965, 'n_estimators': 71, 'min_child_weight': 6, 'subsample': 0.614537275900993, 'colsample_bytree': 0.7916647065595368, 'reg_alpha': 5.3323640370796874e-05, 'reg_lambda': 1.412136580585176, 'early_stopping_rounds': 15}. Best is trial 0 with value: 13.531091690063477.
[I 2024-01-22 03:57:20,017] Trial 2 finished with value: 13.4163131

['time' 'X230' 'X232' 'X121' 'X53' 'X52' 'X49' 'X51' 'X316' 'X372']


In [12]:
best_params

{'max_depth': 6,
 'learning_rate': 0.249407058209529,
 'n_estimators': 88,
 'min_child_weight': 7,
 'subsample': 0.591095619361343,
 'colsample_bytree': 0.7882789772211479,
 'reg_alpha': 0.00015994910444829964,
 'reg_lambda': 0.01924009042129767,
 'early_stopping_rounds': 18}

In [23]:
# Get the best model on train set and dump
train_size = int(len(X) * 0.85)

# split data into train and validation set
X_train, X_eval = X.iloc[:train_size][selected_features], X.iloc[train_size:][selected_features]
y_train, y_eval = y.iloc[:train_size], y.iloc[train_size:]


model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], verbose=0)
model.save_model('model_Y1.json')


## Training for Y2

In [18]:
n_trials=5
import random
import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
import shap
import pickle
from sklearn.metrics import mean_squared_error

# Function to prepare data (split into training and evaluation sets)
def prepare_data(X_section, y_section, train_size, eval_size):
    # Split the data into training and evaluation sets
    splits = [train_size] + [train_size + eval_size * i for i in range(1, 5)]
    X_splits = np.split(X_section, splits)
    y_splits = np.split(y_section, splits)

    # Assign the splits to respective variables
    X_train, X_eval_1, X_eval_2, X_eval_3, X_eval_4 = X_splits[:5]
    y_train, y_eval_1, y_eval_2, y_eval_3, y_eval_4 = y_splits[:5]

    X_eval = [X_eval_1, X_eval_2, X_eval_3, X_eval_4]
    y_eval = [y_eval_1, y_eval_2, y_eval_3, y_eval_4]

    return X_train, X_eval, y_train, y_eval

# Objective function for Optuna
def objective(trial, X_train, y_train, X_eval, y_eval):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 150),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10, log=True),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 3, 20)
    }

    regr = xgb.XGBRegressor(**param)
    regr.fit(X_train, y_train, eval_set=[(X_eval[0], y_eval[0])], verbose=0)

    preds = regr.predict(X_eval[1])
    rmse = np.sqrt(mean_squared_error(y_eval[1], preds))
    return rmse

with open('df_storage/cleaned_train_df_Y2.pkl', 'rb') as file:
    cleaned_train_df_Y2 = pickle.load(file)

# Preparing the data
X = cleaned_train_df_Y2.drop(['sym', 'exch', 'Y1', 'Y2', 'Q1', 'Q2'], axis=1)
y = cleaned_train_df_Y2['Y2']


# Define the size of the training and evaluation sets
train_size = int(len(X) * 0.6)
eval_size = int(len(X) * 0.1)



X_train, X_eval, y_train, y_eval = prepare_data(X, y, train_size, eval_size)
# Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train, X_eval, y_eval), n_trials=n_trials)

# Best parameters from initial tuning
best_params = study.best_params

[I 2024-01-22 10:31:26,156] A new study created in memory with name: no-name-c5689efc-637e-4bef-b6b3-0d95fa295882
[I 2024-01-22 10:38:27,208] Trial 0 finished with value: 18.31778335571289 and parameters: {'max_depth': 27, 'learning_rate': 0.04583373939046242, 'n_estimators': 99, 'min_child_weight': 4, 'subsample': 0.6635043073901279, 'colsample_bytree': 0.6951387099349245, 'reg_alpha': 0.006494981363932844, 'reg_lambda': 0.2913953663199943, 'early_stopping_rounds': 5}. Best is trial 0 with value: 18.31778335571289.
[I 2024-01-22 10:40:09,057] Trial 1 finished with value: 18.253154754638672 and parameters: {'max_depth': 3, 'learning_rate': 0.17046085516526174, 'n_estimators': 73, 'min_child_weight': 7, 'subsample': 0.7692173505746742, 'colsample_bytree': 0.9333325510479389, 'reg_alpha': 0.5276833176325726, 'reg_lambda': 0.9383146087442483, 'early_stopping_rounds': 12}. Best is trial 1 with value: 18.253154754638672.
[I 2024-01-22 10:56:18,543] Trial 2 finished with value: 18.3396053314

In [19]:
# Iteratively reduce features
selected_features = X_train.columns

model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train, eval_set=[(X_eval[0], y_eval[0])], verbose=0)

# Calculate SHAP values
explainer = shap.Explainer(model, X_train, check_additivity=False)
shap_values = explainer.shap_values(X_train)

# Summarize the SHAP values to get feature importance
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([selected_features, shap_sum.tolist()]).T
importance_df.columns = ['feature', 'importance']



In [20]:
best_params

{'max_depth': 3,
 'learning_rate': 0.17046085516526174,
 'n_estimators': 73,
 'min_child_weight': 7,
 'subsample': 0.7692173505746742,
 'colsample_bytree': 0.9333325510479389,
 'reg_alpha': 0.5276833176325726,
 'reg_lambda': 0.9383146087442483,
 'early_stopping_rounds': 12}

In [21]:
importance_df.sort_values(by='importance', ascending=False).to_csv('importance_2.csv')

In [22]:
selected_features = importance_df.sort_values(by='importance', ascending=False).head(10)['feature'].values

# Redefine train and validation sets
X_train = X_train[selected_features]
for i in range(len(X_eval)):
    X_eval[i] = X_eval[i][selected_features]

# Update study for next iteration
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train, X_eval, y_eval),  n_trials=10)
best_params = study.best_params

print(selected_features)

[I 2024-01-22 11:16:54,473] A new study created in memory with name: no-name-cb200c26-2d7d-4764-9ee8-8e8e6f9830cb
[I 2024-01-22 11:17:55,444] Trial 0 finished with value: 18.299022674560547 and parameters: {'max_depth': 12, 'learning_rate': 0.24820871465526181, 'n_estimators': 116, 'min_child_weight': 7, 'subsample': 0.73169067622898, 'colsample_bytree': 0.7403223891564796, 'reg_alpha': 0.000104580853258994, 'reg_lambda': 0.34845864826319073, 'early_stopping_rounds': 14}. Best is trial 0 with value: 18.299022674560547.
[I 2024-01-22 11:20:22,212] Trial 1 finished with value: 18.357656478881836 and parameters: {'max_depth': 27, 'learning_rate': 0.22336602347585552, 'n_estimators': 73, 'min_child_weight': 9, 'subsample': 0.5395475625206507, 'colsample_bytree': 0.8308221375862896, 'reg_alpha': 0.49254364147333457, 'reg_lambda': 0.0003648636275849078, 'early_stopping_rounds': 15}. Best is trial 0 with value: 18.299022674560547.
[I 2024-01-22 11:21:42,082] Trial 2 finished with value: 18.23

['X253' 'X313' 'X250' 'X203' 'X316' 'X222' 'X373' 'time' 'X324' 'X329']


In [24]:
# Get the best model on train set and dump
train_size = int(len(X) * 0.85)

# split data into train and validation set
X_train, X_eval = X.iloc[:train_size][selected_features], X.iloc[train_size:][selected_features]
y_train, y_eval = y.iloc[:train_size], y.iloc[train_size:]


model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], verbose=0)
model.save_model('model_Y2.json')
