In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import pickle

plt.style.use("default")
plt.rc("text", usetex=True)
plt.rc("font", family="cm")
plt.rcParams["grid.color"] = (0.5, 0.5, 0.5, 0.2)

In [None]:
X_dataframe = pd.read_csv("/mnt/ferracci/features_dataframe_new.csv.gz")
X = np.load("/mnt/ferracci/features_new.npz", allow_pickle=True)['a']
y = np.array(pd.read_csv("/mnt/ferracci/targets_dataframe_new.csv.gz")["Qedep"])

In [None]:
# we are only interested in the selected features
with open('/home/ferracci/new_dataset/features_list.txt', 'r') as f:
    file_content = f.read()

features_list = file_content.split('\n')
selected_features_names = eval(features_list[0])

selected_X_dataframe = X_dataframe[selected_features_names]
X = X[:, [X_dataframe.columns.get_loc(name) for name in selected_features_names]]
selected_X_dataframe.head()

In [None]:
import xgboost as xgb 

dm_train = xgb.DMatrix(X, label=y)

### Hyperparameter Tuning

In [None]:
def objective(trial):
    # define the hyperparameters to be tuned
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "mape",
        "tree_method": "gpu_hist",
        "subsample": 0.8,
        "max_depth": trial.suggest_int("max_depth", 6, 12),
        "eta": trial.suggest_float("eta", 0.05, 0.5),
        "gamma": trial.suggest_float("gamma", 0.001, 0.01),
        "lambda": trial.suggest_float("lambda", 0.1, 5),
        "alpha": trial.suggest_float("alpha", 0.001, 0.01),
    }
    
    # set up the cross-validation procedure
    xgb_cv = xgb.cv(params, dm_train, num_boost_round=2000, nfold=5, early_stopping_rounds=5, verbose_eval=False, as_pandas=True)

    mape = xgb_cv["test-mape-mean"].min()
    std = xgb_cv.loc[xgb_cv["test-mape-mean"].idxmin(), "test-mape-std"]
    
    # log how many estimators were used (after early stopping) and standard deviation
    trial.set_user_attr("n_estimators", len(xgb_cv))
    trial.set_user_attr("std", std)
    print(f"Trial #{trial.number}. MAPE = {mape:.4f} +/- {std:.4f}")
    return mape

In [None]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# set up the Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# print the best hyperparameters and mape
best_hyperparameters = study.best_params
best_mape = study.best_value
print(f"Best MAPE: {best_mape:.4f}")

In [None]:
results_dataframe = study.trials_dataframe()
results_dataframe.head()

In [None]:
trials = results_dataframe.drop(["number", "datetime_start", "datetime_complete", "duration", "state",
                                 "user_attrs_n_estimators", "user_attrs_std"], axis=1)
# swap columns so that mape is the last column
columns = list(trials.columns)
trials[columns[0]] = 100*trials[columns[0]]
columns = [columns[5], columns[2], columns[3], columns[4], columns[1]] + [columns[0]]
trials = trials[columns]
labels = ["max depth", "learning rate", "$\gamma$", "$\lambda$", "$\\alpha$", "MAPE, \%"]
trials.head()

In [None]:
from helper_functions.parallel_coordinates_plot import * 

# just 50 trials for readability purposes
fig = plot_parallel_coordinates(trials, labels, linewidth=0.8, alpha=0.9)
fig.set_dpi(150)
fig.supylabel("Hyperparameter tuning", fontsize=15, x=0.05)
fig.savefig("/home/ferracci/new_dataset/images/BDT_hyperparameter_tuning.png", dpi=300, bbox_inches="tight", pad_inches=0.2);

In [None]:
params = study.best_params

# save dictionary to a file
with open("/home/ferracci/new_dataset/xgb_study.pkl", "wb") as file:
    pickle.dump(params, file)

### Train Best Model

In [None]:
with open("/home/ferracci/new_dataset/xgb_study.pkl", "rb") as file:
    params = pickle.load(file)

params["objective"] = "reg:squarederror"
params["eval_metric"] = "mape"
params["tree_method"] = "gpu_hist"
params["subsample"] = 0.8

params

In [None]:
cv = xgb.cv(params, dm_train, num_boost_round=2000, nfold=5, early_stopping_rounds=5, verbose_eval=False)
best_num_boost_rounds = cv["test-mape-mean"].idxmin()
best_model = xgb.train(params, dm_train, num_boost_round=best_num_boost_rounds)

In [None]:
%%capture
import shap 

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(dm_train)
shap.summary_plot(shap_values[:, 1:], X[:, 1:], show=False, plot_type="layered_violin", cmap="PuOr", alpha=0.5)

fig = plt.gcf()
fig.set_figheight(5)
fig.set_figwidth(8)
fig.set_dpi(150)

ax = plt.gca()
ax.set_xlabel("SHAP (measure of impact on model output)", fontsize=15)
ax.set_xlim([-0.25, 0.45])
ax.set_ylabel("Feature", fontsize=15)
ax.set_yticklabels(["$ht_{10\%-5\%}$", "$ht_{80\%-75\%}$", "$ht_{5\%-2\%}$", "$ht_{95\%-90\%}$", "$pe_{15\%}$", "$ht_{entropy}$",
                    "$pe_{mean}$", "$z_{cc}$", "$ht_{kurtosis}$", "$pe_{std}$", "$\\rho_{cc}$", "nPMTs"])
ax.tick_params(axis="both", which="major", labelsize=12)
ax.tick_params(axis="both", which='minor', labelsize=12)
ax.grid()

cbar = fig.get_axes()[1]
cbar.set_ylabel("Feature value", fontsize=15, rotation=270)
cbar.tick_params(labelsize=12)

fig.savefig("/home/ferracci/new_dataset/images/BDT_shap.png", dpi=300, bbox_inches="tight", pad_inches=0.2);

In [None]:
best_model.save_model("/mnt/ferracci/xgb_trained_new.txt")

### Model Evaluation 

In [None]:
from pathlib import Path
from helper_functions.model_evaluation import plot_gaussian_fit
from helper_functions.model_evaluation import energy_res_fit
from helper_functions.model_evaluation import get_a_tilde

In [None]:
X_test_files = list(Path("/mnt/ferracci/").glob("features_test_*"))
y_test_files = list(Path("/mnt/ferracci/").glob("targets_dataframe_test_*"))
X_test, y_test = [], []

for X_test_file in X_test_files:
    X_test.append(np.load(X_test_file)["a"][:, [X_dataframe.columns.get_loc(name) for name in selected_features_names]])
for y_test_file in y_test_files:
    y_test.append(np.array(pd.read_csv(y_test_file)["Qedep"]))

energies = [0, 1, 10, 7, 6, 2, 0.1, 9, 5, 3, 8, 4, 0.3, 0.6]
X_test = [x for _, x in sorted(zip(energies, X_test))]
y_test = [x for _, x in sorted(zip(energies, y_test))]

In [None]:
bias, res = [], []
err_bias, err_res = [], []

for i in range(len(X_test)):
    y_pred = best_model.predict(xgb.DMatrix(X_test[i]))
    err = y_test[i] - y_pred
    err = err[err - np.mean(err) < 5*np.std(err)]

    mean, std, err_mean, err_std = plot_gaussian_fit(data=err, n_bins=100, name="xgb", index=i)
    bias.append(100 * mean / np.mean(y_test[i]))
    res.append(100 * std / np.mean(y_test[i]))
    err_bias.append(100 * err_mean / np.mean(y_test[i]))
    err_res.append(100 * err_std / np.mean(y_test[i]))
    
# get fit parameters
a, b, c, pcov = energy_res_fit([np.mean(y_test[i]) for i in range(1, len(y_test)-1)], res[1:-1], err_res[1:-1])
err_a, err_b, err_c = np.sqrt(np.abs(np.diag(pcov)[0])), np.sqrt(np.abs(np.diag(pcov)[1])), np.sqrt(np.abs(np.diag(pcov)[2]))
cov_ab, cov_ac, cov_bc = pcov[0, 1], pcov[0, 2], pcov[1, 2]

print(f"a = {a:.3f} +/- {err_a:.3f}")
print(f"b = {b:.3f} +/- {err_b:.3f}")
print(f"c = {c:.3f} +/- {err_c:.3f}")

a_tilde, err_a_tilde = get_a_tilde(a, b, c, err_a, err_b, err_c, cov_ab, cov_ac, cov_bc)
print(f"\nã = {a_tilde:.3f} +/- {err_a_tilde:.3f}")

with open('/home/ferracci/new_dataset/xgb_results.txt', 'w') as f:
    f.write(str(bias))
    f.write('\n')
    f.write(str(res))
    f.write('\n')
    f.write(str(err_bias))
    f.write('\n')
    f.write(str(err_res))
    f.write('\n')
    f.write(str([a, b, c, err_a, err_b, err_c, a_tilde, err_a_tilde]))