In [None]:
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from config.definitions import ROOT_DIR
from utils import utils_gn, utils_noah, utils_ivc, utils_dgrd, utils_models
import importlib
importlib.reload(utils_gn)
importlib.reload(utils_noah)
importlib.reload(utils_ivc)
importlib.reload(utils_models)
importlib.reload(utils_dgrd)

In [None]:
# load training raw data
train_raw = utils_gn.read_data(
    path=f"{ROOT_DIR}/data",
    fname="train_1238.pkl"
)

In [None]:
# Load test data
test_raw = utils_gn.read_data(
    path=f"{ROOT_DIR}/data",
    fname="test_1238.pkl"
)
y_test = utils_gn.read_data(
    path=f"{ROOT_DIR}/data",
    fname="true_test_labels_1238.pkl"
)

In [None]:
list_of_cycles = np.arange(10, 101, 1)
list_of_cycles

In [None]:
# Cross-validation on training set
def choosing_best_n_cross_val(target_list, model_type, n_list, ylabels):
    '''
    Function to choose the best cycle number threshold for modelling through cross-validation.

    Args:
    ----
            target_list: list of target to predict
            model_type:  'cycle_at' (predict cycles) or 'value_at' (predict values at cycles)
            n_list:      a list of cycle number threshold
            ylabels:     a list of labels for y-axis

    Returns:
    -------
            a plot of Cross-validated average errors vs cycle number.               
    '''

    # Split targets based on capacity/IR
    if model_type == 'cycle_at':
        split_list = [target_list[:3], target_list[3:]]
    elif model_type == 'value_at':
        split_list = [target_list[:2], target_list[2:]]

    fig, ax = plt.subplots(1, 2, figsize=(15, 4))

    for i, split in enumerate(split_list):

        this_mae, this_rmse = [], []
        mae_ci, rmse_ci = [], []

        for n in n_list:
            print('n: ', n)

            # Get training set
            tr = utils_gn.FeatureTransformation(n=n)
            X_train, y_train = tr.fit_transform(
                data=train_raw, targets=split, with_eol=True)

            # Build model
            if model_type == 'cycle_at':
                model = MultiOutputRegressor(
                    XGBRegressor(n_estimators=100, reg_alpha=0.1, max_depth=2)
                )
            elif model_type == 'value_at':
                model = TransformedTargetRegressor(
                    MultiOutputRegressor(XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=6)),
                    func=np.log10,
                    inverse_func=utils_models.antilog
                )
            # Call k-fold cross-validation on the training set
            val_scores, val_scores_raw = utils_models.kfold_cross_validation(
                X=X_train,
                y=y_train,
                model=model,
                cv=3
            )

            # Append the scores to the list of metrics
            this_mae.append(val_scores['test_MAE'])
            this_rmse.append(val_scores['test_RMSE'])

            # Calculate the CI
            mae_ci.append(utils_models.confidence_interval_any(
                values=val_scores_raw['test_MAE'],
                n_bootstraps=1000,
                alpha=0.1)
            )
            rmse_ci.append(utils_models.confidence_interval_any(
                values=val_scores_raw['test_RMSE'],
                n_bootstraps=1000,
                alpha=0.1)
            )

        # Cast to numpy array
        mae_ci = np.array(mae_ci)
        rmse_ci = np.array(rmse_ci)

        ax[i].plot(n_list, this_mae, label='MAE', color='blue')
        ax[i].fill_between(
            n_list,
            mae_ci[:, 0],
            mae_ci[:, 1],
            color='blue',
            alpha=0.1,
            label='MAE: 90% CI'
        )

        if model_type == 'value_at':
            ax[i].set_xlabel('Cycle number threshold', fontsize=16)

        ax[i].set_ylabel(ylabels[i], fontsize=16)
        ax[i].set_title(', '.join(split), fontsize=18)

        ax[i].plot(n_list, this_rmse, color='crimson', label='RMSE')
        ax[i].fill_between(
            n_list,
            rmse_ci[:, 0],
            rmse_ci[:, 1],
            color='crimson',
            alpha=0.1,
            label='RMSE: 90% CI'
        )

    if model_type == 'value_at':
        handles, labels = ax[0].get_legend_handles_labels()
        ax[0].legend(
            handles,
            labels,
            loc='upper center',
            ncol=4,
            fontsize=16,
            bbox_to_anchor=(1.0, -0.15)
        )

In [None]:
# For the model that predicts 'cycle-at'
choosing_best_n_cross_val(
    target_list=['k-o', 'k-p', 'EOL', 'e-o', 'e-p'],
    model_type='cycle_at',
    n_list=list_of_cycles,
    ylabels=['Cross-validated errors (cycles)', 'Cross-validated errors (cycles)']
)

In [None]:
# For the model that predicts 'value-at'
choosing_best_n_cross_val(
    target_list=['Qatk-o', 'Qatk-p', 'IRate-o', 'IRate-p', 'IRatEOL'],
    model_type='value_at',
    n_list=list_of_cycles,
    ylabels=['Cross-validated errors (Ah)', r'Cross-validated errors ($\Omega$)']
)