In [None]:
import numpy as np
import pandas as pd
import warnings
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedKFold
import scipy.stats as st
from utils import utils_gn, utils_dgrd, utils_models
import importlib
from xgboost import XGBRegressor
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import seaborn as sns
from sklearn.compose import TransformedTargetRegressor
importlib.reload(utils_gn)
importlib.reload(utils_models)
importlib.reload(utils_dgrd)
warnings.filterwarnings("ignore")

In [None]:
# load train raw data
train_raw = utils_gn.read_data('train_1238.pkl')

In [None]:
# Load test data
test_raw = utils_gn.read_data('test_1238.pkl')
y_test_raw = utils_gn.read_data('true_test_labels_1238.pkl')

In [None]:
# Define sub-sampling time step codes
step_size_dict = utils_models.create_time_steps()
step_size_dict

### Cross-validation on training set 

In [None]:
# Cross validation
def ccv_sampling_eol_cross_val():
   
    error_metrics = []
    mae_ci, rmse_ci = [], []

    time_map = step_size_dict

    # Build model
    params = {'n_estimators': 100, 'reg_alpha': 0.1, 'max_depth': 2, 'min_samples_split': 3}
    for time in time_map.keys():

        tr = utils_gn.FeatureTransformation(n=100, step_size=time)
        X_train, y_train = tr.fit_transform(data=train_raw, targets=['EOL'], with_eol=True, sig_level=2)

        model = TransformedTargetRegressor(XGBRegressor(**params), func=np.log10, inverse_func=utils_models.antilog)

        # Call k-fold cross-validation on the training set
        val_scores, val_scores_raw = utils_models.kfold_cross_validation(X=X_train, y=y_train, model=model, cv=3)
        error_metrics.append(list(val_scores.values()))

        # Calculate the 95% CI
        mae_ci.append(utils_models.confidence_interval_any(values=val_scores_raw['test_MAE'], n_bootstraps=10000, alpha=0.1))
        rmse_ci.append(utils_models.confidence_interval_any(values=val_scores_raw['test_RMSE'], n_bootstraps=10000, alpha=0.1))
        #mae_raw = val_scores_raw['test_MAE']
        #rmse_raw = val_scores_raw['test_RMSE']
        #mae_ci.append(st.t.interval(alpha=0.9, df=len(mae_raw)-1, loc=np.mean(mae_raw), scale=st.sem(mae_raw)))
        #rmse_ci.append(st.t.interval(alpha=0.9, df=len(rmse_raw)-1, loc=np.mean(rmse_raw), scale=st.sem(rmse_raw)))

        print(f'step size: {time_map[time]} done')

    with open(os.path.join("data", "signature_ccv_subsample_crossval.pkl"), "wb") as fp:
        pickle.dump((list(time_map.values()), np.array(error_metrics), np.array(mae_ci), np.array(rmse_ci)), fp)

    return list(time_map.values()), np.array(error_metrics), np.array(mae_ci), np.array(rmse_ci)

In [None]:
time, error, mae_ci, rmse_ci = ccv_sampling_eol_cross_val()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 4.5))
ax[0].plot(time[::2], error[:, 0][::2], 'D--', label='EOL: MAE', color='blue')
ax[0].fill_between(time[::2], mae_ci[:, 0][::2], mae_ci[:, 1][::2], color='blue', alpha=0.15, label='MAE: 90% CI')

ax[1].plot(time[::2], error[:, 1][::2], 's-', label='EOL: RMSE', color='crimson')
ax[1].fill_between(time[::2], rmse_ci[:, 0][::2], rmse_ci[:, 1][::2], color='crimson', alpha=0.15, label='RMSE: 90% CI')

ax[0].legend(loc='lower right')
ax[1].legend(loc='lower right')

ax[0].set_xlabel("Sub-sampling time steps (mins)", fontsize=16)
ax[0].set_ylabel("Cross-validation errors (cycles)", fontsize=16)
ax[1].set_xlabel("Sub-sampling time steps (mins)", fontsize=16)

plt.savefig(fname="plots/sig_level2_subsample_tabs12.pdf", bbox_inches='tight')

In [None]:
# Checking the robustness of the XGBoost model and RRCT feature selection to data sub-sampling
total_num_features = 10
times_needed = np.arange(0, 90, step=10)
times_needed[0] = 1
times_needed

In [None]:
# Call model_feature_selection_robustness() to see how similar are the 
# features selected by the RRCT under (restricted to 10) different
# sub-sampling time steps.
robust = utils_models.model_feature_selection_robustness(train_raw=train_raw,
                                                        test_raw=test_raw,
                                                        y_test_df=y_test_raw,
                                                        target_list=['EOL'],
                                                        params={'n_estimators': 100, 'reg_alpha': 0.1, 'max_depth': 2, 'min_samples_split': 3}, #{'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1} #
                                                        step_size_dict=step_size_dict,
                                                        times_needed=times_needed,
                                                        k=total_num_features
)

In [None]:
robust

In [None]:
# Create a similarity score heat map
all_features = robust['Selected features'].values
similarity_scores = [
    [
        len(np.intersect1d(features, others)) / total_num_features
        for others in all_features
    ]
    for features in all_features
]
similarity_scores = np.array(similarity_scores)

In [None]:
# Plotting
fig, ax = plt.subplots(figsize=(5, 4.5))
axis_labels = np.round(robust.index, 2)
ax.set_xticklabels(axis_labels)
ax.set_yticklabels(axis_labels)
sns.heatmap(similarity_scores,
            vmin=0,
            vmax=1,
            xticklabels=axis_labels,
            yticklabels=axis_labels,
            linewidth=0.5,
            linecolor='black',
            ax=ax,
            cbar_kws={'label': 'Similarity scores'},
            annot=True)
#ax.figure.axes[0].set_xlabel('Similarity scores', size=14)
#ax.xaxis.tick_top()

ax.set_xlabel("Sub-sampling time steps (mins)", fontsize=16)
ax.set_ylabel("Sub-sampling time steps (mins)", fontsize=16)
ax.figure.axes[-1].yaxis.label.set_size(16)
plt.yticks(rotation=0)
plt.savefig(fname="plots/sig-similarity-scores.pdf", bbox_inches='tight')



In [None]:
# Checking robustness through trainig a model on high frequency data and then
# testing it on data generated under low frequency data 
test_model = utils_gn.read_data('sig_capacity_ir.pkl', folder='models')  #utils_gn.read_data('sig_cycles.pkl', folder='models')
test_model_tr =  utils_gn.read_data('sig_capacity_ir_trans.pkl', folder='models') #utils_gn.read_data('sig_cycles_trans.pkl', folder='models')
time_step_keys = [10, 20, 30, 40, 50, 60, 70, 80]
targets = ['Qatk-o', 'Qatk-p', 'IRate-o', 'IRate-p', 'IRatEOL']  #['Qatk-o', 'Qatk-p', 'IRate-o', 'IRate-p', 'IRatEOL']

In [None]:
time_used_in_mins, mae_ = utils_models.test_of_robustness(model=test_model,
                                                        model_tr=test_model_tr,
                                                        time_steps=time_step_keys,
                                                        X_test_data=test_raw,
                                                        y_test_data=y_test_raw,
                                                        targets=targets,
                                                        step_size_dict=step_size_dict
)

In [None]:
mae_

In [None]:
# Plot the errors against the times in minutes: cycle model
list_of_markers = ["s-", "o-", "<-", ">-", "*-"]
fig, ax = plt.subplots(figsize=(5, 4))
for i in range(len(targets)):
    ax.plot(time_used_in_mins, mae[:, i], list_of_markers[i], label=targets[i])
    ax.set_xlabel('Sub-sampling time steps (mins)', fontsize=16)
    ax.set_ylabel('MAE (Cycles)', fontsize=16)

#handles, labels = ax[0].get_legend_handles_labels()
#ax[0].legend(handles, labels, loc='upper center', ncol=5, bbox_to_anchor=(1.0, -0.2))
ax.legend()
plt.savefig(fname="plots/sig-robust-cycles.pdf", bbox_inches='tight')

In [None]:
# Plot the errors against the times in minutes: capacity-ir model
fig, ax = plt.subplots(figsize=(5, 4))
for i in range(2, 5):
    ax.plot(time_used_in_mins, mae_[:, i], list_of_markers[i], label=targets[i])
    ax.set_xlabel('Sub-sampling time steps (mins)', fontsize=16)
    #ax.set_ylabel('MAE (Ah)', fontsize=16)
    ax.set_ylabel(r'MAE ($\Omega$)', fontsize=16)

ax.legend()
plt.savefig(fname="plots/sig-robust-ir.pdf", bbox_inches='tight')