In [None]:
import importlib
from utils import (
    extraction,
    generic_helper,
    experiment
)
from utils.definitions import ROOT_DIR
importlib.reload(generic_helper)
importlib.reload(extraction)
importlib.reload(experiment)

In [None]:
# read in train-test split
split_data = generic_helper.read_data(
    path=f"{ROOT_DIR}/data",
    fname="noah_train_val_test_split.pkl"
)

In [None]:
# load model configuration
model_config = generic_helper.load_yaml_file(
    path=f"{ROOT_DIR}/config/model_config.yaml"
)

In [None]:
# load transformation function
trans_func = extraction.get_data_for_eol_prediction

In [None]:
# check the effect of time threshold on model via cross-validation on 
# training data
threshold_result = experiment.effect_time_threshold(
    train_data=split_data['train'],
    signature_depth=model_config["eol"]["signature_depth"],
    param_grid=model_config["eol"]["param_grid"],
    problem_type="regression",
    trans_func=trans_func,
    scorer='neg_mean_absolute_error'
)

In [None]:
# save threshold results for plotting
generic_helper.dump_data(
    data=threshold_result,
    path=f"{ROOT_DIR}/data",
    fname="eol_threshold_data.pkl"
)

In [None]:
# train a model with time threshold of 120 s
best_pipeline, best_params, best_model, best_score, best_std  = experiment.train_model(
        train_data=split_data['train'],
        signature_depth=model_config["eol"]["signature_depth"],
        threshold=120,
        param_grid=model_config["eol"]["param_grid"],
        problem_type="regression",
        trans_func=trans_func,
        scorer='neg_mean_absolute_error'
)
    

In [None]:
# best parameters
best_params

In [None]:
# best validation score
abs(best_score)

In [None]:
# get model metrics and test predictions
metric_ci_data, prediction_data = experiment.display_training_result(
    pipeline=best_pipeline,
    model=best_model,
    split_data=split_data,
    alpha=0.05
)

In [None]:
# display metrics
metric_ci_data

In [None]:
# save prediction data for parity plot
generic_helper.dump_data(
    data=prediction_data,
    path=f"{ROOT_DIR}/data",
    fname="eol_prediction_data.pkl"
)

In [None]:

# log pipeline and model
experiment.log_model_pipeline(
    pipeline=best_pipeline,
    model=best_model,
    model_name="eol"
)

In [None]:
# check the effect of time threshold on feature importance 
threshold_feature_importance = experiment.time_threshold_effect_feature_importance(
    train_data=split_data['train'],
    signature_depth=model_config["eol"]["signature_depth"],
    param_grid=model_config["eol"]["param_grid"],
    problem_type="regression",
    trans_func=trans_func,
    scorer='neg_mean_absolute_error'
)

# save the corresponding results for plotting
generic_helper.dump_data(
    data=threshold_feature_importance,
    path=f"{ROOT_DIR}/data",
    fname="eol_threshold_feature_importance.pkl"
)