In [None]:
import importlib
from utils import (
    extraction,
    generic_helper,
    experiment,
    structure_noah
)
from utils.definitions import ROOT_DIR
importlib.reload(generic_helper)
importlib.reload(extraction)
importlib.reload(experiment)
importlib.reload(structure_noah)

In [None]:
# read in train-test split
split_data = generic_helper.read_data(
    path=f"{ROOT_DIR}/data",
    fname="noah_train_test_split.pkl"
)

In [None]:
# load model configuration
model_config = generic_helper.load_yaml_file(
    path=f"{ROOT_DIR}/config/model_config.yaml"
)

# load transformation function
trans_func = extraction.get_data_for_classification


In [None]:
# check the effect of time threshold on model
threshold_result = experiment.effect_time_threshold(
    train_data=split_data["train"],
    signature_depth=model_config["classification"]["signature_depth"],
    param_grid=model_config["classification"]["param_grid"],
    problem_type="classification",
    trans_func=trans_func,
    scorer="f1",
)

In [None]:
# save threshold results for plotting
generic_helper.dump_data(
    data=threshold_result,
    path=f"{ROOT_DIR}/data",
    fname="classification_threshold_data.pkl"
)

In [None]:
# train a model with time threshold of 120 s
best_pipeline, best_params, best_model, best_score, best_std = experiment.train_model(
    train_data=split_data["train"],
    signature_depth=model_config["classification"]["signature_depth"],
    threshold=120,
    param_grid=model_config["classification"]["param_grid"],
    problem_type="classification",
    trans_func=trans_func,
    scorer="f1",
)

In [None]:
# best params
best_params

In [None]:
# best cv score
best_score * 100.  # convert to percentages

In [None]:
# predictions on train and test data
result_dict = {}

for key, value in split_data.items():
    X, y = best_pipeline.transform(value)

    if key == "test":
        y_test = y    # save this, it will be used later

    # get metrics 
    y_pred = best_model.predict(X)
    y_pred_proba = best_model.predict_proba(X)
    metric = experiment.metric_calculator_classification(
        y_true=y,
        y_pred=y_pred,
        y_pred_proba=y_pred_proba[:, 1]
    )
    
    result_dict[key] = metric

In [None]:

# get all the metrics on the train and test sets
metric_ci_data, prediction_data = experiment.display_training_result_clf(
    pipeline=best_pipeline,
    model=best_model,
    split_data=split_data,
    alpha=0.05
)

In [None]:
# display metrics
metric_ci_data

In [None]:
# store the true and predicted labels
generic_helper.dump_data(
    data={"y_true": y_test, "y_pred": y_pred, "y_score": y_pred_proba[:, 1]},
    path=f"{ROOT_DIR}/data",
    fname="classification_prediction_data.pkl"
)

In [None]:

# log pipeline and model
experiment.log_model_pipeline(
    pipeline=best_pipeline,
    model=best_model,
    model_name="classification"
)

In [None]:
# check the effect of time threshold on feature importance
threshold_feature_importance = experiment.time_threshold_effect_feature_importance(
    train_data=split_data["train"],
    signature_depth=model_config["classification"]["signature_depth"],
    param_grid=model_config["classification"]["param_grid"],
    problem_type="classification",
    trans_func=trans_func,
    scorer="f1",
)

# save the corresponding results for plotting
generic_helper.dump_data(
    data=threshold_feature_importance,
    path=f"{ROOT_DIR}/data",
    fname="classification_threshold_feature_importance.pkl",
)