In [1]:
import os

go_up_n_directories = lambda path, n: os.path.abspath(
    os.path.join(*([os.path.dirname(path)] + [".."] * n))
)
os.chdir(go_up_n_directories(os.getcwd(), 4))  # run once (otherwise restart kernel)

In [2]:
# Python natives
import lightgbm as lgb
import pprint
from ocpa.algo.predictive_monitoring.obj import Feature_Storage
from ocpa.algo.predictive_monitoring import tabular
import pickle
import time
import os
import json
from datetime import datetime
import re
import utilities.evaluation_utils as evaluation_utils

In [3]:
feature_storage_file = (
    "data/OTC/feature_encodings/EFG/efg/raw/OTC_split_[C2_P2_P3_O3_eas].fs"
)
target = "event_remaining_time"
model_output_path = "models/OTC/baselines/eft"

In [4]:
with open(feature_storage_file, "rb") as bin:
    feature_storage: Feature_Storage = pickle.load(bin)

In [5]:
train_eft_in_file = "data/OTC/feature_encodings/baselines/EFT/eft_train.csv"
test_eft_in_file = "data/OTC/feature_encodings/baselines/EFT/eft_test.csv"

In [6]:
# flatten EFG with same train/test split
eft_train = tabular.construct_table(
    feature_storage, feature_storage.train_indices + feature_storage.validation_indices
)
eft_test = tabular.construct_table(feature_storage, feature_storage.test_indices)

# rename columns that contain JSON special characters (as they are not supported by LightGBM)
rename_columns = lambda col_name: re.sub("[^A-Za-z0-9_]+", "", str(col_name))
eft_train = eft_train.rename(columns=rename_columns)
eft_test = eft_test.rename(columns=rename_columns)

In [7]:
# import dataset into LightGBM framework
X_train, y_train = (
    eft_train.drop(columns=[target]),
    eft_train.loc[:, target],
)
X_valid, y_valid = (
    eft_test.drop(columns=[target]),
    eft_test.loc[:, target],
)

train_data = lgb.Dataset(
    X_train,
    label=y_train,
)
valid_data = lgb.Dataset(
    X_valid,
    label=y_valid,
)

In [8]:
params = {
    "objective": "regression",
    "metric": ["mse", "mae", "mape", "rmse"],
    "num_boost_round": 100,
    "stopping_rounds": 100,
}

start_train_time = time.time()
bst = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(params["stopping_rounds"])],
)
total_train_time = time.time() - start_train_time



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1881
[LightGBM] [Info] Number of data points in the train set: 402499, number of used features: 20
[LightGBM] [Info] Start training from score -0.009147
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.598033	valid_0's l1: 0.605958	valid_0's mape: 0.515541	valid_0's rmse: 0.773326


In [9]:
pred_start_time = time.time()
y_train_preds = bst.predict(X_train)
train_pred_time = time.time() - pred_start_time

pred_start_time = time.time()
y_valid_preds = bst.predict(X_valid)
valid_pred_time = time.time() - pred_start_time

In [10]:
eval_train = evaluation_utils.get_evaluation(
    y_train, y_train_preds, regression=True, time=train_pred_time
)
eval_train["report"]["training_time"] = total_train_time
eval_valid = evaluation_utils.get_evaluation(
    y_valid, y_valid_preds, regression=True, time=valid_pred_time
)

experiment_settings = {"experiment_settings": params}
evaluation_report = {"Train": eval_train, "Test": eval_valid}

In [11]:
with open(os.path.join(model_output_path, "experiment_settings.json"), "w") as fp:
    json.dump(experiment_settings, fp, indent=2)
with open(os.path.join(model_output_path, "evaluation_report.json"), "w") as fp:
    json.dump(evaluation_report, fp, indent=2)

pprint.pprint(evaluation_report)

{'Test': {'report': {'MAE': 0.6059582868587509,
                     'MAPE': 5.220105592797986,
                     'MSE': 0.5980334017444133,
                     'R^2': 0.31326232592251935,
                     'prediction_time': 0.12180972099304199}},
 'Train': {'report': {'MAE': 0.5422100656066814,
                      'MAPE': 5.667319388722915,
                      'MSE': 0.5021291797475632,
                      'R^2': 0.4829246075311774,
                      'prediction_time': 0.5992763042449951,
                      'training_time': 1.9556193351745605}}}
