In [1]:
# Python natives
import os

os.chdir("/home/tim/Development/OCPPM/")
import pickle
import re
from typing import Any
import pprint

# Data handling
import pandas as pd
import numpy as np
from ocpa.algo.predictive_monitoring.obj import Feature_Storage
from ocpa.algo.predictive_monitoring import tabular

# Booster model
import lightgbm as lgb

# Processing tools and evaluation metrics
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler
import utilities.evaluation_utils as evaluation_utils

# Config
feature_storage_in_file = "data/BPI17/feature_encodings/EFG/efg/raw/BPI_split_[C2_P2_P3_P5_O3_Action_EventOrigin_OrgResource].fs"
target = "event_remaining_time"

In [None]:
with open(feature_storage_file, "rb") as bin:
    feature_storage: Feature_Storage = pickle.load(bin)

In [3]:
# flatten EFG with same train/test split
eft_train = tabular.construct_table(
    feature_storage, feature_storage.train_indices + feature_storage.validation_indices
)
eft_test = tabular.construct_table(feature_storage, feature_storage.test_indices)

# rename columns that contain JSON special characters (as they are not supported by LightGBM)
rename_columns = lambda col_name: re.sub("[^A-Za-z0-9_]+", "", str(col_name))
eft_train = eft_train.rename(columns=rename_columns)
eft_test = eft_test.rename(columns=rename_columns)

Unnamed: 0,event_preceding_activitiesCreateapplication,event_preceding_activitiesSubmit,event_preceding_activitiesHandleleads,event_preceding_activitiesAccept,event_preceding_activitiesCreateoffer,event_preceding_activitiesSendonline,event_preceding_activitiesComplete,event_preceding_activitiesCall,event_preceding_activitiesCanceloffer,event_preceding_activitiesSendmailandonline,...,event_preceding_activitiesAssesspotentialfraud,event_preceding_activitiesPersonalloancollection,event_preceding_activitiesShortencompletion,event_elapsed_time,event_remaining_time,event_synchronization_time,event_previous_type_countoffer,event_Action_ce,event_EventOrigin_ce,event_OrgResource_ce
0,-0.294856,-0.233835,-0.097731,-0.294856,1.893904,-0.069213,-0.286365,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.705739,8.671991,-0.163154,1.059050,0.811704,1.189447,-0.397887
1,-0.294856,-0.233835,-0.097731,-0.294856,4.312149,-0.069213,-0.286365,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.705730,8.671984,-0.162797,1.059050,0.811704,1.189447,-0.397887
2,-0.294856,-0.233835,-0.097731,-0.294856,1.893904,-0.069213,-0.286365,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.705706,8.671963,-0.163154,1.059050,0.811704,1.189447,-0.397887
3,-0.294856,-0.233835,-0.097731,-0.294856,-0.524342,-0.069213,3.492043,-0.399320,-0.134381,2.931856,...,-0.025798,-0.00276,-0.013799,-0.705706,8.671963,-0.163154,1.059050,-0.925827,-0.317357,-0.397887
4,-0.294856,-0.233835,-0.097731,-0.294856,-0.524342,-0.069213,-0.286365,2.503617,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,1.375277,6.872730,-0.163154,2.171154,0.811704,1.189447,-0.317180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393926,3.391486,-0.233835,-0.097731,-0.294856,-0.524342,-0.069213,-0.286365,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.707109,-0.624823,-0.163154,-1.165159,-0.925827,-0.317357,-0.276954
393927,-0.294856,-0.233835,-0.097731,-0.294856,-0.524342,-0.069213,3.492043,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.285802,-0.989088,-0.163154,-1.165159,0.811704,-1.207974,-0.466470
393928,-0.294856,-0.233835,-0.097731,3.391486,-0.524342,-0.069213,-0.286365,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.285615,-0.989250,-0.163154,-0.053054,0.811704,1.189447,-0.466470
393929,-0.294856,-0.233835,-0.097731,-0.294856,1.893904,-0.069213,-0.286365,-0.399320,-0.134381,-0.328234,...,-0.025798,-0.00276,-0.013799,-0.285604,-0.989259,-0.163154,-0.053054,0.811704,1.189447,-0.466470


In [4]:
# import dataset into LightGBM framework
X_train, y_train = (
    eft_train.drop(columns=[target]),
    eft_train.loc[:, target],
)
X_valid, y_valid = (
    eft_test.drop(columns=[target]),
    eft_test.loc[:, target],
)

train_data = lgb.Dataset(
    X_train,
    label=y_train,
)
valid_data = lgb.Dataset(
    X_valid,
    label=y_valid,
)

In [5]:
params = {
    "objective": "regression",
    "metric": ["mse", "mae", "mape", "rmse"],
    "num_boost_round": 100,
    "stopping_rounds": 100,
}

bst = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(params["stopping_rounds"])],
)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 750
[LightGBM] [Info] Number of data points in the train set: 275751, number of used features: 26
[LightGBM] [Info] Start training from score -0.000684
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.577542	valid_0's l1: 0.530389	valid_0's mape: 0.445374	valid_0's rmse: 0.759962


In [6]:
y_train_preds = bst.predict(X_train)
y_valid_preds = bst.predict(X_valid)

In [7]:
eval_train = evaluation_utils.get_evaluation(y_train, y_train_preds, regression=True)
eval_valid = evaluation_utils.get_evaluation(y_valid, y_valid_preds, regression=True)
experiment_settings = {"experiment_settings": params}
evaluation_report = {"train": eval_train, "validation": eval_valid}
pprint.pprint(evaluation_report)

{'params': {'metric': ['mse', 'mae', 'mape', 'rmse'],
            'num_boost_round': 100,
            'objective': 'regression',
            'stopping_rounds': 100},
 'train': {'MAE': 0.5273921926178372,
           'MAPE': 11.768914625621097,
           'MSE': 0.5704918368991474,
           'R^2': 0.4283829477098976},
 'valid': {'MAE': 0.5303893083557005,
           'MAPE': 13.075442651352738,
           'MSE': 0.5775418072947429,
           'R^2': 0.42509667342793367}}
