In [1]:
# Python natives
import os

os.chdir("/home/tim/Development/OCPPM/")
import pprint
import pickle
import lightgbm as lgb
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from utilities import evaluation_utils
from torch_geometric.data import HeteroData

In [2]:
prediction_task = "regression"
if prediction_task == "regression":
    regression = True
elif prediction_task == "classification":
    regression = False
target_name = {
    "regression": "@@object_lifecycle_duration",
    "classification": "event_ea4",
}

ofg_in_file = "data/CS/feature_encodings/OFG/ofg/raw/CS_OFG.pkl"
oft_in_file = "data/CS/feature_encodings/baselines/OFT/objects_w_ea4.csv"

In [3]:
with open(ofg_in_file, "rb") as ofg_file:
    ofg: HeteroData = pickle.load(ofg_file)

In [4]:
df_objects = pd.read_csv(oft_in_file, sep=";")

In [5]:
if not regression:
    df_objects[target_name[prediction_task]].value_counts(normalize=True).sort_index()

In [6]:
# Make train test split
#   regression/classification not 100% equal, since regr. only has 'krs' objects and class. has all
if regression:
    X, y = ofg["krs"].x.numpy(), ofg["krs"].y.numpy()
else:
    X, y = (
        df_objects.drop(columns=[target_name[prediction_task]]),
        df_objects[target_name[prediction_task]],
    )
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=42
)

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

In [7]:
params = {
    "metric": ["multi_logloss"],
    "num_boost_round": 100,
    "stopping_rounds": 100,
    "num_threads": 4,
}
if regression:
    params |= {"objective": "regression", "metric": ["mse", "mae", "mape", "rmse"]}
else:
    params |= {"objective": "multiclass", "num_class": 7}


bst = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(params["stopping_rounds"])],
)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 144196, number of used features: 18
[LightGBM] [Info] Start training from score -0.039582
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.715016	valid_0's l1: 0.498823	valid_0's mape: 0.374422	valid_0's rmse: 0.845586


In [8]:
if regression:
    y_train_preds = bst.predict(X_train)
    y_valid_preds = bst.predict(X_valid)
else:
    y_train_preds = np.apply_along_axis(
        evaluation_utils.get_preds_from_probs, axis=1, arr=bst.predict(X_train)
    )
    y_valid_preds = np.apply_along_axis(
        evaluation_utils.get_preds_from_probs, axis=1, arr=bst.predict(X_valid)
    )

In [9]:
# Run model evaluation, TODO: store this somewhere (in JSON)
eval_train = evaluation_utils.get_evaluation(
    y_train, y_train_preds, regression=regression
)
eval_valid = evaluation_utils.get_evaluation(
    y_valid, y_valid_preds, regression=regression
)
experiment_settings = {"experiment_settings": params}
evaluation_report = {"train": eval_train, "validation": eval_valid}
pprint.pprint(evaluation_report)
pprint.pprint(experiment_settings)

{'train': {'report': {'MAE': 0.49466513957522246,
                      'MAPE': 1.1901358349625528,
                      'MSE': 0.7016443703397501,
                      'R^2': 0.2547469984088744}},
 'validation': {'report': {'MAE': 0.49882317212151456,
                           'MAPE': 1.2345460803823358,
                           'MSE': 0.7150163485048876,
                           'R^2': 0.24323910217070777}}}
{'experiment_settings': {'metric': ['mse', 'mae', 'mape', 'rmse'],
                         'num_boost_round': 100,
                         'num_threads': 4,
                         'objective': 'regression',
                         'stopping_rounds': 100}}
