In [1]:
from sklearn.preprocessing import StandardScaler
import logging
import re
import pickle
from typing import Any
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import numpy as np
import pandas as pd
import pm4py
import pm4py.ocel
import pm4py.read
import torch
from pm4py.algo.transformation.ocel.features.objects import (
    algorithm as object_feature_factory,
)

ocel_file = "../../../data/BPI17/source/BPI2017-Final.jsonocel"
RANDOM_SEED = 42
use_cache = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if use_cache:
    application_features = pd.read_csv('../../../data/BPI17/feature_encodings/OFT/application_features.csv')
else:
    # load OCEL
    ocel = pm4py.read.read_ocel(ocel_file)

    # encode boolean variables
    ocel.objects["event_Accepted"] = ocel.objects["event_Accepted"].replace(
        {True: 1, False: 0}
    )
    ocel.objects["event_Selected"] = ocel.objects["event_Selected"].replace(
        {True: 1, False: 0}
    )
    ocel.objects = ocel.objects.reset_index().rename(columns={"index": "object_index"})

    # define object attributes per object type
    application_attributes = {
        "str": [
            "event_LoanGoal",
            "event_ApplicationType",
        ],
        "num": [
            "event_RequestedAmount",
        ],
    }
    offer_attributes = {
        "str": [],
        "num": [
            "event_NumberOfTerms",
            "event_Accepted",
            "event_Selected",
            "event_OfferedAmount",
            "event_CreditScore",
            "event_FirstWithdrawalAmount",
            "event_MonthlyCost",
        ],
    }
    # create object-level feature matrix
    data, feature_names = object_feature_factory.apply(
        ocel,
        parameters={
            "str_obj_attr": ["ocel:type"]
            + application_attributes["str"]
            + offer_attributes["str"],
            "num_obj_attr": ["object_index"]  # include object_index for reference
            + application_attributes["num"]
            + offer_attributes["num"],
        },
    )
    # make pd.DataFrame from feature matrix
    object_features = pd.DataFrame(data, columns=feature_names)
    # NORMALIZE "@@object_lifecycle_duration" (JUST FOR TESTING)
    object_features.iloc[:, 1:2] = StandardScaler().fit_transform(
        object_features.iloc[:, 1:2]
    )
    # Split object feature matrix into one feature matrix per object type
    offer_features = object_features[
        object_features["@@object_attr_value_ocel:type_offer"] == 1
    ]
    application_features = object_features[
        object_features["@@object_attr_value_ocel:type_application"] == 1
    ]

    # clean application features
    flatten = lambda l: [item for sublist in l for item in sublist]
    # select used columns/features
    application_attribute_feature_idxs = flatten(
        [
            np.where(application_features.columns.str.contains(attr_name))[0]
            for attr_name in application_attributes["str"]
            + application_attributes["num"]
            + ["object_lifecycle_duration"]
        ]
    )
    application_features = application_features.iloc[:, application_attribute_feature_idxs]
    # strip JSON special characters from feature names, as they are now supported in LightGBM
    application_features = application_features.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    application_features.to_csv('../../../data/BPI17/feature_encodings/OFT/application_features.csv', index=False)


application_features.head()

Unnamed: 0,object_attr_value_event_LoanGoal_Debtrestructuring,object_attr_value_event_LoanGoal_Homeimprovement,object_attr_value_event_LoanGoal_Unknown,object_attr_value_event_LoanGoal_Existingloantakeover,object_attr_value_event_LoanGoal_Remainingdebthome,object_attr_value_event_LoanGoal_Notspeficied,object_attr_value_event_LoanGoal_Taxpayments,object_attr_value_event_LoanGoal_Otherseeexplanation,object_attr_value_event_LoanGoal_Extraspendinglimit,object_attr_value_event_LoanGoal_Car,object_attr_value_event_LoanGoal_Motorcycle,object_attr_value_event_LoanGoal_Businessgoal,object_attr_value_event_LoanGoal_Boat,object_attr_value_event_LoanGoal_CaravanCamper,object_attr_value_event_ApplicationType_Limitraise,object_attr_value_event_ApplicationType_Newcredit,event_num_event_RequestedAmount,object_lifecycle_duration
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,20000.0,-0.558083
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,10000.0,-1.107298
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,15000.0,-0.591179
7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,5000.0,0.502681
10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,35000.0,0.87026


In [3]:
# make train test split
X, y = application_features.drop('object_lifecycle_duration', axis=1), application_features.loc[:,"object_lifecycle_duration"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

In [17]:
params = {
    "objective": "regression",
    "metric": ["mse", "mae", "mape", "rmse"],
}
bst = lgb.train(
    params,
    train_data,
    # num_boost_round=5000,
    valid_sets=[valid_data],
    # callbacks=[lgb.early_stopping(50)],
)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 25207, number of used features: 16
[LightGBM] [Info] Start training from score 0.106434
[1]	valid_0's l2: 1.00623	valid_0's l1: 0.823543	valid_0's mape: 0.70787	valid_0's rmse: 1.00311
[2]	valid_0's l2: 1.0009	valid_0's l1: 0.820841	valid_0's mape: 0.705504	valid_0's rmse: 1.00045
[3]	valid_0's l2: 0.996449	valid_0's l1: 0.818524	valid_0's mape: 0.703514	valid_0's rmse: 0.998223
[4]	valid_0's l2: 0.99314	valid_0's l1: 0.816603	valid_0's mape: 0.70186	valid_0's rmse: 0.996564
[5]	valid_0's l2: 0.990333	valid_0's l1: 0.814905	valid_0's mape: 0.700415	valid_0's rmse: 0.995155
[6]	valid_0's l2: 0.988175	valid_0's l1: 0.813501	valid_0's mape: 0.699221	valid_0's rmse: 0.99407
[7]	valid_0's l2: 0.986367	valid_0's l1: 0.81225	valid_0's mape: 0.69817	valid_0's rmse: 0.99316
[8]	valid_0

In [18]:
y_train_preds = bst.predict(X_train)
y_valid_preds = bst.predict(X_valid)

In [19]:
train_mse_loss = mean_squared_error(y_train, y_train_preds)
valid_mse_loss = mean_squared_error(y_valid, y_valid_preds)

print(f"Training loss (MSE): {train_mse_loss}")
print(f"Validation loss (MSE): {valid_mse_loss}")

Training loss (MSE): 0.9399466044645309
Validation loss (MSE): 0.9834069699259738
