In [1]:
import os

go_up_n_directories = lambda path, n: os.path.abspath(
    os.path.join(*([os.path.dirname(path)] + [".."] * n))
)
os.chdir(go_up_n_directories(os.getcwd(), 2))  # run once (otherwise restart kernel)
os.getcwd()

'/home/tim/Development/OCPPM'

In [2]:
from typing import Any, Callable
import pickle
import re
import string
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

# Object centric process mining
import ocpa.objects.log.importer.ocel.factory as ocel_import_factory
from ocpa.algo.predictive_monitoring.obj import Feature_Storage
from ocpa.objects.log.ocel import OCEL
import ocpa.algo.predictive_monitoring.factory as feature_factory
from ocpa.algo.predictive_monitoring import tabular
import ocpa.objects.log.importer.csv.factory as csv_import_factory

In [3]:
objects_metadata = {
    "item": ["encoded_oid"],
    "order": ["encoded_oid"],
    "package": ["encoded_oid"],
}
event_attributes = ["weight", "price", "age", "bankaccount"]
ocel_in_file = "data/OTC/source/OTC.jsonocel"
feature_storage_intermediate_file = (
    "data/OTC/feature_encodings/EFG/efg/raw/OTC_[C2_P2_P3_O3_eas].fs"
)
feature_storage_out_file = (
    "data/OTC/feature_encodings/EFG/efg/raw/OTC_split_[C2_P2_P3_O3_eas].fs"
)
CACHE = True

In [4]:
ocel = ocel_import_factory.apply(
    ocel_in_file,
    parameters={"execution_extraction": "leading_type", "leading_type": "item"},
)

In [5]:
all_acts = ocel.log.log["event_activity"].unique()
event_level_features = (
    {
        f"C2{char}": (feature_factory.EVENT_PRECEDING_ACTIVITIES, (act,))
        for act, char in zip(all_acts, string.ascii_lowercase)
    }
    | {
        "P2": (feature_factory.EVENT_ELAPSED_TIME, ()),
        "P3": (
            feature_factory.EVENT_REMAINING_TIME,
            (),
        ),
        # "P4": (feature_factory.EVENT_FLOW_TIME, ()),
        "P5": (feature_factory.EVENT_SYNCHRONIZATION_TIME, ()),
        # "P6": (feature_factory.EVENT_SOJOURN_TIME, ()),
    }
    | {
        f"O3{char}": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, (ot,))
        for ot, char in zip(objects_metadata, string.ascii_lowercase)
    }
)

In [6]:
if CACHE:
    with open(feature_storage_intermediate_file, "rb") as bin:
        feature_storage = pickle.load(bin)
else:
    feature_storage = feature_factory.apply(
        ocel,
        event_based_features=list(event_level_features.values()),
        event_attributes=[f"event_{ea}" for ea in event_attributes],
        # workers=2,
    )  # takes much too long when 'customers' is an object type
    # 10448 minutes (7.25 days), 0% progress
    # Final processing time: 2:03:05 H:m:s

    with open(feature_storage_intermediate_file, "wb") as bin:
        pickle.dump(feature_storage, bin)

Applying feature extraction to process executions


100%|██████████| 8159/8159 [1:17:03<00:00,  1.76it/s]  


In [7]:
feature_storage.extract_normalized_train_test_split(
    scaler=StandardScaler,
    scaling_exempt_features=[],
    # category_encoder = ce.CountEncoder,
    test_size=0.15,
    validation_size=0.15,
    state=42,
)



In [8]:
with open(feature_storage_out_file, "wb") as bin:
    pickle.dump(feature_storage, bin)