In [1]:
import os

go_up_n_directories = lambda path, n: os.path.abspath(
    os.path.join(*([os.path.dirname(path)] + [".."] * n))
)
os.chdir(go_up_n_directories(os.getcwd(), 2))  # run once (otherwise restart kernel)
os.getcwd()

'/home/tim/Development/OCPPM'

In [2]:
from typing import Any, Callable
import pickle
import re
import string
from sklearn.preprocessing import StandardScaler

# Object centric process mining
import ocpa.objects.log.importer.ocel.factory as ocel_import_factory
from ocpa.algo.predictive_monitoring.obj import Feature_Storage
from ocpa.objects.log.ocel import OCEL
import ocpa.algo.predictive_monitoring.factory as feature_factory
import ocpa.objects.log.importer.csv.factory as csv_import_factory

In [3]:
ocel_in_file = "data/BPI17/source/BPI2017-CountEncoded.jsonocel"
event_attributes = [
    "event_Action_ce",
    "event_EventOrigin_ce",
    "event_OrgResource_ce",
]
feature_storage_intermediate_file = "data/BPI17/feature_encodings/EFG/efg/raw/BPI_[C2_P2_P3_P5_O3_Action_EventOrigin_OrgResource].fs"
feature_storage_out_file = "data/BPI17/feature_encodings/EFG/efg/raw/BPI_split_[C2_P2_P3_P5_O3_Action_EventOrigin_OrgResource].fs"

In [4]:
ocel = ocel_import_factory.apply(ocel_in_file)

In [5]:
activities = ocel.log.log["event_activity"].unique().tolist()

# Adams used just C2, D1, P2, and O3 with P3 as the target variable
event_level_features = {
    f"C2{char}": (feature_factory.EVENT_PRECEDING_ACTIVITIES, (act,))
    for act, char in zip(activities, string.ascii_lowercase)
} | {
    # "D1": (
    #     feature_factory.EVENT_AGG_PREVIOUS_CHAR_VALUES,
    #     ("event_RequestedAmount", max),
    # ), # error ('event_RequestedAmount' we have made an object attribute)
    "P2": (feature_factory.EVENT_ELAPSED_TIME, ()),
    "P3": (feature_factory.EVENT_REMAINING_TIME, ()),
    # "P4": (feature_factory.EVENT_FLOW_TIME, ()),
    "P5": (feature_factory.EVENT_SYNCHRONIZATION_TIME, ()),
    # "P6": (feature_factory.EVENT_SOJOURN_TIME, ()),
    # "P7": (feature_factory.EVENT_POOLING_TIME, ()), # error
    # "P8": (feature_factory.EVENT_LAGGING_TIME, ()), # error
    # "P9": (feature_factory.EVENT_SERVICE_TIME, ()), # error
    # "P10": (feature_factory.EVENT_WAITING_TIME, ()), # error
    "O3": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("offer",)),
}

In [6]:
feature_storage = feature_factory.apply(
    ocel,
    event_based_features=list(event_level_features.values()),
    event_attributes=event_attributes,
    # execution_based_features=[(feature_factory.EXECUTION_THROUGHPUT, ())]
)

100%|██████████| 31509/31509 [01:17<00:00, 408.15it/s]


In [None]:
# Remove process executions that only contain one event
# DETERMINE WHETHER TO DO IT HERE, IN OCPA, OR IN THE RAW OCEL

In [7]:
with open(feature_storage_intermediate_file, "wb") as binary:
    pickle.dump(feature_storage, binary)

In [8]:
# Generate the train-validation-test split (with the same split as Adams)
# but do not normalize the features here
feature_storage.extract_normalized_train_test_split(
    scaler=StandardScaler,
    scaling_exempt_features=[],
    test_size=0.3,
    validation_size=0.7 * 0.2,
    state=42,
)



In [9]:
with open(feature_storage_out_file, "wb") as binary:
    pickle.dump(feature_storage, binary)