In [None]:
from typing import Any, Callable
import pickle
import re
import string
from sklearn.preprocessing import StandardScaler

# Object centric process mining
import ocpa.objects.log.importer.ocel.factory as ocel_import_factory
from ocpa.algo.predictive_monitoring.obj import Feature_Storage
from ocpa.objects.log.ocel import OCEL
import ocpa.algo.predictive_monitoring.factory as feature_factory
from ocpa.algo.predictive_monitoring import tabular
import ocpa.objects.log.importer.csv.factory as csv_import_factory

In [None]:
ocel_in_file = "../../data/CS/source/cs_january_ots[krs_krv_cv]_eas[ea2_ea4_resource_ce_resource_multi].jsonocel"
event_attributes = ["ea2", "ea4", "resource_ce", "resource_multi"]
feature_storage_intermediate_file = "../../data/CS/feature_encodings/EFG/efg/raw/CS_[P2_P3_O3_ea2_ea4_resource_robot_person_multi].fs"
feature_storage_out_file = "../../data/CS/feature_encodings/EFG/efg/raw/CS_split_[P2_P3_O3_ea2_ea4_resource_robot_person_multi].fs"

In [None]:
ocel = ocel_import_factory.apply(ocel_in_file)

In [None]:
all_acts = ocel.log.log["event_activity"].value_counts(normalize=True) * 100
most_occuring_acts = all_acts[all_acts > 1].index
event_level_features = {
    f"C2{char}": (feature_factory.EVENT_PRECEDING_ACTIVITIES, (act,))
    for act, char in zip(most_occuring_acts, string.ascii_lowercase)
} | {
    "P2": (feature_factory.EVENT_ELAPSED_TIME, ()),
    # "P3": (feature_factory.EVENT_REMAINING_TIME, ()), # only use for target (but we use 'ea4' now)
    # "P4": (feature_factory.EVENT_FLOW_TIME, ()),
    "P5": (feature_factory.EVENT_SYNCHRONIZATION_TIME, ()),
    # "P6": (feature_factory.EVENT_SOJOURN_TIME, ()),
    "O3a": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("krs",)),
    "O3b": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("krv",)),
    "O3c": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("cv",)),
}

In [None]:
feature_storage = feature_factory.apply(
    ocel,
    event_based_features=list(event_level_features.values()),
    event_attributes=[f"event_{ea}" for ea in event_attributes],
    # workers=2
)  # takes much too long when resource (ot2) is an object type

In [None]:
with open(feature_storage_intermediate_file, "wb") as bin:
    pickle.dump(feature_storage, bin)

In [None]:
feature_storage.extract_normalized_train_test_split(
    scaler=StandardScaler,
    scaling_exempt_features=["event_ea4"],
    test_size=0.3,
    validation_size=0.2,
    state=42,
)

In [None]:
with open(feature_storage_out_file, "wb") as bin:
    pickle.dump(feature_storage, bin)