In [1]:
import os

go_up_n_directories = lambda path, n: os.path.abspath(
    os.path.join(*([os.path.dirname(path)] + [".."] * n))
)
os.chdir(go_up_n_directories(os.getcwd(), 2))  # run once (otherwise restart kernel)

In [2]:
from typing import Any, Callable, Union
import pickle
import re
import string
from sklearn.preprocessing import StandardScaler

# Object centric process mining
import ocpa.objects.log.importer.ocel.factory as ocel_import_factory
from ocpa.algo.predictive_monitoring.obj import Feature_Storage
from ocpa.objects.log.ocel import OCEL
import ocpa.algo.predictive_monitoring.factory as feature_factory
from ocpa.algo.predictive_monitoring import tabular
import ocpa.objects.log.importer.csv.factory as csv_import_factory

# Custom local imports
import utilities.ocel_import_utils as ocel_import_utils

In [3]:
# ocel_in_file = "data/CS/source/cs_january_ots[krs_krv_cv]_oas[oa1_oa2_oa3_oa4_oa5_oa6_oa7_oa8_oa9_oa10_oa11_oa12_oa13_oa15_oa16]_eas[resourceCE_resourceMulti_ea1_ea2_ea3_ea4_ea6_ea8_ea10_ea12_ea14].jsonocel"
ocel_in_file = "data/CS/source/cs_january_only_linked_ots[krs_krv_cv]_oas[oa1_oa2_oa3_oa4_oa5_oa6_oa7_oa8_oa10_oa11_oa12_oa13_oa15_oa16]_eas[resourceCE_resourceMulti_ea1_ea2_ea3_ea4_ea6_ea8_ea10_ea12_ea14].jsonocel"
event_attributes = ocel_import_utils.extract_values_from_file_string(
    ocel_in_file, key="eas"
)
object_attributes = ocel_import_utils.extract_values_from_file_string(
    ocel_in_file, key="oas"
)
object_types = ocel_import_utils.extract_values_from_file_string(
    ocel_in_file, key="ots"
)
feature_storage_intermediate_file = (
    "data/CS/feature_encodings/EFG/efg/raw/CS_[C2_P2_P3_O3_eas].fs"
)
feature_storage_out_file = (
    "data/CS/feature_encodings/EFG/efg/raw/CS_split_[C2_P2_P3_O3_eas].fs"
)
hoeg_feature_storage_out_file = (
    "data/CS/feature_encodings/HOEG/hoeg/raw/CS_split_[C2_P2_P3_O3_eas].fs"
)

In [8]:
relevant_event_features = [
    "event_elapsed_time",
    "event_ea6",
    "event_ea14",
    "event_ea3",
    "event_previous_type_countkrv",
    "event_ea1",
    "event_ea2",
    "event_ea12",
    "event_ea10",
    "event_ea8",
    "event_resourceCE",
    "event_preceding_activitiesFinishTask",
    "event_preceding_activitiesSaveObject",
    "event_previous_type_countcv",
    "event_resourceMulti",
    "event_preceding_activitiesPerformAction",
    "event_previous_type_countkrs",
]  # from EFT shap
event_attributes = [
    # "ea1",
    # "ea2",
    "ea3",
    # "ea4",
    "ea6",
    # "ea8",
    # "ea10",
    # "ea12",
    "ea14",
    # "resourceCE",
]  # We only keep the top 3 most relevant event attributes

In [6]:
ocel = ocel_import_factory.apply(ocel_in_file)

In [9]:
all_acts = ocel.log.log["event_activity"].value_counts(normalize=True) * 100
most_occuring_acts = all_acts[all_acts > 4].index
event_level_features = {
    f"C2{char}": (feature_factory.EVENT_PRECEDING_ACTIVITIES, (act,))
    for act, char in zip(most_occuring_acts, string.ascii_lowercase)
} | {
    "P2": (feature_factory.EVENT_ELAPSED_TIME, ()),
    "P3": (
        feature_factory.EVENT_REMAINING_TIME,
        (),
    ),  # only use for target (but we use 'ea4' now)
    # "P4": (feature_factory.EVENT_FLOW_TIME, ()),
    "P5": (feature_factory.EVENT_SYNCHRONIZATION_TIME, ()),
    # "P6": (feature_factory.EVENT_SOJOURN_TIME, ()),
    "O3a": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("krs",)),
    "O3b": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("krv",)),
    "O3c": (feature_factory.EVENT_PREVIOUS_TYPE_COUNT, ("cv",)),
}
# We've reduced the number of features (e.g. we now only include the top 7 activities and top 3 event attributes)
len(event_level_features) + len(event_attributes)

16

In [10]:
feature_storage = feature_factory.apply(
    ocel,
    event_based_features=list(event_level_features.values()),
    event_attributes=[f"event_{ea}" for ea in event_attributes],
    min_execution_length=4,
    workers=2,
)  # takes much too long when resource (ot2) is an object type

100%|██████████| 269725/269725 [03:02<00:00, 1478.26it/s] 


In [11]:
# Removing process execution graphs that are incomplete (`Feature_Graph.objects` is empty)
# Leaving them in causes issues downstream, as our pipelines depend on `Feature_Graph.objects`
feature_graph_success_report = {"success_ids": [], "failure_ids": []}

for fg in feature_storage.feature_graphs:
    if fg.objects:
        feature_graph_success_report["success_ids"].append(fg._pexec_id)
    else:
        feature_graph_success_report["failure_ids"].append(fg._pexec_id)
feature_graph_success_report["success_count"] = len(
    feature_graph_success_report["success_ids"]
)
feature_graph_success_report["failure_count"] = len(
    feature_graph_success_report["failure_ids"]
)
feature_graph_success_report["success_rate"] = feature_graph_success_report[
    "success_count"
] / (
    feature_graph_success_report["failure_count"]
    + feature_graph_success_report["success_count"]
)
print(feature_graph_success_report["success_rate"])

1.0


In [12]:
with open(feature_storage_intermediate_file, "wb") as bin:
    pickle.dump(feature_storage, bin)

In [13]:
feature_storage.extract_normalized_train_test_split(
    scaler=StandardScaler,
    # scaling_exempt_features=["event_ea4"],
    test_size=0.15,
    validation_size=0.15,
    state=42,
)



In [14]:
with open(feature_storage_out_file, "wb") as bin:
    pickle.dump(feature_storage, bin)
with open(hoeg_feature_storage_out_file, "wb") as bin:
    pickle.dump(feature_storage, bin)