In [1]:
import os

go_up_n_directories = lambda path, n: os.path.abspath(
    os.path.join(*([os.path.dirname(path)] + [".."] * n))
)
os.chdir(go_up_n_directories(os.getcwd(), 3))  # run once (otherwise restart kernel)

In [2]:
# Python native
import logging
import pickle
from typing import Any
import os
from collections import defaultdict
from ocpa.algo.predictive_monitoring.obj import Feature_Storage as FeatureStorage

# Custom local imports
import utilities.hetero_data_utils as hetero_data_utils
import utilities.ocel_import_utils as ocel_import_utils

# Data handling
import numpy as np
import pandas as pd

# Object-centric process mining
import pm4py
import pm4py.ocel
import pm4py.read
import torch
from pm4py.algo.transformation.ocel.features.objects import (
    algorithm as object_feature_factory,
)
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import HeteroData

# Configuration variables
ocel_in_file = "data/CS/source/cs_january_only_linked_ots[krs_krv_cv]_oas[oa1_oa2_oa3_oa4_oa5_oa6_oa7_oa8_oa10_oa11_oa12_oa13_oa15_oa16]_eas[resourceCE_resourceMulti_ea1_ea2_ea3_ea4_ea6_ea8_ea10_ea12_ea14].jsonocel"
ofg_out_file = "data/CS/feature_encodings/OFG/ofg/raw/CS_OFG.pkl"
objects_data_dict_out_file = "data/CS/feature_encodings/HOEG/hoeg/raw/cs_ofg+oi_graph+krs_node_map+krv_node_map+cv_node_map.pkl"

object_attributes = ocel_import_utils.extract_values_from_file_string(
    ocel_in_file, key="oas"
)
object_types = ocel_import_utils.extract_values_from_file_string(
    ocel_in_file, key="ots"
)

factory_feature_prefix = "@@"
factory_num_oa_prefix = f"{factory_feature_prefix}event_num_"
factory_str_oa_prefix = f"{factory_feature_prefix}object_attr_value_"
target_name = f"{factory_feature_prefix}object_lifecycle_duration"

In [3]:
# load OCEL
ocel = pm4py.read.read_ocel(ocel_in_file)

In [4]:
# record the index as a column so we can later link objects (i.a. with events)
ocel.objects = ocel.objects.reset_index().rename(columns={"index": "object_index"})
object_feature_factory_params = {
    "str_obj_attr": ["ocel:type"],  # include ocel:oid for reference
    "num_obj_attr": ["object_index"]  # include object_index for reference
    + object_attributes,  # all oas are numerical (encoding already done in OCEL pipeline)
}
# create object-level feature matrix
factory_data, factory_feature_names = object_feature_factory.apply(
    ocel,
    parameters=object_feature_factory_params,
)

In [5]:
# create a set with only the relevant feature names (using PM4Py's naming scheme)
feature_names = (
    {
        f"{factory_num_oa_prefix}{num_feature}"
        for num_feature in object_feature_factory_params["num_obj_attr"]
    }
    | {
        f"{factory_str_oa_prefix}ocel:type_{object_type}"
        for object_type in object_types
    }
    | {target_name}
)

In [6]:
object_features = pd.DataFrame(factory_data, columns=factory_feature_names)
object_features = object_features[list(feature_names & set(factory_feature_names))]

In [7]:
# NORMALIZE "@@object_lifecycle_duration"
object_features.loc[:, target_name] = StandardScaler().fit_transform(
    object_features.loc[:, [f"{factory_num_oa_prefix}object_index", target_name]]
)[:, 1]

In [8]:
# retrieve the mapper from ocel object id to the object index in the pd.DataFrame (e.g. 'KRS-56423':1)
oid_object_index_map = hetero_data_utils.get_index_map(
    ocel.objects, "ocel:oid", "object_index"
)

In [9]:
# reset column name from object_index that was passed to the object-level feature matrix factory
object_features = object_features.rename(
    columns={f"{factory_num_oa_prefix}object_index": "object_index"}
)
object_features["object_index"] = object_features["object_index"].astype(int)


# Split object feature matrix into one feature matrix per object type
krs_features = object_features[
    object_features[f"{factory_str_oa_prefix}ocel:type_krs"] == 1
]
krv_features = object_features[
    object_features[f"{factory_str_oa_prefix}ocel:type_krv"] == 1
]
cv_features = object_features[
    object_features[f"{factory_str_oa_prefix}ocel:type_cv"] == 1
]

# Subset features to only include object attribute features, excluding object interaction features
flatten = lambda l: [item for sublist in l for item in sublist]
krs_attribute_feature_idxs = flatten(
    [
        np.where(krs_features.columns.str.contains(attr_name))[0]
        for attr_name in object_attributes
        + ["object_index", "object_lifecycle_duration"]
    ]
)
krv_attribute_feature_idxs = flatten(
    [
        np.where(krv_features.columns.str.contains(attr_name))[0]
        for attr_name in object_attributes
        + ["object_index", "object_lifecycle_duration"]
    ]
)
cv_attribute_feature_idxs = flatten(
    [
        np.where(cv_features.columns.str.contains(attr_name))[0]
        for attr_name in object_attributes
        + ["object_index", "object_lifecycle_duration"]
    ]
)
# subset krs features, with correct columns
krs_features = krs_features.iloc[:, krs_attribute_feature_idxs]
# create object_index to krs_index mapper
krs_features = hetero_data_utils.add_object_type_index(krs_features, "krs")
object_index_krs_index_map = hetero_data_utils.get_index_map(
    krs_features, "object_index", "krs_index"
)

# subset krv features, with correct columns
krv_features = krv_features.iloc[:, krv_attribute_feature_idxs]
# create object_index to krv_index mapper
krv_features = hetero_data_utils.add_object_type_index(krv_features, "krv")
object_index_krv_index_map = hetero_data_utils.get_index_map(
    krv_features, "object_index", "krv_index"
)

# subset cv features, with correct columns
cv_features = cv_features.iloc[:, cv_attribute_feature_idxs]
# create object_index to cv_index mapper
cv_features = hetero_data_utils.add_object_type_index(cv_features, "cv")
object_index_cv_index_map = hetero_data_utils.get_index_map(
    cv_features, "object_index", "cv_index"
)

In [10]:
graph = pm4py.ocel.discover_objects_graph(
    ocel,
    graph_type="object_descendants",
    # graph_type="object_interaction",
)

In [11]:
# define object relation types (edge types)
cs_edge_types = [
    ("krv", "cv"),
    ("krs", "krs"),
    ("krs", "krv"),
    ("krv", "krv"),
    ("cv", "cv"),
    ("krs", "cv"),
]
# assign edge tuples to correct edge types
cs_edges_per_edge_type = hetero_data_utils.split_on_edge_types(
    list(graph), cs_edge_types, to_undirected=True
)
cs_edge_types = list(
    cs_edges_per_edge_type.keys()
)  # reset the predefined edge_types to the edge_types found in the data

# create ocel object index to krs node index (for HeteroData) mapper
krs_to_node_map = hetero_data_utils.object_map_to_node_map(
    oid_object_index_map, object_index_krs_index_map, "krs"
)
# create ocel object index to krv node index (for HeteroData) mapper
krv_to_node_map = hetero_data_utils.object_map_to_node_map(
    oid_object_index_map, object_index_krv_index_map, "krv"
)
# create ocel object index to cv node index (for HeteroData) mapper
cv_to_node_map = hetero_data_utils.object_map_to_node_map(
    oid_object_index_map, object_index_cv_index_map, "cv"
)

In [12]:
# rename edges to have correct edge_index for HeteroData
cs_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
    cs_edges_per_edge_type, krs_to_node_map
)
cs_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
    cs_edges_per_edge_type, krv_to_node_map
)
cs_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
    cs_edges_per_edge_type, cv_to_node_map
)

In [13]:
# define heterogeneous graph
hetero_data = HeteroData()
# define target variable for both "krs" type and "krv" type
hetero_data["krs"].y = torch.tensor(krs_features["@@object_lifecycle_duration"].values)
hetero_data["krv"].y = torch.tensor(krv_features["@@object_lifecycle_duration"].values)
hetero_data["cv"].y = torch.tensor(cv_features["@@object_lifecycle_duration"].values)

# attach node feature vectors for both "krs" type and "krv" type
hetero_data["krs"].x = torch.tensor(
    krs_features.drop(
        ["krs_index", "object_index", "@@object_lifecycle_duration"], axis=1
    ).values
)
hetero_data["krv"].x = torch.tensor(
    krv_features.drop(
        ["krv_index", "object_index", "@@object_lifecycle_duration"], axis=1
    ).values
)
hetero_data["cv"].x = torch.tensor(
    cv_features.drop(
        ["cv_index", "object_index", "@@object_lifecycle_duration"], axis=1
    ).values
)

In [14]:
for edge_type in cs_edge_types:
    hetero_data[
        edge_type[0], "interacts", edge_type[1]
    ].edge_index = hetero_data_utils.to_torch_coo_format(
        cs_edges_per_edge_type[edge_type]
    )

In [15]:
objects_data = {
    "ofg": hetero_data,
    "objects_interaction_graph": graph,
    "object_feature_vector_map": {
        "krs": krs_to_node_map,
        "krv": krv_to_node_map,
        "cv": cv_to_node_map,
    },
    "object_feature_matrices": {
        "krs": krs_features,
        "krv": krv_features,
        "cv": cv_features,
    },
}

# save HeteroData object (for OFG encoding)
with open(ofg_out_file, "wb") as binary_file:
    pickle.dump(hetero_data, binary_file)
# save object interaction graph information (for HOEG encoding)
with open(objects_data_dict_out_file, "wb") as binary_file:
    pickle.dump(objects_data, binary_file)

In [16]:
check_list_item_type = lambda lst, item_type: all(
    isinstance(item, item_type) for item in lst
)
unique_item_types = lambda input_list: set(type(item) for item in input_list)
print(objects_data.keys())

dict_keys(['ofg', 'objects_interaction_graph', 'object_feature_vector_map', 'object_feature_matrices'])


In [17]:
res = dict()
for ot in objects_data["object_feature_vector_map"]:
    ks = objects_data["object_feature_vector_map"][ot].keys()
    vls = objects_data["object_feature_vector_map"][ot].values()
    res[ot] = (unique_item_types(ks), unique_item_types(vls))

res

{'krs': ({str}, {int}), 'krv': ({str}, {int}), 'cv': ({str}, {int})}

In [19]:
res = dict()
for ot in objects_data["object_feature_vector_map"]:
    ks = objects_data["object_feature_vector_map"][ot].keys()
    vls = objects_data["object_feature_vector_map"][ot].values()
    res[ot] = (unique_item_types(ks), unique_item_types(vls))

res

{'krs': ({str}, {int}), 'krv': ({str}, {int}), 'cv': ({str}, {int})}