In [1]:
import os

go_up_n_directories = lambda path, n: os.path.abspath(
    os.path.join(*([os.path.dirname(path)] + [".."] * n))
)
os.chdir(go_up_n_directories(os.getcwd(), 3))  # run once (otherwise restart kernel)
print(os.getcwd())

/home/tim/Development/OCPPM


In [2]:
import logging
import pickle
from typing import Any
import os
import utilities.hetero_data_utils as hetero_data_utils
import numpy as np
import pandas as pd
import pm4py
import pm4py.ocel
import pm4py.read
import torch
from pm4py.algo.transformation.ocel.features.objects import (
    algorithm as object_feature_factory,
)
from pm4py.algo.transformation.ocel.features.objects import (
    object_lifecycle_duration,
    object_num_attributes,
    object_str_attributes,
)
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import HeteroData

# Configuration variables
ocel_in_file = "data/OTC/source/OTC.jsonocel"
ofg_out_file = "data/OTC/feature_encodings/OFG/ofg/raw/OTC_OFG.pkl"
objects_data_dict_out_file = "data/OTC/feature_encodings/HOEG/hoeg/raw/otc_ofg+oi_graph+item_node_map+order_node_map+packages_node_map.pkl"

object_types = ["item", "order", "package"]
object_attributes = {
    "str": [],
    "num": [
        "encoded_oid",
    ],
}

In [3]:
# load OCEL
ocel = pm4py.read.read_ocel(ocel_in_file)
ocel.objects["ocel:oid"] = ocel.objects["ocel:oid"] + "-" + ocel.objects["ocel:type"]
ocel.relations["ocel:oid"] = (
    ocel.relations["ocel:oid"] + "-" + ocel.relations["ocel:type"]
)
ocel.objects["encoded_oid"] = 1

In [4]:
# record the index as a column so we can later link objects (i.a. with events)
ocel.objects = ocel.objects.reset_index().rename(columns={"index": "object_index"})
object_feature_factory_params = {
    "str_obj_attr": ["ocel:type"],  # include ocel:oid for reference
    "num_obj_attr": ["object_index"]  # include object_index for reference
    + object_attributes["num"],
}
# create object-level feature matrix
factory_data, factory_feature_names = object_feature_factory.apply(
    ocel,
    parameters=object_feature_factory_params,
)

In [5]:
# make pd.DataFrame from feature matrix
factory_feature_prefix = "@@"
factory_num_oa_prefix = f"{factory_feature_prefix}event_num_"
factory_str_oa_prefix = f"{factory_feature_prefix}object_attr_value_"
target_feature_name = f"{factory_feature_prefix}object_lifecycle_duration"
feature_names = (
    {
        f"{factory_num_oa_prefix}{num_feature}"
        for num_feature in object_feature_factory_params["num_obj_attr"]
    }
    | {
        f"{factory_str_oa_prefix}ocel:type_{object_type}"
        for object_type in object_types
    }
    | {target_feature_name}
)

In [6]:
object_features = pd.DataFrame(factory_data, columns=factory_feature_names)
object_features = object_features[list(feature_names & set(factory_feature_names))]

In [7]:
# NORMALIZE "@@object_lifecycle_duration" (JUST FOR TESTING)
object_features.loc[:, target_feature_name] = StandardScaler().fit_transform(
    object_features.loc[
        :, [f"{factory_num_oa_prefix}object_index", target_feature_name]
    ]
)[:, 1]

In [8]:
# retrieve the mapper from ocel object id to the object index in the pd.DataFrame (e.g. '880002':1)
oid_object_index_map = hetero_data_utils.get_index_map(
    ocel.objects, "ocel:oid", "object_index"
)

In [9]:
# reset column name from object_index that was passed to the object-level feature matrix factory
object_features = object_features.rename(
    columns={f"{factory_num_oa_prefix}object_index": "object_index"}
)
object_features["object_index"] = object_features["object_index"].astype(int)


# Split object feature matrix into one feature matrix per object type
def get_object_type_matrices(
    object_features: pd.DataFrame, ots: list[str]
) -> tuple[list[pd.DataFrame], list[dict]]:
    """Note this function's impurity: `object_attributes` is external."""
    _flatten = lambda l: [item for sublist in l for item in sublist]
    object_type_matrices = []
    object_index_ot_index_maps = []
    for ot in ots:
        # split object feature matrix into a feature matrix per object type
        ot_features = object_features[
            object_features[f"{factory_str_oa_prefix}ocel:type_{ot}"] == 1
        ]
        # Subset features to only include object attribute features, excluding object interaction features
        ot_attribute_feature_idxs = _flatten(
            [
                np.where(ot_features.columns.str.contains(attr_name))[0]
                for attr_name in object_attributes["str"]
                + object_attributes["num"]
                + ["object_index", "object_lifecycle_duration"]
            ]
        )
        # subset each object type's features, with correct columns
        ot_features = ot_features.iloc[:, ot_attribute_feature_idxs]
        # create object_index to specific ot_index mapper
        ot_features = hetero_data_utils.add_object_type_index(ot_features, ot)
        object_type_matrices.append(ot_features)
        object_index_ot_index_map = hetero_data_utils.get_index_map(
            ot_features, "object_index", f"{ot}_index"
        )
        object_index_ot_index_maps.append(object_index_ot_index_map)
    return object_type_matrices, object_index_ot_index_maps


object_type_matrices, object_index_ot_index_maps = get_object_type_matrices(
    object_features, object_types
)

In [10]:
oi_graph = pm4py.ocel.discover_objects_graph(ocel, graph_type="object_interaction")

In [11]:
# define object relation types (edge types)
otc_edge_types = [
    ("orders", "orders"),
    ("orders", "item"),
    ("orders", "packages"),
    ("item", "item"),
    ("item", "packages"),
    ("packages", "packages"),
]
# assign edge tuples to correct edge types
otc_edges_per_edge_type = hetero_data_utils.split_on_edge_types(
    list(oi_graph), otc_edge_types, to_undirected=True
)

In [12]:
otc_edge_types = list(
    otc_edges_per_edge_type.keys()
)  # reset the predefined edge_types to the edge_types found in the data
ot_to_node_maps = []
for ot, object_index_ot_index_map in zip(object_types, object_index_ot_index_maps):
    # create ocel object index to specific object type node index (for HeteroData) mapper
    ot_to_node_map = hetero_data_utils.object_map_to_node_map(
        oid_object_index_map, object_index_ot_index_map, ot
    )
    ot_to_node_maps.append(ot_to_node_map)
    # rename edges to have correct edge_index for HeteroData
    otc_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
        otc_edges_per_edge_type, ot_to_node_map
    )  # THIS ONE MIGHT NOT BE CORRECTLY UPDATED

In [13]:
# define heterogeneous graph
hetero_data = HeteroData()
for ot, object_type_matrix in zip(object_types, object_type_matrices):
    # define target variable for both "krs" type and "krv" type
    hetero_data[ot].y = torch.tensor(
        object_type_matrix["@@object_lifecycle_duration"].values
    )
    # attach node feature vectors for both "krs" type and "krv" type
    hetero_data[ot].x = torch.tensor(
        object_type_matrix.drop(
            [f"{ot}_index", "object_index", "@@object_lifecycle_duration"], axis=1
        ).values
    )

In [14]:
for edge_type in otc_edge_types:
    hetero_data[
        edge_type[0], "interacts", edge_type[1]
    ].edge_index = hetero_data_utils.to_torch_coo_format(
        otc_edges_per_edge_type[edge_type]
    )

In [15]:
# SPECIFIC TO THIS DATASET:
# remove object type name from object IDs
renamed_oi_graph = set()
for edge in oi_graph:
    renamed_edge = edge[0].split("-")[0], edge[1].split("-")[0]
    renamed_oi_graph.add(renamed_edge)
renamed_ot_to_node_maps = [
    {oid_key.split("-")[0]: node_id for oid_key, node_id in ot_to_node_map.items()}
    for ot_to_node_map in ot_to_node_maps
]

In [16]:
objects_data = {
    "ofg": hetero_data,
    "objects_interaction_graph": renamed_oi_graph,
    "object_feature_vector_map": {
        ot: ot_to_node_map
        for ot, ot_to_node_map in zip(object_types, renamed_ot_to_node_maps)
    },
    "object_feature_matrices": {
        ot: ot_matrix for ot, ot_matrix in zip(object_types, object_type_matrices)
    },
}

# save HeteroData object (for OFG encoding)
with open(ofg_out_file, "wb") as binary_file:
    pickle.dump(hetero_data, binary_file)
# save object interaction graph information (for HOEG encoding)
with open(objects_data_dict_out_file, "wb") as binary_file:
    pickle.dump(objects_data, binary_file)