In [1]:
import logging
import pickle
from typing import Any
import os

os.chdir("/home/tim/Development/OCPPM/")

import utilities.hetero_data_utils as hetero_data_utils
import numpy as np
import pandas as pd
import pm4py
import pm4py.ocel
import pm4py.read
import torch
from pm4py.algo.transformation.ocel.features.objects import (
    algorithm as object_feature_factory,
)
from pm4py.algo.transformation.ocel.features.objects import (
    object_lifecycle_duration,
    object_num_attributes,
    object_str_attributes,
)
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import HeteroData

# Configuration variables
ocel_in_file = "data/CS/source/cs_january_ots[krs_krv_cv]_eas[ea2_ea4_resource_ce_resource_multi].jsonocel"
ofg_out_file = "data/CS/feature_encodings/OFG/ofg/raw/CS_OFG.pkl"
objects_data_dict_out_file = "data/CS/feature_encodings/HOEG/hoeg/raw/cs_ofg+oi_graph+krs_node_map+krv_node_map+cv_node_map.pkl"

object_types = ["krs", "krv", "cv"]
object_attributes = {"str": [], "num": ["oa1", "oa5", "oa10"]}

In [2]:
# load OCEL
ocel = pm4py.read.read_ocel(ocel_in_file)

In [3]:
# record the index as a column so we can later link objects (i.a. with events)
ocel.objects = ocel.objects.reset_index().rename(columns={"index": "object_index"})
object_feature_factory_params = {
    "str_obj_attr": ["ocel:type"],  # include ocel:oid for reference
    "num_obj_attr": ["object_index"]  # include object_index for reference
    + object_attributes["num"],
}
# create object-level feature matrix
factory_data, factory_feature_names = object_feature_factory.apply(
    ocel,
    parameters=object_feature_factory_params,
)

In [4]:
# make pd.DataFrame from feature matrix
factory_feature_prefix = "@@"
factory_num_oa_prefix = f"{factory_feature_prefix}event_num_"
factory_str_oa_prefix = f"{factory_feature_prefix}object_attr_value_"
target_feature_name = f"{factory_feature_prefix}object_lifecycle_duration"
feature_names = (
    {
        f"{factory_num_oa_prefix}{num_feature}"
        for num_feature in object_feature_factory_params["num_obj_attr"]
    }
    | {
        f"{factory_str_oa_prefix}ocel:type_{object_type}"
        for object_type in object_types
    }
    | {target_feature_name}
)

In [5]:
object_features = pd.DataFrame(factory_data, columns=factory_feature_names)
object_features = object_features[list(feature_names & set(factory_feature_names))]
object_features.head()

Unnamed: 0,@@event_num_oa5,@@event_num_oa10,@@object_lifecycle_duration,@@event_num_object_index,@@object_attr_value_ocel:type_krv,@@event_num_oa1,@@object_attr_value_ocel:type_krs,@@object_attr_value_ocel:type_cv
0,-0.090177,-0.116221,8270659.696,0.0,0,-1.12997,1,0
1,-1.0021,-0.116221,8165915.735,1.0,0,0.368881,1,0
2,-0.481001,-1.265612,1067805.049,2.0,0,0.368881,1,0
3,-0.090177,-0.116221,5125887.733,3.0,0,1.867732,1,0
4,-0.481001,-0.116221,2043256.291,4.0,0,1.867732,1,0


In [6]:
# NORMALIZE "@@object_lifecycle_duration" (JUST FOR TESTING)
object_features.loc[:, target_feature_name] = StandardScaler().fit_transform(
    object_features.loc[
        :, [f"{factory_num_oa_prefix}object_index", target_feature_name]
    ]
)[:, 1]

In [7]:
# retrieve the mapper from ocel object id to the object index in the pd.DataFrame (e.g. 'KRS-56423':1)
oid_object_index_map = hetero_data_utils.get_index_map(
    ocel.objects, "ocel:oid", "object_index"
)

In [8]:
# reset column name from object_index that was passed to the object-level feature matrix factory
object_features = object_features.rename(
    columns={f"{factory_num_oa_prefix}object_index": "object_index"}
)
object_features["object_index"] = object_features["object_index"].astype(int)


# Split object feature matrix into one feature matrix per object type
krs_features = object_features[
    object_features[f"{factory_str_oa_prefix}ocel:type_krs"] == 1
]
krv_features = object_features[
    object_features[f"{factory_str_oa_prefix}ocel:type_krv"] == 1
]
cv_features = object_features[
    object_features[f"{factory_str_oa_prefix}ocel:type_cv"] == 1
]

# Subset features to only include object attribute features, excluding object interaction features
flatten = lambda l: [item for sublist in l for item in sublist]
krs_attribute_feature_idxs = flatten(
    [
        np.where(krs_features.columns.str.contains(attr_name))[0]
        for attr_name in object_attributes["str"]
        + object_attributes["num"]
        + ["object_index", "object_lifecycle_duration"]
    ]
)
krv_attribute_feature_idxs = flatten(
    [
        np.where(krv_features.columns.str.contains(attr_name))[0]
        for attr_name in object_attributes["str"]
        + object_attributes["num"]
        + ["object_index", "object_lifecycle_duration"]
    ]
)
cv_attribute_feature_idxs = flatten(
    [
        np.where(cv_features.columns.str.contains(attr_name))[0]
        for attr_name in object_attributes["str"]
        + object_attributes["num"]
        + ["object_index", "object_lifecycle_duration"]
    ]
)
# subset krs features, with correct columns
krs_features = krs_features.iloc[:, krs_attribute_feature_idxs]
# create object_index to krs_index mapper
krs_features = hetero_data_utils.add_object_type_index(krs_features, "krs")
object_index_krs_index_map = hetero_data_utils.get_index_map(
    krs_features, "object_index", "krs_index"
)

# subset krv features, with correct columns
krv_features = krv_features.iloc[:, krv_attribute_feature_idxs]
# create object_index to krv_index mapper
krv_features = hetero_data_utils.add_object_type_index(krv_features, "krv")
object_index_krv_index_map = hetero_data_utils.get_index_map(
    krv_features, "object_index", "krv_index"
)

# subset cv features, with correct columns
cv_features = cv_features.iloc[:, cv_attribute_feature_idxs]
# create object_index to cv_index mapper
cv_features = hetero_data_utils.add_object_type_index(cv_features, "cv")
object_index_cv_index_map = hetero_data_utils.get_index_map(
    cv_features, "object_index", "cv_index"
)

In [9]:
graph = pm4py.ocel.discover_objects_graph(ocel, graph_type="object_interaction")

In [10]:
# define object relation types (edge types)
cs_edge_types = [
    ("krv", "cv"),
    ("krs", "krs"),
    ("krs", "krv"),
    ("krv", "krv"),
    ("cv", "cv"),
    ("krs", "cv"),
]
# assign edge tuples to correct edge types
cs_edges_per_edge_type = hetero_data_utils.split_on_edge_types(
    list(graph), cs_edge_types, to_undirected=True
)
cs_edge_types = list(
    cs_edges_per_edge_type.keys()
)  # reset the predefined edge_types to the edge_types found in the data

# create ocel object index to krs node index (for HeteroData) mapper
krs_to_node_map = hetero_data_utils.object_map_to_node_map(
    oid_object_index_map, object_index_krs_index_map, "krs"
)
# create ocel object index to krv node index (for HeteroData) mapper
krv_to_node_map = hetero_data_utils.object_map_to_node_map(
    oid_object_index_map, object_index_krv_index_map, "krv"
)
# create ocel object index to cv node index (for HeteroData) mapper
cv_to_node_map = hetero_data_utils.object_map_to_node_map(
    oid_object_index_map, object_index_cv_index_map, "cv"
)

In [11]:
# rename edges to have correct edge_index for HeteroData
cs_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
    cs_edges_per_edge_type, krs_to_node_map
)
cs_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
    cs_edges_per_edge_type, krv_to_node_map
)
cs_edges_per_edge_type = hetero_data_utils.rename_edges_in_split_dict(
    cs_edges_per_edge_type, cv_to_node_map
)

In [12]:
# define heterogeneous graph
hetero_data = HeteroData()
# define target variable for both "krs" type and "krv" type
hetero_data["krs"].y = torch.tensor(krs_features["@@object_lifecycle_duration"].values)
hetero_data["krv"].y = torch.tensor(krv_features["@@object_lifecycle_duration"].values)
hetero_data["cv"].y = torch.tensor(cv_features["@@object_lifecycle_duration"].values)

# attach node feature vectors for both "krs" type and "krv" type
hetero_data["krs"].x = torch.tensor(
    krs_features.drop(
        ["krs_index", "object_index", "@@object_lifecycle_duration"], axis=1
    ).values
)
hetero_data["krv"].x = torch.tensor(
    krv_features.drop(
        ["krv_index", "object_index", "@@object_lifecycle_duration"], axis=1
    ).values
)
hetero_data["cv"].x = torch.tensor(
    cv_features.drop(
        ["cv_index", "object_index", "@@object_lifecycle_duration"], axis=1
    ).values
)

In [20]:
for edge_type in cs_edge_types:
    hetero_data[
        edge_type[0], "interacts", edge_type[1]
    ].edge_index = hetero_data_utils.to_torch_coo_format(
        cs_edges_per_edge_type[edge_type]
    )

0
1
2


In [21]:
objects_data = {
    "ofg": hetero_data,
    "objects_interaction_graph": graph,
    "object_feature_vector_map": {
        "krs": krs_to_node_map,
        "krv": krv_to_node_map,
        "cv": cv_to_node_map,
    },
    "object_feature_matrices": {
        "krs": krs_features,
        "krv": krv_features,
        "cv": cv_features,
    },
}

# save HeteroData object (for OFG encoding)
with open(ofg_out_file, "wb") as binary_file:
    pickle.dump(hetero_data, binary_file)
# save object interaction graph information (for HOEG encoding)
with open(objects_data_dict_out_file, "wb") as binary_file:
    pickle.dump(objects_data, binary_file)