In [1]:
import os

os.chdir("/home/tim/Development/OCPPM/")
import pickle
import torch
from tqdm import tqdm
import random
import pandas as pd
import json
from collections import defaultdict
from ocpa.algo.predictive_monitoring.obj import Feature_Storage as FeatureStorage
from torch_geometric.data import HeteroData


def sample_dict(dictionary: dict, n: int):
    keys = random.sample(list(dictionary), n)
    return {key: dictionary[key] for key in keys}


flatten = lambda newlist: [item for items in newlist for item in items]
list_intersect = lambda big_list, smaller_list: [
    x for x in big_list if x in smaller_list
]

In [2]:
cs_objects_data_dict = "data/CS/feature_encodings/HOEG/hoeg/raw/cs_ofg+oi_graph+krs_node_map+krv_node_map+cv_node_map.pkl"
bpi_objects_data_dict = "data/BPI17/feature_encodings/HOEG/hoeg/raw/bpi17_ofg+oi_graph+app_node_map+off_node_map.pkl"
cs_fs = "data/CS/feature_encodings/EFG/efg/raw/CS_split_[C2_P2_P3_O3_eas].fs"
bpi_fs = "data/BPI17/feature_encodings/HOEG/hoeg/raw/BPI_split_[C2_P2_P3_P5_O3_Action_EventOrigin_OrgResource].fs"
cs_ocel_in_file = "data/CS/source/cs_january_ots[krs_krv_cv]_oas[oa1_oa2_oa3_oa4_oa5_oa6_oa7_oa8_oa9_oa10_oa11_oa12_oa13_oa15_oa16]_eas[resourceCE_resourceMulti_ea1_ea2_ea3_ea4_ea6_ea8_ea10_ea12_ea14].jsonocel"
object_nodes_label_key = "@@object_lifecycle_duration"
event_node_type = "event"
object_node_types = ["krs", "krv", "cs"]
edge_types = [("event", "interacts", "krs"), ("event", "interacts", "krv")]

# with open(cs_ocel_in_file, 'r') as ocel:
#     cs_data = json.load(ocel)
with open(cs_fs, "rb") as binary_file:
    csfs = pickle.load(binary_file)
# with open(bpi_fs, "rb") as binary_file:
#     bpifs = pickle.load(binary_file)

with open(cs_objects_data_dict, "rb") as binary_file:
    cs_objects_data = pickle.load(binary_file)
with open(bpi_objects_data_dict, "rb") as binary_file:
    bpi_objects_data = pickle.load(binary_file)

print(cs_objects_data.keys())
bpi_objects_data.keys()

dict_keys(['ofg', 'objects_interaction_graph', 'object_feature_vector_map', 'object_feature_matrices'])


dict_keys(['ofg', 'objects_interaction_graph', 'object_feature_vector_map', 'object_feature_matrices'])

In [3]:
class test_HOEG:
    def __set_to_split_dict(
        self, unique_objects: set[tuple[str, str]]
    ) -> dict[str, list[str]]:
        # Function that splits a set of [object type, object id]
        # into a dict with object types as keys and object ids as values
        result = defaultdict(list)
        for item in unique_objects:
            result[item[0]].append(item[1])
        return dict(result)

    def __replace_dict_values(
        self, split_dict: dict[str, list[str]], objects_map: dict[str, dict[str, int]]
    ) -> dict[str, list[int]]:
        return {
            key: [objects_map[key].get(value, value) for value in value_list]
            for key, value_list in split_dict.items()
        }

    def _get_object_mapping(
        self,
        feature_graph: FeatureStorage.Feature_Graph,
        object_to_feature_vector_map: dict[str, dict[str, int]],
    ) -> tuple[dict[str, list[int]], dict[str, dict[str, int]]]:
        """
        Function that, given a feature_graph, returns a dictionary with a
        key per object type and indices as values. These indices are the
        rows of the object feature matrices (found in self.objects_data)
        that should be added as object nodes in the HeteroData graph.
        """

        # get dict with object type as keys and object ids as values
        unique_objects_dict = self.__set_to_split_dict(
            unique_objects=set(
                item for obj in feature_graph.objects.values() for item in obj
            )
        )
        # get dict of object type as keys and feature vector indices as values
        object_feature_vector_map = self.__replace_dict_values(
            unique_objects_dict, object_to_feature_vector_map
        )
        # get dict that maps object ids to node ids for each object type
        object_node_map = {
            key: {value: index for index, value in enumerate(value_list)}
            for key, value_list in unique_objects_dict.items()
        }

        return object_feature_vector_map, object_node_map

    def _get_node_features(
        self,
        event_node_type: str,
        feature_graph: FeatureStorage.Feature_Graph,
        object_node_types: list[str],
        object_feature_matrices: dict[str, pd.DataFrame],
        object_id_to_feature_matrix_index: dict[str, list[int]],
        object_nodes_label_key: str,
    ) -> dict[str, dict[str, torch.Tensor]]:
        """
        This will return a dict with  feature matrices per node type
        [Number of Nodes, Node Feature size]
        """
        # Append event node features to matrix
        event_node_feature_matrix: list[list[float]] = [
            list(node.attributes.values()) for node in feature_graph.nodes
        ]

        node_features_dict = {
            event_node_type: {
                "x": torch.tensor(event_node_feature_matrix, dtype=torch.float)
            }
        }
        for object_node_type in object_node_types:
            if object_node_type in object_id_to_feature_matrix_index:
                object_node_feature_matrix = (
                    object_feature_matrices[object_node_type]
                    .drop(
                        [f"{object_node_type}_index", "object_index"], axis=1
                    )  # assuming naming scheme in ofg construction pipeline
                    .iloc[object_id_to_feature_matrix_index[object_node_type]]
                )
                node_features_dict[object_node_type] = {
                    "y": torch.tensor(
                        object_node_feature_matrix[object_nodes_label_key].values,
                        dtype=torch.float,
                    ),
                    "x": torch.tensor(
                        object_node_feature_matrix.drop(
                            columns=[object_nodes_label_key]
                        ).values,
                        dtype=torch.float,
                    ),
                }
        return node_features_dict


testhoeg = test_HOEG()

In [5]:
failure_ids = []
failures = []
for i, fg in enumerate(csfs.feature_graphs):
    object_feature_vector_map, object_node_map = testhoeg._get_object_mapping(
        feature_graph=fg,
        object_to_feature_vector_map=cs_objects_data["object_feature_vector_map"],
    )
    if (not object_feature_vector_map) or (not object_node_map):
        failure_ids.append(i)
        failures.extend(n.objects for n in fg.nodes)
failures = [f[1] for f in flatten(failures)]

In [4]:
failure_ids = []
success_ids = []
for i, fg in enumerate(csfs.feature_graphs):
    if fg.objects:
        success_ids.append(i)
    else:
        failure_ids.append(i)
print(len(failure_ids))
print(len(success_ids))

0
161838


In [57]:
object_occurences = []
for e in cs_data["ocel:events"].values():
    object_occurences.extend(e["ocel:omap"])

In [77]:
df = pd.DataFrame(object_occurences)
df = df[df[0].isin(failures)]
sub_object_occs = df[0].values.tolist()

In [81]:
df.value_counts()

KRS-7839107    6
KRV-4590307    6
KRV-5810932    6
KRS-2399318    6
KRS-5541891    6
              ..
KRS-4777755    1
KRS-4777754    1
KRS-4777494    1
KRS-4777491    1
KRV-9999724    1
Length: 81590, dtype: int64

: 

In [47]:
object_occurences[:4]
len(object_occurences)

1828193

In [5]:
def check_list_item_type(lst, item_type):
    return all(isinstance(item, item_type) for item in lst)


failures = []
for i, csfg in enumerate(csfs.feature_graphs):
    object_feature_vector_map, object_node_map = testhoeg._get_object_mapping(
        feature_graph=csfg,
        object_to_feature_vector_map=cs_objects_data["object_feature_vector_map"],
    )
    if (
        "krv" in object_feature_vector_map
        and type(object_feature_vector_map["krv"]) == list
    ):
        test = object_feature_vector_map["krv"]
        p = check_list_item_type(test, int)
        if not p:
            failures.append(test)

len(failures)

148

In [1]:
failures = flatten(failures)

NameError: name 'failures' is not defined

In [23]:
csfg = csfs.feature_graphs[248916]
object_feature_vector_map, object_node_map = testhoeg._get_object_mapping(
    feature_graph=csfg,
    object_to_feature_vector_map=cs_objects_data["object_feature_vector_map"],
)
node_features = testhoeg._get_node_features(
    event_node_type=event_node_type,
    object_node_types=object_node_types,
    feature_graph=csfg,
    object_feature_matrices=cs_objects_data["object_feature_matrices"],
    object_id_to_feature_matrix_index=object_feature_vector_map,
    object_nodes_label_key=object_nodes_label_key,
)

IndexError: list index out of range

In [None]:
node_features
object_feature_vector_map, object_node_map
n: FeatureStorage.Feature_Graph.Node = csfg.nodes[0]
n.objects

[('krv', 'KRV-9789608'), ('krs', 'KRS-1814915')]

In [75]:
eventx = node_features[event_node_type]["x"]

hetero_data = HeteroData()
# Attach feature matrices and target variables
hetero_data[event_node_type].x = eventx
hetero_data[event_node_type].y = eventx.shape[0]
for object_node_type in object_node_types:
    if not object_node_type in node_features:
        # continue # if object type not related to current process execution graph, skip loop and try next object type
        hetero_data[object_node_type].x = torch.tensor([])
        hetero_data[object_node_type].y = torch.tensor([])
    else:
        hetero_data[object_node_type].x = node_features[object_node_type]["x"]
        hetero_data[object_node_type].y = node_features[object_node_type]["y"]
# Define edge index per edge type
for edge_type in edge_types:
    subject, predicate, object = edge_type[0], edge_type[1], edge_type[2]
    hetero_data[subject, predicate, object].edge_index = edge_index = torch.tensor(
        [[0, 0, 1, 2], [1, 2, 3, 4]], dtype=torch.long
    )
    # = edge_index_dict[edge_type]

In [23]:
x = cs_objects_data["object_feature_vector_map"]
sample_dict(x, 1)

{'krs': {'KRS-6830105': 0,
  'KRS-2447124': 1,
  'KRS-5616964': 2,
  'KRS-6384762': 3,
  'KRS-7457068': 4,
  'KRS-9329287': 5,
  'KRS-7031790': 6,
  'KRS-7758130': 7,
  'KRS-4615297': 8,
  'KRS-8237410': 9,
  'KRS-6226077': 10,
  'KRS-754969': 11,
  'KRS-6739966': 12,
  'KRS-8382240': 13,
  'KRS-9844936': 14,
  'KRS-8075031': 15,
  'KRS-464858': 16,
  'KRS-7718580': 17,
  'KRS-1343391': 18,
  'KRS-9087536': 19,
  'KRS-1160964': 20,
  'KRS-2761065': 21,
  'KRS-250163': 22,
  'KRS-8134349': 23,
  'KRS-2148122': 24,
  'KRS-7829076': 25,
  'KRS-8332369': 26,
  'KRS-4987561': 27,
  'KRS-9032607': 28,
  'KRS-7784126': 29,
  'KRS-7546058': 30,
  'KRS-4808278': 31,
  'KRS-3530053': 32,
  'KRS-8664599': 33,
  'KRS-7673530': 34,
  'KRS-6958958': 35,
  'KRS-3142784': 36,
  'KRS-8330093': 37,
  'KRS-9621568': 38,
  'KRS-4580074': 39,
  'KRS-3111667': 40,
  'KRS-9781287': 41,
  'KRS-5878935': 42,
  'KRS-5456604': 43,
  'KRS-8785011': 44,
  'KRS-69594': 45,
  'KRS-4433582': 46,
  'KRS-5729181': 47,


In [24]:
x = bpi_objects_data["object_feature_vector_map"]
sample_dict(x, 1)

{'application': {'Application_652823628': 0,
  'Application_1691306052': 1,
  'Application_428409768': 2,
  'Application_1746793196': 3,
  'Application_828200680': 4,
  'Application_1085880569': 5,
  'Application_1266995739': 6,
  'Application_1878239836': 7,
  'Application_619403287': 8,
  'Application_1710223761': 9,
  'Application_1529124572': 10,
  'Application_387012864': 11,
  'Application_1120819670': 12,
  'Application_42838382': 13,
  'Application_180547487': 14,
  'Application_1966208034': 15,
  'Application_1806387393': 16,
  'Application_1111870538': 17,
  'Application_1017492916': 18,
  'Application_2082119944': 19,
  'Application_758985626': 20,
  'Application_1018615109': 21,
  'Application_319587802': 22,
  'Application_1846626914': 23,
  'Application_546206358': 24,
  'Application_196483749': 25,
  'Application_1363980603': 26,
  'Application_1553362978': 27,
  'Application_1197857445': 28,
  'Application_931862671': 29,
  'Application_2012588584': 30,
  'Application_1

In [7]:
bpi_objects_data["object_feature_vector_map"]

{'application': {'Application_652823628': 0,
  'Application_1691306052': 1,
  'Application_428409768': 2,
  'Application_1746793196': 3,
  'Application_828200680': 4,
  'Application_1085880569': 5,
  'Application_1266995739': 6,
  'Application_1878239836': 7,
  'Application_619403287': 8,
  'Application_1710223761': 9,
  'Application_1529124572': 10,
  'Application_387012864': 11,
  'Application_1120819670': 12,
  'Application_42838382': 13,
  'Application_180547487': 14,
  'Application_1966208034': 15,
  'Application_1806387393': 16,
  'Application_1111870538': 17,
  'Application_1017492916': 18,
  'Application_2082119944': 19,
  'Application_758985626': 20,
  'Application_1018615109': 21,
  'Application_319587802': 22,
  'Application_1846626914': 23,
  'Application_546206358': 24,
  'Application_196483749': 25,
  'Application_1363980603': 26,
  'Application_1553362978': 27,
  'Application_1197857445': 28,
  'Application_931862671': 29,
  'Application_2012588584': 30,
  'Application_1

In [7]:
# define object relation types (edge types)
edge_types = [
    ("krv", "cv"),
    ("krs", "krs"),
    ("krs", "krv"),
    ("krv", "krv"),
    ("cv", "cv"),
    ("krs", "cv"),
]

to_undirected = False


def _to_undirected(edge_types):
    undirected_edges = list(edge_types)
    for edge in edge_types:
        reversed_edge = (edge[1], edge[0])
        if reversed_edge not in undirected_edges:
            undirected_edges.append(reversed_edge)
    return undirected_edges


new_edge_types = _to_undirected(edge_types) if to_undirected else edge_types

In [8]:
print(new_edge_types)
print(set(new_edge_types) == set(edge_types))
edge_types

[('krv', 'cv'), ('krs', 'krs'), ('krs', 'krv'), ('krv', 'krv'), ('cv', 'cv'), ('krs', 'cv')]
True


[('krv', 'cv'),
 ('krs', 'krs'),
 ('krs', 'krv'),
 ('krv', 'krv'),
 ('cv', 'cv'),
 ('krs', 'cv')]