In [None]:
# SELF REMINDER. Copy the 'ocpa' directory to the forked one from github, so that I can push updates to github.

# Python native
import time
import random
import pickle
from datetime import timedelta
from statistics import median as median
from tqdm import tqdm
from ast import literal_eval

# Data handling
import pandas as pd
import numpy as np

# Object centric process mining
from ocpa.objects.log.ocel import OCEL
import ocpa.objects.log.importer.ocel.factory as ocel_import_factory  # json/xml import factory
import ocpa.objects.log.importer.csv.factory as csv_import_factory
import ocpa.objects.log.converter.factory as convert_factory
import ocpa.algo.util.filtering.log.time_filtering
import ocpa.algo.util.filtering.log.variant_filtering as trace_filtering

# import ocpa.algo.evaluation.precision_and_fitness.utils as evaluation_utils # COMMENTED OUT BY TIM
# import ocpa.algo.evaluation.precision_and_fitness.evaluator as precision_fitness_evaluator # COMMENTED OUT BY TIM
import ocpa.algo.predictive_monitoring.factory as feature_extractor
from ocpa.algo.predictive_monitoring import time_series
from ocpa.algo.predictive_monitoring import tabular, sequential
from ocpa.algo.discovery.ocpn import algorithm as ocpn_discovery_factory
import ocpa.visualization.oc_petri_net.factory as vis_factory
import ocpa.visualization.log.variants.factory as log_viz

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Simple machine learning models and procedure tools and evaluation metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score
import shap

# Tensorflow deep learning
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import keras.backend as K

# Custom GNN tools
from gnn_utils import (
    generate_graph_dataset,
    get_ordered_event_list,
    visualize_graph,
    show_remaining_times,
    visualize_instance,
    GraphDataLoader,
    GCN,
    evaluate_gnn,
)

In [None]:
# Config variables

filename = "example_logs/mdl/BPI2017-Final.csv"
object_types = ["application", "offer"]
parameters = {
    "obj_names": object_types,
    "val_names": [],
    "act_name": "event_activity",
    "time_name": "event_timestamp",
    "sep": ",",
    "take_sample": 9999,
}
file_path_object_attribute_table = None


In [None]:
# Importing OCEL

ocel = csv_import_factory.apply(
    filename, csv_import_factory.TO_OCEL, parameters, file_path_object_attribute_table
)

In [None]:
# Constructing feature graphs

activities = list(set(ocel.log.log["event_activity"].tolist()))
feature_set = [
    (feature_extractor.EVENT_REMAINING_TIME, ()),
    (feature_extractor.EVENT_PREVIOUS_TYPE_COUNT, ("GDSRCPT",)),
    (feature_extractor.EVENT_ELAPSED_TIME, ()),
] + [(feature_extractor.EVENT_PRECEDING_ACTIVITES, (act,)) for act in activities]
feature_storage = feature_extractor.apply(ocel, feature_set, [])

In [None]:
# Setting up for machine learning tasks

feature_storage.extract_normalized_train_test_split(0.3, state=42)

# keep list of first three events for comparability of regression use case
events_to_remove = []
for g in tqdm(feature_storage.feature_graphs):
    event_ids = [n.event_id for n in g.nodes]
    event_ids.sort()
    events_to_remove = events_to_remove + event_ids[:3]

label_order = None

accuracy_dict = {}

In [None]:
# Building train-val-test graphs for DGL

train_idx, val_idx = train_test_split(feature_storage.training_indices, test_size=0.2)
x_train, y_train = generate_graph_dataset(
    feature_storage.feature_graphs, train_idx, ocel
)
x_val, y_val = generate_graph_dataset(feature_storage.feature_graphs, val_idx, ocel)
x_test, y_test = generate_graph_dataset(
    feature_storage.feature_graphs, feature_storage.test_indices, ocel
)

In [None]:
# Case study 5 - Graph-based variant visualization

uc5 = "USE CASE 5 - Graph-based variant visualization"
print("_"*len(uc5))
print(uc5)
print("_"*len(uc5))
layouting = log_viz.apply(ocel)
print(layouting[ocel.variants[61]])

In [None]:
# Case study 6 - Graph Neural Network Prediction

uc6 = "USE CASE 6 - Graph neural network prediction"
print("_"*len(uc6))
print(uc6)
print("_"*len(uc6))
train_idx, val_idx = train_test_split(
    feature_storage.training_indices, test_size=0.2
)
x_train, y_train = generate_graph_dataset(
    feature_storage.feature_graphs, train_idx, ocel
)
x_val, y_val = generate_graph_dataset(feature_storage.feature_graphs, val_idx, ocel)
x_test, y_test = generate_graph_dataset(
    feature_storage.feature_graphs, feature_storage.test_indices, ocel
)

# initialize data loaders
train_loader = GraphDataLoader(
    x_train,
    y_train,
    batch_size=64,
    shuffle=True,
    add_self_loop=True,
    make_bidirected=False,
    on_gpu=False,
)
val_loader = GraphDataLoader(
    x_val,
    y_val,
    batch_size=64,
    shuffle=True,
    add_self_loop=True,
    make_bidirected=False,
    on_gpu=False,
)
test_loader = GraphDataLoader(
    x_test,
    y_test,
    batch_size=128,
    shuffle=False,
    add_self_loop=True,
    make_bidirected=False,
    on_gpu=False,
)

# define GCN model
tf.keras.backend.clear_session()
model = GCN(24, 24)
optimizer = tf.keras.optimizers.Adam(lr=0.01)
loss_function = tf.keras.losses.MeanAbsoluteError()

# run tensorflow training loop
epochs = 30
iter_idx = np.arange(0, train_loader.__len__())
loss_history = []
val_loss_history = []
step_losses = []
for epoch in range(epochs):
    print("Running epoch:", epoch)
    np.random.shuffle(iter_idx)
    current_loss = step = 0
    for batch_id in tqdm(iter_idx):
        step += 1
        dgl_batch, label_batch = train_loader.__getitem__(batch_id)
        with tf.GradientTape() as tape:
            pred = model(dgl_batch, dgl_batch.ndata["features"])
            loss = loss_function(label_batch, pred)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        step_losses.append(loss.numpy())
        current_loss += loss.numpy()
        # if (step % 100 == 0): print('Loss: %s'%((current_loss / step)))
        loss_history.append(current_loss / step)
    val_predictions, val_labels = evaluate_gnn(val_loader, model)
    val_loss = tf.keras.metrics.mean_absolute_error(
        np.squeeze(val_labels), np.squeeze(val_predictions)
    ).numpy()
    print("    Validation MAE GNN:", val_loss)
    if len(val_loss_history) < 1:
        model.save_weights("gnn_checkpoint.tf")
        print("    GNN checkpoint saved.")
    else:
        if val_loss < np.min(val_loss_history):
            model.save_weights("gnn_checkpoint.tf")
            print("    GNN checkpoint saved.")
    val_loss_history.append(val_loss)

# visualize training progress
pd.DataFrame({"loss": loss_history, "step_losses": step_losses}).plot(
    subplots=True, layout=(1, 2), sharey=True
)

# restore weights from best epoch
cp_status = model.load_weights("gnn_checkpoint.tf")
cp_status.assert_consumed()

# generate predictions and calculate MAE for train, val & test sets
train_predictions, train_labels = evaluate_gnn(train_loader, model)
val_predictions, val_labels = evaluate_gnn(val_loader, model)
test_predictions, test_labels = evaluate_gnn(test_loader, model)
mean_prediction = np.mean(np.array(y_train))
print("MAE baseline: ")
print(
    mean_absolute_error(test_labels, np.repeat(mean_prediction, len(test_labels)))
)
print("MAE GNN: ")
print(mean_absolute_error(test_predictions, test_labels))

# record performance of GNN
accuracy_dict["gnn"] = {
    "train_MAE": mean_absolute_error(train_predictions, train_labels),
    "val_MAE": mean_absolute_error(val_predictions, val_labels),
    "test_MAE": mean_absolute_error(test_predictions, test_labels),
}
print(pd.DataFrame(accuracy_dict))
# calculate shap values for the presence of edges for sample instance
test_graph = x_test[2]
# visualize_instance(test_graph, y_test[2])
test_graph = dgl.add_self_loop(test_graph)
test_features = test_graph.ndata["features"]
test_features = test_features.numpy()
test_features.shape

# define prediction function
def f(edge_selection):

    all_preds = []

    for i in edge_selection:
        idx = np.concatenate([i, np.array([1, 1, 1, 1])], axis=0).astype("bool")
        edges = test_graph.edges()
        selected_from = edges[0].numpy()[idx]
        selected_to = edges[1].numpy()[idx]
        new_graph = dgl.graph(data=(selected_from, selected_to))
        new_graph.ndata["features"] = test_graph.ndata["features"]
        new_graph.ndata["remaining_time"] = test_graph.ndata["remaining_time"]
        new_graph.ndata["event_indices"] = test_graph.ndata["event_indices"]

        with tf.device("CPU:0"):
            pred = model(new_graph, new_graph.ndata["features"]).numpy().squeeze()

        all_preds.append(pred)
    all_preds = np.array(all_preds)

    return all_preds

# explain instance
explainer = shap.KernelExplainer(f, np.zeros((1, 4)))
shap_values = explainer.shap_values(np.ones((1, 4)), nsamples=1000)
shap_values

plt.clf()
# visualize instance
nx_G = x_test[2].cpu().to_networkx(node_attrs=["remaining_time", "event_indices"])
pos = nx.kamada_kawai_layout(nx_G)
edges = [i for i in nx_G.edges() if (i[0] != i[1])]
edge_labels = {k: v for k, v in zip(edges, np.round(shap_values[0], 2))}
nx.draw(nx_G, pos, with_labels=True, node_color=[[0.7, 0.7, 0.7]], font_size=10)
nx.draw_networkx_edge_labels(nx_G, pos, edge_labels=edge_labels)
plt.savefig("shap_graph.png")
pd.DataFrame(accuracy_dict).to_csv("results_table.csv")
