In [1]:
# Python native
import os

os.chdir("/home/tim/Development/OCPPM/")

import pickle
import random
import json
from copy import copy
from datetime import datetime
from statistics import median as median
from sys import platform
from typing import Any, Callable

# Data handling
import numpy as np
import ocpa.algo.predictive_monitoring.factory as feature_factory

# PyG
import torch

# PyTorch TensorBoard support
import torch.utils.tensorboard

# Object centric process mining
from ocpa.algo.predictive_monitoring.obj import Feature_Storage as FeatureStorage

# # Simple machine learning models, procedure tools, and evaluation metrics
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.tensorboard.writer import SummaryWriter
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
from tqdm import tqdm
from torch import tensor

# Custom imports
# from loan_application_experiment.feature_encodings.efg.efg import EFG
from loan_application_experiment.feature_encodings.hoeg.hoeg import HOEG

# from importing_ocel import build_feature_storage, load_ocel, pickle_feature_storage
from loan_application_experiment.models.geometric_models import (
    AGNN_EFG,
    AdamsGCN,
    GraphModel,
    HigherOrderGNN_EFG,
)
import torch_geometric.nn as pygnn
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as O

# Configuration
bpi17_hoeg_config = {
    "STORAGE_PATH": "data/BPI17/feature_encodings/HOEG/hoeg",
    "SPLIT_FEATURE_STORAGE_FILE": "BPI_split_[C2_P2_P3_P5_O3_Action_EventOrigin_OrgResource].fs",
    "TARGET_LABEL": (feature_factory.EVENT_REMAINING_TIME, ()),
    'OBJECTS_DATA_DICT': "bpi17_ofg+oi_graph+app_node_map+off_node_map.pkl",
    "SUBGRAPH_SIZE": 4,
    "BATCH_SIZE": 64,
    "RANDOM_SEED": 42,
    "EPOCHS": 30,
    'meta_data': (
            ["event", "application", "offer"],
            [
                ("event", "follows", "event"),
                ("event", "interacts", "application"),
                ("event", "interacts", "offer"),
                ("application", "interacts", "application"),
                ("application", "rev_interacts", "event"),
                ("offer", "rev_interacts", "event"),
            ],
        ),
    "early_stopping": 5,
    "optimizer_settings": {
        "lr": 0.001,
        "betas": (0.9, 0.999),
        "eps": 1e-08,
        "weight_decay": 0,
        "amsgrad": False,
    },
    "loss_fn": torch.nn.L1Loss(),
    "verbose": True,
    "skip_cache": False,
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

# ADAPTATIONS
bpi17_hoeg_config["EPOCHS"]=64
bpi17_hoeg_config["early_stopping"]=16
bpi17_hoeg_config["optimizer_settings"]={
        "lr": 1e-3,
        "betas": (0.9, 0.999),
        "eps": 1e-08,
        "weight_decay": 0,
        "amsgrad": False,
}
bpi17_hoeg_config["BATCH_SIZE"] = 16

  from .autonotebook import tqdm as notebook_tqdm


Torch version: 1.13.1+cu117
Cuda available: True
Torch geometric version: 2.3.1



In [2]:
def load_hetero_datasets(
    storage_path: str,
    split_feature_storage_file: str,
    objects_data_file: str,
    target_label: tuple[str, tuple],
    transform=None,
    train: bool = True,
    val: bool = True,
    test: bool = True,
    skip_cache: bool = False,
) -> list[HOEG]:
    datasets = []
    if train:
        ds_train = HOEG(
            train=True,
            root=storage_path,
            events_filename=split_feature_storage_file,
            objects_filename=objects_data_file,
            label_key=target_label,
            transform=transform,
            verbosity=51,
            skip_cache=skip_cache,
        )
        datasets.append(ds_train)
    if val:
        ds_val = HOEG(
            validation=True,
            root=storage_path,
            events_filename=split_feature_storage_file,
            objects_filename=objects_data_file,
            label_key=target_label,
            transform=transform,
            verbosity=51,
            skip_cache=skip_cache,
        )
        datasets.append(ds_val)
    if test:
        ds_test = HOEG(
            test=True,
            root=storage_path,
            events_filename=split_feature_storage_file,
            objects_filename=objects_data_file,
            label_key=target_label,
            transform=transform,
            verbosity=51,
            skip_cache=skip_cache,
        )
        datasets.append(ds_test)
    return datasets


def print_hetero_dataset_summaries(
    ds_train: HOEG,
    ds_val: HOEG,
    ds_test: HOEG,
) -> None:
    print("Train set")
    print(ds_train.get_summary(), "\n")
    print("Validation set")
    print(ds_val.get_summary(), "\n")
    print("Test set")
    print(ds_test.get_summary(), "\n")


def prepare_hetero_dataloaders(
    batch_size: int,
    ds_train: HOEG = None,
    ds_val: HOEG = None,
    ds_test: HOEG = None,
    shuffle: bool = True,
    pin_memory: bool = True,
    num_workers: int = 4,
    seed_worker: Callable[[int], None] = None,
    generator: torch.Generator = None,
) -> list[DataLoader]:
    dataloaders = []
    if ds_train:
        train_loader = DataLoader(
            ds_train,
            batch_size=batch_size,
            shuffle=shuffle,
            pin_memory=pin_memory,
            num_workers=num_workers,
            worker_init_fn=seed_worker,
            generator=generator,
        )
        dataloaders.append(train_loader)
    if ds_val:
        val_loader = DataLoader(
            ds_val,
            batch_size=batch_size,
            shuffle=shuffle,
            pin_memory=pin_memory,
            num_workers=num_workers,
            worker_init_fn=seed_worker,
            generator=generator,
        )
        dataloaders.append(val_loader)
    if ds_test:
        test_loader = DataLoader(
            ds_test,
            batch_size=128,
            shuffle=shuffle,
            pin_memory=pin_memory,
            num_workers=num_workers,
            worker_init_fn=seed_worker,
            generator=generator,
        )
        dataloaders.append(test_loader)
    return dataloaders


def count_parameters(model: GraphModel) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def train_one_epoch(target_node_type:str,
    epoch_index: int,
    model: GraphModel,
    train_loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    tb_writer: SummaryWriter,
    device: torch.device,
    verbose: bool = True,
) -> float:
    if verbose:
        print(f"EPOCH {epoch_index}:")

    # Enumerate over the data
    running_loss = 0.0
    last_loss = 0
    for i, batch in enumerate(tqdm(train_loader, miniters=25)):
        # Use GPU
        batch.to(device)
        # Every data instance is an input + label pair
        inputs, adjacency_matrix, labels = (
            batch.x_dict,  # k times the batch_size, where k is the subgraph size
            batch.edge_index_dict,
            batch[target_node_type].y
        )
        # Reset gradients (set_to_none is faster than to zero)
        optimizer.zero_grad(set_to_none=True)
        # Passing the node features and the connection info
        outputs = model(inputs, edge_index=adjacency_matrix, batch=batch[target_node_type].batch)
        # Compute loss and gradients
        loss = loss_fn(torch.squeeze(outputs[target_node_type]), labels)
        loss.backward()
        # Adjust learnable weights
        optimizer.step()
        # Gather data and report
        running_loss += loss.item()
        # if i % 100 == 99:
        last_loss = running_loss / 1000  # loss per batch
        if verbose:
            print(f"  batch {i+1} loss: {last_loss}")
        tb_x = epoch_index * len(train_loader) + i
        tb_writer.add_scalar("Mini-batch training loss", last_loss, tb_x)
        running_loss = 0.0

    return last_loss


def run_training(target_node_type:str,
    num_epochs: int,
    model: GraphModel,
    train_loader: DataLoader,
    validation_loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    early_stopping_criterion: int,
    timestamp: str,
    device: torch.device,
    verbose: bool = True,
) -> str:
    model_path = f"models/runs/{str(model).split('(')[0]}_{timestamp}"
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    writer = SummaryWriter(f"{model_path}/run")
    best_vloss = 1_000_000_000_000_000.
    epochs_without_improvement = 0
    for epoch in range(num_epochs):
        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(
            target_node_type, epoch, model, train_loader, optimizer, loss_fn, writer, device, verbose
        )

        # We don't need gradients on to do reporting
        model.train(False)

        running_vloss = 0.0
        for i, vbatch in enumerate(validation_loader, start=1):
            vbatch.to(device)
            vinputs, vadjacency_matrix, vlabels = (
                vbatch.x_dict,
                vbatch.edge_index_dict,
                vbatch[target_node_type].y,
            )
            voutputs = model(vinputs, vadjacency_matrix)
            vloss = loss_fn(voutputs[target_node_type], vlabels)
            running_vloss += vloss

        avg_vloss = running_vloss / i
        if verbose:
            print(f"Epoch loss - train: {avg_loss} valid: {avg_vloss}")

        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars(
            "Epoch loss",
            {"train": avg_loss, "valid": avg_vloss},
            epoch,
        )
        writer.flush()

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), f"{model_path}/state_dict_epoch{epoch}.pt")
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= early_stopping_criterion:
            print(f"Early stopping after {epoch+1} epochs.")
            break
    return model_path


def evaluate_hetero_model(target_node_type:str,
    model: GraphModel,
    dataloader: DataLoader,
    metric: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    device: torch.device = torch.device("cpu"),
    verbose: bool = False,
) -> torch.Tensor:
    with torch.no_grad():

        def _eval_batch(batch, model):
            batch_inputs, batch_adjacency_matrix, batch_labels = (
                batch.x_dict,
                batch.edge_index_dict,
                batch[target_node_type].y
            )
            return (
                model(
                    batch_inputs, edge_index=batch_adjacency_matrix, batch=batch[target_node_type].batch
                ),
                batch_labels,
            )

        model.eval()
        model.train(False)
        model.to(device)
        y_preds = torch.tensor([]).to(device)
        y_true = torch.tensor([]).to(device)
        for batch in tqdm(dataloader, disable=not (verbose)):
            batch.to(device)
            batch_y_preds, batch_y_true = _eval_batch(batch, model)
            # append
            y_preds = torch.cat((y_preds, batch_y_preds[target_node_type]))
            y_true = torch.cat((y_true, batch_y_true))
        y_preds = torch.squeeze(y_preds)
    return metric(y_preds.to(device), y_true.to(device))


def evaluate_best_model(target_node_type:str,
    model_state_dir: str,
    train_loader: DataLoader,
    val_loader: DataLoader,
    test_loader: DataLoader,
    model: GraphModel,
    metric: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    device: torch.device,
    verbose: bool = True,
) -> dict[str, torch.Tensor]:
    def find_latest_state_dict(dir: str) -> str:
        latest_state_dict_path = sorted(
            [
                item
                for item in os.listdir(dir)
                if len(item.split("state_dict_epoch")) == 2
            ]
        )[-1]
        return os.path.join(dir, latest_state_dict_path)

    best_state_dict = torch.load(
        find_latest_state_dict(model_state_dir), map_location=device
    )

    model.load_state_dict(best_state_dict)
    model.eval()
    evaluation = {
        f"Train {metric}": evaluate_hetero_model(target_node_type,
            model=model,
            dataloader=train_loader,
            metric=metric,
            device=device,
            verbose=verbose,
        ),
        f"Val {metric}": evaluate_hetero_model(target_node_type,
            model=model,
            dataloader=val_loader,
            metric=metric,
            device=device,
            verbose=verbose,
        ),
        f"Test {metric}": evaluate_hetero_model(target_node_type,
            model=model,
            dataloader=test_loader,
            metric=metric,
            device=device,
            verbose=verbose,
        ),
    }
    return evaluation

In [4]:
# Data preparation
def seed_worker(worker_id: int) -> None:
    # worker_seed = torch.initial_seed() % RANDOM_SEED
    worker_seed = bpi17_hoeg_config["RANDOM_SEED"]
    np.random.seed(worker_seed)
    random.seed(worker_seed)


generator = torch.Generator().manual_seed(bpi17_hoeg_config["RANDOM_SEED"])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Get data and dataloaders
ds_train, ds_val, ds_test = load_hetero_datasets(
    bpi17_hoeg_config["STORAGE_PATH"],
    bpi17_hoeg_config["SPLIT_FEATURE_STORAGE_FILE"],
    bpi17_hoeg_config['OBJECTS_DATA_DICT'],
    bpi17_hoeg_config["TARGET_LABEL"],
    transform=T.ToUndirected(),
    train=True,
    val=True,
    test=True,
    skip_cache=bpi17_hoeg_config["skip_cache"],
)
# print_hetero_dataset_summaries(ds_train, ds_val,ds_test)
train_loader, val_loader, test_loader = prepare_hetero_dataloaders(
    batch_size=bpi17_hoeg_config["BATCH_SIZE"],
    ds_train=ds_train,
    ds_val=ds_val,
    ds_test=ds_test,
    seed_worker=seed_worker,
    generator=generator,
)


In [5]:
class HeteroGraphConvNet(GraphModel):
    """Implementation of a Attentional Graph Neural Network for EFG"""
    # GraphConvNet(48, 1): 0.4113 MAE (test)

    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = pygnn.GraphConv(-1, hidden_channels)
        self.conv2 = pygnn.GraphConv(-1, hidden_channels)
        self.act1 = nn.PReLU()
        self.act2 = nn.PReLU()
        self.pool1 = pygnn.global_mean_pool
        self.lin_out = pygnn.Linear(-1, out_channels)

    def forward(self, x, edge_index, batch=None):
        x = self.conv1(x, edge_index)
        x = self.act1(x)
        x = self.conv2(x, edge_index)
        x = self.act2(x)
        # x = self.pool1(x, batch)
        x = self.lin_out(x)
        return x
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HeteroGraphConvNet(32, 1)
model = pygnn.to_hetero(model, bpi17_hoeg_config['meta_data'])
model.to(device)

# Print summary of data and model
if bpi17_hoeg_config["verbose"]:
    print(model)
    with torch.no_grad():  # Initialize lazy modules, s.t. we can count its parameters.
        batch = next(iter(train_loader))
        batch.to(device)
        out = model(batch.x_dict, batch.edge_index_dict, batch['event'].batch)
        print(f"Number of parameters: {count_parameters(model)}")



GraphModule(
  (conv1): ModuleDict(
    (event__follows__event): GraphConv(-1, 32)
    (event__interacts__application): GraphConv(-1, 32)
    (event__interacts__offer): GraphConv(-1, 32)
    (application__interacts__application): GraphConv(-1, 32)
    (application__rev_interacts__event): GraphConv(-1, 32)
    (offer__rev_interacts__event): GraphConv(-1, 32)
  )
  (act1): ModuleDict(
    (event): PReLU(num_parameters=1)
    (application): PReLU(num_parameters=1)
    (offer): PReLU(num_parameters=1)
  )
  (conv2): ModuleDict(
    (event__follows__event): GraphConv(-1, 32)
    (event__interacts__application): GraphConv(-1, 32)
    (event__interacts__offer): GraphConv(-1, 32)
    (application__interacts__application): GraphConv(-1, 32)
    (application__rev_interacts__event): GraphConv(-1, 32)
    (offer__rev_interacts__event): GraphConv(-1, 32)
  )
  (act2): ModuleDict(
    (event): PReLU(num_parameters=1)
    (application): PReLU(num_parameters=1)
    (offer): PReLU(num_parameters=1)
  )

In [6]:
# Training
print("Training started, progress available in Tensorboard")
torch.cuda.empty_cache()



model_path = run_training(
    target_node_type="event",
    num_epochs=bpi17_hoeg_config["EPOCHS"],
    model=model,
    train_loader=train_loader,  
    validation_loader=val_loader,
    optimizer=O.Adam(
        model.parameters(), **bpi17_hoeg_config["optimizer_settings"]
    ),
    loss_fn=bpi17_hoeg_config["loss_fn"],
    early_stopping_criterion=bpi17_hoeg_config["early_stopping"],
    timestamp=datetime.now().strftime("%Y%m%d_%Hh%Mm"),
    device=bpi17_hoeg_config['device'],
    verbose=False,
)
# Write experiment settings as JSON into model path
with open(os.path.join(model_path, "experiment_settings.json"), "w") as file_path:
    bpi17_hoeg_config["loss_fn"] = str(bpi17_hoeg_config["loss_fn"])
    bpi17_hoeg_config["device"] = str(bpi17_hoeg_config["device"])
    json.dump(bpi17_hoeg_config, file_path)

Training started, progress available in Tensorboard


100%|██████████| 1103/1103 [01:02<00:00, 17.66it/s]
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self

Early stopping after 19 epochs.


In [14]:
# Evaluation
state_dict_path = "models/runs/GraphModule_20230718_16h54m"  # 0.3902 test mae | 21k params (I DO NOT BELIEVE IT)
state_dict_path = "models/runs/GraphModule_20230718_17h02m"  # 0.4182 test mae | 21k params
state_dict_path = "models/runs/GraphModule_20230718_17h07m"  # 0.4354 test mae | 21k params
state_dict_path = "models/runs/GraphModule_20230719_18h06m"  # 0.2251 test mae | 21k params // best so far! (reloading model, re-evaluating: same result)
state_dict_path = "models/runs/GraphModule_20230719_18h52m"  # 0.2185 test mae | 21k params (full re-run of previous model)


# Get MAE results
evaluation_dict = evaluate_best_model(
    target_node_type='event',
    model_state_dir=state_dict_path,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    model=model,
    metric=torch.nn.L1Loss(),
    device=bpi17_hoeg_config['device'],
    verbose=bpi17_hoeg_config["verbose"],
)

# Print MAE results
print(evaluation_dict)

  0%|          | 0/1103 [00:00<?, ?it/s]

100%|██████████| 1103/1103 [00:27<00:00, 39.55it/s]
100%|██████████| 276/276 [00:06<00:00, 41.02it/s]
100%|██████████| 74/74 [00:06<00:00, 11.76it/s]

{'Train L1Loss()': tensor(0.2209, device='cuda:0'), 'Val L1Loss()': tensor(0.2224, device='cuda:0'), 'Test L1Loss()': tensor(0.2185, device='cuda:0')}



