In [5]:
# SELF REMINDER. Copy the 'ocpa' directory to the forked one from github, so that I can push updates to github.

# Python native
import pickle
from statistics import median as median
from tqdm import tqdm
import os
import random
from sys import platform
if platform == "linux" or platform == "linux2":
    os.chdir("/home/tim/Development/OCELFeatureExtractionExperiments/")
elif platform == "win32":
    os.chdir("c:\\Users\\Tim\\Development\\OCELFeatureExtractionExperiments")

# from copy import deepcopy

# Object centric process mining
from ocpa.algo.predictive_monitoring.obj import Feature_Storage as FeatureStorage
import ocpa.algo.predictive_monitoring.factory as feature_factory

# Data handling
# import pandas as pd
import numpy as np

# # Simple machine learning models, procedure tools, and evaluation metrics
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# PyG
import torch

# from replicating.ocpa_PyG_integration.EventGraphDataset import EventGraphDataset
from replicating.ocpa_PyG_integration.EventSubGraphDataset import EventSubGraphDataset

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

# Global variables
# from replicating.experiment_config import STORAGE_PATH, RANDOM_SEED, TARGET_LABEL
STORAGE_PATH = "data/ocpa-processed"
FEATURE_STORAGE_FILE = "BPI17-scaled-split.fs"
RANDOM_SEED = 3
TARGET_LABEL = (feature_factory.EVENT_REMAINING_TIME, ())
SUBGRAPH_SIZE = 4
EPOCHS = 30
NUM_GRAPHS_PER_BATCH = 64

Torch version: 1.13.1+cpu
Cuda available: False
Torch geometric version: 2.2.0


In [6]:
# Initializing random seeds for maximizing reproducibility
torch.manual_seed(RANDOM_SEED)
def seed_worker(worker_id) -> None:
    # worker_seed = torch.initial_seed() % RANDOM_SEED
    worker_seed = RANDOM_SEED
    np.random.seed(worker_seed)
    random.seed(worker_seed)

generator = torch.Generator().manual_seed(RANDOM_SEED)
# torch.use_deterministic_algorithms(True) # incompatible with GCN

In [8]:
# If FEATURE_STORAGE_FILE not cached, generate it
if not os.path.exists(f"{STORAGE_PATH}/raw/{FEATURE_STORAGE_FILE}"):
    if not os.path.exists(f"{STORAGE_PATH}/raw/BPI17-feature_storage-[C2,D1,P2,P3,O3].fs"):
        from importing_ocel import feature_storage
    else:
        with open(
            f"{STORAGE_PATH}/raw/BPI17-feature_storage-[C2,D1,P2,P3,O3].fs", "rb"
        ) as file:
            feature_storage: FeatureStorage = pickle.load(file)
    
    # Adams didn't give this split a random seed, 
    # thus we can split the validation set in this arbitrary manner
    feature_storage.extract_normalized_train_test_split(
        test_size=0.3,
        validation_size=0.7*0.2, 
        scaler=StandardScaler,
        scaling_exempt_features=[],
        state=RANDOM_SEED,
    )

    with open(
        f"{STORAGE_PATH}/raw/{FEATURE_STORAGE_FILE}",
        "wb",
    ) as file:
        pickle.dump(feature_storage, file)

ds_train = EventSubGraphDataset(
    train=True,
    root=STORAGE_PATH,
    filename=FEATURE_STORAGE_FILE,
    label_key=TARGET_LABEL,
    size_subgraph_samples=SUBGRAPH_SIZE,
    verbosity=51,
)
ds_val = EventSubGraphDataset(
    validation=True,
    root=STORAGE_PATH,
    filename=FEATURE_STORAGE_FILE,
    label_key=TARGET_LABEL,
    size_subgraph_samples=SUBGRAPH_SIZE,
    verbosity=51,
)
ds_test = EventSubGraphDataset(
    test=True,
    root=STORAGE_PATH,
    filename=FEATURE_STORAGE_FILE,
    label_key=TARGET_LABEL,
    size_subgraph_samples=SUBGRAPH_SIZE,
    verbosity=51,
)

  result = np.asarray(values, dtype=dtype)
  result = np.asarray(values, dtype=dtype)
  result = np.asarray(values, dtype=dtype)


No EventSubGraphDataset found with this configuration in 'data\ocpa-processed\processed'. Proceeding to processing...


Processing...
17645it [09:46, 30.09it/s]
Done!


In [10]:
print("Train set")
print(ds_train.get_summary())
print()

# Val and test look the same (most likely)
# print("Validation set")
# print(ds_val.get_summary())
# print()

# print("Test set")
# print(ds_test.get_summary())
# print()

Train set


100%|██████████| 168030/168030 [00:41<00:00, 4003.36it/s]


EventSubGraphDataset (#graphs=168030):
+------------+----------+----------+
|            |   #nodes |   #edges |
|------------+----------+----------|
| mean       |        4 |      3   |
| std        |        0 |      0.6 |
| min        |        4 |      0   |
| quantile25 |        4 |      3   |
| median     |        4 |      3   |
| quantile75 |        4 |      3   |
| max        |        4 |      5   |
+------------+----------+----------+



In [4]:
from model import GCN, GAT
import torch
from torch_geometric.loader import DataLoader
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


def count_parameters(model) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# Initialize model
model = GCN(
    ds_train.num_node_features,
    {
        "num_hidden_features": ds_train.num_node_features,
        "size_subgraph_samples": SUBGRAPH_SIZE,
    },
)
print(model)
print(f"Number of parameters: {count_parameters(model)}")


# Use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
model = model.to(device)
# data = ds_train.to(device)

# Initialize Optimizer
learning_rate = 0.01
# decay = 5e-4
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate,
    #  weight_decay=decay
)

mse = torch.nn.MSELoss()
mae = torch.nn.L1Loss()
loss_fn = mae

train_loader = DataLoader(
    ds_train,
    batch_size=NUM_GRAPHS_PER_BATCH,
    shuffle=False,
    pin_memory=True,
    num_workers = 4,
    worker_init_fn=seed_worker,
    generator=generator,
)
val_loader = DataLoader(
    ds_val,
    batch_size=NUM_GRAPHS_PER_BATCH,
    shuffle=False,
    pin_memory=True,
    num_workers = 4,
    worker_init_fn=seed_worker,
    generator=generator,
)
test_loader = DataLoader(
    ds_test,
    batch_size=128,
    shuffle=False,
    pin_memory=True,
    num_workers = 4,
    worker_init_fn=seed_worker,
    generator=generator,
)


def train_one_epoch(
    epoch_index: int, model, train_loader, optimizer, loss_fn, tb_writer
) -> float:
    # Enumerate over the data
    running_loss = 0.0
    last_loss = 0
    for i, batch in enumerate(tqdm(train_loader)):
        # Use GPU
        batch.to(device)
        # Every data instance is an input + label pair
        inputs, adjacency_matrix, labels = (
            batch.x.float(),  # k times the batch_size, where k is the subgraph size
            batch.edge_index,
            batch.y.float(),
        )
        # Reset gradients (set_to_none is faster than to zero)
        optimizer.zero_grad(set_to_none=True)
        # Passing the node features and the connection info
        outputs = model(inputs, adjacency_matrix)
        # Compute loss and gradients
        loss = loss_fn(torch.squeeze(outputs), labels)
        loss.backward()
        # Adjust learnable weights
        optimizer.step()
        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000  # loss per batch
            print(f"  batch {i + 1} loss: {last_loss}")
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar("Loss/train", last_loss, tb_x)
            running_loss = 0.0

    return last_loss


def run_training(
    num_epochs, model, train_loader, validation_loader, optimizer, loss_fn, timestamp
):
    model_path = f"models/{model.get_class_name()}_{timestamp}"
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    writer = SummaryWriter(f"{model_path}/run")
    best_vloss = 1_000_000_000_000_000.0

    for epoch in range(num_epochs):
        print(f"EPOCH {epoch + 1}:")

        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(
            epoch, model, train_loader, optimizer, loss_fn, writer
        )

        # We don't need gradients on to do reporting
        # model.train(False)

        running_vloss = 0.0
        for i, vdata in enumerate(validation_loader):
            vdata.to(device)
            vinputs, vadjacency_matrix, vlabels = (
                vdata.x.float(),
                vdata.edge_index,
                vdata.y.float(),
            )
            voutputs = model(vinputs, vadjacency_matrix)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

        avg_vloss = running_vloss / (i + 1)
        print(f"LOSS train {avg_loss} valid {avg_vloss}")

        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars(
            "Training vs. Validation Loss",
            {"Training": avg_loss, "Validation": avg_vloss},
            epoch + 1,
        )
        writer.flush()

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            torch.save(model.state_dict(), f"{model_path}/state_dict_epoch{epoch}.pt")

def evaluate_model(
    model, dataloader, metric: callable, device: str = "cpu", verbose: bool = False
) -> float or list[float]:
    with torch.no_grad():

        def _eval_batch(batch, model):
            batch_inputs, batch_adjacency_matrix, batch_labels = (
                batch.x.float(),
                batch.edge_index,
                batch.y.float(),
            )
            return model(batch_inputs, batch_adjacency_matrix), batch_labels

        model.eval()
        model.train(False)
        model.to(device)
        y_preds = torch.tensor([]).to(device)
        y_true = torch.tensor([]).to(device)
        for batch in tqdm(dataloader, disable=not (verbose)):
            batch.to(device)
            batch_y_preds, batch_y_true = _eval_batch(batch, model)
            y_preds = torch.cat((y_preds, batch_y_preds))
            y_true = torch.cat((y_true, batch_y_true))
        y_preds = torch.squeeze(y_preds)
    return metric(y_preds.to("cpu"), y_true.to("cpu"))



GCN(
  (gconv1): GCNConv(24, 24)
  (gconv2): GCNConv(24, 24)
  (out): Linear(in_features=96, out_features=1, bias=True)
)
Number of parameters: 1297
Device: cuda:0


In [None]:

# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime("%Y%m%d_%Hh%Mm")

run_training(
    num_epochs=EPOCHS,
    model=model,
    train_loader=train_loader,
    validation_loader=val_loader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    timestamp=timestamp,
)


In [5]:
model_path0 = "/home/tim/Development/OCELFeatureExtractionExperiments/models/GCN_20230315_17h07m/state_dict_epoch26.pt"
model_path = "/home/tim/Development/OCELFeatureExtractionExperiments/models/GCN_20230315_23h22m/state_dict_epoch6.pt"
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [12]:
normed_train_mae = evaluate_model(
    model=model,
    dataloader=train_loader,
    metric=mae,
    device=device,
    verbose=True,
)
normed_train_mae

100%|██████████| 2626/2626 [00:30<00:00, 86.32it/s] 


tensor(0.4440)

In [13]:
normed_val_mae = evaluate_model(
    model=model,
    dataloader=val_loader,
    metric=mae,
    device=device,
    verbose=True,
)
normed_val_mae

100%|██████████| 657/657 [00:07<00:00, 86.79it/s] 


tensor(0.4535)

In [14]:
normed_test_mae = evaluate_model(
    model=model,
    dataloader=test_loader,
    metric=mae,
    device=device,
    verbose=True,
)
normed_test_mae

100%|██████████| 699/699 [00:13<00:00, 51.52it/s]


tensor(0.4386)

In [9]:
with open(
    f"{STORAGE_PATH}/raw/{FEATURE_STORAGE_FILE}",
    "rb",
) as file:
    fs: FeatureStorage = pickle.load(file)


In [17]:
train_mae = fs.scaler.inverse_transform([normed_train_mae]*25)[-2]
val_mae = fs.scaler.inverse_transform([normed_val_mae]*25)[-2]
test_mae = fs.scaler.inverse_transform([normed_test_mae]*25)[-2]

print("Mean absolute error denormalized:")
print('Train MAE: ', train_mae)
print('Val MAE:   ', val_mae)
print('Test MAE:  ', test_mae)

Mean absolute error denormalized:
Train MAE:  1721453.9
Val MAE:    1732891.6
Test MAE:   1715052.8
