In [2]:
import os
import sys
import warnings
from pprint import pprint

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import wandb

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from ppm.wandb_utils import fetch_experiments

warnings.filterwarnings("ignore")

os.environ.setdefault("ENTITY", "privajet-university-of-mannheim")
entity = os.environ["ENTITY"]
os.environ["WANDB_MODE"] = "offline"

print("CWD:", os.getcwd())
print("project_root in sys.path:", project_root in sys.path)
print("ENTITY:", entity)
print("WANDB_MODE:", os.environ.get("WANDB_MODE"))

CWD: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results
project_root in sys.path: True
ENTITY: privajet-university-of-mannheim
WANDB_MODE: offline


In [3]:
output_dir_csv = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv"
output_dir_plots = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots"
os.makedirs(output_dir_csv, exist_ok=True)
os.makedirs(output_dir_plots, exist_ok=True)

In [4]:
# Display all lines pandas
pd.set_option("display.max_rows", None)

mpl.rcParams.update({
    "figure.figsize": (6, 4),          
    "font.size": 10,                   
    "axes.labelsize": 10,              
    "axes.titlesize": 10,              
    "legend.fontsize": 9,              
    "xtick.labelsize": 9,              
    "ytick.labelsize": 9,
    "lines.linewidth": 1.5,            
    "lines.markersize": 5,             
    "axes.grid": True,                 
    "grid.linestyle": "--",
    "grid.linewidth": 0.5,
    "legend.frameon": False,           
    "pdf.fonttype": 42,                
    "ps.fonttype": 42,
    "savefig.bbox": "tight",           
    "savefig.dpi": 300,                
})

colors = [
    "#9467bd",
    "#2ca02c",
    "#bcbd22",
    "#7f7f7f",
    "#e377c2",
    "#8c564b",
    "#d62728",
    "#17becf",
    "#1f77b4",
    "#ff7f0e",
]

plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors)  

In [5]:
properties = pd.read_csv(
    "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/log_properties.csv"
)
properties

Unnamed: 0,Log,# cases,# evt.,# act.,Trace length
0,BPI20PrepaidTravelCosts,2099,18246,29,8.6927±2.3
1,BPI20RequestForPayment,6886,36796,19,5.3436±1.5
2,BPI20TravelPermitData,7065,86581,51,12.2549±5.6
3,BPI12,13087,262200,24,20.0351±19.9
4,BPI17,31509,1202267,26,38.1563±16.7


In [6]:
# from ppm.models import NextEventPredictor
# import torch 

# use_cuda = torch.cuda.is_available()
# device = "cuda" if use_cuda else "cpu"
# print("Using device:", device)

# rnn_example = NextEventPredictor(
#     embedding_size=32,
#     categorical_cols=["activity"],
#     numerical_cols=["accumulated_time"],
#     categorical_sizes={"activity": 20},
#     categorical_targets=["activity"],
#     numerical_targets=["remaining_time"],
#     backbone_name="rnn",
#     backbone_hidden_size=64,
#     backbone_n_layers=2,
#     padding_idx=0,
#     strategy="sum",
#     backbone_pretrained=False,
#     backbone_finetuning=None,
#     backbone_type="lstm",
#     device=device,
# )
# pprint(rnn_example)

In [7]:
def _to_int_or_none(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    try:
        return int(x)
    except Exception:
        return None

def map_setting(row):
    ft = row.get("fine_tuning")
    k_raw = row.get("few_shot_k", None)
    fl    = row.get("freeze_layers", None)
    ep_raw = row.get("epochs", None)

    k  = _to_int_or_none(k_raw)
    ep = _to_int_or_none(ep_raw)

    # LoRA Few-Shot
    if ft == "lora" and k == 8:
        return "FewShot-LoRA"

    # LoRA Full
    if ft == "lora" and k is None:
        return "LoRA"

    # Zero-Shot (epochs = 0)
    if ft == "freeze" and ep == 0:
        return "ZeroShot"

    # Freezing Few-Shot
    if ft == "freeze" and k == 8:
        return "FewShot-Freezing"

    # Freezing standard (keine freeze_layers angegeben)
    if ft == "freeze" and fl in (None, "", [], ()):
        return "Freezing"

    # Freezing layer configs (z.B. -1, -2 / 0, 1)
    if ft == "freeze" and fl is not None:
        if isinstance(fl, (list, tuple)):
            fl_clean = [_to_int_or_none(x) for x in fl]
        else:
            tokens = str(fl).replace("[", "").replace("]", "").replace(",", " ").split()
            fl_clean = [_to_int_or_none(x) for x in tokens]
        fl_clean = [x for x in fl_clean if x is not None]
        return f"Freezing-{fl_clean}"

    return "Other"

In [8]:
pkl_path = os.path.join(output_dir_csv, "global_results.pkl")

BACKBONE_PROJECTS = {
    "majority":         "llm-peft-ppm_majority_baseline",
    "rnn":              "llm-peft-ppm_rnn",
    "transformer":      "llm-peft-ppm_transformer_baseline",
    "tabpfn":           "llm-peft-ppm_tabpfn_baseline",
    "saprpt":           "llm-peft-ppm_saprpt_baseline",
    "gpt2":             "llm-peft-ppm_gpt2",
    "gptneo-1b3":       "llm-peft-ppm_gpt-neo-1.3B",
    "qwen25-05b":       "llm-peft-ppm_qwen25-05b",
    "llama32-1b":       "llm-peft-ppm_llama32-1b",
    "gemma-2-2b":       "llm-peft-ppm_gemma-2-2b",
}

def build_global_results():
    all_results = []
    for backbone, project_name in BACKBONE_PROJECTS.items():
        df_tmp = fetch_experiments(project=project_name, entity=entity, include_metrics=True)
        df_tmp["backbone"] = backbone
        df_tmp["project"] = project_name
        all_results.append(df_tmp)

    gr = pd.concat(all_results, ignore_index=True)

    safe_cols = [
        "id", "log", "backbone", "project", "fine_tuning",
        "total_params", "trainable_params", "seed", "_runtime", "_timestamp",
        "categorical_features", "categorical_targets",
        "continuous_features", "continuous_targets", "device", "model", "name",

        "test_next_activity_acc",
        "test_next_activity_loss",
        "test_next_remaining_time_loss",
        "test_next_time_to_next_event_loss",
        "best_test_next_activity_acc",
        "best_test_next_activity_loss",
        "best_test_next_remaining_time_loss",
        "best_test_next_time_to_next_event_loss",

        "batch_size",
        "embedding_size",
        "epochs",
        "freeze_layers",
        "grad_clip",
        "hidden_size",
        "lr",
        "n_layers",
        "rnn_type",
        "strategy",
        "weight_decay",
        "lora_alpha",
        "r",
        "few_shot_k",
    ]

    safe_cols = [c for c in safe_cols if c in gr.columns]
    gr = gr[safe_cols]
    return gr


if os.path.exists(pkl_path):
    try:
        global_results = pd.read_pickle(pkl_path)
    except Exception as e:
        print("Fehler beim Laden von global_results.pkl, baue neu:", repr(e))
        global_results = build_global_results()
        global_results.to_pickle(pkl_path)
else:
    global_results = build_global_results()
    global_results.to_pickle(pkl_path)

In [9]:
cols = [
    "id",
    "log",
    "backbone",
    "project",
    "fine_tuning",
    "total_params",
    "trainable_params",
    "test_next_activity_acc",
    "test_next_activity_loss",
    "test_next_remaining_time_loss",
    "test_next_time_to_next_event_loss",
    "best_test_next_activity_acc",
    "best_test_next_activity_loss",
    "best_test_next_remaining_time_loss",
    "best_test_next_time_to_next_event_loss",
    "_runtime",
    "mt_score",
]

df = global_results.copy()
df = df[
    df["test_next_activity_acc"].notna()
    & df["test_next_remaining_time_loss"].notna()
    & df["test_next_time_to_next_event_loss"].notna()
].copy()

sc_acc = MinMaxScaler()
sc_rt  = MinMaxScaler()
sc_nt  = MinMaxScaler()

df["na_norm"] = sc_acc.fit_transform(df[["test_next_activity_acc"]])
df["rt_norm"] = sc_rt.fit_transform(-df[["test_next_remaining_time_loss"]])
df["nt_norm"] = sc_nt.fit_transform(-df[["test_next_time_to_next_event_loss"]])
df["mt_score"] = df["na_norm"] + df["rt_norm"] + df["nt_norm"]

df.head(10)

Unnamed: 0,id,log,backbone,project,fine_tuning,total_params,trainable_params,seed,_runtime,_timestamp,...,rnn_type,strategy,weight_decay,lora_alpha,r,few_shot_k,na_norm,rt_norm,nt_norm,mt_score
0,pox3cg0n,BPI20PrepaidTravelCosts,majority,llm-peft-ppm_majority_baseline,,1.0,1.0,,1.127169,1762725000.0,...,lstm,concat,0.1,,,,0.129842,0.944634,0.940268,2.014744
1,whfyo8uu,BPI12,majority,llm-peft-ppm_majority_baseline,,1.0,1.0,,35.60098,1762849000.0,...,lstm,concat,0.1,,,,0.252059,0.955857,0.943034,2.15095
2,oy378knj,BPI20TravelPermitData,majority,llm-peft-ppm_majority_baseline,,1.0,1.0,,23.808587,1762850000.0,...,lstm,concat,0.1,,,,0.087414,0.951915,0.961259,2.000588
3,3gchqw9a,BPI20RequestForPayment,majority,llm-peft-ppm_majority_baseline,,1.0,1.0,,21.075565,1762850000.0,...,lstm,concat,0.1,,,,0.18847,0.968881,0.975401,2.132752
4,cl197f3r,BPI17,majority,llm-peft-ppm_majority_baseline,,1.0,1.0,,107.474805,1762850000.0,...,lstm,concat,0.1,,,,0.169815,0.972109,0.963413,2.105338
5,vl6y1axq,BPI12,rnn,llm-peft-ppm_rnn,,88733.0,88733.0,41.0,64.434233,1764079000.0,...,lstm,sum,0.01,,,,0.896924,0.890896,0.959974,2.747794
6,wwv4s6ta,BPI12,rnn,llm-peft-ppm_rnn,,88733.0,88733.0,42.0,65.57315,1764080000.0,...,lstm,sum,0.01,,,,0.865574,0.885115,0.957324,2.708013
7,xjcuy92g,BPI12,rnn,llm-peft-ppm_rnn,,88733.0,88733.0,43.0,66.748843,1764080000.0,...,lstm,sum,0.01,,,,0.882514,0.882064,0.956911,2.721489
8,r5ax60vx,BPI12,rnn,llm-peft-ppm_rnn,,88733.0,88733.0,44.0,60.193704,1764080000.0,...,lstm,sum,0.01,,,,0.87506,0.886308,0.959447,2.720816
9,s5xrzrsn,BPI12,rnn,llm-peft-ppm_rnn,,88733.0,88733.0,45.0,68.910497,1764080000.0,...,lstm,sum,0.01,,,,0.902644,0.889049,0.956773,2.748466


In [10]:
METRICS = [
    "test_next_activity_acc",
    "test_next_activity_loss",
    "test_next_remaining_time_loss",
    "test_next_time_to_next_event_loss",
    "best_test_next_activity_acc",
    "best_test_next_activity_loss",
    "best_test_next_remaining_time_loss",
    "best_test_next_time_to_next_event_loss",
]

def agg_over_seeds(group: pd.DataFrame) -> pd.Series:
    out = {"n_runs": len(group)}
    for c in ["total_params", "trainable_params"]:
        if c in group.columns:
            out[c] = group[c].iloc[0]
    if "mt_score" in group.columns:
        out["mt_score_mean"] = group["mt_score"].mean()
        out["mt_score_std"] = group["mt_score"].std()
    if "_runtime" in group.columns:
        out["_runtime_mean"] = group["_runtime"].mean()
        out["_runtime_std"]  = group["_runtime"].std()
    for m in METRICS:
        if m in group.columns:
            vals = group[m].dropna()
            out[m + "_mean"] = vals.mean()
            out[m + "_std"] = vals.std()
    return pd.Series(out)

In [11]:
majority = df[df["backbone"] == "majority"].copy()
majority_grouped = (
    majority
    .groupby(["log", "backbone"], dropna=False)
    .apply(agg_over_seeds)
    .reset_index()
)

BASELINE_BACKBONES = ["rnn", "transformer", "tabpfn", "saprpt"]
baseline = df[df["backbone"].isin(BASELINE_BACKBONES)].copy()

NON_HP_COLS = set(
    [
        "id","log","backbone","categorical_features","categorical_targets",
        "continuous_features","continuous_targets","device","project","model",
        "name","fine_tuning","lora_alpha", "r", "few_shot_k", "seed","_runtime","_timestamp",
        "na_norm","rt_norm","nt_norm","mt_score","majority_stat",
        "total_params","trainable_params","best_train_next_remaining_time_loss",
        "_step","best_train_next_activity_loss","train_next_time_to_next_event_loss",
        "best_train_next_time_to_next_event_loss","train_next_activity_acc",
        "train_next_activity_loss","_wandb.runtime","best_train_next_activity_acc",
        "train_next_remaining_time_loss","persist_model","project_name","wandb",
    ]
    + METRICS
)

HP_COLS = [c for c in baseline.columns if c not in NON_HP_COLS]
print("Hyperparameter columns:", HP_COLS)

Hyperparameter columns: ['batch_size', 'embedding_size', 'epochs', 'freeze_layers', 'grad_clip', 'hidden_size', 'lr', 'n_layers', 'rnn_type', 'strategy', 'weight_decay']


In [12]:
group_cols = ["log", "backbone"] + HP_COLS

baseline_grouped = (
    baseline
    .groupby(group_cols, dropna=False)
    .apply(agg_over_seeds)   # deine Funktion von oben
    .reset_index()
)

score_col = "mt_score_mean"
if score_col not in baseline_grouped.columns:
    score_col = "test_next_activity_acc_mean"

idx_best = (
    baseline_grouped
    .groupby(["log", "backbone"])[score_col]
    .idxmax()
)
baseline_best = baseline_grouped.loc[idx_best].reset_index(drop=True)

baseline_all = pd.concat([baseline_best, majority_grouped], ignore_index=True)

DATASET_MAP = {
    "BPI12": "BPI12",
    "BPI17": "BPI17",
    "BPI20PrepaidTravelCosts": "BPI20PTC",
    "BPI20RequestForPayment": "BPI20RfP",
    "BPI20TravelPermitData": "BPI20TPD",
}
BACKBONE_MAP = {
    "majority": "Majority",
    "rnn": "RNN",
    "transformer": "Transformer",
    "tabpfn": "TabPFN",
    "saprpt": "SAP-RPT",
}

baseline_all["Dataset"] = baseline_all["log"].map(DATASET_MAP).fillna(baseline_all["log"])
baseline_all["Backbone_pretty"] = baseline_all["backbone"].map(BACKBONE_MAP).fillna(baseline_all["backbone"])

for m in METRICS:
    mean_col = m + "_mean"
    std_col  = m + "_std"
    if mean_col in baseline_all.columns and std_col in baseline_all.columns:
        baseline_all[m + "_mean_std"] = (
            baseline_all[mean_col].round(4).astype(str)
            + " ± "
            + baseline_all[std_col].round(4).astype(str)
        )
        
if "_runtime_mean" in baseline_all.columns:
    baseline_all["runtime_mean_h"] = baseline_all["_runtime_mean"] / 3600.0
if "_runtime_std" in baseline_all.columns:
    baseline_all["runtime_std_h"]  = baseline_all["_runtime_std"]  / 3600.0

if {"runtime_mean_h", "runtime_std_h"}.issubset(baseline_all.columns):
    mean_str = baseline_all["runtime_mean_h"].map(lambda x: f"{x:.5f}")
    std_str  = baseline_all["runtime_std_h"].map(lambda x: f"{x:.5f}")

    baseline_all["Runtime (h)"] = mean_str + " ± " + std_str
    
cols_to_drop = ["_runtime_mean", "_runtime_std", "runtime_mean_h", "runtime_std_h"]
cols_to_drop = [c for c in cols_to_drop if c in baseline_all.columns]

baseline_all = baseline_all.drop(columns=cols_to_drop)

csv_path = os.path.join(output_dir_csv, "baseline_best_settings_mean_std.csv")
baseline_all.to_csv(csv_path, index=False)
print("Saved baseline summary to:", csv_path)

baseline_all

Saved baseline summary to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/baseline_best_settings_mean_std.csv


Unnamed: 0,log,backbone,batch_size,embedding_size,epochs,freeze_layers,grad_clip,hidden_size,lr,n_layers,...,Backbone_pretty,test_next_activity_acc_mean_std,test_next_activity_loss_mean_std,test_next_remaining_time_loss_mean_std,test_next_time_to_next_event_loss_mean_std,best_test_next_activity_acc_mean_std,best_test_next_activity_loss_mean_std,best_test_next_remaining_time_loss_mean_std,best_test_next_time_to_next_event_loss_mean_std,Runtime (h)
0,BPI12,rnn,32.0,128.0,25.0,,5.0,512.0,5e-05,1.0,...,RNN,0.7757 ± 0.0142,0.7689 ± 0.042,1.7738 ± 0.0889,1.3352 ± 0.0302,0.7799 ± 0.0095,0.758 ± 0.0285,1.7033 ± 0.1138,1.3205 ± 0.014,0.01943 ± 0.00025
1,BPI12,saprpt,16.0,16.0,25.0,,5.0,32.0,0.0001,1.0,...,SAP-RPT,0.6294 ± 0.0121,nan ± nan,2.1601 ± 0.1146,1.6706 ± 0.372,nan ± nan,nan ± nan,nan ± nan,nan ± nan,0.00500 ± 0.00093
2,BPI12,tabpfn,16.0,16.0,25.0,,5.0,32.0,0.0001,1.0,...,TabPFN,0.6364 ± 0.0153,nan ± nan,2.2338 ± 0.0937,1.6877 ± 0.4063,nan ± nan,nan ± nan,nan ± nan,nan ± nan,0.12074 ± 0.00085
3,BPI12,transformer,32.0,128.0,10.0,,5.0,128.0,0.001,2.0,...,Transformer,0.7562 ± 0.0246,0.7139 ± 0.1077,2.8388 ± 0.1209,1.3671 ± 0.0495,0.7687 ± 0.016,0.6635 ± 0.0383,2.6204 ± 0.1508,1.3364 ± 0.016,0.00758 ± 0.00008
4,BPI17,rnn,256.0,32.0,25.0,,5.0,512.0,0.0001,1.0,...,RNN,0.8535 ± 0.0013,0.4193 ± 0.0068,0.6725 ± 0.0325,0.7629 ± 0.0159,0.8535 ± 0.0013,0.4193 ± 0.0068,0.6097 ± 0.0138,0.7629 ± 0.0159,0.04166 ± 0.00022
5,BPI17,saprpt,16.0,16.0,25.0,,5.0,32.0,0.0001,1.0,...,SAP-RPT,0.6472 ± 0.0268,nan ± nan,1.3056 ± 0.2773,1.1287 ± 0.1969,nan ± nan,nan ± nan,nan ± nan,nan ± nan,0.00440 ± 0.00005
6,BPI17,tabpfn,16.0,16.0,25.0,,5.0,32.0,0.0001,1.0,...,TabPFN,0.669 ± 0.0194,nan ± nan,1.6011 ± 0.0589,1.1679 ± 0.2366,nan ± nan,nan ± nan,nan ± nan,nan ± nan,0.12188 ± 0.00312
7,BPI17,transformer,32.0,128.0,10.0,,5.0,128.0,0.001,1.0,...,Transformer,0.8556 ± 0.0058,0.4141 ± 0.0136,1.156 ± 0.0803,0.8836 ± 0.0296,0.8596 ± 0.0024,0.407 ± 0.0142,1.1343 ± 0.0847,0.8433 ± 0.0401,0.01953 ± 0.00030
8,BPI20PrepaidTravelCosts,rnn,32.0,128.0,25.0,,5.0,128.0,0.0005,1.0,...,RNN,0.7841 ± 0.0201,0.6578 ± 0.0553,1.0744 ± 0.0308,1.1906 ± 0.0485,0.7897 ± 0.0134,0.6549 ± 0.0492,0.9536 ± 0.027,1.1239 ± 0.0216,0.00316 ± 0.00005
9,BPI20PrepaidTravelCosts,saprpt,16.0,16.0,25.0,,5.0,32.0,0.0001,1.0,...,SAP-RPT,0.7646 ± 0.0054,nan ± nan,1.041 ± 0.0605,1.0863 ± 0.2173,nan ± nan,nan ± nan,nan ± nan,nan ± nan,0.00448 ± 0.00007


In [13]:
LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]

llm = df[df["backbone"].isin(LLM_BACKBONES)].copy()

llm["Setting"] = llm.apply(map_setting, axis=1)

NON_HP_COLS_LLM = set(NON_HP_COLS)
for col in ["lora_alpha", "r", "few_shot_k"]:
    NON_HP_COLS_LLM.discard(col)
NON_HP_COLS_LLM.add("Setting")

HP_COLS_LLM = [c for c in llm.columns if c not in NON_HP_COLS_LLM]
print("LLM Hyperparameter columns:", HP_COLS_LLM)

group_cols_llm = ["log", "backbone", "Setting"] + HP_COLS_LLM

llm_grouped = (
    llm
    .groupby(group_cols_llm, dropna=False)
    .apply(agg_over_seeds)   # gleiche Funktion wie bei Baselines
    .reset_index()
)

score_col = "mt_score_mean"
if score_col not in llm_grouped.columns:
    score_col = "test_next_activity_acc_mean"

idx_best_llm = (
    llm_grouped
    .groupby(["log", "backbone", "Setting"])[score_col]
    .idxmax()
)

llm_all = llm_grouped.loc[idx_best_llm].reset_index(drop=True)

BACKBONE_MAP_LLM = {
    "gpt2":         "GPT2",
    "gptneo-1b3":   "GPT-Neo-1.3B",
    "qwen25-05b":   "Qwen2.5-0.5B",
    "llama32-1b":   "Llama3.2-1B",
    "gemma-2-2b":   "Gemma-2-2B",
}

llm_all["Dataset"] = llm_all["log"].map(DATASET_MAP).fillna(llm_all["log"])
llm_all["Backbone_pretty"] = llm_all["backbone"].map(BACKBONE_MAP_LLM).fillna(llm_all["backbone"])

for m in METRICS:
    mean_col = m + "_mean"
    std_col  = m + "_std"
    if mean_col in llm_all.columns and std_col in llm_all.columns:
        llm_all[m + "_mean_std"] = (
            llm_all[mean_col].round(4).astype(str)
            + " ± "
            + llm_all[std_col].round(4).astype(str)
        )
        
if "_runtime_mean" in llm_all.columns:
    llm_all["runtime_mean_h"] = llm_all["_runtime_mean"] / 3600.0
if "_runtime_std" in llm_all.columns:
    llm_all["runtime_std_h"]  = llm_all["_runtime_std"]  / 3600.0

if {"runtime_mean_h", "runtime_std_h"}.issubset(llm_all.columns):
    mean_str = llm_all["runtime_mean_h"].map(lambda x: f"{x:.5f}")
    std_str  = llm_all["runtime_std_h"].map(lambda x: f"{x:.5f}")
    llm_all["Runtime (h)"] = mean_str + " ± " + std_str

cols_to_drop_llm = ["_runtime_mean", "_runtime_std", "runtime_mean_h", "runtime_std_h"]
cols_to_drop_llm = [c for c in cols_to_drop_llm if c in llm_all.columns]
llm_all = llm_all.drop(columns=cols_to_drop_llm)

csv_path = os.path.join(output_dir_csv, "llm_all_settings_by_method_mean_std.csv")
llm_all.to_csv(csv_path, index=False)
print("Saved LLM summary to:", csv_path)

llm_all.head(15)

LLM Hyperparameter columns: ['batch_size', 'embedding_size', 'epochs', 'freeze_layers', 'grad_clip', 'hidden_size', 'lr', 'n_layers', 'rnn_type', 'strategy', 'weight_decay', 'lora_alpha', 'r', 'few_shot_k']
Saved LLM summary to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/llm_all_settings_by_method_mean_std.csv


Unnamed: 0,log,backbone,Setting,batch_size,embedding_size,epochs,freeze_layers,grad_clip,hidden_size,lr,...,Backbone_pretty,test_next_activity_acc_mean_std,test_next_activity_loss_mean_std,test_next_remaining_time_loss_mean_std,test_next_time_to_next_event_loss_mean_std,best_test_next_activity_acc_mean_std,best_test_next_activity_loss_mean_std,best_test_next_remaining_time_loss_mean_std,best_test_next_time_to_next_event_loss_mean_std,Runtime (h)
0,BPI12,gemma-2-2b,FewShot-Freezing,8,2304,10,,5,2304,5e-05,...,Gemma-2-2B,0.0799 ± 0.0341,3.6757 ± 0.2217,1.712 ± 0.236,1.609 ± 0.0568,0.0876 ± 0.0281,3.4803 ± 0.1926,1.3363 ± 0.123,1.5519 ± 0.0226,0.11437 ± 0.00098
1,BPI12,gemma-2-2b,FewShot-LoRA,8,2304,10,,5,2304,5e-05,...,Gemma-2-2B,0.0751 ± 0.0496,4.1549 ± 0.8314,3.5247 ± 2.3271,2.502 ± 1.3303,0.0953 ± 0.0496,3.1654 ± 0.2169,1.4307 ± 0.2371,1.6082 ± 0.0677,0.14829 ± 0.00058
2,BPI12,gemma-2-2b,Freezing,8,2304,10,,5,2304,5e-05,...,Gemma-2-2B,0.3912 ± 0.0314,1.4898 ± 0.0318,2.0048 ± 0.0841,1.5121 ± 0.0317,0.4028 ± 0.0211,1.4816 ± 0.0364,1.9842 ± 0.0928,1.4755 ± 0.0157,0.63840 ± 0.00268
3,BPI12,gemma-2-2b,"Freezing-[-1, -2]",8,2304,10,"-1,-2",5,2304,5e-05,...,Gemma-2-2B,0.7409 ± 0.0644,0.7587 ± 0.1567,2.9607 ± 0.0668,1.407 ± 0.0574,0.7921 ± 0.0109,0.6666 ± 0.0472,2.1902 ± 0.1723,1.3602 ± 0.0079,0.70089 ± 0.00147
4,BPI12,gemma-2-2b,Freezing-[-1],8,2304,10,-1,5,2304,5e-05,...,Gemma-2-2B,0.7481 ± 0.0678,0.8252 ± 0.3164,2.6484 ± 0.1867,1.4281 ± 0.0205,0.8007 ± 0.0061,0.6464 ± 0.0429,2.2036 ± 0.1755,1.3601 ± 0.0117,0.66969 ± 0.00339
5,BPI12,gemma-2-2b,"Freezing-[0, 1]",8,2304,10,01,5,2304,5e-05,...,Gemma-2-2B,0.7328 ± 0.0603,0.7753 ± 0.0927,2.413 ± 0.1309,1.3021 ± 0.0234,0.797 ± 0.0121,0.6983 ± 0.0244,2.3113 ± 0.1783,1.2989 ± 0.019,0.70158 ± 0.00118
6,BPI12,gemma-2-2b,Freezing-[0],8,2304,10,0,5,2304,5e-05,...,Gemma-2-2B,0.7857 ± 0.0103,0.729 ± 0.0074,2.4203 ± 0.1474,1.3369 ± 0.0233,0.7933 ± 0.0051,0.7206 ± 0.0217,2.2122 ± 0.0645,1.3219 ± 0.0238,0.67204 ± 0.00218
7,BPI12,gemma-2-2b,LoRA,8,2304,10,,5,2304,5e-05,...,Gemma-2-2B,0.7726 ± 0.0182,0.6433 ± 0.0493,2.6577 ± 0.2777,1.4247 ± 0.0643,0.8033 ± 0.0094,0.5982 ± 0.0168,1.912 ± 0.1038,1.3099 ± 0.0157,0.72725 ± 0.00116
8,BPI12,gemma-2-2b,ZeroShot,8,2304,0,,5,2304,5e-05,...,Gemma-2-2B,0.0298 ± 0.0184,4.4444 ± 0.6145,4.3306 ± 3.2487,3.6357 ± 2.514,0.0298 ± 0.0184,4.4444 ± 0.6145,4.3306 ± 3.2487,3.6357 ± 2.514,0.01072 ± 0.00068
9,BPI12,gpt2,FewShot-Freezing,8,768,10,,5,768,5e-05,...,GPT2,0.0372 ± 0.0315,3.5797 ± 0.3214,1.9015 ± 0.8402,1.6738 ± 0.0887,0.0387 ± 0.0323,3.5797 ± 0.3214,1.8622 ± 0.8651,1.5788 ± 0.0816,0.00966 ± 0.00005


In [14]:
llm["Setting"] = llm.apply(map_setting, axis=1)
print(llm["Setting"].value_counts())

Setting
LoRA                 729
Freezing-[-1, -2]    125
Freezing-[0, 1]      125
Freezing-[0]         125
Freezing-[-1]        125
Freezing             125
ZeroShot             125
FewShot-Freezing     125
FewShot-LoRA         125
Name: count, dtype: int64


In [15]:
multi = pd.concat([baseline_all, llm_all], ignore_index=True, sort=False)

multi = (
    multi
    .sort_values(["Dataset", "Backbone_pretty", "Setting"])
    .reset_index(drop=True)
)

csv_path = os.path.join(output_dir_csv, "multi_task_benchmark_results.csv")
multi.to_csv(csv_path, index=False)
print("Saved combined multi-task table to:", csv_path)

multi.head()

Saved combined multi-task table to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/multi_task_benchmark_results.csv


Unnamed: 0,log,backbone,batch_size,embedding_size,epochs,freeze_layers,grad_clip,hidden_size,lr,n_layers,...,test_next_time_to_next_event_loss_mean_std,best_test_next_activity_acc_mean_std,best_test_next_activity_loss_mean_std,best_test_next_remaining_time_loss_mean_std,best_test_next_time_to_next_event_loss_mean_std,Runtime (h),Setting,lora_alpha,r,few_shot_k
0,BPI12,gptneo-1b3,8.0,2048.0,10.0,,5.0,2048.0,5e-05,1.0,...,1.7584 ± 0.1475,0.0734 ± 0.0279,3.3621 ± 0.3649,1.5616 ± 0.2029,1.6504 ± 0.1312,0.07263 ± 0.00036,FewShot-Freezing,,,8.0
1,BPI12,gptneo-1b3,8.0,2048.0,10.0,,5.0,2048.0,5e-05,1.0,...,1.8453 ± 0.314,0.1002 ± 0.0504,3.1159 ± 0.0594,1.4986 ± 0.4039,1.5639 ± 0.0325,0.07842 ± 0.00021,FewShot-LoRA,512.0,256.0,8.0
2,BPI12,gptneo-1b3,8.0,2048.0,10.0,,5.0,2048.0,5e-05,1.0,...,1.531 ± 0.0338,0.5736 ± 0.0769,1.4783 ± 0.3746,2.2858 ± 0.1486,1.4952 ± 0.0342,0.40460 ± 0.00102,Freezing,,,
3,BPI12,gptneo-1b3,8.0,2048.0,10.0,"-1,-2",5.0,2048.0,5e-05,1.0,...,1.5124 ± 0.0677,0.5663 ± 0.071,1.4508 ± 0.2137,2.3459 ± 0.1544,1.4892 ± 0.0518,0.44785 ± 0.00081,"Freezing-[-1, -2]",,,
4,BPI12,gptneo-1b3,8.0,2048.0,10.0,-1,5.0,2048.0,5e-05,1.0,...,1.5131 ± 0.0366,0.5474 ± 0.0859,1.4897 ± 0.2106,2.3335 ± 0.0808,1.492 ± 0.0327,0.42557 ± 0.00100,Freezing-[-1],,,


In [16]:
multi_path = os.path.join(output_dir_csv, "multi_task_benchmark_results.csv")
multi = pd.read_csv(multi_path)

for log_name, df_log in multi.groupby("log"):
    log_dir = os.path.join(output_dir_csv, "per_dataset", log_name)
    os.makedirs(log_dir, exist_ok=True)
    
    csv_path = os.path.join(log_dir, f"multi_task_benchmark_results_{log_name}.csv")
    
    df_log.to_csv(csv_path, index=False)
    print(f"Saved per-log table for {log_name} to {csv_path}")

Saved per-log table for BPI12 to /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/multi_task_benchmark_results_BPI12.csv
Saved per-log table for BPI17 to /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI17/multi_task_benchmark_results_BPI17.csv
Saved per-log table for BPI20PrepaidTravelCosts to /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20PrepaidTravelCosts/multi_task_benchmark_results_BPI20PrepaidTravelCosts.csv
Saved per-log table for BPI20RequestForPayment to /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20RequestForPayment/multi_task_benchmark_results_BPI20RequestForPayment.csv
Saved per-log table for BPI20TravelPermitData to /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20TravelPermitData/multi_task_benchmark_results_BPI20TravelPermitData.csv


In [17]:
multi_path = os.path.join(output_dir_csv, "multi_task_benchmark_results.csv")
multi = pd.read_csv(multi_path)

LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]

llm_multi = multi[multi["backbone"].isin(LLM_BACKBONES)].copy()

for (log_name, backbone), df_sub in llm_multi.groupby(["log", "backbone"]):
    log_dir = os.path.join(output_dir_csv, "per_dataset", log_name)
    os.makedirs(log_dir, exist_ok=True)

    csv_path = os.path.join(log_dir, f"llm_methods_{log_name}_{backbone}.csv")

    df_sub.to_csv(csv_path, index=False)
    print(f"Saved LLM methods table for log={log_name}, backbone={backbone} to: {csv_path}")

Saved LLM methods table for log=BPI12, backbone=gemma-2-2b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_gemma-2-2b.csv
Saved LLM methods table for log=BPI12, backbone=gpt2 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_gpt2.csv
Saved LLM methods table for log=BPI12, backbone=gptneo-1b3 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_gptneo-1b3.csv
Saved LLM methods table for log=BPI12, backbone=llama32-1b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_llama32-1b.csv
Saved LLM methods table for log=BPI12, backbone=qwen25-05b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_qwen25-05b.csv
Saved LLM methods table for log=BPI17, backbone=gemma-2-2b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI17/llm_methods_BPI17_gemma-2-2b.cs

In [18]:
multi_path = os.path.join(output_dir_csv, "multi_task_benchmark_results.csv")
multi = pd.read_csv(multi_path)

BASELINE_BACKBONES = ["majority", "rnn", "transformer", "tabpfn", "saprpt"]

mask_baseline = multi["backbone"].isin(BASELINE_BACKBONES)
mask_llm_lora = (~multi["backbone"].isin(BASELINE_BACKBONES)) & (multi["Setting"] == "LoRA")

subset = multi[mask_baseline | mask_llm_lora].copy()

for log_name, df_log in subset.groupby("log"):

    log_dir = os.path.join(output_dir_csv, "per_dataset", log_name)
    os.makedirs(log_dir, exist_ok=True)

    csv_path_log = os.path.join(log_dir, f"baseline_vs_lora_multi_task_results_{log_name}.csv")

    df_log.to_csv(csv_path_log, index=False)
    print(f"Saved baseline vs LoRA table for {log_name} to: {csv_path_log}")

Saved baseline vs LoRA table for BPI12 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/baseline_vs_lora_multi_task_results_BPI12.csv
Saved baseline vs LoRA table for BPI17 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI17/baseline_vs_lora_multi_task_results_BPI17.csv
Saved baseline vs LoRA table for BPI20PrepaidTravelCosts to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20PrepaidTravelCosts/baseline_vs_lora_multi_task_results_BPI20PrepaidTravelCosts.csv
Saved baseline vs LoRA table for BPI20RequestForPayment to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20RequestForPayment/baseline_vs_lora_multi_task_results_BPI20RequestForPayment.csv
Saved baseline vs LoRA table for BPI20TravelPermitData to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20TravelPermitData/baseline_vs_lora_multi_task_results_BPI20TravelPermitData.csv


In [19]:
# %% LoRA-Sweeps pro Datensatz & LLM-Backbone (alle HParams, über Seeds aggregiert)

LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]

# Nur LLM + LoRA
lora_sweeps = df[
    df["backbone"].isin(LLM_BACKBONES)
    & (df["fine_tuning"] == "lora")
].copy()

# Nur "full" LoRA (ohne FewShot-LoRA)
if "few_shot_k" in lora_sweeps.columns:
    lora_sweeps = lora_sweeps[lora_sweeps["few_shot_k"].isna()].copy()

# Relevante Sweep-Hyperparameter (dein LoRA-Raum)
HP_SWEEP_COLS = [
    "lr",
    "batch_size",
    "epochs",
    "r",
    "lora_alpha",
    "embedding_size",
    "hidden_size",
    "strategy",
]
HP_SWEEP_COLS = [c for c in HP_SWEEP_COLS if c in lora_sweeps.columns]

# ***Ganz wichtig: hier KEIN idxmax, wir behalten ALLE Kombis***
group_cols_sweep = ["log", "backbone"] + HP_SWEEP_COLS

lora_sweeps_grouped = (
    lora_sweeps
    .groupby(group_cols_sweep, dropna=False)
    .apply(agg_over_seeds)   # mittelt nur über Seeds
    .reset_index()
)

for m in METRICS:
    mean_col = m + "_mean"
    std_col  = m + "_std"
    if mean_col in lora_sweeps_grouped.columns and std_col in lora_sweeps_grouped.columns:
        lora_sweeps_grouped[m + "_mean_std"] = (
            lora_sweeps_grouped[mean_col].round(4).astype(str)
            + " ± "
            + lora_sweeps_grouped[std_col].round(4).astype(str)
        )

if "_runtime_mean" in lora_sweeps_grouped.columns:
    lora_sweeps_grouped["runtime_mean_h"] = lora_sweeps_grouped["_runtime_mean"] / 3600.0
if "_runtime_std" in lora_sweeps_grouped.columns:
    lora_sweeps_grouped["runtime_std_h"]  = lora_sweeps_grouped["_runtime_std"]  / 3600.0

if {"runtime_mean_h", "runtime_std_h"}.issubset(lora_sweeps_grouped.columns):
    mean_str = lora_sweeps_grouped["runtime_mean_h"].map(lambda x: f"{x:.5f}")
    std_str  = lora_sweeps_grouped["runtime_std_h"].map(lambda x: f"{x:.5f}")
    lora_sweeps_grouped["Runtime (h)"] = mean_str + " ± " + std_str

cols_to_drop_sweeps = ["_runtime_mean", "_runtime_std", "runtime_mean_h", "runtime_std_h"]
cols_to_drop_sweeps = [c for c in cols_to_drop_sweeps if c in lora_sweeps_grouped.columns]
lora_sweeps_grouped = lora_sweeps_grouped.drop(columns=cols_to_drop_sweeps)

for (log_name, backbone), df_sub in lora_sweeps_grouped.groupby(["log", "backbone"]):
    log_dir = os.path.join(output_dir_csv, "per_dataset", log_name)
    os.makedirs(log_dir, exist_ok=True)

    out_path = os.path.join(
        log_dir,
        f"llm_methods_{log_name}_{backbone}_lora_sweeps.csv"
    )
    df_sub.to_csv(out_path, index=False)
    print(f"Saved LoRA sweeps table for log={log_name}, backbone={backbone} to: {out_path}")

Saved LoRA sweeps table for log=BPI12, backbone=gemma-2-2b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_gemma-2-2b_lora_sweeps.csv
Saved LoRA sweeps table for log=BPI12, backbone=gpt2 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_gpt2_lora_sweeps.csv
Saved LoRA sweeps table for log=BPI12, backbone=gptneo-1b3 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_gptneo-1b3_lora_sweeps.csv
Saved LoRA sweeps table for log=BPI12, backbone=llama32-1b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_llama32-1b_lora_sweeps.csv
Saved LoRA sweeps table for log=BPI12, backbone=qwen25-05b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/llm_methods_BPI12_qwen25-05b_lora_sweeps.csv
Saved LoRA sweeps table for log=BPI17, backbone=gemma-2-2b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/r

In [20]:
LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]

llm = df[df["backbone"].isin(LLM_BACKBONES)].copy()
llm["Setting"] = llm.apply(map_setting, axis=1)

def collapse_setting_for_main(setting: str) -> str:
    if isinstance(setting, str) and setting.startswith("Freezing-["):
        return "Freezing"   # -1, 0, 0,1, -1,-2 zusammengefasst
    return setting

llm["Setting_main"] = llm["Setting"].apply(collapse_setting_for_main)

plots_base_dir = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset"

SETTING_ORDER_FULL = [
    "ZeroShot",
    "LoRA",
    "FewShot-LoRA",
    "Freezing",
    "Freezing-[-1]",
    "Freezing-[0]",
    "Freezing-[0, 1]",
    "Freezing-[-1, -2]",
    "FewShot-Freezing",
]

SETTING_ORDER_MAIN = [
    "ZeroShot",
    "LoRA",
    "FewShot-LoRA",
    "Freezing",
    "FewShot-Freezing",
]

PLOTS = [
    ("test_next_activity_acc",           "NA Acc."),
    ("test_next_remaining_time_loss",    "RT MSE"),
    ("test_next_time_to_next_event_loss","NT MSE"),
]

for log_name, df_log in llm.groupby("log"):
    log_dir = os.path.join(plots_base_dir, log_name)
    os.makedirs(log_dir, exist_ok=True)

    for backbone, df_b in df_log.groupby("backbone"):

        settings_full = [s for s in SETTING_ORDER_FULL if s in df_b["Setting"].unique()]
        if settings_full:
            fig, axes = plt.subplots(3, 1, figsize=(8, 9), sharex=True)

            for ax, (metric, ylabel) in zip(axes, PLOTS):
                sns.boxplot(
                    data=df_b,
                    x="Setting",
                    y=metric,
                    order=settings_full,
                    ax=ax,
                )
                ax.set_ylabel(ylabel)
                ax.set_xticks(range(len(settings_full)))
                ax.set_xticklabels(settings_full, rotation=45, ha="right")

            axes[-1].set_xlabel("Fine-tuning method (detailed)")

            fig.suptitle(f"{log_name} – {backbone} (all Freezing variants)", fontsize=12)
            plt.tight_layout()

            out_path = os.path.join(log_dir, f"llm_methods_boxplot_freezing_{log_name}_{backbone}.png")
            plt.savefig(out_path, dpi=300)
            plt.close(fig)

            print(f"Saved detailed boxplot for log={log_name}, backbone={backbone} to: {out_path}")

        settings_main = [s for s in SETTING_ORDER_MAIN if s in df_b["Setting_main"].unique()]
        if settings_main:
            fig, axes = plt.subplots(3, 1, figsize=(8, 9), sharex=True)

            for ax, (metric, ylabel) in zip(axes, PLOTS):
                sns.boxplot(
                    data=df_b,
                    x="Setting_main",
                    y=metric,
                    order=settings_main,
                    ax=ax,
                )
                ax.set_ylabel(ylabel)
                ax.set_xticks(range(len(settings_main)))
                ax.set_xticklabels(settings_main, rotation  =45, ha="right")

            axes[-1].set_xlabel("Fine-tuning method (collapsed)")

            fig.suptitle(f"{log_name} – {backbone} (collapsed Freezing)", fontsize=12)
            plt.tight_layout()

            out_path = os.path.join(log_dir, f"llm_methods_boxplot_collapsed_{log_name}_{backbone}.png")
            plt.savefig(out_path, dpi=300)
            plt.close(fig)

            print(f"Saved collapsed boxplot for log={log_name}, backbone={backbone} to: {out_path}")

Saved detailed boxplot for log=BPI12, backbone=gemma-2-2b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_methods_boxplot_freezing_BPI12_gemma-2-2b.png
Saved collapsed boxplot for log=BPI12, backbone=gemma-2-2b to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_methods_boxplot_collapsed_BPI12_gemma-2-2b.png
Saved detailed boxplot for log=BPI12, backbone=gpt2 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_methods_boxplot_freezing_BPI12_gpt2.png
Saved collapsed boxplot for log=BPI12, backbone=gpt2 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_methods_boxplot_collapsed_BPI12_gpt2.png
Saved detailed boxplot for log=BPI12, backbone=gptneo-1b3 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_methods_boxplot_freezing_BPI12_gptneo-1b3.png
Saved collapsed boxplot for log=BPI12, backbone=gptneo-1b3 to: /ceph/lfertig/Thesis/n

In [21]:
LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]

llm = df[df["backbone"].isin(LLM_BACKBONES)].copy()
llm["Setting"] = llm.apply(map_setting, axis=1)

def collapse_setting_for_main(setting: str) -> str:
    if isinstance(setting, str) and setting.startswith("Freezing-["):
        return "Freezing"   # -1, 0, 0,1, -1,-2 zusammengefasst
    return setting

llm["Setting_main"] = llm["Setting"].apply(collapse_setting_for_main)

# Hübsche Modellnamen
BACKBONE_MAP_LLM = {
    "gpt2":         "GPT2",
    "gptneo-1b3":   "GPT-Neo-1.3B",
    "qwen25-05b":   "Qwen2.5-0.5B",
    "llama32-1b":   "Llama3.2-1B",
    "gemma-2-2b":   "Gemma-2-2B",
}
llm["Backbone_pretty"] = llm["backbone"].map(BACKBONE_MAP_LLM).fillna(llm["backbone"])

plots_base_dir = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset"

MAIN_SETTING_ORDER = [
    "ZeroShot",
    "LoRA",
    "FewShot-LoRA",
    "Freezing",
    "FewShot-Freezing",
]

PLOTS = [
    ("test_next_activity_acc",           "NA Acc."),
    ("test_next_remaining_time_loss",    "RT MSE"),
    ("test_next_time_to_next_event_loss","NT MSE"),
]

for log_name, df_log in llm.groupby("log"):
    log_dir = os.path.join(plots_base_dir, log_name)
    os.makedirs(log_dir, exist_ok=True)

    for setting in MAIN_SETTING_ORDER:
        df_s = df_log[df_log["Setting_main"] == setting].copy()
        if df_s.empty:
            continue

        backbone_order = [
            BACKBONE_MAP_LLM[b]
            for b in LLM_BACKBONES
            if b in df_s["backbone"].unique()
        ]
        if not backbone_order:
            continue

        fig, axes = plt.subplots(3, 1, figsize=(8, 9), sharex=True)

        for ax, (metric, ylabel) in zip(axes, PLOTS):
            sns.boxplot(
                data=df_s,
                x="Backbone_pretty",
                y=metric,
                order=backbone_order,
                ax=ax,
            )
            ax.set_ylabel(ylabel)
            ax.set_xticklabels(backbone_order, rotation=45, ha="right")

        axes[-1].set_xlabel("LLM backbone")

        fig.suptitle(f"{log_name} – {setting}", fontsize=12)
        plt.tight_layout()

        out_path = os.path.join(
            log_dir,
            f"llm_backbones_boxplot_{log_name}_{setting}.png"
        )
        plt.savefig(out_path, dpi=300)
        plt.close(fig)

        print(f"Saved LLM-backbone comparison for log={log_name}, setting={setting} to: {out_path}")

Saved LLM-backbone comparison for log=BPI12, setting=ZeroShot to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_backbones_boxplot_BPI12_ZeroShot.png
Saved LLM-backbone comparison for log=BPI12, setting=LoRA to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_backbones_boxplot_BPI12_LoRA.png
Saved LLM-backbone comparison for log=BPI12, setting=FewShot-LoRA to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_backbones_boxplot_BPI12_FewShot-LoRA.png
Saved LLM-backbone comparison for log=BPI12, setting=Freezing to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_backbones_boxplot_BPI12_Freezing.png
Saved LLM-backbone comparison for log=BPI12, setting=FewShot-Freezing to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/llm_backbones_boxplot_BPI12_FewShot-Freezing.png
Saved LLM-backbone comparison for log=BPI17, setting=ZeroShot to: /ceph/lf

In [22]:
# Loss-Curves für beste Full-LoRA-Runs pro (log, backbone)

def fetch_single(
    wandb_id: str,
    targets=["na", "rt", "nt"],
    project_name: str | None = None,
    entity: str | None = None,
):
    """
    Holt Verlaufskurven (pro Epoch) für einen einzelnen W&B-Run.
    Gibt (na_acc, na_loss, rt_loss, nt_loss) als Listen zurück.
    Fehlende Targets -> entsprechende Liste = None.
    """
    if isinstance(targets, str):
        targets = [targets]

    if project_name is None:
        raise ValueError("fetch_single requires an explicit project_name.")

    if entity is None:
        entity = os.environ.get("ENTITY")
        if entity is None:
            raise ValueError("ENTITY not set and no entity passed to fetch_single().")

    api = wandb.Api()
    run = api.run(f"{entity}/{project_name}/{wandb_id}")
    history = list(run.scan_history())

    na_acc, na_loss, rt_loss, nt_loss = None, None, None, None

    if "rt" in targets:
        rt_loss = [
            row["test_next_remaining_time_loss"]
            for row in history
            if "test_next_remaining_time_loss" in row
        ]

    if "na" in targets:
        na_loss = [
            row["test_next_activity_loss"]
            for row in history
            if "test_next_activity_loss" in row
        ]
        na_acc = [
            row["test_next_activity_acc"]
            for row in history
            if "test_next_activity_acc" in row
        ]

    if "nt" in targets:
        nt_loss = [
            row["test_next_time_to_next_event_loss"]
            for row in history
            if "test_next_time_to_next_event_loss" in row
        ]

    return na_acc, na_loss, rt_loss, nt_loss


# Pfad für Loss-Curves-CSV
loss_csv_path = os.path.join(output_dir_csv, "loss_curves_multitask_lora_best.csv")

LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]

if os.path.exists(loss_csv_path):
    losses = pd.read_csv(loss_csv_path)
else:
    # Nur LLM + LoRA
    df_lora = df[
        df["backbone"].isin(LLM_BACKBONES)
        & (df["fine_tuning"] == "lora")
    ].copy()

    # Nur "Full-LoRA" (keine Few-Shot-LoRA)
    if "few_shot_k" in df_lora.columns:
        df_lora = df_lora[df_lora["few_shot_k"].isna()].copy()

    # Score-Spalte für beste Runs
    score_col = "mt_score"
    if score_col not in df_lora.columns:
        score_col = "test_next_activity_acc"

    # Bester LoRA-Run pro (log, backbone)
    best_runs = (
        df_lora
        .sort_values(score_col, ascending=False)
        .groupby(["log", "backbone"], as_index=False)
        .head(1)
    )

    losses_list = []

    for _, row in best_runs.iterrows():
        na_acc, na_loss, rt_loss, nt_loss = fetch_single(
            wandb_id=row["id"],
            project_name=row["project"],
            entity=entity,
            targets=["na", "rt", "nt"],
        )

        # falls etwas fehlt → überspringen
        if na_loss is None or rt_loss is None or nt_loss is None:
            continue

        tmp = pd.DataFrame({
            "epoch": range(len(na_loss)),
            "na_acc": na_acc,
            "na_loss": na_loss,
            "rt_loss": rt_loss,
            "nt_loss": nt_loss,
        })
        tmp["log"] = row["log"]
        tmp["backbone"] = row["backbone"]
        losses_list.append(tmp)

    if not losses_list:
        raise RuntimeError("Keine Loss-Curves für LoRA-Runs gefunden.")

    losses = pd.concat(losses_list, axis=0, ignore_index=True)
    losses.to_csv(loss_csv_path, index=False)
    print("Saved LoRA loss curves to:", loss_csv_path)

print("Loss curves shape (LoRA best runs):", losses.shape)

Loss curves shape (LoRA best runs): (250, 7)


In [23]:
LOGS_TO_PLOT = sorted(losses["log"].unique())

HUE_MAP = {
    "gpt2":         "GPT2",
    "gptneo-1b3":   "GPT-Neo-1.3B",
    "qwen25-05b":   "Qwen2.5-0.5B",
    "llama32-1b":   "Llama3.2-1B",
    "gemma-2-2b":   "Gemma-2-2B",
}

HUE_ORDER = [
    "GPT2",
    "GPT-Neo-1.3B",
    "Qwen2.5-0.5B",
    "Llama3.2-1B",
    "Gemma-2-2B",
]

# Long-Format
l = losses.melt(
    id_vars=["log", "backbone", "epoch"],
    value_vars=["na_loss", "rt_loss", "nt_loss"],
    var_name="Loss",
    value_name="Value",
).dropna(subset=["Value"])

l["Backbone"] = l["backbone"].map(HUE_MAP)
l = l[l["Backbone"].notna()]

LOSS_LABELS = {
    "na_loss": "NA Loss",
    "rt_loss": "RT Loss",
    "nt_loss": "NT Loss",
}

fig, axes = plt.subplots(
    3, len(LOGS_TO_PLOT),
    figsize=(4 * len(LOGS_TO_PLOT), 8),
    sharex=True
)
axes_iter = iter(axes.flatten())

legend_handles, legend_labels = None, None  # globale Legende

for loss_name in ["na_loss", "rt_loss", "nt_loss"]:
    for log_name in LOGS_TO_PLOT:
        ax = next(axes_iter)
        tmp = l[(l["Loss"] == loss_name) & (l["log"] == log_name)]

        sns.lineplot(
            data=tmp,
            x="epoch",
            y="Value",
            hue="Backbone",
            hue_order=[h for h in HUE_ORDER if h in tmp["Backbone"].unique()],
            ax=ax,
            linewidth=2.0,
        )

        ax.set_xlabel("Epoch")
        ax.set_ylabel(LOSS_LABELS[loss_name])
        ax.set_title(log_name)

        # Legend nur einmal abgreifen
        leg = ax.get_legend()
        if leg is not None:
            handles, labels = leg.legend_handles, [t.get_text() for t in leg.get_texts()]
            legend_handles, legend_labels = handles, labels
            leg.remove()

# globale Legende unter der Figure
if legend_handles is not None:
    fig.legend(
        legend_handles,
        legend_labels,
        title="",
        loc="lower center",
        ncol=len(legend_labels),
        bbox_to_anchor=(0.5, -0.02),
    )

plt.tight_layout(rect=(0, 0.05, 1, 1))  # unten Platz für Legende lassen

plot_path = os.path.join(output_dir_plots, "loss_curves_multitask_lora_best.png")
plt.savefig(plot_path, dpi=300)
plt.close(fig)

print("Saved LoRA loss curve plot to:", plot_path)

Saved LoRA loss curve plot to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/loss_curves_multitask_lora_best.png


In [24]:
# PARAMETER-SUMMARY 

multi_path = os.path.join(output_dir_csv, "multi_task_benchmark_results.csv")
multi = pd.read_csv(multi_path)

# Nur Zeilen mit Parameterinfos
param_summary = (
    multi[
        [
            "log",
            "backbone",
            "Setting",
            "total_params",
            "trainable_params",
        ]
    ]
    .dropna(subset=["total_params", "trainable_params"])
    .drop_duplicates()
    .copy()
)

# Anteil trainierbarer Parameter in %
param_summary["trainable_percent"] = (
    param_summary["trainable_params"] / param_summary["total_params"] * 100.0
)

param_summary["trainable_percent_fmt"] = (
    param_summary["trainable_percent"].round(1).astype(str) + "%"
)

# total_params schön formatiert (wissenschaftliche Notation)
param_summary["total_params_fmt"] = param_summary["total_params"].apply(
    lambda x: np.format_float_scientific(x, precision=1)
)

param_summary["# params\n(%trainable)"] = (
    param_summary["total_params_fmt"]
    + " ("
    + param_summary["trainable_percent_fmt"]
    + ")"
)

# falls vorhanden, Dataset & hübsche Namen mitnehmen
if "Dataset" in multi.columns:
    param_summary["Dataset"] = multi.set_index(
        ["log", "backbone", "Setting"]
    ).loc[
        param_summary.set_index(["log", "backbone", "Setting"]).index,
        "Dataset"
    ].values
else:
    param_summary["Dataset"] = param_summary["log"]

if "Backbone_pretty" in multi.columns:
    param_summary["Backbone_pretty"] = multi.set_index(
        ["log", "backbone", "Setting"]
    ).loc[
        param_summary.set_index(["log", "backbone", "Setting"]).index,
        "Backbone_pretty"
    ].values
else:
    param_summary["Backbone_pretty"] = param_summary["backbone"]

# --- pro Datensatz (log) speichern ---
for log_name, df_log in param_summary.groupby("log"):
    log_dir = os.path.join(output_dir_csv, "per_dataset", log_name)
    os.makedirs(log_dir, exist_ok=True)

    csv_path = os.path.join(log_dir, "param_summary_multitask.csv")
    df_log.to_csv(csv_path, index=False)
    print(f"Saved param summary for {log_name} to: {csv_path}")

Saved param summary for BPI12 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI12/param_summary_multitask.csv
Saved param summary for BPI17 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI17/param_summary_multitask.csv
Saved param summary for BPI20PrepaidTravelCosts to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20PrepaidTravelCosts/param_summary_multitask.csv
Saved param summary for BPI20RequestForPayment to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20RequestForPayment/param_summary_multitask.csv
Saved param summary for BPI20TravelPermitData to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/csv/per_dataset/BPI20TravelPermitData/param_summary_multitask.csv


In [25]:
# === PARETO: MT-Score vs. trainierbare Parameter (LoRA, best per LLM-Backbone) ===

multi_path = os.path.join(output_dir_csv, "multi_task_benchmark_results.csv")
multi = pd.read_csv(multi_path)

# Relevante LLM-Backbones + Pretty Names
LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]
BACKBONE_MAP_LLM = {
    "gpt2":         "GPT2",
    "gptneo-1b3":   "GPT-Neo-1.3B",
    "qwen25-05b":   "Qwen2.5-0.5B",
    "llama32-1b":   "Llama3.2-1B",
    "gemma-2-2b":   "Gemma-2-2B",
}

# Nur LLM + LoRA + benötigte Spalten
pareto_source = multi[
    (multi["backbone"].isin(LLM_BACKBONES))
    & (multi["Setting"] == "LoRA")
    & multi["trainable_params"].notna()
    & multi["mt_score_mean"].notna()
].copy()

# Pretty-Namen ergänzen (falls noch nicht vorhanden)
if "Backbone_pretty" not in pareto_source.columns:
    pareto_source["Backbone_pretty"] = (
        pareto_source["backbone"]
        .map(BACKBONE_MAP_LLM)
        .fillna(pareto_source["backbone"])
    )

plots_base_dir = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset"

for log_name, df_log in pareto_source.groupby("log"):
    # pro Datensatz: bester LoRA-Run je Backbone
    df_best = (
        df_log
        .sort_values(
            ["backbone", "mt_score_mean", "trainable_params"],
            ascending=[True, False, True],  # Score ↓, bei Tie weniger Params ↑
        )
        .drop_duplicates(subset=["backbone"], keep="first")
        .reset_index(drop=True)
    )

    if df_best.empty:
        continue

    # etwas breiter, damit die Legend unten gut passt
    fig, ax = plt.subplots(figsize=(9, 4.5))

    # Scatter: ein Punkt pro Backbone
    for bb, df_b in df_best.groupby("backbone"):
        label = df_b["Backbone_pretty"].iloc[0]
        ax.scatter(
            df_b["trainable_params"],
            df_b["mt_score_mean"],
            label=label,
            s=70,
        )

    # Pareto-Front hervorheben (min trainable_params, max mt_score_mean)
    df_pf = df_best.sort_values("trainable_params")
    best_so_far = -np.inf
    pareto_mask = []
    for _, row in df_pf.iterrows():
        if row["mt_score_mean"] >= best_so_far - 1e-9:
            pareto_mask.append(True)
            best_so_far = row["mt_score_mean"]
        else:
            pareto_mask.append(False)
    df_pf_pareto = df_pf[pareto_mask]

    if not df_pf_pareto.empty:
        ax.scatter(
            df_pf_pareto["trainable_params"],
            df_pf_pareto["mt_score_mean"],
            s=140,
            facecolors="none",
            edgecolors="black",
            linewidths=1.5,
        )

    ax.set_xscale("log")
    ax.set_xlabel("Trainable parameters (LoRA, log scale)")
    ax.set_ylabel("MT-Score (mean across seeds)")
    ax.grid(True, which="both", linestyle="--", linewidth=0.5)

    # Titel: wenn Dataset-Spalte existiert, nimm diese Bezeichnung
    if "Dataset" in df_best.columns:
        ds_label = df_best["Dataset"].iloc[0]
    else:
        ds_label = log_name
    ax.set_title(f"{ds_label} – LLM LoRA Pareto (best per backbone)")

    # Legend unten über die ganze Breite
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(
        handles,
        labels,
        title="Backbone",
        loc="lower center",
        ncol=len(labels),
        frameon=False,
        bbox_to_anchor=(0.5, -0.02),
    )

    # Platz für die Legend unten lassen
    plt.tight_layout(rect=(0, 0.12, 1, 1))

    log_dir = os.path.join(plots_base_dir, log_name)
    os.makedirs(log_dir, exist_ok=True)
    out_path = os.path.join(log_dir, f"pareto_llm_lora_{log_name}.png")

    plt.savefig(out_path, dpi=300)
    plt.close(fig)

    print(f"Saved Pareto plot for log={log_name} to: {out_path}")

Saved Pareto plot for log=BPI12 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/pareto_llm_lora_BPI12.png
Saved Pareto plot for log=BPI17 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI17/pareto_llm_lora_BPI17.png
Saved Pareto plot for log=BPI20PrepaidTravelCosts to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI20PrepaidTravelCosts/pareto_llm_lora_BPI20PrepaidTravelCosts.png
Saved Pareto plot for log=BPI20RequestForPayment to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI20RequestForPayment/pareto_llm_lora_BPI20RequestForPayment.png
Saved Pareto plot for log=BPI20TravelPermitData to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI20TravelPermitData/pareto_llm_lora_BPI20TravelPermitData.png


In [26]:
LLM_BACKBONES = ["gpt2", "gptneo-1b3", "qwen25-05b", "llama32-1b", "gemma-2-2b"]
BACKBONE_MAP_LLM = {
    "gpt2":         "GPT2",
    "gptneo-1b3":   "GPT-Neo-1.3B",
    "qwen25-05b":   "Qwen2.5-0.5B",
    "llama32-1b":   "Llama3.2-1B",
    "gemma-2-2b":   "Gemma-2-2B",
}

# Datensatz-Kürzel (wie bisher)
DATASET_MAP = {
    "BPI12": "BPI12",
    "BPI17": "BPI17",
    "BPI20PrepaidTravelCosts": "BPI20PTC",
    "BPI20RequestForPayment": "BPI20RfP",
    "BPI20TravelPermitData": "BPI20TPD",
}

# 1) Alle LLM-LoRA-Runs aus den Roh-Runs df ziehen
lora_all = df[
    df["backbone"].isin(LLM_BACKBONES)
    & (df["fine_tuning"] == "lora")
].copy()

# Voll-LoRA (ohne Few-Shot-LoRA)
if "few_shot_k" in lora_all.columns:
    lora_all = lora_all[lora_all["few_shot_k"].isna()].copy()

# nur Zeilen mit Parametern & MT-Score
if "mt_score" not in lora_all.columns:
    raise ValueError("Spalte 'mt_score' fehlt in df – bitte sicherstellen, dass sie vorher berechnet wird.")

lora_all = lora_all[
    lora_all["trainable_params"].notna()
    & lora_all["mt_score"].notna()
].copy()

if lora_all.empty:
    print("Keine LoRA-Sweeps mit trainable_params + mt_score gefunden – Pareto-Front wird übersprungen.")
else:
    # 2) HParam-Kombi definieren (deine Sweep-Parameter)
    HP_SWEEP_COLS = [
        "lr",
        "batch_size",
        "epochs",
        "r",
        "lora_alpha",
        "embedding_size",
        "hidden_size",
        "strategy",
    ]
    HP_SWEEP_COLS = [c for c in HP_SWEEP_COLS if c in lora_all.columns]

    group_cols_sweep = ["log", "backbone"] + HP_SWEEP_COLS

    # 3) Über Seeds mitteln: MT-Score + trainable_params
    lora_sweeps_grouped = (
        lora_all
        .groupby(group_cols_sweep, dropna=False)
        .agg(
            mt_score_mean=("mt_score", "mean"),
            mt_score_std=("mt_score", "std"),
            trainable_params=("trainable_params", "mean"),
        )
        .reset_index()
    )

    # Backbone-Label
    lora_sweeps_grouped["Backbone_pretty"] = (
        lora_sweeps_grouped["backbone"]
        .map(BACKBONE_MAP_LLM)
        .fillna(lora_sweeps_grouped["backbone"])
    )

    plots_base_dir = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset"

    # 4) Pro Datensatz: All-Sweeps + echte Pareto-Front
    for log_name, df_log in lora_sweeps_grouped.groupby("log"):
        if df_log.empty:
            continue

        fig, ax = plt.subplots(figsize=(10, 5))

        # --- alle Sweeps: Linien je Backbone (wie zuvor) ---
        for bb, df_b in df_log.groupby("backbone"):
            df_b = df_b.sort_values("trainable_params")
            label = df_b["Backbone_pretty"].iloc[0]

            ax.plot(
                df_b["trainable_params"],
                df_b["mt_score_mean"],
                marker="o",
                linestyle="-",
                linewidth=1.0,
                markersize=6,
                label=label,
                alpha=0.9,
            )

        # --- echte Pareto-Front über ALLE Sweeps dieses Datensatzes ---
        # Ziele: min trainable_params, max mt_score_mean
        df_sorted = df_log.sort_values("trainable_params")
        best_score = -np.inf
        pareto_rows = []

        for _, row in df_sorted.iterrows():
            score = row["mt_score_mean"]
            if score >= best_score - 1e-9:
                pareto_rows.append(row)
                best_score = score

        pareto_df = pd.DataFrame(pareto_rows)

        if not pareto_df.empty:
            pareto_df = pareto_df.sort_values("trainable_params")
            ax.plot(
                pareto_df["trainable_params"],
                pareto_df["mt_score_mean"],
                color="black",
                linewidth=1.3,
                marker="o",
                markersize=4,
                label="Pareto front",
            )

        ax.set_xscale("log")
        ax.set_xlabel("Trainable parameters (LoRA, log scale)")
        ax.set_ylabel("MT-Score (mean across seeds)")

        ds_label = DATASET_MAP.get(log_name, log_name)
        ax.set_title(f"{ds_label} – LLM LoRA sweeps (Pareto front)")

        ax.grid(True, which="both", linestyle="--", linewidth=0.5)

        # Legend unten zentriert wie beim anderen Plot
        handles, labels = ax.get_legend_handles_labels()
        fig.legend(
            handles,
            labels,
            title="Backbone / front",
            loc="lower center",
            ncol=len(labels),
            bbox_to_anchor=(0.5, -0.02),
        )

        # Platz für Legende lassen
        plt.tight_layout(rect=(0, 0.10, 1, 1))

        log_dir = os.path.join(plots_base_dir, log_name)
        os.makedirs(log_dir, exist_ok=True)
        out_path = os.path.join(log_dir, f"pareto_llm_lora_sweeps_true_{log_name}.png")

        plt.savefig(out_path, dpi=300)
        plt.close(fig)

        print(f"Saved TRUE Pareto-front LoRA-sweeps plot for log={log_name} to: {out_path}")


Saved TRUE Pareto-front LoRA-sweeps plot for log=BPI12 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12/pareto_llm_lora_sweeps_true_BPI12.png
Saved TRUE Pareto-front LoRA-sweeps plot for log=BPI17 to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI17/pareto_llm_lora_sweeps_true_BPI17.png
Saved TRUE Pareto-front LoRA-sweeps plot for log=BPI20PrepaidTravelCosts to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI20PrepaidTravelCosts/pareto_llm_lora_sweeps_true_BPI20PrepaidTravelCosts.png
Saved TRUE Pareto-front LoRA-sweeps plot for log=BPI20RequestForPayment to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI20RequestForPayment/pareto_llm_lora_sweeps_true_BPI20RequestForPayment.png
Saved TRUE Pareto-front LoRA-sweeps plot for log=BPI20TravelPermitData to: /ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI20TravelPermitData/pareto_llm_lora_sweeps_true_BPI20TravelPe

In [None]:
# %% === CPU-only minimal evaluation: (A) prefix-topk trace viz, (B) sampled confusion matrix ===
import os
import sys
import re
from pathlib import Path
import random
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------
# User settings
# -------------------------
MODEL_PATH = "/ceph/lfertig/Thesis/notebook/llm-peft-ppm/persisted_models/suffix/BPI12_gpt2_ep10_lora_seed41.pth"
DATASET_NAME = "BPI12"  # derived from model name, but keep explicit
CACHE_DIR = Path("/ceph/lfertig/Thesis/notebook/llm-peft-ppm/data/BPI12/cached_train_test")
SAVE_DIR = Path("/ceph/lfertig/Thesis/notebook/llm-peft-ppm/results/plots/per_dataset/BPI12")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Quick trace visualization
TRACE_IDX = 0
PREFIX_LEN = 8
TOPK = 10
SUFFIX_STEPS = 12

# Confusion matrix "light" (sampling)
DO_CONFUSION_MATRIX = True
N_SAMPLES = 2500          # reduce if too slow (e.g. 500..2000)
MAX_CTX = 64              # context window cap (big speed win)
TOP_CLASSES_CM = 30       # plot only top-N by support (+OTHER)
SEED = 41

# Force CPU (GPU not available)
DEVICE = "cpu"

# Try bf16 on CPU to reduce memory; fallback to fp32
DTYPE = torch.bfloat16 if hasattr(torch, "cpu") and torch.cpu.is_bf16_supported() else torch.float32
print("DEVICE:", DEVICE, "| DTYPE:", DTYPE)

# Speed knobs (safe)
torch.set_grad_enabled(False)
torch.set_num_threads(min(8, os.cpu_count() or 8))

# Make transformers offline (avoid accidental downloads)
os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
os.environ.setdefault("HF_HUB_OFFLINE", "1")

# -------------------------
# Make ppm importable
# -------------------------
REPO_ROOT = Path("/ceph/lfertig/Thesis/notebook/llm-peft-ppm").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from peft import LoraConfig, TaskType
from ppm.models import NextEventPredictor

# Only needed if cache sidecars not found
from skpm.event_logs import BPI12
from skpm.event_logs.split import unbiased
from skpm.feature_extraction import TimestampExtractor
from sklearn.preprocessing import StandardScaler
from ppm.datasets import ContinuousTraces
from ppm.datasets.event_logs import EventFeatures, EventLog, EventTargets

# -------------------------
# Backbone configs (as in your training script)
# -------------------------
PRETRAINED_CONFIGS = {
    "gpt2": {"name": "openai-community/gpt2", "embedding_size": 768,  "hidden_size": 768,  "fine_tuning_module_path": "h"},
    "gptneo-1b3": {"name": "EleutherAI/gpt-neo-1.3B", "embedding_size": 2048, "hidden_size": 2048, "fine_tuning_module_path": "h"},
    "qwen25-05b": {"name": "Qwen/Qwen2.5-0.5B", "embedding_size": 896, "hidden_size": 896, "fine_tuning_module_path": "layers"},
    "llama32-1b": {"name": "unsloth/Llama-3.2-1B", "embedding_size": 2048, "hidden_size": 2048, "fine_tuning_module_path": "layers"},
    "gemma-2-2b": {"name": "google/gemma-2-2b", "embedding_size": 2304, "hidden_size": 2304, "fine_tuning_module_path": "layers"},
}

NUMERICAL_FEATURES = [
    "accumulated_time","day_of_month","day_of_week","day_of_year","hour_of_day",
    "min_of_hour","month_of_year","sec_of_min","secs_within_day","week_of_year",
]

# -------------------------
# Helpers
# -------------------------
def parse_ckpt_name(path: str) -> dict:
    base = os.path.basename(path).replace(".pth", "")
    rx = re.compile(
        r"^(?P<log>[^_]+)_(?P<backbone>[^_]+)"
        r"(?:_ep(?P<epochs>\d+))?"
        r"(?:_(?P<ft>lora|freeze))?"
        r"(?:_seed(?P<seed>\d+))?$"
    )
    m = rx.match(base)
    if not m:
        raise ValueError(f"Cannot parse checkpoint name: {base}")
    d = m.groupdict()
    d["epochs"] = int(d["epochs"]) if d.get("epochs") else None
    d["seed"] = int(d["seed"]) if d.get("seed") else None
    return d

def infer_lora_r_from_state_dict(sd: dict) -> int | None:
    for k, v in sd.items():
        if "lora_A" in k and hasattr(v, "shape") and len(v.shape) == 2:
            return int(v.shape[0])
    return None

def get_lora_config(backbone: str, r: int, lora_alpha: int) -> LoraConfig:
    if "gptneo" in backbone:
        return LoraConfig(task_type=TaskType.FEATURE_EXTRACTION, r=r, lora_alpha=lora_alpha, target_modules=None, use_rslora=True)
    if "gpt2" in backbone:
        return LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION, r=r, lora_alpha=lora_alpha, use_rslora=True,
            target_modules=["attn.c_attn","attn.c_proj","mlp.c_fc","mlp.c_proj"],
        )
    return LoraConfig(
        task_type=TaskType.CAUSAL_LM, r=r, lora_alpha=lora_alpha, use_rslora=True,
        target_modules=["q_proj","k_proj","v_proj","up_proj","down_proj","o_proj","gate_proj"],
    )

def softmax_np(x: np.ndarray) -> np.ndarray:
    x = x - np.max(x)
    e = np.exp(x)
    return e / (e.sum() + 1e-12)

def try_load_sidecar_cache(cache_dir: Path):
    """
    Try to load cached tensors from sidecar pt files.
    Returns dict with keys traces/cat_features/num_features/cat_targets/num_targets or None.
    """
    if not cache_dir.exists():
        return None

    # Common naming patterns (you can extend if your cache uses different names)
    pats = {
        "traces":      ["*traces*.pt"],
        "cat_features":["*cat_features*.pt"],
        "num_features":["*num_features*.pt"],
        "cat_targets": ["*cat_targets*.pt"],
        "num_targets": ["*num_targets*.pt"],
    }

    picks = {}
    for k, patterns in pats.items():
        cands = []
        for pat in patterns:
            cands += sorted(cache_dir.glob(pat))
        # Drop known "test.pt" which is None in your case
        cands = [c for c in cands if c.name != "test.pt"]
        picks[k] = cands[0] if len(cands) else None

    if any(v is None for v in picks.values()):
        return None

    bundle = {}
    for k, p in picks.items():
        obj = torch.load(p, map_location="cpu")
        bundle[k] = obj
    return bundle

def prepare_data_like_training(df_raw: pd.DataFrame, unbiased_split_params: dict):
    df = df_raw.loc[:, ["case:concept:name", "concept:name", "time:timestamp"]].copy()
    keep_cases = df.groupby("case:concept:name").size() > 2
    keep_cases = keep_cases[keep_cases].index
    df = df[df["case:concept:name"].isin(keep_cases)]

    df = df.sort_values(by=["case:concept:name", "time:timestamp"])
    df["time:timestamp"] = pd.to_datetime(df["time:timestamp"], utc=True)

    df["time_to_next_event"] = (
        df.groupby("case:concept:name")["time:timestamp"].shift(-1) - df["time:timestamp"]
    ).dt.total_seconds().fillna(0).clip(lower=0)

    train, test = unbiased(df, **unbiased_split_params)

    ts = TimestampExtractor(
        case_features=["accumulated_time", "remaining_time"],
        event_features="all",
        time_unit="d",
    )
    train[ts.get_feature_names_out()] = ts.fit_transform(train)
    test[ts.get_feature_names_out()] = ts.transform(test)

    train = train.drop(columns=["time:timestamp"])
    test = test.drop(columns=["time:timestamp"])

    train = train.rename(columns={"case:concept:name": "case_id", "concept:name": "activity"})
    test = test.rename(columns={"case:concept:name": "case_id", "concept:name": "activity"})

    sc = StandardScaler()
    columns = NUMERICAL_FEATURES + ["remaining_time", "time_to_next_event"]
    train.loc[:, columns] = sc.fit_transform(train[columns])
    test.loc[:, columns] = sc.transform(test[columns])
    return train, test

def forward_next_activity_logits(model, x_cat, x_num, pad_id: int):
    """
    x_cat: [B,T,Ccat] long
    x_num: [B,T,Cnum] float
    returns logits: [B,T,V]
    """
    attn = (x_cat[..., 0] != pad_id).long()
    out, _ = model(x_cat=x_cat, x_num=x_num, attention_mask=attn)
    if "next_activity" in out:
        return out["next_activity"]
    # fallback: first key
    k0 = next(iter(out.keys()))
    return out[k0]

# -------------------------
# 1) Load checkpoint
# -------------------------
ckpt = torch.load(MODEL_PATH, map_location="cpu")
state_dict = ckpt["net"]
stoi = ckpt.get("stoi")
itos = ckpt.get("itos")

meta = parse_ckpt_name(MODEL_PATH)
log_name = meta["log"]
backbone = meta["backbone"]
epochs = meta["epochs"]
seed = meta["seed"]
print("Parsed:", meta)

pre = PRETRAINED_CONFIGS[backbone]

# Vocab/IDs (need checkpoint vocabs ideally)
if stoi is None or itos is None:
    raise RuntimeError("Checkpoint does not contain stoi/itos. For minimal CPU evaluation, please save vocabs in ckpt.")

stoi_act = stoi["activity"]
itos_act = {int(k): v for k, v in itos["activity"].items()}
pad_id = int(stoi_act["<PAD>"])
unk_id = int(stoi_act["<UNK>"])
eos_id = int(stoi_act["<EOS>"])
vocab_size = max(itos_act.keys()) + 1

# LoRA params
r = infer_lora_r_from_state_dict(state_dict)
if r is None:
    raise RuntimeError("Could not infer LoRA r from state_dict.")
# alpha: try from df if available; else fallback alpha=r
lora_alpha = None
if "df" in globals():
    cand = df.copy()
    if "log" in cand.columns: cand = cand[cand["log"] == log_name]
    if "backbone" in cand.columns: cand = cand[cand["backbone"] == backbone]
    if "fine_tuning" in cand.columns: cand = cand[cand["fine_tuning"] == "lora"]
    if epochs is not None and "epochs" in cand.columns: cand = cand[cand["epochs"] == epochs]
    if seed is not None and "seed" in cand.columns: cand = cand[cand["seed"] == seed]
    if len(cand) and "lora_alpha" in cand.columns and cand["lora_alpha"].notna().any():
        lora_alpha = int(cand.loc[cand["lora_alpha"].notna(), "lora_alpha"].iloc[0])
if lora_alpha is None:
    lora_alpha = int(r)
    print(f"[WARN] lora_alpha not found -> fallback lora_alpha=r={lora_alpha}")

lora_cfg = get_lora_config(backbone, r=r, lora_alpha=lora_alpha)

# Build model (CPU)
numerical_cols = [f"num_{i}" for i in range(len(NUMERICAL_FEATURES))]  # will adjust if sidecar differs
model = NextEventPredictor(
    embedding_size=pre["embedding_size"],
    categorical_cols=["activity"],
    categorical_sizes={"activity": vocab_size},
    numerical_cols=numerical_cols,
    categorical_targets=["next_activity"],
    numerical_targets=["next_remaining_time", "next_time_to_next_event"],
    padding_idx=pad_id,
    strategy="sum",
    backbone_name=pre["name"],
    backbone_pretrained=True,
    backbone_finetuning=lora_cfg,
    backbone_type=None,
    backbone_hidden_size=pre["hidden_size"],
    backbone_n_layers=None,
    device=DEVICE,
).to(DEVICE)

model.load_state_dict(state_dict, strict=False)
# dtype downcast on CPU if possible
try:
    model = model.to(dtype=DTYPE)
except Exception as e:
    print("[WARN] Could not cast model dtype:", repr(e))
model.eval()

# -------------------------
# 2) Get test data with minimal overhead:
#    (a) sidecar cache tensors if they exist
#    (b) else rebuild test split like training
# -------------------------
bundle = try_load_sidecar_cache(CACHE_DIR)
if bundle is not None:
    print("[OK] Using sidecar cache from:", str(CACHE_DIR))
    traces = bundle["traces"]
    cat_features = bundle["cat_features"]
    num_features = bundle["num_features"]
    cat_targets = bundle["cat_targets"]
    # num_targets = bundle["num_targets"]

    # trace indices tensor for one example
    tr = traces[TRACE_IDX]
    tr = tr if torch.is_tensor(tr) else torch.tensor(tr, dtype=torch.long)

    x_cat = cat_features[tr]     # [T, Ccat]
    x_num = num_features[tr]     # [T, Cnum]
    y_cat = cat_targets[tr]      # [T, Ccat_tgt]

    # adjust numerical cols to true dim
    if x_num.shape[1] != len(numerical_cols):
        numerical_cols = [f"num_{i}" for i in range(int(x_num.shape[1]))]
        print(f"[WARN] num dim from cache = {x_num.shape[1]} -> rebuilding model numerical_cols is required for strictness.")
        # For minimal hassle: we proceed anyway (many implementations only use lengths).
else:
    print("[WARN] No sidecar cache found. Rebuilding test split (slower, but works).")
    raw_log = BPI12()
    train_df, test_df = prepare_data_like_training(raw_log.dataframe, raw_log.unbiased_split_params)

    event_features = EventFeatures(categorical=["activity"], numerical=NUMERICAL_FEATURES)
    event_targets = EventTargets(categorical=["activity"], numerical=["remaining_time","time_to_next_event"])

    # use checkpoint vocabs to align IDs
    vocabs = (stoi, itos)
    train_log = EventLog(
        dataframe=train_df, case_id="case_id",
        features=event_features, targets=event_targets,
        train_split=True, name=log_name, vocabs=vocabs
    )
    test_log = EventLog(
        dataframe=test_df, case_id="case_id",
        features=event_features, targets=event_targets,
        train_split=False, name=log_name, vocabs=train_log.get_vocabs()
    )
    test_dataset = ContinuousTraces(log=test_log, refresh_cache=True, device="cpu")
    x_cat, x_num, y_cat, _ = test_dataset[TRACE_IDX]

# -------------------------
# 3) A) Minimal trace visualization: Prefix -> Top-k Next Activity
# -------------------------
T = int(x_cat.shape[0])
if PREFIX_LEN < 2 or PREFIX_LEN > T:
    PREFIX_LEN = min(max(2, PREFIX_LEN), T)

# batchify + dtype
x_cat_b = x_cat[:PREFIX_LEN].unsqueeze(0).to(DEVICE)
x_num_b = x_num[:PREFIX_LEN].unsqueeze(0).to(DEVICE)
if x_num_b.dtype != DTYPE:
    try:
        x_num_b = x_num_b.to(dtype=DTYPE)
    except Exception:
        pass

with torch.inference_mode():
    logits = forward_next_activity_logits(model, x_cat_b, x_num_b, pad_id=pad_id)  # [1,L,V]
    logits_last = logits[0, PREFIX_LEN - 1].detach().cpu().float().numpy()

# remove PAD/UNK for display
logits_last[[pad_id, unk_id]] = -1e9
probs = softmax_np(logits_last)
top_ids = np.argsort(probs)[::-1][:TOPK].astype(int)
top_probs = probs[top_ids]
top_names = [str(itos_act.get(i, i)) for i in top_ids]

true_next_id = int(y_cat[PREFIX_LEN - 1, 0].item())
true_next_name = str(itos_act.get(true_next_id, true_next_id))

prefix_ids = x_cat[:PREFIX_LEN, 0].detach().cpu().numpy().astype(int).tolist()
prefix_names = [str(itos_act.get(i, i)) for i in prefix_ids]

print("\nPREFIX:")
print(" -> ".join(prefix_names))
print("True next:", true_next_name)
print("Pred top-1:", top_names[0], f"(p={top_probs[0]:.3f})")

fig, ax = plt.subplots(figsize=(9, max(3, 0.35 * TOPK)))
ax.barh(range(len(top_ids))[::-1], top_probs)
ax.set_yticks(range(len(top_ids))[::-1])
ax.set_yticklabels(top_names)
ax.set_xlabel("Probability")
ax.set_title(f"{log_name} | {backbone} | Prefix L={PREFIX_LEN} | Top-{TOPK}\nTrue next: {true_next_name}")
ax.grid(True, axis="x", linewidth=0.5, alpha=0.4)
plt.tight_layout()

out_topk = SAVE_DIR / f"trace{TRACE_IDX}_prefixL{PREFIX_LEN}_top{TOPK}_{backbone}_cpu.png"
plt.savefig(out_topk, dpi=300)
plt.close(fig)
print("Saved:", out_topk)

# Teacher-forced suffix table (cheap: re-use logits by running only once on a bit longer window)
L2 = min(T, PREFIX_LEN + SUFFIX_STEPS)
x_cat2 = x_cat[:L2].unsqueeze(0).to(DEVICE)
x_num2 = x_num[:L2].unsqueeze(0).to(DEVICE)
if x_num2.dtype != DTYPE:
    try:
        x_num2 = x_num2.to(dtype=DTYPE)
    except Exception:
        pass

with torch.inference_mode():
    logits2 = forward_next_activity_logits(model, x_cat2, x_num2, pad_id=pad_id)[0].detach().cpu().float().numpy()  # [L2,V]

rows = []
t0 = PREFIX_LEN - 1
for t in range(t0, L2):
    lg = logits2[t].copy()
    lg[[pad_id, unk_id]] = -1e9
    pr = softmax_np(lg)
    pred_id = int(np.argmax(pr))
    rows.append({
        "step_ahead": t - t0 + 1,
        "obs_activity": str(itos_act.get(int(x_cat[t,0].item()), "")),
        "true_next": str(itos_act.get(int(y_cat[t,0].item()), "")),
        "pred_next": str(itos_act.get(pred_id, pred_id)),
        "p_pred": float(pr[pred_id]),
        "correct": int(pred_id == int(y_cat[t,0].item())),
    })

df_suffix = pd.DataFrame(rows)
out_csv = SAVE_DIR / f"trace{TRACE_IDX}_prefixL{PREFIX_LEN}_suffix_{backbone}_cpu.csv"
df_suffix.to_csv(out_csv, index=False)
print("Saved:", out_csv)
display(df_suffix)

# -------------------------
# 4) B) Confusion matrix light (sampling, max context window)
# -------------------------
if DO_CONFUSION_MATRIX:
    print("\n[CM-light] sampling next-step predictions...")
    rng = random.Random(SEED)

    # Need access to multiple traces. If we used sidecar cache, use bundle traces.
    if bundle is None:
        # from rebuilt dataset
        traces_all = test_dataset.traces
        cat_features_all = test_dataset.cat_features
        num_features_all = test_dataset.num_features
        cat_targets_all = test_dataset.cat_targets
    else:
        traces_all = bundle["traces"]
        cat_features_all = bundle["cat_features"]
        num_features_all = bundle["num_features"]
        cat_targets_all = bundle["cat_targets"]

    y_true_list, y_pred_list = [], []

    # Small batching for speed on CPU
    BATCH = 6

    def sample_one():
        tr = traces_all[rng.randrange(len(traces_all))]
        tr = tr if torch.is_tensor(tr) else torch.tensor(tr, dtype=torch.long)
        if tr.numel() < 3:
            return None
        # choose a prediction position t (predict next at t)
        Tloc = int(tr.numel())
        t = rng.randrange(1, Tloc)  # t indexes label position in y_cat
        # context window: last MAX_CTX events up to t inclusive (model predicts y at t using prefix ending at t)
        start = max(0, (t + 1) - MAX_CTX)
        tr_ctx = tr[start:t+1]
        x_cat = cat_features_all[tr_ctx]
        x_num = num_features_all[tr_ctx]
        y_true = int(cat_targets_all[tr[t], 0] if torch.is_tensor(cat_targets_all) else cat_targets_all[tr[t]][0])
        return x_cat, x_num, y_true

    batch_cat, batch_num, batch_true = [], [], []
    for _ in range(N_SAMPLES):
        s = sample_one()
        if s is None:
            continue
        xc, xn, yt = s
        batch_cat.append(xc)
        batch_num.append(xn)
        batch_true.append(yt)

        if len(batch_cat) >= BATCH:
            # pad to same length
            lens = [x.shape[0] for x in batch_cat]
            Lm = max(lens)
            Cc = batch_cat[0].shape[1]
            Cn = batch_num[0].shape[1]

            xcat = torch.full((len(batch_cat), Lm, Cc), pad_id, dtype=torch.long)
            xnum = torch.zeros((len(batch_cat), Lm, Cn), dtype=torch.float32)

            for i,(xc,xn) in enumerate(zip(batch_cat, batch_num)):
                L = xc.shape[0]
                xcat[i,:L,:] = xc
                xnum[i,:L,:] = xn

            xcat = xcat.to(DEVICE)
            xnum = xnum.to(DEVICE)
            if xnum.dtype != DTYPE:
                try:
                    xnum = xnum.to(dtype=DTYPE)
                except Exception:
                    pass

            with torch.inference_mode():
                logits = forward_next_activity_logits(model, xcat, xnum, pad_id=pad_id)  # [B,L,V]
                # take last position per sample (length-1)
                pred_ids = []
                for i,L in enumerate(lens):
                    lg = logits[i, L-1].detach().cpu().float().numpy()
                    lg[[pad_id, unk_id]] = -1e9
                    pred_ids.append(int(np.argmax(lg)))
            y_pred_list.extend(pred_ids)
            y_true_list.extend(batch_true)

            batch_cat, batch_num, batch_true = [], [], []

    # flush remaining
    if len(batch_cat):
        lens = [x.shape[0] for x in batch_cat]
        Lm = max(lens)
        Cc = batch_cat[0].shape[1]
        Cn = batch_num[0].shape[1]
        xcat = torch.full((len(batch_cat), Lm, Cc), pad_id, dtype=torch.long)
        xnum = torch.zeros((len(batch_cat), Lm, Cn), dtype=torch.float32)
        for i,(xc,xn) in enumerate(zip(batch_cat, batch_num)):
            L = xc.shape[0]
            xcat[i,:L,:] = xc
            xnum[i,:L,:] = xn
        xcat = xcat.to(DEVICE)
        xnum = xnum.to(DEVICE)
        if xnum.dtype != DTYPE:
            try:
                xnum = xnum.to(dtype=DTYPE)
            except Exception:
                pass
        with torch.inference_mode():
            logits = forward_next_activity_logits(model, xcat, xnum, pad_id=pad_id)
            pred_ids = []
            for i,L in enumerate(lens):
                lg = logits[i, L-1].detach().cpu().float().numpy()
                lg[[pad_id, unk_id]] = -1e9
                pred_ids.append(int(np.argmax(lg)))
        y_pred_list.extend(pred_ids)
        y_true_list.extend(batch_true)

    y_true_arr = np.array(y_true_list, dtype=int)
    y_pred_arr = np.array(y_pred_list, dtype=int)

    # filter specials from y_true
    keep = ~np.isin(y_true_arr, [pad_id, unk_id])
    y_true_arr = y_true_arr[keep]
    y_pred_arr = y_pred_arr[keep]

    acc = float((y_true_arr == y_pred_arr).mean()) if len(y_true_arr) else float("nan")
    print(f"[CM-light] samples={len(y_true_arr)} | approx acc={acc:.4f}")

    # Top-N classes + OTHER for readability
    uniq, cnt = np.unique(y_true_arr, return_counts=True)
    order = np.argsort(cnt)[::-1]
    top = uniq[order][:TOP_CLASSES_CM]
    OTHER = -999999

    def map_other(a):
        out = a.copy()
        out[~np.isin(out, top)] = OTHER
        return out

    yt = map_other(y_true_arr)
    yp = map_other(y_pred_arr)
    labels = list(top) + [OTHER]
    names = [str(itos_act.get(int(i), i)) for i in top] + ["OTHER"]

    # confusion matrix (row-normalized)
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(yt, yp, labels=labels)
    cmn = cm.astype(float) / np.maximum(cm.sum(axis=1, keepdims=True), 1.0)

    fig_w = max(8, 0.35 * len(labels))
    fig_h = max(7, 0.35 * len(labels))
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    sns.heatmap(cmn, square=True, cbar=True, xticklabels=names, yticklabels=names, linewidths=0.2, linecolor="white", ax=ax)
    ax.set_xlabel("Predicted next activity")
    ax.set_ylabel("True next activity")
    ax.set_title(f"{log_name} | {backbone} | CM-light (N={len(y_true_arr)}, ctx<= {MAX_CTX})\nrow-normalized, approx acc={acc:.4f}")
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    out_cm = SAVE_DIR / f"cm_light_{log_name}_{backbone}_N{len(y_true_arr)}_ctx{MAX_CTX}_cpu.png"
    plt.savefig(out_cm, dpi=300)
    plt.close(fig)
    print("Saved:", out_cm)
