In [1]:
import sys
import os
import joblib
import pandas as pd
import torch

sys.path.insert(0, '..')
sys.path.insert(0, '../..')
sys.path.insert(0, '../../..')
sys.path.insert(0, '../../../..')
sys.path.insert(0, '../../../../..')
sys.path.insert(0, '../../../../../..')

from reimplemented_approaches.proactive_conformance_checking.data_prep_split_encode import DeviationLabeling, TrainTestSplit, Undersampling, PrefixDataset

In [2]:
# Data loading:
name_event_log = "Repair"
path_event_log = "../../../../../../../data/artificial/repair_shop_event_log.csv"
path_process_model = "../../../../../../../data/process_models/Repair.bpmn"

In [3]:
# Define attributes:
# Load the event log as pandas dataframe. Get all attributes in the log:
df = pd.read_csv(path_event_log)
print(df.columns)

Index(['case:concept:name', 'concept:name', 'a:id', 'time:timestamp',
       'org:resource', 'type'],
      dtype='object')


In [4]:
df

Unnamed: 0,case:concept:name,concept:name,a:id,time:timestamp,org:resource,type
0,c2,RECEPTION,0,2020-01-01 08:18:27.697378,44,EventType.ACTIVITY_COMPLETE
1,c1,RECEPTION,0,2020-01-01 09:05:32.572858,38,EventType.ACTIVITY_COMPLETE
2,c0,RECEPTION,0,2020-01-01 10:38:11.101437,34,EventType.ACTIVITY_COMPLETE
3,c0,ACKNOWLEDGEMENT,2,2020-01-01 14:35:43.907968,43,EventType.ACTIVITY_COMPLETE
4,c2,DISASSEMBLY,1,2020-01-01 14:42:37.914360,44,EventType.ACTIVITY_COMPLETE
...,...,...,...,...,...,...
82233,c9998,DISASSEMBLY,1,2025-06-20 19:28:39.870333,30,EventType.ACTIVITY_COMPLETE
82234,c9988,ACKNOWLEDGEMENT,6,2025-06-20 19:32:03.860795,29,EventType.ACTIVITY_COMPLETE
82235,c9996,REPAIR,2,2025-06-20 21:33:48.570152,37,EventType.ACTIVITY_COMPLETE
82236,c9998,ACKNOWLEDGEMENT,2,2025-06-20 22:01:20.039645,7,EventType.ACTIVITY_COMPLETE


In [5]:
# Data preparation and labelling
dl = DeviationLabeling(log_name=name_event_log,
                       path_event_log=path_event_log,
                       path_process_model=path_process_model,
                       label_strategy='separate')

# separate
dict_labeled_deviations, dict_encoders = dl.generate_individual_labels(trace_attr=[], conf_runs=1)

  0%|          | 0/1 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/39 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  2.05it/s]


In [6]:
# df_labeled_deviations
for label, df in dict_labeled_deviations.items():
    print("Label: ", label)
    print("dataframe: ", df.shape)

Label:  ('>>', 'ACKNOWLEDGEMENT')
dataframe:  (82238, 5)
Label:  ('>>', 'CREATE_INVOICE')
dataframe:  (82238, 5)
Label:  ('>>', 'DISASSEMBLY')
dataframe:  (82238, 5)
Label:  ('>>', 'QUALITY_CONTROL')
dataframe:  (82238, 5)
Label:  ('>>', 'REPAIR')
dataframe:  (82238, 5)
Label:  ('>>', 'SHIPPING')
dataframe:  (82238, 5)
Label:  ('ACKNOWLEDGEMENT', '>>')
dataframe:  (82238, 5)
Label:  ('QUALITY_CONTROL', '>>')
dataframe:  (82238, 5)
Label:  ('REPAIR', '>>')
dataframe:  (82238, 5)


In [7]:
act_ids = list(dict_encoders.values())[0]['activity_ids']
print("Activity ids: " ,act_ids)
print("\n")

res_ids = list(dict_encoders.values())[0]['resource_ids']
print("Reource ids: ", res_ids)
print("\n")

months = list(dict_encoders.values())[0]['month_ids']
print("Months: ", months)
print("\n")

deviations = list(dict_encoders.values())[0]['deviations']
print("Deviations: ", deviations)
print("\n")

Activity ids:  {'ACKNOWLEDGEMENT': 1, 'CREATE_INVOICE': 2, 'DISASSEMBLY': 3, 'QUALITY_CONTROL': 4, 'RECEPTION': 5, 'REPAIR': 6, 'SHIPPING': 7}


Reource ids:  {'0': 1, '1': 2, '10': 3, '11': 4, '12': 5, '13': 6, '14': 7, '15': 8, '16': 9, '17': 10, '18': 11, '19': 12, '2': 13, '20': 14, '21': 15, '22': 16, '23': 17, '24': 18, '25': 19, '26': 20, '27': 21, '28': 22, '29': 23, '3': 24, '30': 25, '31': 26, '32': 27, '33': 28, '34': 29, '35': 30, '36': 31, '37': 32, '38': 33, '39': 34, '4': 35, '40': 36, '41': 37, '42': 38, '43': 39, '44': 40, '45': 41, '46': 42, '47': 43, '48': 44, '49': 45, '5': 46, '6': 47, '7': 48, '8': 49, '9': 50}


Months:  {'10_2020': 1, '10_2021': 2, '10_2022': 3, '10_2023': 4, '10_2024': 5, '11_2020': 6, '11_2021': 7, '11_2022': 8, '11_2023': 9, '11_2024': 10, '12_2020': 11, '12_2021': 12, '12_2022': 13, '12_2023': 14, '12_2024': 15, '1_2020': 16, '1_2021': 17, '1_2022': 18, '1_2023': 19, '1_2024': 20, '1_2025': 21, '2_2020': 22, '2_2021': 23, '2_2022': 24, '2_20

In [8]:
# Save prepared dataframe as .csv
label_dir = "./labelled_prefix_deviations"
os.makedirs(label_dir, exist_ok=True)

for label, df_label in dict_labeled_deviations.items():
    csv_path = os.path.join(label_dir, f"{label}_prefix_deviations.csv")
    df_label.to_csv(csv_path, index=False)

In [9]:
# Train and test split
tts = TrainTestSplit(df_labled_deviations=dict_labeled_deviations,
                     label_strategy="separate")
train_dict, val_dict, test_dict = tts.data_split(val_frac=0.2)

In [10]:
# df_labeled_deviations
for label, df in train_dict.items():
    print("Label: ", label)
    print("dataframe: ", df.shape)

Label:  ('>>', 'ACKNOWLEDGEMENT')
dataframe:  (43860, 5)
Label:  ('>>', 'CREATE_INVOICE')
dataframe:  (43860, 5)
Label:  ('>>', 'DISASSEMBLY')
dataframe:  (43860, 5)
Label:  ('>>', 'QUALITY_CONTROL')
dataframe:  (43860, 5)
Label:  ('>>', 'REPAIR')
dataframe:  (43860, 5)
Label:  ('>>', 'SHIPPING')
dataframe:  (43860, 5)
Label:  ('ACKNOWLEDGEMENT', '>>')
dataframe:  (43860, 5)
Label:  ('QUALITY_CONTROL', '>>')
dataframe:  (43860, 5)
Label:  ('REPAIR', '>>')
dataframe:  (43860, 5)


In [11]:
u = Undersampling(train_data=train_dict,
                  list_dynamic_cols=['activities', 'resources', 'months'],
                  label_strategy='separate')
train_dict, y_no_true_class = u.one_sided_selection_undersampling()

In [12]:
# df_labeled_deviations
for label, df in train_dict.items():
    print("Label: ", label)
    print("dataframe: ", df.shape)

Label:  ('>>', 'ACKNOWLEDGEMENT')
dataframe:  (28105, 5)
Label:  ('>>', 'CREATE_INVOICE')
dataframe:  (38217, 5)
Label:  ('>>', 'DISASSEMBLY')
dataframe:  (12921, 5)
Label:  ('>>', 'QUALITY_CONTROL')
dataframe:  (40374, 5)
Label:  ('>>', 'REPAIR')
dataframe:  (26041, 5)
Label:  ('>>', 'SHIPPING')
dataframe:  (42559, 5)
Label:  ('ACKNOWLEDGEMENT', '>>')
dataframe:  (42215, 5)
Label:  ('QUALITY_CONTROL', '>>')
dataframe:  (32764, 5)
Label:  ('REPAIR', '>>')
dataframe:  (41858, 5)


In [13]:
y_columns = {label: [c for c in df_label.columns if c.startswith("y_")] for label, df_label in train_dict.items()}

for label in y_columns:
    dict_encoders[label]['deviations'] = y_columns[label]
joblib.dump(dict_encoders, "encoders.pkl")

['encoders.pkl']

In [14]:
# Tensor encoding and saving — quick literal fix
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
dataset_prep = PrefixDataset(df_train=train_dict,
                             df_val=val_dict,
                             df_test=test_dict,
                             activity_col='activities',
                             resource_col='resources',
                             month_col='months',
                             trace_cols=[],
                             y_cols=y_columns,
                             label_strategy="separate")

train_set_dict, val_set_dict, test_set_dict = dataset_prep.tensor_datset_encoding(device=device)
dataset_prep.save_datasets(train_dataset=train_set_dict,
                           val_dataset=val_set_dict,
                           test_dataset=test_set_dict,
                           save_path=".")

('./train_set.pkl', './val_set.pkl', './test_set.pkl')

In [15]:
train_set_dict, val_set_dict, test_set_dict = dataset_prep.load_datasets(".")
for label in train_set_dict:
    print(label, train_set_dict[label].tensors[0].shape, test_set_dict[label].tensors[0].shape)

('>>', 'ACKNOWLEDGEMENT') torch.Size([28105, 16]) torch.Size([27413, 16])
('>>', 'CREATE_INVOICE') torch.Size([38217, 16]) torch.Size([27413, 16])
('>>', 'DISASSEMBLY') torch.Size([12921, 16]) torch.Size([27413, 16])
('>>', 'QUALITY_CONTROL') torch.Size([40374, 16]) torch.Size([27413, 16])
('>>', 'REPAIR') torch.Size([26041, 16]) torch.Size([27413, 16])
('>>', 'SHIPPING') torch.Size([42559, 16]) torch.Size([27413, 16])
('ACKNOWLEDGEMENT', '>>') torch.Size([42215, 16]) torch.Size([27413, 16])
('QUALITY_CONTROL', '>>') torch.Size([32764, 16]) torch.Size([27413, 16])
('REPAIR', '>>') torch.Size([41858, 16]) torch.Size([27413, 16])
