In [None]:
import sys
import os
import joblib
import pandas as pd
import torch

sys.path.insert(0, '..')
sys.path.insert(0, '../..')
sys.path.insert(0, '../../..')
sys.path.insert(0, '../../../..')
sys.path.insert(0, '../../../../..')
sys.path.insert(0, '../../../../../..')

from reimplemented_approaches.proactive_conformance_checking.data_prep_split_encode import DeviationLabeling, TrainTestSplit, Undersampling, PrefixDataset

In [None]:
# Data loading:
name_event_log = "Sepsis"
path_event_log = "../../../../../../../../../data/data/Sepsis.csv"
path_process_model = "../../../../../../../data/process_models/Sepsis.bpmn"

In [None]:
# Define attributes:
# Load the event log as pandas dataframe. Get all attributes in the log:
df = pd.read_csv(path_event_log)
print(df.columns)

In [None]:
df

In [None]:
# Data preparation and labelling
dl = DeviationLabeling(log_name=name_event_log,
                       path_event_log=path_event_log,
                       path_process_model=path_process_model,
                       label_strategy='separate')

# separate
dict_labeled_deviations, dict_encoders = dl.generate_individual_labels(trace_attr=[], conf_runs=1)

In [None]:
# df_labeled_deviations
for label, df in dict_labeled_deviations.items():
    print("Label: ", label)
    print("dataframe: ", df.shape)

In [None]:
act_ids = list(dict_encoders.values())[0]['activity_ids']
print("Activity ids: " ,act_ids)
print("\n")

res_ids = list(dict_encoders.values())[0]['resource_ids']
print("Reource ids: ", res_ids)
print("\n")

months = list(dict_encoders.values())[0]['month_ids']
print("Months: ", months)
print("\n")

deviations = list(dict_encoders.values())[0]['deviations']
print("Deviations: ", deviations)
print("\n")

In [None]:
# Save prepared dataframe as .csv
label_dir = "./labelled_prefix_deviations"
os.makedirs(label_dir, exist_ok=True)

for label, df_label in dict_labeled_deviations.items():
    csv_path = os.path.join(label_dir, f"{label}_prefix_deviations.csv")
    df_label.to_csv(csv_path, index=False)

In [None]:
# Train and test split
tts = TrainTestSplit(df_labled_deviations=dict_labeled_deviations,
                     label_strategy="separate")
train_dict, val_dict, test_dict = tts.data_split(val_frac=0.2)

In [None]:
# df_labeled_deviations
for label, df in train_dict.items():
    print("Label: ", label)
    print("dataframe: ", df.shape)

In [None]:
u = Undersampling(train_data=train_dict,
                  list_dynamic_cols=['activities', 'resources', 'months'],
                  label_strategy='separate')
train_dict, y_no_true_class = u.one_sided_selection_undersampling()

In [None]:
# df_labeled_deviations
for label, df in train_dict.items():
    print("Label: ", label)
    print("dataframe: ", df.shape)

In [None]:
y_columns = {label: [c for c in df_label.columns if c.startswith("y_")] for label, df_label in train_dict.items()}

for label in y_columns:
    dict_encoders[label]['deviations'] = y_columns[label]
joblib.dump(dict_encoders, "encoders.pkl")

In [None]:
# Tensor encoding and saving â€” quick literal fix
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
dataset_prep = PrefixDataset(df_train=train_dict,
                             df_val=val_dict,
                             df_test=test_dict,
                             activity_col='activities',
                             resource_col='resources',
                             month_col='months',
                             trace_cols=[],
                             y_cols=y_columns,
                             label_strategy="separate")

train_set_dict, val_set_dict, test_set_dict = dataset_prep.tensor_datset_encoding(device=device)
dataset_prep.save_datasets(train_dataset=train_set_dict,
                           val_dataset=val_set_dict,
                           test_dataset=test_set_dict,
                           save_path=".")

In [None]:
train_set_dict, val_set_dict, test_set_dict = dataset_prep.load_datasets(".")
for label in train_set_dict:
    print(label, train_set_dict[label].tensors[0].shape, test_set_dict[label].tensors[0].shape)