In [None]:
import sys
import os
import joblib
import pandas as pd
import torch

sys.path.insert(0, '..')
sys.path.insert(0, '../..')
sys.path.insert(0, '../../..')
sys.path.insert(0, '../../../..')
sys.path.insert(0, '../../../../..')
sys.path.insert(0, '../../../../../..')

from reimplemented_approaches.proactive_conformance_checking.data_prep_split_encode import DeviationLabeling, TrainTestSplit, Undersampling, PrefixDataset

In [None]:
# Data loading:
name_event_log = "Repair"
path_event_log = "../../../../../../../data/artificial/repair_shop_event_log.csv"
path_process_model = "../../../../../../../data/process_models/Repair.bpmn"

In [None]:
# Define attributes:
# Load the event log as pandas dataframe. Get all attributes in the log:
df = pd.read_csv(path_event_log)
print(df.columns)

In [None]:
df

In [None]:
# Data preparation and labelling
dl = DeviationLabeling(log_name=name_event_log,
                       path_event_log=path_event_log,
                       path_process_model=path_process_model,
                       label_strategy='collective')

# collective
df_labeled_deviations, encoders = dl.generate_individual_labels(trace_attr=[], conf_runs=1)

In [None]:
df_labeled_deviations

In [None]:
act_ids = encoders['activity_ids']
print("Activity ids: " ,act_ids)
print("\n")

res_ids = encoders['resource_ids']
print("Reource ids: ", res_ids)
print("\n")

months = encoders['month_ids']
print("Months: ", months)
print("\n")

deviations = encoders['deviations']
print("Deviations: ", deviations)
print("\n")

In [None]:
# Save prepared dataframe as .csv
os.makedirs(".", exist_ok=True)
csv_path_collective = os.path.join("./prefix_deviations.csv")
df_labeled_deviations.to_csv(csv_path_collective, index=False)

In [None]:
# Train and test split
tts = TrainTestSplit(df_labled_deviations=df_labeled_deviations,
                     label_strategy = "collective")

train_df, val_df, test_df = tts.data_split(val_frac=0.2)

In [None]:
train_df

In [None]:
# Undersampling train set
u = Undersampling(train_data=train_df, 
                  list_dynamic_cols=['activities', 'resources', 'months'],
                  label_strategy='collective')

train_df, y_no_true_class = u.one_sided_selection_undersampling()

In [None]:
train_df

In [None]:
# Save encoder ids for all attributes in the dataframe:
y_columns = cols = [c for c in train_df.columns if c.startswith("y_")]
encoders['deviations'] = y_columns
# Save encoding/ decoding key as .pkl
joblib.dump(encoders, "encoders.pkl")

In [None]:
# Tensor encoding and saving â€” quick literal fix
device = torch.device("cpu") # store to cpu
dataset_prep = PrefixDataset(# datset
                             df_train=train_df,
                             df_val=val_df,
                             df_test=test_df,
                             # column values
                             activity_col='activities',
                             resource_col='resources',
                             month_col='months',
                             trace_cols=[],
                             y_cols=encoders['deviations'],
                             label_strategy = "collective")

# Encode to tensor datsets and save files
train_set, val_set, test_set = dataset_prep.tensor_datset_encoding(device=device)
dataset_prep.save_datasets(train_dataset=train_set, val_dataset=val_set, test_dataset=test_set, save_path=".")

In [None]:
train_set, val_set, test_set = dataset_prep.load_datasets(".")
print(train_set.tensors)
print(val_set.tensors)
print(test_set.tensors)