In [1]:
import sys
import os
import joblib
import pandas as pd
import torch

sys.path.insert(0, '..')
sys.path.insert(0, '../..')
sys.path.insert(0, '../../..')
sys.path.insert(0, '../../../..')
sys.path.insert(0, '../../../../..')
sys.path.insert(0, '../../../../../..')

from reimplemented_approaches.proactive_conformance_checking.data_prep_split_encode import DeviationLabeling, TrainTestSplit, Undersampling, PrefixDataset

In [2]:
# Data loading:
name_event_log = "Helpdesk"
path_event_log = "../../../../../../../../../data/data/helpdesk.csv"
path_process_model = "../../../../../../../data/process_models/Helpdesk.bpmn"

In [3]:
# Define attributes:
# Load the event log as pandas dataframe. Get all attributes in the log:
df = pd.read_csv(path_event_log)
print(df.columns)

Index(['CaseID', 'Activity', 'Resource', 'CompleteTimestamp', 'VariantIndex',
       'seriousness', 'customer', 'product', 'responsible_section',
       'seriousness_2', 'service_level', 'service_type', 'support_section',
       'workgroup'],
      dtype='object')


In [4]:
df

Unnamed: 0,CaseID,Activity,Resource,CompleteTimestamp,VariantIndex,seriousness,customer,product,responsible_section,seriousness_2,service_level,service_type,support_section,workgroup
0,Case 1,Assign seriousness,Value 1,2012/10/09 14:50:17.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1
1,Case 1,Take in charge ticket,Value 1,2012/10/09 14:51:01.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1
2,Case 1,Take in charge ticket,Value 2,2012/10/12 15:02:56.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1
3,Case 1,Resolve ticket,Value 1,2012/10/25 11:54:26.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1
4,Case 1,Closed,Value 3,2012/11/09 12:54:39.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21343,Case 4579,Closed,Value 5,2010/09/02 10:11:00.000,1,Value 1,Value 71,Value 3,Value 1,Value 1,Value 3,Value 1,Value 1,Value 1
21344,Case 4580,Take in charge ticket,Value 6,2012/01/03 09:33:43.000,18,Value 1,Value 92,Value 3,Value 1,Value 2,Value 2,Value 2,Value 1,Value 1
21345,Case 4580,Wait,Value 6,2012/01/10 15:30:11.000,18,Value 1,Value 92,Value 3,Value 1,Value 2,Value 2,Value 2,Value 1,Value 1
21346,Case 4580,Resolve ticket,Value 6,2012/01/10 17:07:40.000,18,Value 1,Value 92,Value 3,Value 1,Value 2,Value 2,Value 2,Value 1,Value 1


In [5]:
# Data preparation and labelling
dl = DeviationLabeling(log_name=name_event_log,
                       path_event_log=path_event_log,
                       path_process_model=path_process_model,
                       label_strategy='collective')

# collective, runn 100 times:
df_labeled_deviations, encoders = dl.generate_individual_labels(trace_attr=['seriousness', 'customer', 'seriousness_2'], conf_runs=1)

  0%|          | 0/1 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/226 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:05<00:00,  5.51s/it]


In [6]:
df_labeled_deviations

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
1,"('Case 1', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
2,"('Case 1', 3)","[1, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
3,"('Case 1', 4)","[1, 12, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
4,"('Case 1', 5)","[1, 12, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21343,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[7, 7, 11, 11, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
21344,"('Case 999', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0
21345,"('Case 999', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0
21346,"('Case 999', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 21, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0


In [7]:
act_ids = encoders['activity_ids']
print("Activity ids: " ,act_ids)
print("\n")

res_ids = encoders['resource_ids']
print("Reource ids: ", res_ids)
print("\n")

months = encoders['month_ids']
print("Months: ", months)
print("\n")

deviations = encoders['deviations']
print("Deviations: ", deviations)
print("\n")

Activity ids:  {'Assign seriousness': 1, 'Closed': 2, 'Create SW anomaly': 3, 'DUPLICATE': 4, 'INVALID': 5, 'Insert ticket': 6, 'RESOLVED': 7, 'Require upgrade': 8, 'Resolve SW anomaly': 9, 'Resolve ticket': 10, 'Schedule intervention': 11, 'Take in charge ticket': 12, 'VERIFIED': 13, 'Wait': 14}


Reource ids:  {'Value 1': 1, 'Value 10': 2, 'Value 11': 3, 'Value 12': 4, 'Value 13': 5, 'Value 14': 6, 'Value 15': 7, 'Value 16': 8, 'Value 17': 9, 'Value 18': 10, 'Value 19': 11, 'Value 2': 12, 'Value 20': 13, 'Value 21': 14, 'Value 22': 15, 'Value 3': 16, 'Value 4': 17, 'Value 5': 18, 'Value 6': 19, 'Value 7': 20, 'Value 8': 21, 'Value 9': 22}


Months:  {'10_2010': 1, '10_2011': 2, '10_2012': 3, '10_2013': 4, '11_2010': 5, '11_2011': 6, '11_2012': 7, '11_2013': 8, '12_2010': 9, '12_2011': 10, '12_2012': 11, '12_2013': 12, '1_2010': 13, '1_2011': 14, '1_2012': 15, '1_2013': 16, '1_2014': 17, '2_2010': 18, '2_2011': 19, '2_2012': 20, '2_2013': 21, '3_2010': 22, '3_2011': 23, '3_2012': 24, 

In [8]:
# Save prepared dataframe as .csv
os.makedirs(".", exist_ok=True)
csv_path_collective = os.path.join("./prefix_deviations.csv")
df_labeled_deviations.to_csv(csv_path_collective, index=False)

In [9]:
# Train and test split
tts = TrainTestSplit(df_labled_deviations=df_labeled_deviations,
                     label_strategy = "collective")

train_df, val_df, test_df = tts.data_split(val_frac=0.2)

In [10]:
train_df

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 4)","[1, 12, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
1,"('Case 1', 5)","[1, 12, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
2,"('Case 10', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
3,"('Case 10', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[18, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
4,"('Case 10', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[18, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381,"('Case 997', 4)","[1, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 5, 5, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[6, 6, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,148,0,0,0,0,0,0,0
11382,"('Case 998', 4)","[1, 12, 14, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 7, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
11383,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[7, 7, 11, 11, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
11384,"('Case 999', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0


In [11]:
# Undersampling train set
u = Undersampling(train_data=train_df, 
                  list_dynamic_cols=['activities', 'resources', 'months'],
                  label_strategy='collective')

train_df, y_no_true_class = u.one_sided_selection_undersampling()

In [12]:
train_df

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 4)","[1, 12, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
1,"('Case 1', 5)","[1, 12, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3, 3, 3, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
2,"('Case 10', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
3,"('Case 10', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[18, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
4,"('Case 10', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[18, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381,"('Case 997', 4)","[1, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 5, 5, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[6, 6, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,148,0,0,0,0,0,0,0
11382,"('Case 998', 4)","[1, 12, 14, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 7, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
11383,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[7, 7, 11, 11, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
11384,"('Case 999', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0


In [13]:
# Save encoder ids for all attributes in the dataframe:
y_columns = cols = [c for c in train_df.columns if c.startswith("y_")]
encoders['deviations'] = y_columns
# Save encoding/ decoding key as .pkl
joblib.dump(encoders, "encoders.pkl")

['encoders.pkl']

In [14]:
# Tensor encoding and saving — quick literal fix
device = torch.device("cpu") # store to cpu
dataset_prep = PrefixDataset(# datset
                             df_train=train_df,
                             df_val=val_df,
                             df_test=test_df,
                             # column values
                             activity_col='activities',
                             resource_col='resources',
                             month_col='months',
                             trace_cols=['trace_attr_seriousness', 'trace_attr_customer', 'trace_attr_seriousness_2'],
                             y_cols=encoders['deviations'],
                             label_strategy = "collective")

# Encode to tensor datsets and save files
train_set, val_set, test_set = dataset_prep.tensor_datset_encoding(device=device)
dataset_prep.save_datasets(train_dataset=train_set, val_dataset=val_set, test_dataset=test_set, save_path=".")

('./train_set.pkl', './val_set.pkl', './test_set.pkl')

In [16]:
train_set, val_set, test_set = dataset_prep.load_datasets(".")
print(train_set.tensors)
print(val_set.tensors)
print(test_set.tensors)

(tensor([[ 1, 12, 12,  ...,  0,  0,  0],
        [ 1, 12, 12,  ...,  0,  0,  0],
        [ 1,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 1, 12, 14,  ...,  0,  0,  0],
        [ 1, 12,  0,  ...,  0,  0,  0],
        [ 1, 12, 10,  ...,  0,  0,  0]]), tensor([[ 1,  1, 12,  ...,  0,  0,  0],
        [ 1,  1, 12,  ...,  0,  0,  0],
        [12,  0,  0,  ...,  0,  0,  0],
        ...,
        [22, 12, 22,  ...,  0,  0,  0],
        [ 1,  4,  0,  ...,  0,  0,  0],
        [ 1,  4,  4,  ...,  0,  0,  0]]), tensor([[ 3,  3,  3,  ...,  0,  0,  0],
        [ 3,  3,  3,  ...,  0,  0,  0],
        [18,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 7,  7, 11,  ...,  0,  0,  0],
        [21, 21,  0,  ...,  0,  0,  0],
        [21, 21, 25,  ...,  0,  0,  0]]), tensor([[  0,   0,   0],
        [  0,   0,   0],
        [  0,   1,   0],
        ...,
        [  0, 353,   1],
        [  0, 386,   1],
        [  0, 386,   1]]), tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0