In [None]:
import sys
import os
import joblib
import pandas as pd
import torch

sys.path.insert(0, '..')
sys.path.insert(0, '../..')
sys.path.insert(0, '../../..')
sys.path.insert(0, '../../../..')
sys.path.insert(0, '../../../../..')

from reimplemented_approaches.proactive_conformance_checking.data_prep_split_encode import DeviationLabeling, TrainTestSplit, Undersampling, CleanDatasets, PrefixDataset

In [2]:
# Data loading:
name_event_log = "Helpdesk"
path_event_log = "../../../../../../../../data/data/helpdesk.csv"
path_process_model = "../../../../../../data/process_models/Helpdesk.bpmn"

In [3]:
# Define attributes:
# Load the event log as pandas dataframe. Get all attributes in the log:
df = pd.read_csv(path_event_log)
print(df.columns)
df

Index(['CaseID', 'Activity', 'Resource', 'CompleteTimestamp', 'VariantIndex',
       'seriousness', 'customer', 'product', 'responsible_section',
       'seriousness_2', 'service_level', 'service_type', 'support_section',
       'workgroup'],
      dtype='object')


Unnamed: 0,CaseID,Activity,Resource,CompleteTimestamp,VariantIndex,seriousness,customer,product,responsible_section,seriousness_2,service_level,service_type,support_section,workgroup
0,Case 1,Assign seriousness,Value 1,2012/10/09 14:50:17.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1
1,Case 1,Take in charge ticket,Value 1,2012/10/09 14:51:01.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1,Value 1
2,Case 1,Take in charge ticket,Value 2,2012/10/12 15:02:56.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1
3,Case 1,Resolve ticket,Value 1,2012/10/25 11:54:26.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1
4,Case 1,Closed,Value 3,2012/11/09 12:54:39.000,12,Value 1,Value 1,Value 1,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21343,Case 4579,Closed,Value 5,2010/09/02 10:11:00.000,1,Value 1,Value 71,Value 3,Value 1,Value 1,Value 3,Value 1,Value 1,Value 1
21344,Case 4580,Take in charge ticket,Value 6,2012/01/03 09:33:43.000,18,Value 1,Value 92,Value 3,Value 1,Value 2,Value 2,Value 2,Value 1,Value 1
21345,Case 4580,Wait,Value 6,2012/01/10 15:30:11.000,18,Value 1,Value 92,Value 3,Value 1,Value 2,Value 2,Value 2,Value 1,Value 1
21346,Case 4580,Resolve ticket,Value 6,2012/01/10 17:07:40.000,18,Value 1,Value 92,Value 3,Value 1,Value 2,Value 2,Value 2,Value 1,Value 1


In [4]:
# Data preparation and labelling
dl = DeviationLabeling(log_name=name_event_log,
                       path_event_log=path_event_log,
                       path_process_model=path_process_model)

df_labeled_deviations, encoders = dl.generate_individual_labels(trace_attr=['seriousness', 'customer', 'seriousness_2'],
                                                                conf_runs=1)
df_labeled_deviations

  0%|          | 0/1 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/226 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:12<00:00, 12.29s/it]


Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Insert ticket', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0,0
1,"('Case 1', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0,0
2,"('Case 1', 3)","[1, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0,0
3,"('Case 1', 4)","[1, 12, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0,0
4,"('Case 1', 5)","[1, 12, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 10, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21343,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[11, 11, 12, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0,0
21344,"('Case 999', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0,0
21345,"('Case 999', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0,0
21346,"('Case 999', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0,0


In [5]:
act_ids = encoders['activity_ids']
print("Activity ids: " ,act_ids)
print("\n")

res_ids = encoders['resource_ids']
print("Reource ids: ", res_ids)
print("\n")

deviations = encoders['deviations']
print("Deviations: ", deviations)
print("\n")

Activity ids:  {'Assign seriousness': 1, 'Closed': 2, 'Create SW anomaly': 3, 'DUPLICATE': 4, 'INVALID': 5, 'Insert ticket': 6, 'RESOLVED': 7, 'Require upgrade': 8, 'Resolve SW anomaly': 9, 'Resolve ticket': 10, 'Schedule intervention': 11, 'Take in charge ticket': 12, 'VERIFIED': 13, 'Wait': 14}


Reource ids:  {'Value 1': 1, 'Value 10': 2, 'Value 11': 3, 'Value 12': 4, 'Value 13': 5, 'Value 14': 6, 'Value 15': 7, 'Value 16': 8, 'Value 17': 9, 'Value 18': 10, 'Value 19': 11, 'Value 2': 12, 'Value 20': 13, 'Value 21': 14, 'Value 22': 15, 'Value 3': 16, 'Value 4': 17, 'Value 5': 18, 'Value 6': 19, 'Value 7': 20, 'Value 8': 21, 'Value 9': 22}


Deviations:  ["('>>', 'Assign seriousness')", "('Create SW anomaly', '>>')", "('Insert ticket', '>>')", "('Require upgrade', '>>')", "('Resolve ticket', '>>')", "('Take in charge ticket', '>>')", "('Wait', '>>')"]




In [6]:
# Save prepared dataframe as .csv
os.makedirs(".", exist_ok=True)
csv_path = os.path.join("prefix_deviations.csv")
df_labeled_deviations.to_csv(csv_path, index=False)

# Save encoding/ decoding key as .pkl
joblib.dump(encoders, "encoders.pkl")

['encoders.pkl']

In [7]:
# Train and test split
tts = TrainTestSplit(df_labled_deviations=df_labeled_deviations)
# sets seed
train_df, test_df = tts.data_split()

In [8]:
train_df

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Insert ticket', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0,0
3,"('Case 1', 4)","[1, 12, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0,0
5,"('Case 10', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0,0
6,"('Case 10', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0,0
9,"('Case 100', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,333,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21342,"('Case 998', 4)","[1, 12, 14, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[11, 11, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0,0
21343,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[11, 11, 12, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0,0
21344,"('Case 999', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0,0
21346,"('Case 999', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0,0


In [9]:
# test_df

In [10]:
# Undersampling train set
u = Undersampling(train_df=train_df, 
                  list_dynamic_cols=['activities', 'resources', 'months'])
train_undersampled_df, y_no_true_class = u.one_sided_selection_undersampling()

In [11]:
train_undersampled_df

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
1,"('Case 10', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
2,"('Case 10', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
3,"('Case 100', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,333,0,0,0,1,0,0,0
4,"('Case 100', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,333,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12562,"('Case 998', 4)","[1, 12, 14, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[11, 11, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
12563,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[11, 11, 12, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
12564,"('Case 999', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0
12565,"('Case 999', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0


In [12]:
print(y_no_true_class)

["y_('Insert ticket', '>>')"]


In [13]:
# Cleaning, removing 
c = CleanDatasets(train_undersmpl_df=train_undersampled_df,
                  test_df=test_df,
                  undersmpl_removed_cols=y_no_true_class)

train_undersmpl_clean_df, test_clean_df = c.clean()

In [14]:
train_undersmpl_clean_df

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
0,"('Case 1', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
1,"('Case 10', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
2,"('Case 10', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
3,"('Case 100', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,333,0,0,0,1,0,0,0
4,"('Case 100', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,333,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12562,"('Case 998', 4)","[1, 12, 14, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[11, 11, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
12563,"('Case 998', 5)","[1, 12, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[22, 12, 22, 22, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[11, 11, 12, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,353,1,0,0,0,0,0,0
12564,"('Case 999', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0
12565,"('Case 999', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,386,1,0,0,0,0,0,0


In [15]:
test_clean_df

Unnamed: 0,case_id,activities,resources,months,trace_attr_seriousness,trace_attr_customer,trace_attr_seriousness_2,"y_('>>', 'Assign seriousness')","y_('Create SW anomaly', '>>')","y_('Require upgrade', '>>')","y_('Resolve ticket', '>>')","y_('Take in charge ticket', '>>')","y_('Wait', '>>')"
1,"('Case 1', 2)","[1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
2,"('Case 1', 3)","[1, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0,0,0,0,0
4,"('Case 1', 5)","[1, 12, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 12, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 10, 10, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0,0,0,0,0,0,0,0,0
7,"('Case 10', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
8,"('Case 10', 4)","[1, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 12, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21331,"('Case 996', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,383,1,0,0,0,0,0,0
21333,"('Case 996', 3)","[1, 12, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 19, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,383,1,0,0,0,0,0,0
21334,"('Case 996', 4)","[1, 12, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[21, 19, 19, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[12, 12, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,383,1,0,0,0,0,0,0
21335,"('Case 997', 1)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,148,0,0,0,0,0,0,0


In [16]:
columns = train_undersampled_df.columns
print(columns)

Index(['case_id', 'activities', 'resources', 'months',
       'trace_attr_seriousness', 'trace_attr_customer',
       'trace_attr_seriousness_2', 'y_('>>', 'Assign seriousness')',
       'y_('Create SW anomaly', '>>')', 'y_('Require upgrade', '>>')',
       'y_('Resolve ticket', '>>')', 'y_('Take in charge ticket', '>>')',
       'y_('Wait', '>>')'],
      dtype='object')


In [1]:
# Tensor encoding and saving — quick literal fix
train_prep = PrefixDataset(
    df=train_undersampled_df,
    activity_col='activities',
    resource_col='resources',
    month_col='months',
    trace_cols=['trace_attr_seriousness', 'trace_attr_customer', 'trace_attr_seriousness_2'],
    y_cols=["y_('Create SW anomaly', '>>')",
            "y_('Require upgrade', '>>')",
            "y_('Resolve ticket', '>>')",
            "y_('Take in charge ticket', '>>')",
            "y_('Wait', '>>')"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_set = train_prep.to_tensor_dataset(device=device)

NameError: name 'PrefixDataset' is not defined