In [1]:
!pip3 install torch torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.5.1-py3-none-any.whl (890 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.6/890.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.8 torchmetrics-1.5.1


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import TensorDataset, random_split, DataLoader
import torch.optim as optim

In [3]:
from preprocessing import sort_log, debiasing, create_time_features, mapping_case_id, add_soc_eoc
from preprocessing import train_mapping_event_name, test_mapping_event_name, train_standardize, test_standardize
from train_test_split import create_table_without_discard_case, get_train_test_split_point
from create_prefix_suffix import create_log_prefix_tensor, create_trace_prefix_tensor, create_trace_suffix_tensor
from create_model import Encoder, Decoder, Seq2Seq_one_input, Seq2Seq_cat, Seq2Seq_add, Seq2Seq_mul, normalized_DL_distance

# Create training dataloader pipeline
The steps in this section mirror the steps in the function *create_train_valid_dataloader* from *dataloader_pipeline*

The steps are executed separately to allow for inspection of intermediate results.

In [4]:
# Define parameters
csv_path = 'BPIC2017_6000cases.csv'
end_date = '2017-01'
max_duration = 47.81
test_ratio = 0.3
validation_ratio = 0.2
log_prefix_length = 30
trace_prefix_length = 15
trace_suffix_length = 35
num_act = 28 # number of activity labels in training set: 24

In [5]:
set_name = 'train'
log_col_name = ['concept:name', 'log_ts_pre']
trace_prefix_col_name = ['concept:name', 'trace_ts_start', 'trace_ts_pre']
trace_suffix_col_name = ['concept:name', 'trace_ts_pre']
categorical_features = ['concept:name']
continuous_features = ['log_ts_pre', 'trace_ts_pre', 'trace_ts_start']
case_id = 'case:concept:name'
timestamp = 'time:timestamp'
event_name = 'concept:name'
event_idx = 'event_idx'

In [6]:
# 1. Tranform csv to dataframe
df = pd.read_csv(csv_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230225 entries, 0 to 230224
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Unnamed: 0         230225 non-null  int64 
 1   case:concept:name  230225 non-null  object
 2   time:timestamp     230225 non-null  object
 3   concept:name       230225 non-null  object
dtypes: int64(1), object(3)
memory usage: 7.0+ MB
None


In [7]:
print("Number of cases:", df[case_id].nunique())
print("Number of activity labels:", df[event_name].nunique())

Number of cases: 6000
Number of activity labels: 25


In [8]:
# 2. Sort dataframe by timestamp
df = sort_log(df,
              timestamp)

In [9]:
# 3. Debiasing and cleaning
df = debiasing(df, end_date, max_duration,
              case_id, timestamp)
print(df.info())
print("Number of retaining cases:", df[case_id].nunique())

  case_stops_df['date'] = case_stops_df[timestamp].dt.to_period('M')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216107 entries, 0 to 216106
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         216107 non-null  int64              
 1   case:concept:name  216107 non-null  object             
 2   time:timestamp     216107 non-null  datetime64[ns, UTC]
 3   concept:name       216107 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 6.6+ MB
None
Number of retaining cases: 5711


In [10]:
# 4. Get rid of discard case
df_no_discard = create_table_without_discard_case(df, test_ratio,
                                                  case_id, timestamp)
print(df_no_discard.info())
print("Number of remaining cases:", df_no_discard[case_id].nunique())
print("Number of activity labels:", df_no_discard[event_name].nunique())

<class 'pandas.core.frame.DataFrame'>
Index: 158994 entries, 0 to 216106
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         158994 non-null  int64              
 1   case:concept:name  158994 non-null  object             
 2   time:timestamp     158994 non-null  datetime64[ns, UTC]
 3   concept:name       158994 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 6.1+ MB
None
Number of remaining cases: 4123
Number of activity labels: 24


In [11]:
# 5. Subset: retain dataframe only before training / test split.
train_test_split_time, train_test_split_idx = get_train_test_split_point(df, test_ratio,
                                                      case_id, timestamp)
print(train_test_split_time)
print(train_test_split_idx)

2016-02-24 16:44:54.829000+00:00
130497


In [12]:
training_df = df_no_discard[df_no_discard[timestamp] < train_test_split_time]
print(training_df.info())
print("Number of cases in training set:", training_df[case_id].nunique())
print("Number of activity labels:", training_df[event_name].nunique())

<class 'pandas.core.frame.DataFrame'>
Index: 95857 entries, 0 to 130450
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   Unnamed: 0         95857 non-null  int64              
 1   case:concept:name  95857 non-null  object             
 2   time:timestamp     95857 non-null  datetime64[ns, UTC]
 3   concept:name       95857 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 3.7+ MB
None
Number of cases in training set: 2409
Number of activity labels: 24


In [13]:
# 6. Create time features
training_df = create_time_features(training_df, case_id,timestamp)
print(training_df.info())
print(training_df.head(20))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95857 entries, 0 to 95856
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   Unnamed: 0         95857 non-null  int64              
 1   case:concept:name  95857 non-null  object             
 2   time:timestamp     95857 non-null  datetime64[ns, UTC]
 3   concept:name       95857 non-null  object             
 4   log_ts_pre         95857 non-null  float64            
 5   trace_ts_pre       95857 non-null  float64            
 6   trace_ts_start     95857 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(3), int64(1), object(2)
memory usage: 5.1+ MB
None
    Unnamed: 0       case:concept:name                   time:timestamp  \
0            0   Application_652823628 2016-01-01 09:51:15.304000+00:00   
1            1   Application_652823628 2016-01-01 09:51:15.352000+00:00   
2            2   Application_6528236

In [14]:
# 7. Standardize time features
training_df, mean_dict, std_dict = train_standardize(training_df,
                                                     continuous_features)
print(training_df.head(20))
print(mean_dict)
print(std_dict)

    Unnamed: 0       case:concept:name                   time:timestamp  \
0            0   Application_652823628 2016-01-01 09:51:15.304000+00:00   
1            1   Application_652823628 2016-01-01 09:51:15.352000+00:00   
2            2   Application_652823628 2016-01-01 09:51:15.774000+00:00   
3            3   Application_652823628 2016-01-01 09:52:36.392000+00:00   
4            4   Application_652823628 2016-01-01 09:52:36.403000+00:00   
5            5   Application_652823628 2016-01-01 09:52:36.413000+00:00   
6            6  Application_1691306052 2016-01-01 10:16:11.500000+00:00   
7            7  Application_1691306052 2016-01-01 10:16:11.549000+00:00   
8            8  Application_1691306052 2016-01-01 10:16:11.740000+00:00   
9            9  Application_1691306052 2016-01-01 10:17:31.573000+00:00   
10          10  Application_1691306052 2016-01-01 10:17:31.584000+00:00   
11          11  Application_1691306052 2016-01-01 10:17:31.594000+00:00   
12          12   Applicat

In [15]:
# 8. Map case ID to numbers
training_df, case_id_dict= mapping_case_id(training_df,
                                     case_id)
print(len(case_id_dict))
print(training_df.head(20))

2409
    Unnamed: 0  case:concept:name                   time:timestamp  \
0            0                  1 2016-01-01 09:51:15.304000+00:00   
1            1                  1 2016-01-01 09:51:15.352000+00:00   
2            2                  1 2016-01-01 09:51:15.774000+00:00   
3            3                  1 2016-01-01 09:52:36.392000+00:00   
4            4                  1 2016-01-01 09:52:36.403000+00:00   
5            5                  1 2016-01-01 09:52:36.413000+00:00   
6            6                  2 2016-01-01 10:16:11.500000+00:00   
7            7                  2 2016-01-01 10:16:11.549000+00:00   
8            8                  2 2016-01-01 10:16:11.740000+00:00   
9            9                  2 2016-01-01 10:17:31.573000+00:00   
10          10                  2 2016-01-01 10:17:31.584000+00:00   
11          11                  2 2016-01-01 10:17:31.594000+00:00   
12          12                  3 2016-01-01 11:19:38.177000+00:00   
13          13 

In [16]:
# 9. Insert SOC and EOC rows
training_df = add_soc_eoc(training_df,
                          case_id, timestamp, event_name)
print(training_df.info())
print(training_df.head(20))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100675 entries, 0 to 100674
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         100675 non-null  int64              
 1   case:concept:name  100675 non-null  int64              
 2   time:timestamp     100675 non-null  datetime64[ns, UTC]
 3   concept:name       100675 non-null  object             
 4   log_ts_pre         100675 non-null  float64            
 5   trace_ts_pre       100675 non-null  float64            
 6   trace_ts_start     100675 non-null  float64            
 7   event_idx          100675 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(3), int64(3), object(1)
memory usage: 6.1+ MB
None
    Unnamed: 0  case:concept:name                   time:timestamp  \
0            0                  1 2016-01-01 09:51:15.304000+00:00   
1            0                  1 2016-01-01 09:5

In [17]:
# 10. Mapping event name to numbers
training_df, train_event_name_dict = train_mapping_event_name(training_df,
                                                                  event_name)
print(training_df.head(20))
print(train_event_name_dict)

    Unnamed: 0  case:concept:name                   time:timestamp  \
0            0                  1 2016-01-01 09:51:15.304000+00:00   
1            0                  1 2016-01-01 09:51:15.304000+00:00   
2            1                  1 2016-01-01 09:51:15.352000+00:00   
3            2                  1 2016-01-01 09:51:15.774000+00:00   
4            3                  1 2016-01-01 09:52:36.392000+00:00   
5            4                  1 2016-01-01 09:52:36.403000+00:00   
6            5                  1 2016-01-01 09:52:36.413000+00:00   
7            6                  2 2016-01-01 10:16:11.500000+00:00   
8            6                  2 2016-01-01 10:16:11.500000+00:00   
9            7                  2 2016-01-01 10:16:11.549000+00:00   
10           8                  2 2016-01-01 10:16:11.740000+00:00   
11           9                  2 2016-01-01 10:17:31.573000+00:00   
12          10                  2 2016-01-01 10:17:31.584000+00:00   
13          11      

In [18]:
# Set printed tensor format
torch.set_printoptions(sci_mode=False, precision=2)

Shape of train_log_prefix_tensor

98266 - Number of events in training set used for creating log prefix (i.e. number of events excluding EOC)

30 - Length of log prefix

29 - dimension of one-hot encoding (i.e. num_act, which is 28) + number of time features (which is 1 for log prefix)




In [19]:
# 11. Create train_log_prefix_tensor, train_trace_prefix_tensor, train_suffix_act_tensor, train_suffix_time_tensor
train_log_prefix_tensor = create_log_prefix_tensor(training_df, log_prefix_length, set_name, test_ratio, num_act,
                                                       log_col_name, categorical_features, case_id, timestamp, event_name)
print(train_log_prefix_tensor.shape)
print(train_log_prefix_tensor[:5])

torch.Size([98266, 30, 29])
tensor([[[     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         ...,
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [     0.00,      0.00,      1.00,  ...,      0.00,      0.00,
              -0.07]],

        [[     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         ...,
         [     0.00,      0.00,      0.00,  ...,      0.00,      0.00,
          -10000.00],
         [  

Shape of train_trace_prefix_tensor

98266 - Number of events in training set used for creating trace prefix (i.e. number of events excluding EOC)

15 - Length of trace prefix

30 - dimension of one-hot encoding (i.e. num_act, which is 28) + number of time features (which is 2 for trace prefix)

In [20]:
train_trace_prefix_tensor = create_trace_prefix_tensor(training_df, trace_prefix_length, set_name, test_ratio, num_act,
                                                           trace_prefix_col_name, categorical_features, event_idx, case_id, timestamp, event_name)
print(train_trace_prefix_tensor.shape)
print(train_trace_prefix_tensor[:5])

torch.Size([98266, 15, 30])
tensor([[[     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         ...,
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      1.00,  ...,      0.00,     -0.90,
              -0.21]],

        [[     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         ...,
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [  

Shape of train_suffix_act_tensor

98266 - Number of events in training set used for creating suffix (i.e. number of events excluding EOC)

35 - Length of suffix

In [21]:
train_suffix_act_tensor, train_suffix_time_tensor = create_trace_suffix_tensor(training_df, trace_suffix_length, set_name, test_ratio,
                                                                                   trace_suffix_col_name, categorical_features, event_idx, case_id, timestamp, event_name)
print(train_suffix_act_tensor.shape)
print(train_suffix_act_tensor[:5])


torch.Size([98266, 35])
tensor([[ 4,  5,  6,  6,  7,  8,  7,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13,
         13, 13, 16, 16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16],
        [ 5,  6,  6,  7,  8,  7,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13,
         13, 16, 16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17],
        [ 6,  6,  7,  8,  7,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13, 13,
         16, 16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17, 16],
        [ 6,  7,  8,  7,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13, 13, 16,
         16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17, 16, 23],
        [ 7,  8,  7,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13, 13, 16, 16,
         17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17, 16, 23, 24]])


In [22]:
print(train_suffix_time_tensor.shape)
print(train_suffix_time_tensor[:5])

torch.Size([98266, 35])
tensor([[-0.21, -0.21, -0.21, -0.21, -0.21, -0.21,  0.35, -0.21, -0.20, -0.21,
         -0.21, -0.21, -0.21, -0.21, -0.21, -0.21, -0.21,  1.93, -0.21,  3.71,
         -0.21, -0.21, -0.21, -0.21, -0.21,  0.24, -0.21, -0.21, -0.21, -0.21,
         -0.16, -0.21, -0.16, -0.21, -0.21],
        [-0.21, -0.21, -0.21, -0.21, -0.21,  0.35, -0.21, -0.20, -0.21, -0.21,
         -0.21, -0.21, -0.21, -0.21, -0.21, -0.21,  1.93, -0.21,  3.71, -0.21,
         -0.21, -0.21, -0.21, -0.21,  0.24, -0.21, -0.21, -0.21, -0.21, -0.16,
         -0.21, -0.16, -0.21, -0.21, -0.21],
        [-0.21, -0.21, -0.21, -0.21,  0.35, -0.21, -0.20, -0.21, -0.21, -0.21,
         -0.21, -0.21, -0.21, -0.21, -0.21,  1.93, -0.21,  3.71, -0.21, -0.21,
         -0.21, -0.21, -0.21,  0.24, -0.21, -0.21, -0.21, -0.21, -0.16, -0.21,
         -0.16, -0.21, -0.21, -0.21, -0.21],
        [-0.21, -0.21, -0.21,  0.35, -0.21, -0.20, -0.21, -0.21, -0.21, -0.21,
         -0.21, -0.21, -0.21, -0.21,  1.93, -0.21, 

In [23]:
# 12. Build TensorDataset
train_dataset = TensorDataset(train_log_prefix_tensor, train_trace_prefix_tensor, train_suffix_act_tensor, train_suffix_time_tensor)

In [24]:
# 13. Split TensorDataset into train_dataset, valid_dataset
train_dataset, valid_dataset = random_split(train_dataset, [1-validation_ratio, validation_ratio])
print(len(train_dataset))
print(len(valid_dataset))

78613
19653


In [25]:
# 14. Build train_dataloader, valid_dataloader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=True)

# Create test dataloader pipeline
The steps in this section mirror the steps in the function *create_test_dataloader* from *dataloader_pipeline*

The steps are executed separately to allow for inspection of intermediate results.

In [26]:
# Define parameters
csv_path = 'BPIC2017_6000cases.csv'
end_date = '2017-01'
max_duration = 47.81
test_ratio = 0.3
log_prefix_length = 30
trace_prefix_length = 15
trace_suffix_length = 35
num_act = 28 # number of activity labels in training set: 24

In [27]:
set_name = 'test'
log_col_name = ['concept:name', 'log_ts_pre']
trace_prefix_col_name = ['concept:name', 'trace_ts_start', 'trace_ts_pre']
trace_suffix_col_name = ['concept:name', 'trace_ts_pre']
categorical_features = ['concept:name']
continuous_features = ['log_ts_pre', 'trace_ts_pre', 'trace_ts_start']
case_id = 'case:concept:name'
timestamp = 'time:timestamp'
event_name = 'concept:name'
event_idx = 'event_idx'

In [28]:
train_event_name_dict = train_event_name_dict
mean_dict = mean_dict
std_dict = std_dict

In [29]:
# 1. Tranform csv to dataframe
df = pd.read_csv(csv_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230225 entries, 0 to 230224
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Unnamed: 0         230225 non-null  int64 
 1   case:concept:name  230225 non-null  object
 2   time:timestamp     230225 non-null  object
 3   concept:name       230225 non-null  object
dtypes: int64(1), object(3)
memory usage: 7.0+ MB
None


In [30]:
# 2. Sort dataframe by timestamp
df = sort_log(df,
              timestamp)

In [31]:
# 3. Debiasing and cleaning
df = debiasing(df,end_date, max_duration,
                   case_id, timestamp)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216107 entries, 0 to 216106
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         216107 non-null  int64              
 1   case:concept:name  216107 non-null  object             
 2   time:timestamp     216107 non-null  datetime64[ns, UTC]
 3   concept:name       216107 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 6.6+ MB
None


  case_stops_df['date'] = case_stops_df[timestamp].dt.to_period('M')


In [32]:
# 4. Create time features
df = create_time_features(df,
                          case_id,timestamp)
print(df.info())
print(df.head(20))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216107 entries, 0 to 216106
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         216107 non-null  int64              
 1   case:concept:name  216107 non-null  object             
 2   time:timestamp     216107 non-null  datetime64[ns, UTC]
 3   concept:name       216107 non-null  object             
 4   log_ts_pre         216107 non-null  float64            
 5   trace_ts_pre       216107 non-null  float64            
 6   trace_ts_start     216107 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(3), int64(1), object(2)
memory usage: 11.5+ MB
None
    Unnamed: 0       case:concept:name                   time:timestamp  \
0            0   Application_652823628 2016-01-01 09:51:15.304000+00:00   
1            1   Application_652823628 2016-01-01 09:51:15.352000+00:00   
2            2   Applica

In [33]:
# 5. Standardize time features
df = test_standardize(df,
                      mean_dict, std_dict, continuous_features)
print(df.head(20))

    Unnamed: 0       case:concept:name                   time:timestamp  \
0            0   Application_652823628 2016-01-01 09:51:15.304000+00:00   
1            1   Application_652823628 2016-01-01 09:51:15.352000+00:00   
2            2   Application_652823628 2016-01-01 09:51:15.774000+00:00   
3            3   Application_652823628 2016-01-01 09:52:36.392000+00:00   
4            4   Application_652823628 2016-01-01 09:52:36.403000+00:00   
5            5   Application_652823628 2016-01-01 09:52:36.413000+00:00   
6            6  Application_1691306052 2016-01-01 10:16:11.500000+00:00   
7            7  Application_1691306052 2016-01-01 10:16:11.549000+00:00   
8            8  Application_1691306052 2016-01-01 10:16:11.740000+00:00   
9            9  Application_1691306052 2016-01-01 10:17:31.573000+00:00   
10          10  Application_1691306052 2016-01-01 10:17:31.584000+00:00   
11          11  Application_1691306052 2016-01-01 10:17:31.594000+00:00   
12          12   Applicat

In [34]:
# 6. Map case ID to numbers
df, case_id_dict = mapping_case_id(df,
                        case_id)
print(len(case_id_dict))
print(df.head(20))

5711
    Unnamed: 0  case:concept:name                   time:timestamp  \
0            0                  1 2016-01-01 09:51:15.304000+00:00   
1            1                  1 2016-01-01 09:51:15.352000+00:00   
2            2                  1 2016-01-01 09:51:15.774000+00:00   
3            3                  1 2016-01-01 09:52:36.392000+00:00   
4            4                  1 2016-01-01 09:52:36.403000+00:00   
5            5                  1 2016-01-01 09:52:36.413000+00:00   
6            6                  2 2016-01-01 10:16:11.500000+00:00   
7            7                  2 2016-01-01 10:16:11.549000+00:00   
8            8                  2 2016-01-01 10:16:11.740000+00:00   
9            9                  2 2016-01-01 10:17:31.573000+00:00   
10          10                  2 2016-01-01 10:17:31.584000+00:00   
11          11                  2 2016-01-01 10:17:31.594000+00:00   
12          12                  3 2016-01-01 11:19:38.177000+00:00   
13          13 

In [35]:
# 7. Insert SOC and EOC rows
df = add_soc_eoc(df,
                  case_id, timestamp, event_name)
print(df.info())
print(df.head(20))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227529 entries, 0 to 227528
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   Unnamed: 0         227529 non-null  int64              
 1   case:concept:name  227529 non-null  int64              
 2   time:timestamp     227529 non-null  datetime64[ns, UTC]
 3   concept:name       227529 non-null  object             
 4   log_ts_pre         227529 non-null  float64            
 5   trace_ts_pre       227529 non-null  float64            
 6   trace_ts_start     227529 non-null  float64            
 7   event_idx          227529 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(3), int64(3), object(1)
memory usage: 13.9+ MB
None
    Unnamed: 0  case:concept:name                   time:timestamp  \
0            0                  1 2016-01-01 09:51:15.304000+00:00   
1            0                  1 2016-01-01 09:

In [36]:
# 8. Mapping event name to numbers
df, test_event_name_dict = test_mapping_event_name(df, train_event_name_dict,
                                                  event_name)
print(df.head(20))
print(test_event_name_dict)

    Unnamed: 0  case:concept:name                   time:timestamp  \
0            0                  1 2016-01-01 09:51:15.304000+00:00   
1            0                  1 2016-01-01 09:51:15.304000+00:00   
2            1                  1 2016-01-01 09:51:15.352000+00:00   
3            2                  1 2016-01-01 09:51:15.774000+00:00   
4            3                  1 2016-01-01 09:52:36.392000+00:00   
5            4                  1 2016-01-01 09:52:36.403000+00:00   
6            5                  1 2016-01-01 09:52:36.413000+00:00   
7            6                  2 2016-01-01 10:16:11.500000+00:00   
8            6                  2 2016-01-01 10:16:11.500000+00:00   
9            7                  2 2016-01-01 10:16:11.549000+00:00   
10           8                  2 2016-01-01 10:16:11.740000+00:00   
11           9                  2 2016-01-01 10:17:31.573000+00:00   
12          10                  2 2016-01-01 10:17:31.584000+00:00   
13          11      

In [37]:
_, split_idx = get_train_test_split_point(df, test_ratio,
                                                case_id=case_id, timestamp=timestamp)
print(split_idx)
# print the table near the splitting point
print(df[split_idx-20:split_idx]) # 20 rows above the splitting point
print(df[split_idx:split_idx+20]) # 20 rows below the splitting point

136903
        Unnamed: 0  case:concept:name                   time:timestamp  \
136883      136270               3996 2016-02-24 16:32:43.087000+00:00   
136884      136271               3996 2016-02-24 16:32:53.130000+00:00   
136885      136272               3996 2016-02-24 16:32:53.146000+00:00   
136886      136273               3996 2016-02-24 16:32:53.153000+00:00   
136887      136274               3996 2016-02-24 16:32:53.156000+00:00   
136888      136275               3996 2016-02-24 16:32:53.158000+00:00   
136889      136276               3997 2016-02-24 16:34:50.220000+00:00   
136890      136276               3997 2016-02-24 16:34:50.220000+00:00   
136891      136277               3997 2016-02-24 16:34:50.229000+00:00   
136892      136278               3997 2016-02-24 16:34:50.234000+00:00   
136893      136279               3997 2016-02-24 16:34:50.237000+00:00   
136894      136280               3996 2016-02-24 16:36:14.290000+00:00   
136895      136281             

Shape of test_log_prefix_tensor

87324 - Number of events in test set used for creating log prefix (i.e. number of events excluding EOC)

30 - Length of log prefix

29 - dimension of one-hot encoding (i.e. num_act, which is 28) + number of time features (which is 1 for log prefix)

In [38]:
# 9. Create test_log_prefix_tensor, test_trace_prefix_tensor, test_suffix_act_tensor, test_suffix_time_tensor
test_log_prefix_tensor = create_log_prefix_tensor(df, log_prefix_length, set_name, test_ratio, num_act,
                                                    log_col_name, categorical_features, case_id, timestamp, event_name)
print(test_log_prefix_tensor.shape)
print(test_log_prefix_tensor[:5])

torch.Size([87324, 30, 29])
tensor([[[ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.04],
         ...,
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         [ 0.00,  0.00,  1.00,  ...,  0.00,  0.00,  0.36]],

        [[ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.04],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00,  0.07],
         ...,
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         [ 0.00,  0.00,  1.00,  ...,  0.00,  0.00,  0.36],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00,  0.36]],

        [[ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.04],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00,  0.07],
         [ 0.00,  0.00,  0.00,  ...,  0.00,  0.00, -0.07],
         ...,
         [ 0.00,  0.00,  1.00,  ...,  0.0

Shape of test_trace_prefix_tensor

87324 - Number of events in test set used for creating trace prefix (i.e. number of events excluding EOC)

15 - Length of trace prefix

30 - dimension of one-hot encoding (i.e. num_act, which is 28) + number of time features (which is 2 for trace prefix)

In [39]:
test_trace_prefix_tensor = create_trace_prefix_tensor(df, trace_prefix_length, set_name, test_ratio, num_act,
                                                      trace_prefix_col_name, categorical_features, event_idx, case_id, timestamp, event_name)
print(test_trace_prefix_tensor.shape)
print(test_trace_prefix_tensor[:5])

torch.Size([87324, 15, 30])
tensor([[[     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         ...,
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      1.00,  ...,      0.00,     -0.90,
              -0.21]],

        [[     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         ...,
         [     0.00,      0.00,      0.00,  ...,      0.00, -10000.00,
          -10000.00],
         [  

Shape of test_suffix_act_tensor

87324 - Number of events in test set used for creating suffix (i.e. number of events excluding EOC)

35 - Length of suffix

In [40]:
test_suffix_act_tensor, test_suffix_time_tensor = create_trace_suffix_tensor(df, trace_suffix_length, set_name, test_ratio,
                                                                                trace_suffix_col_name, categorical_features, event_idx, case_id, timestamp, event_name)
print(test_suffix_act_tensor.shape)
print(test_suffix_act_tensor[:5])

torch.Size([87324, 35])
tensor([[ 4,  5,  6,  6,  7,  8,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13,
         13, 16, 16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17],
        [ 5,  6,  6,  7,  8,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13, 13,
         16, 16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17, 16],
        [ 6,  6,  7,  8,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13, 13, 16,
         16, 17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17, 16, 23],
        [ 6,  7,  8,  7,  9, 10, 11, 12,  7, 13, 13, 14, 13, 13, 13, 13, 16, 16,
         17, 18, 16, 16, 20, 20, 21, 20, 20, 20, 20, 16, 16, 17, 16, 23, 24],
        [13, 13, 22, 15, 13,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])


In [41]:
print(test_suffix_time_tensor.shape)
print(test_suffix_time_tensor[:5])

torch.Size([87324, 35])
tensor([[    -0.21,     -0.21,     -0.21,     -0.21,     -0.21,     -0.21,
             -0.19,     -0.21,     -0.21,     -0.21,     -0.21,     -0.21,
             -0.21,     -0.21,     -0.21,     -0.21,      2.37,     -0.21,
              8.52,     -0.21,     -0.21,     -0.21,     -0.21,     -0.21,
             -0.10,     -0.21,     -0.21,     -0.21,     -0.21,     -0.17,
             -0.21,      0.14,     -0.21,     -0.21,     -0.21],
        [    -0.21,     -0.21,     -0.21,     -0.21,     -0.21,     -0.19,
             -0.21,     -0.21,     -0.21,     -0.21,     -0.21,     -0.21,
             -0.21,     -0.21,     -0.21,      2.37,     -0.21,      8.52,
             -0.21,     -0.21,     -0.21,     -0.21,     -0.21,     -0.10,
             -0.21,     -0.21,     -0.21,     -0.21,     -0.17,     -0.21,
              0.14,     -0.21,     -0.21,     -0.21,     -0.21],
        [    -0.21,     -0.21,     -0.21,     -0.21,     -0.19,     -0.21,
             -0.21,  

In [42]:
# 10. Build TensorDataset
test_dataset = TensorDataset(test_log_prefix_tensor, test_trace_prefix_tensor, test_suffix_act_tensor, test_suffix_time_tensor)
print(len(test_dataset))

87324


In [43]:
# 11. Build test_dataloader, valid_dataloader
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

# Run experiment

In [44]:
# Define parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_act = 28

learning_rate = 0.01

num_epochs = 10

num_layers = 2

enc_hidden_size = 50
log_enc_input_size = num_act + 1
trace_enc_input_size = num_act + 2

dec_hidden_size = 50 # For seq2seq_one_input, seq2seq_add, seq2seq_mul
dec_cat_hidden_size = 100 # For seq2seq_cat

act_dec_input_size = num_act
act_dec_output_size = num_act

time_dec_input_size = 1
time_dec_output_size = 1

In [45]:
# Define encoder and decoder
log_encoder = Encoder(input_size=log_enc_input_size,
                      hidden_size=enc_hidden_size,
                      num_layers=num_layers).to(device)

trace_encoder = Encoder(input_size=trace_enc_input_size,
                        hidden_size=enc_hidden_size,
                        num_layers=num_layers).to(device)

act_decoder = Decoder(input_size=act_dec_input_size,
                      hidden_size=dec_hidden_size,
                      output_size=act_dec_output_size,
                      num_layers=num_layers).to(device)

time_decoder = Decoder(input_size=time_dec_input_size,
                      hidden_size=dec_hidden_size,
                      output_size=time_dec_output_size,
                      num_layers=num_layers).to(device)

act_cat_decoder = Decoder(input_size=act_dec_input_size,
                      hidden_size=dec_cat_hidden_size,
                      output_size=act_dec_output_size,
                      num_layers=num_layers).to(device)

time_cat_decoder = Decoder(input_size=time_dec_input_size,
                      hidden_size=dec_cat_hidden_size,
                      output_size=time_dec_output_size,
                      num_layers=num_layers).to(device)

In [46]:
# Define training loop
def training(model, dataloader, teacher_forcing_ratio=0.5):

    model.train()

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Define loss function
    act_criterion = nn.CrossEntropyLoss(ignore_index=0) # target value 0 is ignored and does not contribute to the input gradient
    time_criterion = nn.L1Loss()

    for epoch in range(num_epochs):

        epoch_loss = 0

        for batch in dataloader:

            train_log_prefix, train_trace_prefix, train_suffix_act, train_suffix_time = batch
            train_log_prefix = train_log_prefix.float().to(device)
            # train_log_prefix shape: (batch_size, log_prefix_len, num_act + 1)
            train_trace_prefix = train_trace_prefix.float().to(device)
            # train_trace_prefix shape: (batch_size, trace_prefix_len, num_act + 2)
            train_suffix_act = train_suffix_act.to(torch.long).to(device)
            # train_suffix_act shape: (batch_size, suffix_len)
            train_suffix_time = train_suffix_time.float().to(device)
            # train_suffi_time shape: (batch_size, suffix_len)

            optimizer.zero_grad()

            if model == model_log:
                act_predictions, time_predictions = model(train_log_prefix, train_suffix_act, train_suffix_time, teacher_forcing_ratio)

            elif model == model_trace:
                act_predictions, time_predictions = model(train_trace_prefix, train_suffix_act, train_suffix_time, teacher_forcing_ratio)

            else:
                act_predictions, time_predictions = model(train_log_prefix, train_trace_prefix, train_suffix_act, train_suffix_time, teacher_forcing_ratio)

            act_predictions = act_predictions.to(device)
            # act_predictions shape: (batch_size, suffix_len, num_act)
            time_predictions = time_predictions.to(device)
            # time_predictions shape: (batch_size, suffix_len, 1)

            # Mask padding (-10000) in the timestamp suffix so that they do not contribute to the input gradient
            train_suffix_time = train_suffix_time.unsqueeze(-1) # To match the dimension of time_predictions
            # train_suffix_time shape: (batch_size, suffix_len, 1)
            mask = (train_suffix_time != -10000).to(device)
            masked_train_suffix_time = torch.masked_select(train_suffix_time, mask) # The result is a 1D tensor
            masked_time_predictions = torch.masked_select(time_predictions, mask) # The result is a 1D tensor

            # nn.CrossEntropyLoss requires input shape (batch_size, num_act), target shape (batch_size)
            act_predictions = act_predictions.view(-1, act_predictions.size(-1))
            # act_predictions shape: (batch_size * seq_length, num_act)
            train_suffix_act = train_suffix_act.view(-1)
            # train_suffix_act shape: (batch_size * seq_length)

            act_loss = act_criterion(act_predictions, train_suffix_act)
            time_loss = time_criterion(masked_time_predictions, masked_train_suffix_time)

            loss = 0.5 * act_loss + 0.5 * time_loss

            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(dataloader)

        if (epoch + 1) % 5 == 0:
          print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss:.4f}")

In [61]:
def evaluation(model, dataloader, teacher_forcing_ratio=0):

    model.eval()
    epoch_act_loss = 0
    epoch_time_loss = 0

    time_criterion = nn.L1Loss()

    with torch.no_grad():

        for batch in dataloader:

            test_log_prefix, test_trace_prefix, test_suffix_act, test_suffix_time = batch
            test_log_prefix = test_log_prefix.float().to(device)
            # test_log_prefix shape: (batch_size, log_prefix_len, num_act + 1)
            test_trace_prefix = test_trace_prefix.float().to(device)
            # test_trace_prefix shape: (batch_size, trace_prefix_len, num_act + 2)
            test_suffix_act = test_suffix_act.to(torch.long).to(device)
            # test_suffix_act: (batch_size, suffix_len)
            test_suffix_time = test_suffix_time.float().to(device)
            # test_suffi_time: (batch_size, suffix_len)

            if model == model_log:
                act_predictions, time_predictions = model(test_log_prefix, test_suffix_act, test_suffix_time, teacher_forcing_ratio)

            elif model == model_trace:
                act_predictions, time_predictions = model(test_trace_prefix, test_suffix_act, test_suffix_time, teacher_forcing_ratio)

            else:
                act_predictions, time_predictions = model(test_log_prefix, test_trace_prefix, test_suffix_act, test_suffix_time, teacher_forcing_ratio)

            act_predictions = act_predictions.to(device)
            # act_predictions shape: (batch_size, suffix_len, num_act)
            time_predictions = time_predictions.to(device)
            # time_predictions shape: (batch_size, suffix_len, 1)

            # Mask padding (-10000) in the timestamp suffix so that they do not contribute to the loss
            test_suffix_time = test_suffix_time.unsqueeze(-1)
            # train_suffix_time shape: batch_size, suffix_len, 1
            mask = (test_suffix_time != -10000).to(device)
            masked_test_suffix_time = torch.masked_select(test_suffix_time, mask) # The result is a 1D tensor
            masked_time_predictions = torch.masked_select(time_predictions, mask) # The result is a 1D tensor

            act_loss = normalized_DL_distance(act_predictions, test_suffix_act)
            time_loss = time_criterion(masked_time_predictions, masked_test_suffix_time)

            epoch_act_loss += act_loss
            epoch_time_loss += time_loss.item()

        avg_act_loss = epoch_act_loss / len(dataloader)
        avg_time_loss = epoch_time_loss / len(dataloader)

        return avg_act_loss, avg_time_loss

In [48]:
model_log = Seq2Seq_one_input(num_act=num_act,
                              encoder=log_encoder,
                              act_decoder=act_decoder,
                              time_decoder=time_decoder).to(device)

In [49]:
training(model_log, train_dataloader)

Epoch 5/10, Average Loss: 0.4898
Epoch 10/10, Average Loss: 0.4718


In [50]:
torch.save(model_log.state_dict(), 'model_log_checkpoint.pth')

In [51]:
model_log.load_state_dict(torch.load('model_log_checkpoint.pth'))

  model_log.load_state_dict(torch.load('model_log_checkpoint.pth'))


<All keys matched successfully>

In [62]:
act_loss, time_loss = evaluation(model_log, test_dataloader)
print("DL distance for activity label suffix prediction:", act_loss)
print("MAE for timestamp suffix prediction:", time_loss)

DL distance for activity label suffix prediction: 0.6711827658578029
MAE for timestamp suffix prediction: 0.31808089867179645


In [63]:
model_trace = Seq2Seq_one_input(num_act=num_act,
                                encoder=trace_encoder,
                                act_decoder=act_decoder,
                                time_decoder=time_decoder).to(device)

In [64]:
training(model_trace, train_dataloader)

Epoch 5/10, Average Loss: 0.4367
Epoch 10/10, Average Loss: 0.4317


In [65]:
torch.save(model_trace.state_dict(), 'model_trace_checkpoint.pth')

In [66]:
model_trace.load_state_dict(torch.load('model_trace_checkpoint.pth'))

  model_trace.load_state_dict(torch.load('model_trace_checkpoint.pth'))


<All keys matched successfully>

In [67]:
act_loss, time_loss = evaluation(model_trace, test_dataloader)
print("DL distance for activity label suffix prediction:", act_loss)
print("MAE for timestamp suffix prediction:", time_loss)

DL distance for activity label suffix prediction: 0.6502332185086889
MAE for timestamp suffix prediction: 0.2663606502852597


In [68]:
model_cat = Seq2Seq_cat(num_act=num_act,
                        log_encoder=log_encoder,
                        trace_encoder=trace_encoder,
                        act_cat_decoder=act_cat_decoder,
                        time_cat_decoder=time_cat_decoder).to(device)

In [69]:
training(model_cat, train_dataloader)

Epoch 5/10, Average Loss: 0.4616
Epoch 10/10, Average Loss: 0.4481


In [70]:
torch.save(model_cat.state_dict(), 'model_cat_checkpoint.pth')

In [71]:
model_cat.load_state_dict(torch.load('model_cat_checkpoint.pth'))

  model_cat.load_state_dict(torch.load('model_cat_checkpoint.pth'))


<All keys matched successfully>

In [72]:
act_loss, time_loss = evaluation(model_cat, test_dataloader)
print("DL distance for activity label suffix prediction:", act_loss)
print("MAE for timestamp suffix prediction:", time_loss)

DL distance for activity label suffix prediction: 0.6329113824984581
MAE for timestamp suffix prediction: 0.28600139231472227


In [73]:
model_add = Seq2Seq_add(num_act=num_act,
                        log_encoder=log_encoder,
                        trace_encoder=trace_encoder,
                        act_decoder=act_decoder,
                        time_decoder=time_decoder).to(device)

In [74]:
training(model_add, train_dataloader)

Epoch 5/10, Average Loss: 0.4278
Epoch 10/10, Average Loss: 0.5166


In [75]:
torch.save(model_add.state_dict(), 'model_add_checkpoint.pth')

In [76]:
model_add.load_state_dict(torch.load('model_add_checkpoint.pth'))

  model_add.load_state_dict(torch.load('model_add_checkpoint.pth'))


<All keys matched successfully>

In [77]:
act_loss, time_loss = evaluation(model_add, test_dataloader)
print("DL distance for activity label suffix prediction:", act_loss)
print("MAE for timestamp suffix prediction:", time_loss)

DL distance for activity label suffix prediction: 0.6319158765888663
MAE for timestamp suffix prediction: 0.3025603502006321


In [78]:
model_mul = Seq2Seq_mul(num_act=num_act,
                        log_encoder=log_encoder,
                        trace_encoder=trace_encoder,
                        act_decoder=act_decoder,
                        time_decoder=time_decoder).to(device)

In [79]:
training(model_mul, train_dataloader)

Epoch 5/10, Average Loss: 0.4714
Epoch 10/10, Average Loss: 0.4440


In [80]:
torch.save(model_mul.state_dict(), 'model_mul_checkpoint.pth')

In [81]:
model_mul.load_state_dict(torch.load('model_mul_checkpoint.pth'))

  model_mul.load_state_dict(torch.load('model_mul_checkpoint.pth'))


<All keys matched successfully>

In [82]:
act_loss, time_loss = evaluation(model_mul, test_dataloader)
print("DL distance for activity label suffix prediction:", act_loss)
print("MAE for timestamp suffix prediction:", time_loss)

DL distance for activity label suffix prediction: 0.5929331144964682
MAE for timestamp suffix prediction: 0.28569949521468235
