# VAE Approach

In this notebook, we adapt the AVATAR measure for generalization that was introduced by Theis and Darabi in 2020  in the paper: "Adversarial System Variant Approximation to Quantify Process Model Generalization" (doi: https://doi.org/10.1109/ACCESS.2020.3033450. We use the same idea, but exchange the sequential generative adversial network (SGAN) utilised in the approach by a sequence variational autoencoder.<br>
We use two different input scenarios. In this notebook, we sample from a process model created by the original log.
<br>
The measure is defined as follows:
<br>
For event log $E$ and process model $M$, generalization is defined as:
$$Generalization_S(E,M) = 2*\frac{precision(E,M)*fitness(E,M)}{precision(E,M)+fitness(E,M)}$$

In [2]:
import warnings
warnings.filterwarnings('ignore')
from ocpa.objects.log.importer.ocel import factory as ocel_import_factory
from ocpa.algo.discovery.ocpn import algorithm as ocpn_discovery_factory
from src.utils import get_happy_path_log, create_flower_model, generate_variant_model, sample_traces, process_log
from ocpa.objects.log.importer.csv import factory as ocel_import_factory_csv
from models.VAE_measure import get_text_data, decode_sequence, create_lstm_vae, VAE_generalization
from tqdm import tqdm
import numpy as np

# Order Process

In [3]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})
train_log = sample_traces(ocel, ocpn, 10000, save_path='../src/data/playout/ocpn_data_order.txt')


Check the arcs: 100%|██████████| 46/46 [00:00<00:00, 45785.00it/s]
Generate the traces: 100%|██████████| 10000/10000 [00:00<00:00, 23912.34it/s]


In [4]:
train_log

[['FuelCar',
  'PlaceOrder',
  'ConfirmOrder',
  'PaymentReminder',
  'PickItem',
  'PayOrder',
  'Itemoutofstock',
  'ReorderItem',
  'LoadCargo',
  'PickItem',
  'StartRoute',
  'EndRoute'],
 ['PlaceOrder',
  'FuelCar',
  'ConfirmOrder',
  'PayOrder',
  'Itemoutofstock',
  'ReorderItem',
  'PickItem',
  'LoadCargo',
  'StartRoute',
  'EndRoute'],
 ['PlaceOrder',
  'FuelCar',
  'ConfirmOrder',
  'PaymentReminder',
  'PickItem',
  'PayOrder',
  'LoadCargo',
  'Itemoutofstock',
  'StartRoute',
  'ReorderItem',
  'PickItem',
  'EndRoute'],
 ['PlaceOrder',
  'ConfirmOrder',
  'PaymentReminder',
  'Itemoutofstock',
  'FuelCar',
  'PaymentReminder',
  'PaymentReminder',
  'ReorderItem',
  'PickItem',
  'PaymentReminder',
  'PaymentReminder',
  'PaymentReminder',
  'LoadCargo',
  'PaymentReminder',
  'StartRoute',
  'EndRoute',
  'PaymentReminder',
  'PayOrder'],
 ['FuelCar',
  'PlaceOrder',
  'ConfirmOrder',
  'PayOrder',
  'PickItem',
  'Itemoutofstock',
  'ReorderItem',
  'PickItem',
  'L

In [5]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/playout/ocpn_data_order.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 13
Max sequence length for inputs: 26
(10000, 26, 13) Creating model...


In [6]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 13)]   0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 353)          518204      ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 191)          67614       ['lstm[0][0]']                   
                                                                                                  
 dense_1 (Dense)                (None, 191)          67614       ['lstm[0][0]']                   
                                                                                              

<keras.callbacks.History at 0x1838683b100>

In [7]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
input_data = [' '.join(inner_list) for inner_list in train_log]
max_length = max(len(string) for string in input_data)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:14<00:00, 34.16it/s]


In [8]:
log

[['placeorder confirmorder itemoutofstock reorderitem payorder fuelcar pickitem loadcargo startroute endroute <end> '],
 ['placeorder confirmorder paymentreminder payorder itemoutofstock pickitem reorderitem fuelcar loadcargo pickitem loadcargo startroute endroute <end> '],
 ['placeorder fuelcar confirmorder payorder pickitem loadcargo startroute itemoutofstock reorderitem endroute <end> '],
 ['placeorder fuelcar confirmorder paymentreminder itemoutofstock payorder pickitem loadcargo startroute endroute <end> '],
 ['fuelcar placeorder confirmorder paymentreminder payorder itemoutofstock reorderitem pickitem loadcargo startroute endroute <end> '],
 ['placeorder fuelcar confirmorder pickitem loadcargo payorder startroute itemoutofstock reorderitem pickitem loadcargo startroute endroute <end> '],
 ['placeorder fuelcar confirmorder payorder itemoutofstock pickitem reorderitem loadcargo startroute endroute <end> '],
 ['fuelcar placeorder confirmorder paymentreminder pickitem itemoutofstock 

In [9]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/order_process_sampled_function.csv')

In [10]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,delivery,item,order
0,0,Place Order,1,2022-01-01 00:19:47.251967,[],[item1],[order1]
1,1,Confirm Order,1,2022-01-01 00:20:47.251967,[],[item1],[order1]
2,2,Item out of stock,1,2022-01-01 00:21:47.251967,[],[item1],[]
3,3,Reorder Item,1,2022-01-01 00:22:47.251967,[],[item1],[]
4,4,Pay Order,1,2022-01-01 00:23:47.251967,[],[],[order1]
...,...,...,...,...,...,...,...
5707,5707,Pick Item,500,2022-03-05 16:49:41.868876,[],[item500],[]
5708,5708,Item out of stock,500,2022-03-05 16:50:41.868876,[],[item500],[]
5709,5709,Load Cargo,500,2022-03-05 16:51:41.868876,[delivery500],[item500],[]
5710,5710,Start Route,500,2022-03-05 16:52:41.868876,[delivery500],[item500],[]


In [11]:
object_types = ["order","item","delivery"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/order_process_sampled_function.csv', parameters=parameters)

# OCPN Model

In [12]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.7729
Fitness of IM-discovered net:  0.6481
VAE Generalization= 0.705


# Happy Path Order

In [13]:
happy_path__ocel = get_happy_path_log(filename)

In [14]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [15]:
happy_path__ocel = get_happy_path_log(filename)
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  0.9897
Fitness of IM-discovered net:  0.2678
VAE Generalization= 0.4215


# Flower Model Order

In [16]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ots = ["order","item","delivery"]
flower_ocpn = create_flower_model(filename,ots)

In [17]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.2994
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.4608


# Variant OCPN

In [18]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ots = ["order","item","delivery"]
ocel = ocel_import_factory.apply(filename)
variant_ocpn = generate_variant_model(ocel,save_path_logs='../src/data/csv/order_process_variants/order_process_variant',object_types = ots,save_path_visuals=f"../reports/figures/order_variant_total.svg" )

Generating Variant Models: 100%|██████████| 12/12 [00:01<00:00,  8.96it/s]
Processing Variant Nets: 100%|██████████| 12/12 [00:00<00:00, 14004.35it/s]


#########Start generating Object-Centric Petri Net#########
#########Finished generating Object-Centric Petri Net#########


In [19]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [20]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

Precision of IM-discovered net:  0.7149
Fitness of IM-discovered net:  0.4306
VAE Generalization= 0.5375


# P2P Process

In [21]:
filename = "../src/data/jsonocel/p2p-normal.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})
train_log = sample_traces(ocel, ocpn, 10000, save_path='../src/data/playout/ocpn_data_p2p.txt')


Check the arcs: 100%|██████████| 40/40 [00:00<?, ?it/s]
Generate the traces: 100%|██████████| 10000/10000 [00:00<00:00, 29228.86it/s]


In [22]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/playout/ocpn_data_p2p.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 11
Max sequence length for inputs: 11
(10000, 11, 11) Creating model...


In [23]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, None, 11)]   0           []                               
                                                                                                  
 lstm_2 (LSTM)                  (None, 353)          515380      ['input_6[0][0]']                
                                                                                                  
 dense_4 (Dense)                (None, 191)          67614       ['lstm_2[0][0]']                 
                                                                                                  
 dense_5 (Dense)                (None, 191)          67614       ['lstm_2[0][0]']                 
                                                                                            

<keras.callbacks.History at 0x1838577fb50>

In [24]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
input_data = [' '.join(inner_list) for inner_list in train_log]
max_length = max(len(string) for string in input_data)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:12<00:00, 40.74it/s]


In [25]:
log

[['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial plangoodsissue receiveinvoice clearinvoice goodsissue '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial plangoodsissue receiveinvoice clearinvoice goodsissue '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt receiveinvoice clearinvoice plangoodsissue verifymaterial goodsissue '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial receiveinvoice plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial plangoodsissue goodsissue receiveinvoice clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt plangoodsissue receiveinvoice clearinvoice verifymaterial goodsissue '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt re

In [26]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/p2p_process_sampled_function.csv')

In [27]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,PURCHORD,INVOICE,PURCHREQ,GDSRCPT,MATERIAL
0,0,Create Purchase Requisition,1,2022-01-01 08:33:43.886025,[],[],[PURCHREQ1],[],[MATERIAL1]
1,1,Create Purchase Order,1,2022-01-01 08:34:43.886025,[PURCHORD1],[],[PURCHREQ1],[],[MATERIAL1]
2,2,Receive Goods,1,2022-01-01 08:35:43.886025,[PURCHORD1],[],[],[GDSRCPT1],[MATERIAL1]
3,3,Issue Goods Receipt,1,2022-01-01 08:36:43.886025,[PURCHORD1],[],[],[GDSRCPT1],[MATERIAL1]
4,4,Verify Material,1,2022-01-01 08:37:43.886025,[],[],[],[],[MATERIAL1]
...,...,...,...,...,...,...,...,...,...
4495,4495,Verify Material,500,2022-02-15 22:47:00.630500,[],[],[],[],[MATERIAL500]
4496,4496,Plan Goods Issue,500,2022-02-15 22:48:00.630500,[],[],[],[],[MATERIAL500]
4497,4497,Receive Invoice,500,2022-02-15 22:49:00.630500,[PURCHORD500],[INVOICE500],[],[],[]
4498,4498,Goods Issue,500,2022-02-15 22:50:00.630500,[],[],[],[],[MATERIAL500]


In [28]:
object_types = ["PURCHORD","INVOICE","PURCHREQ","MATERIAL","GDSRCPT"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/p2p_process_sampled_function.csv', parameters=parameters)

# OCPN Model

In [29]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.8519
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.92


# Happy Path Order

In [30]:
happy_path__ocel = get_happy_path_log(filename)

In [31]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [32]:
happy_path__ocel = get_happy_path_log(filename)
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  0.8759
Fitness of IM-discovered net:  0.8396
VAE Generalization= 0.8573


# Flower Model Order

In [33]:
filename = "../src/data/jsonocel/p2p-normal.jsonocel"
ots = ["PURCHORD","INVOICE","PURCHREQ","MATERIAL","GDSRCPT"]
flower_ocpn = create_flower_model(filename,ots)

In [34]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.1699
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.2905


# Variant OCPN

In [35]:
filename = "../src/data/jsonocel/p2p-normal.jsonocel"
ots = ["PURCHORD","INVOICE","PURCHREQ","MATERIAL","GDSRCPT"]
ocel = ocel_import_factory.apply(filename)
variant_ocpn = generate_variant_model(ocel,save_path_logs='../src/data/csv/p2p-normal_variants/p2p-normal_variant',object_types = ots ,save_path_visuals=f"../reports/figures/p2p_variant_total.svg" )

Generating Variant Models: 100%|██████████| 20/20 [00:02<00:00,  8.10it/s]
Processing Variant Nets: 100%|██████████| 20/20 [00:00<00:00, 6751.94it/s]


#########Start generating Object-Centric Petri Net#########
#########Finished generating Object-Centric Petri Net#########


In [36]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [37]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

Precision of IM-discovered net:  0.521
Fitness of IM-discovered net:  0.8271
VAE Generalization= 0.6393


# BPI Challenge

In [38]:
filename = "../src/data/jsonocel/BPI2017-Final.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})
train_log = sample_traces(ocel, ocpn, 10000, save_path='../src/data/playout/ocpn_data_BPI.txt')

Check the arcs: 100%|██████████| 120/120 [00:00<00:00, 125296.61it/s]
Generate the traces: 100%|██████████| 10000/10000 [00:00<00:00, 29482.58it/s]


In [39]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/playout/ocpn_data_BPI.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 17
Max sequence length for inputs: 27
(10000, 27, 17) Creating model...


In [40]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, None, 17)]   0           []                               
                                                                                                  
 lstm_4 (LSTM)                  (None, 353)          523852      ['input_11[0][0]']               
                                                                                                  
 dense_8 (Dense)                (None, 191)          67614       ['lstm_4[0][0]']                 
                                                                                                  
 dense_9 (Dense)                (None, 191)          67614       ['lstm_4[0][0]']                 
                                                                                            

<keras.callbacks.History at 0x18392499880>

In [41]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
input_data = [' '.join(inner_list) for inner_list in train_log]
max_length = max(len(string) for string in input_data)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:37<00:00, 13.32it/s]


In [42]:
log

[['createapplication submit accept <end> '],
 ['createapplication submit accept <end> '],
 ['createapplication submit assesspotentialfraud accept <end> '],
 ['createapplication submit assesspotentialfraud accept <end> '],
 ['createapplication submit accept <end> '],
 ['createapplication submit accept <end> '],
 ['createapplication submit assesspotentialfraud callincompletefiles callincompletefiles deny accept <end> '],
 ['createapplication submit assesspotentialfraud call accept <end> '],
 ['createapplication submit accept <end> '],
 ['createapplication submit accept <end> '],
 ['createapplication submit assesspotentialfraud accept <end> '],
 ['createapplication submit assesspotentialfraud deny refuseoffer accept <end> '],
 ['createapplication submit assesspotentialfraud cancelapplication accept <end> '],
 ['createapplication submit accept <end> '],
 ['createapplication submit assesspotentialfraud cancelapplication accept <end> '],
 ['createapplication submit accept <end> '],
 ['create

In [43]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/BPI_process_sampled_function.csv')

In [44]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,application,offer
0,0,Create application,1,2022-01-01 05:55:03.714751,[application1],
1,1,Submit,1,2022-01-01 05:56:03.714751,[application1],
2,2,Accept,1,2022-01-01 05:57:03.714751,[application1],
3,3,Create application,2,2022-01-04 04:20:19.249937,[application2],
4,4,Submit,2,2022-01-04 04:21:19.249937,[application2],
...,...,...,...,...,...,...
3311,3311,Cancel application,500,2022-01-25 14:06:23.979455,[application500],[offer500]
3312,3312,Cancel application,500,2022-01-25 14:07:23.979455,[application500],[offer500]
3313,3313,Deny,500,2022-01-25 14:08:23.979455,[application500],[offer500]
3314,3314,Accept,500,2022-01-25 14:09:23.979455,[application500],[]


In [45]:
object_types = ["application","offer"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/BPI_process_sampled_function.csv', parameters=parameters)

# OCPN Model

In [46]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.5088
Fitness of IM-discovered net:  0.4689
VAE Generalization= 0.4881


# Happy Path

In [47]:
happy_path__ocel = get_happy_path_log(filename)

In [48]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [49]:
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  1.0
Fitness of IM-discovered net:  0.377
VAE Generalization= 0.5475


# Flower Model 

In [50]:
filename = "../src/data/jsonocel/BPI2017-Final.jsonocel"
ots = ["application","offer"]
flower_ocpn = create_flower_model(filename,ots)

In [51]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.1131
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.2032


# Variant OCPN

In [52]:
import pickle

In [53]:
with open("../src/data/csv/bpi_variant_ocpn.pickle", "rb") as file:
    variant_ocpn = pickle.load(file)

In [54]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [55]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

Precision of IM-discovered net:  1.0
Fitness of IM-discovered net:  0.1508
VAE Generalization= 0.2621


# DS3 Log

In [3]:
filename = "../src/data/jsonocel/DS3.jsonocel"
ocel = ocel_import_factory.apply(filename)
ots = ["incident","customer"]
ocpn = create_flower_model(filename,ots)
train_log = sample_traces(ocel, ocpn, 10000, save_path='../src/data/playout/ocpn_data_ds3.txt')


Check the arcs: 100%|██████████| 20/20 [00:00<?, ?it/s]
Generate the traces: 100%|██████████| 10000/10000 [23:45<00:00,  7.01it/s]


In [4]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/playout/ocpn_data_ds3.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 10
Max sequence length for inputs: 50
(10000, 50, 10) Creating model...


In [5]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 10)]   0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 353)          513968      ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 191)          67614       ['lstm[0][0]']                   
                                                                                                  
 dense_1 (Dense)                (None, 191)          67614       ['lstm[0][0]']                   
                                                                                              

<keras.callbacks.History at 0x2002c1ffd90>

In [6]:
from nltk.tokenize import word_tokenize

# vectorize the data
input_texts = []
input_characters = set(["\t"])

with open('../src/data/playout/ocpn_data_ds3.txt', "r", encoding="utf-8") as f:
    lines = f.read().lower().split("\n")

for line in lines[: min(10000, len(lines) - 1)]:

    #input_text, _ = line.split("\t")
    input_text = word_tokenize(line)
    input_text.append("<end>")

    input_texts.append(input_text)


In [7]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
#input_data = [' '.join(inner_list) for inner_list in train_log]

input_data = [' '.join(inner_list) for inner_list in input_texts]
max_length = max(len(string) for string in input_data)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:33<00:00, 14.75it/s]


In [8]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/DS3_process_sampled_function.csv')

In [9]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,customer,incident
0,0,Awaiting Vendor,1,2022-01-01 02:04:09.221997,[],[incident1]
1,1,Awaiting Evidence,1,2022-01-01 02:05:09.221997,[],[incident1]
2,2,Awaiting Vendor,1,2022-01-01 02:06:09.221997,[],[incident1]
3,3,Awaiting Vendor,1,2022-01-01 02:07:09.221997,[],[incident1]
4,4,Awaiting Vendor,1,2022-01-01 02:08:09.221997,[],[incident1]
...,...,...,...,...,...,...
23995,23995,Closed Incident,500,2022-08-29 01:07:20.292880,[customer500],[incident500]
23996,23996,Closed Incident,500,2022-08-29 01:08:20.292880,[customer500],[incident500]
23997,23997,Closed Incident,500,2022-08-29 01:09:20.292880,[customer500],[incident500]
23998,23998,Awaiting User Info,500,2022-08-29 01:10:20.292880,[],[incident500]


In [10]:
object_types = ["incident","customer"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/DS3_process_sampled_function.csv', parameters=parameters)

# OCPN Model

In [11]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.1387
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.2436


# Happy Path Order

In [12]:
happy_path__ocel = get_happy_path_log(filename)

In [13]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [14]:
happy_path__ocel = get_happy_path_log(filename)
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  0
Fitness of IM-discovered net:  0.0


ZeroDivisionError: float division by zero

# Flower Model Order

In [15]:
filename = "../src/data/jsonocel/DS3.jsonocel"
ots = ["incident","customer"]
flower_ocpn = create_flower_model(filename,ots)

In [16]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.1387
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.2436


# Variant OCPN

In [17]:
import pickle

In [18]:
with open("../src/data/csv/DS3_variant_ocpn.pickle", "rb") as file:
    variant_ocpn = pickle.load(file)

In [19]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [None]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

# DS4 Log

In [3]:
filename = "../src/data/jsonocel/DS4.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})
train_log = sample_traces(ocel, ocpn, 10000, save_path='../src/data/playout/ocpn_data_ds4.txt')


Check the arcs: 100%|██████████| 364/364 [00:00<00:00, 363939.61it/s]
Generate the traces: 100%|██████████| 10000/10000 [00:07<00:00, 1303.01it/s]


In [4]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/playout/ocpn_data_ds4.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 52
Max sequence length for inputs: 119
(10000, 119, 52) Creating model...


In [5]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 52)]   0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 353)          573272      ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 191)          67614       ['lstm[0][0]']                   
                                                                                                  
 dense_1 (Dense)                (None, 191)          67614       ['lstm[0][0]']                   
                                                                                              

<keras.callbacks.History at 0x23322eb7a60>

In [6]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
input_data = [' '.join(inner_list) for inner_list in train_log]
max_length = max(len(string) for string in input_data)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [01:51<00:00,  4.48it/s]


In [7]:
log

[['paymentapplicationmailvalid entitlementapplicationmailvalid controlsummaryinitialize entitlementapplicationmailvalid geoparceldocumentinitialize geoparceldocumentbeginediting controlsummarybeginediting geoparceldocumentcalculateprotocol geoparceldocumentinsertdocument inspectionplan entitlementapplicationmailvalid entitlementapplicationmailvalid entitlementapplicationinitialize referencealignmentsave paymentapplicationfinishediting inspectionsave referencealignmentinsertdocument paymentapplicationbeginediting controlsummaryfinishediting paymentapplicationcheckadmissibility paymentapplicationsave geoparceldocumentchangedepartment controlsummarychangedepartment paymentapplicationfinishpayment geoparceldocumentcreate geoparceldocumentcalculateprotocol geoparceldocumentinsertdocument paymentapplicationremovedocument paymentapplicationbeginpayment paymentapplicationbeginediting paymentapplicationremovedocument geoparceldocumentcalculateprotocol geoparceldocumentchangedepartment geoparcel

In [8]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/DS4_process_sampled_function.csv')

In [9]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,Geo parcel document,Inspection,Reference alignment,Entitlement application,Payment application,Control summary
0,0,Payment application mail valid,1,2022-01-01 01:38:01.880153,[],[],[],[],[Payment application1],[]
1,1,Entitlement application mail valid,1,2022-01-01 01:39:01.880153,[],[],[],[Entitlement application1],[],[]
2,2,Control summary initialize,1,2022-01-01 01:40:01.880153,[],[],[],[],[Payment application1],[Control summary1]
3,3,Entitlement application mail valid,1,2022-01-01 01:41:01.880153,[],[],[],[Entitlement application1],[],[]
4,4,Geo parcel document initialize,1,2022-01-01 01:42:01.880153,[Geo parcel document1],[],[],[],[Payment application1],[]
...,...,...,...,...,...,...,...,...,...,...
57846,57846,Geo parcel document finish editing,500,2023-07-31 12:43:26.148391,[Geo parcel document500],[],[],[],[],[]
57847,57847,Geo parcel document remove document,500,2023-07-31 12:44:26.148391,[Geo parcel document500],[],[],[],[],[]
57848,57848,Geo parcel document change department,500,2023-07-31 12:45:26.148391,[Geo parcel document500],[],[],[],[],[]
57849,57849,Reference alignment insert document,500,2023-07-31 12:46:26.148391,[],[],[Reference alignment500],[],[],[]


In [10]:
object_types = ["Payment application","Control summary","Entitlement application","Geo parcel document","Inspection","Reference alignment"]

parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/DS4_process_sampled_function.csv', parameters=parameters)

# OCPN Model

In [11]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.524
Fitness of IM-discovered net:  0.0839
VAE Generalization= 0.1446


# Happy Path Order

In [12]:
happy_path__ocel = get_happy_path_log(filename)

In [13]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [15]:
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  1.0
Fitness of IM-discovered net:  0.0032
VAE Generalization= 0.0063


# Flower Model Order

In [17]:
filename = "../src/data/jsonocel/DS4.jsonocel"
ots =  ["Payment application","Control summary","Entitlement application","Geo parcel document","Inspection","Reference alignment"]
flower_ocpn = create_flower_model(filename,ots)

In [18]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.0267
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.0521


# Variant OCPN

In [None]:
import pickle

In [None]:
with open("../src/data/csv/DS4_variant_ocpn.pickle", "rb") as file:
    variant_ocpn = pickle.load(file)

In [None]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [None]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)