# VAE Approach Original Log

In this notebook, we adapt the AVATAR measure for generalization that was introduced by THeis and Darabi in 2020  in the paper: "Adversarial System Variant Approximation to Quantify Process Model Generalization" (doi: https://doi.org/10.1109/ACCESS.2020.3033450. We use the same idea, but exchange the sequential generative adversial network (SGAN) utilised in the approach by a sequence variational autoencoder.<br>
We use two different input scenarios. In this notebook, we process the original log.
<br>
The measure is defined as follows:
<br>
For event log $E$ and process model $M$, generalization is defined as:
$$Generalization_S(E,M) = 2*\frac{precision(E,M)*fitness(E,M)}{precision(E,M)+fitness(E,M)}$$ 

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ocpa.objects.log.importer.ocel import factory as ocel_import_factory
from ocpa.algo.discovery.ocpn import algorithm as ocpn_discovery_factory
from src.utils import get_happy_path_log, create_flower_model, generate_variant_model, sample_traces, process_log
from ocpa.objects.log.importer.csv import factory as ocel_import_factory_csv
from models.VAE_measure import get_text_data, decode_sequence, create_lstm_vae, VAE_generalization, create_VAE_input
from ocpa.algo.util.filtering.log import case_filtering
from tqdm import tqdm
import numpy as np

# Order Process

In [3]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})
#filter out most frequent trace here, because it distorts results
ocel = case_filtering.filter_process_executions(ocel, ocel.process_executions[1:])

In [4]:
train_log = create_VAE_input(ocel,'../src/data/VAE_input/order_process.txt')

In [5]:
train_log

['PlaceOrder ConfirmOrder Itemoutofstock Itemoutofstock PayOrder ReorderItem ReorderItem PlaceOrder ConfirmOrder Itemoutofstock Itemoutofstock ReorderItem ReorderItem PayOrder PickItem PickItem PickItem PickItem LoadCargo LoadCargo FuelCar StartRoute EndRoute FuelCar StartRoute EndRoute',
 'PlaceOrder ConfirmOrder Itemoutofstock PayOrder PickItem ReorderItem PlaceOrder ConfirmOrder Itemoutofstock ReorderItem PickItem PayOrder LoadCargo PickItem PickItem LoadCargo FuelCar StartRoute EndRoute FuelCar StartRoute EndRoute',
 'PlaceOrder ConfirmOrder PaymentReminder PickItem PickItem LoadCargo PaymentReminder PaymentReminder PaymentReminder PaymentReminder FuelCar StartRoute EndRoute PaymentReminder PaymentReminder PayOrder',
 'PlaceOrder ConfirmOrder Itemoutofstock Itemoutofstock PaymentReminder ReorderItem ReorderItem PaymentReminder PickItem PickItem LoadCargo PayOrder FuelCar StartRoute EndRoute',
 'PlaceOrder ConfirmOrder Itemoutofstock Itemoutofstock PaymentReminder ReorderItem Reorde

In [6]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/VAE_input/order_process.txt')

print(x.shape, "Creating model...")

Number of samples: 47
Number of unique input tokens: 13
Max sequence length for inputs: 28
(47, 28, 13) Creating model...


In [7]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 13)]   0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 353)          518204      ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 191)          67614       ['lstm[0][0]']                   
                                                                                                  
 dense_1 (Dense)                (None, 191)          67614       ['lstm[0][0]']                   
                                                                                              

<keras.callbacks.History at 0x197390a4910>

In [17]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
max_length = max(len(string) for string in train_log)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:10<00:00, 46.79it/s]


In [18]:
log

[['placeorder confirmorder itemoutofstock pickitem pickitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder itemoutofstock pickitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder itemoutofstock pickitem pickitem reorderitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder itemoutofstock pickitem pickitem reorderitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder itemoutofstock itemoutofstock pickitem reorderitem reorderitem reorderitem pickitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder confirmorder itemoutofstock itemoutofstock reorderitem reorderitem reorderitem reorderitem pickitem pickitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder pickitem pickitem pickitem loadcargo fuelcar startroute endroute <end> '],
 ['placeorder confirmorder pickitem pickitem pickitem 

In [19]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/order_process_original_sampled.csv')

In [20]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,delivery,item,order
0,0,Place Order,1,2022-01-01 09:56:38.617386,[],[item1],[order1]
1,1,Confirm Order,1,2022-01-01 09:57:38.617386,[],[item1],[order1]
2,2,Item out of stock,1,2022-01-01 09:58:38.617386,[],[item1],[]
3,3,Pick Item,1,2022-01-01 09:59:38.617386,[],[item1],[]
4,4,Pick Item,1,2022-01-01 10:00:38.617386,[],[item1],[]
...,...,...,...,...,...,...,...
5387,5387,Pick Item,500,2022-02-23 04:04:50.398410,[],[item500],[]
5388,5388,Load Cargo,500,2022-02-23 04:05:50.398410,[delivery500],[item500],[]
5389,5389,Fuel Car,500,2022-02-23 04:06:50.398410,[delivery500],[],[]
5390,5390,Start Route,500,2022-02-23 04:07:50.398410,[delivery500],[item500],[]


In [21]:
object_types = ["order","item","delivery"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/order_process_original_sampled.csv', parameters=parameters)

# OCPN Model

In [22]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.8366
Fitness of IM-discovered net:  0.2528
VAE Generalization= 0.3883


# Happy Path Order

In [23]:
happy_path__ocel = get_happy_path_log(filename)

In [24]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [25]:
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  0.8333
Fitness of IM-discovered net:  0.2164
VAE Generalization= 0.3435


# Flower Model Order

In [26]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ots = ["order","item","delivery"]
flower_ocpn = create_flower_model(filename,ots)

In [27]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.1795
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.3044


# Variant OCPN

In [28]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ots = ["order","item","delivery"]
ocel = ocel_import_factory.apply(filename)
variant_ocpn = generate_variant_model(ocel,save_path_logs='../src/data/csv/order_process_variants/order_process_variant',object_types = ots,save_path_visuals=f"../reports/figures/order_variant_total.svg" )

Generating Variant Models: 100%|██████████| 12/12 [00:01<00:00,  6.24it/s]
Processing Variant Nets: 100%|██████████| 12/12 [00:00<00:00, 6786.90it/s]


#########Start generating Object-Centric Petri Net#########
#########Finished generating Object-Centric Petri Net#########


In [29]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [30]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

Precision of IM-discovered net:  0.6187
Fitness of IM-discovered net:  0.2219
VAE Generalization= 0.3267


# P2P Process

In [31]:
filename = "../src/data/jsonocel/p2p-normal.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})

In [32]:
train_log = create_VAE_input(ocel,'../src/data/VAE_input/p2p_process.txt')

In [33]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/VAE_input/p2p_process.txt')

print(x.shape, "Creating model...")

Number of samples: 80
Number of unique input tokens: 11
Max sequence length for inputs: 11
(80, 11, 11) Creating model...


In [34]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, None, 11)]   0           []                               
                                                                                                  
 lstm_2 (LSTM)                  (None, 353)          515380      ['input_6[0][0]']                
                                                                                                  
 dense_4 (Dense)                (None, 191)          67614       ['lstm_2[0][0]']                 
                                                                                                  
 dense_5 (Dense)                (None, 191)          67614       ['lstm_2[0][0]']                 
                                                                                            

<keras.callbacks.History at 0x19746d42370>

In [39]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
max_length = max(len(string) for string in train_log)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:10<00:00, 47.26it/s]


In [40]:
log

[['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial verifymaterial plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial verifymaterial plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial verifymaterial plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial verifymaterial plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial verifymaterial plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt verifymaterial verifymaterial plangoodsissue goodsissue clearinvoice '],
 ['createpurchaserequisition createpurchaseorder receivegoods issuegoodsreceipt ve

In [41]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/p2p_process_original_sampled.csv')

In [42]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,PURCHORD,PURCHREQ,INVOICE,MATERIAL,GDSRCPT
0,0,Create Purchase Requisition,1,2022-01-01 12:06:01.404541,[],[PURCHREQ1],[],[MATERIAL1],[]
1,1,Create Purchase Order,1,2022-01-01 12:07:01.404541,[PURCHORD1],[PURCHREQ1],[],[MATERIAL1],[]
2,2,Receive Goods,1,2022-01-01 12:08:01.404541,[PURCHORD1],[],[],[MATERIAL1],[GDSRCPT1]
3,3,Issue Goods Receipt,1,2022-01-01 12:09:01.404541,[PURCHORD1],[],[],[MATERIAL1],[GDSRCPT1]
4,4,Verify Material,1,2022-01-01 12:10:01.404541,[],[],[],[MATERIAL1],[]
...,...,...,...,...,...,...,...,...,...
4495,4495,Verify Material,500,2022-02-15 03:50:59.183281,[],[],[],[MATERIAL500],[]
4496,4496,Verify Material,500,2022-02-15 03:51:59.183281,[],[],[],[MATERIAL500],[]
4497,4497,Plan Goods Issue,500,2022-02-15 03:52:59.183281,[],[],[],[MATERIAL500],[]
4498,4498,Goods Issue,500,2022-02-15 03:53:59.183281,[],[],[],[MATERIAL500],[]


In [43]:
object_types = ["PURCHORD","INVOICE","PURCHREQ","MATERIAL","GDSRCPT"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/p2p_process_original_sampled.csv', parameters=parameters)

# OCPN Model

In [44]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.9
Fitness of IM-discovered net:  0.5556
VAE Generalization= 0.687


# Happy Path Order

In [45]:
happy_path__ocel = get_happy_path_log(filename)

In [46]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [47]:
happy_path__ocel = get_happy_path_log(filename)
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  1.0
Fitness of IM-discovered net:  0.5556
VAE Generalization= 0.7143


# Flower Model Order

In [48]:
filename = "../src/data/jsonocel/p2p-normal.jsonocel"
ots = ["PURCHORD","INVOICE","PURCHREQ","MATERIAL","GDSRCPT"]
flower_ocpn = create_flower_model(filename,ots)

In [49]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.1576
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.2723


# Variant OCPN

In [50]:
filename = "../src/data/jsonocel/p2p-normal.jsonocel"
ots = ["PURCHORD","INVOICE","PURCHREQ","MATERIAL","GDSRCPT"]
ocel = ocel_import_factory.apply(filename)
variant_ocpn = generate_variant_model(ocel,save_path_logs='../src/data/csv/p2p-normal_variants/p2p-normal_variant',object_types = ots ,save_path_visuals=f"../reports/figures/p2p_variant_total.svg" )

Generating Variant Models: 100%|██████████| 20/20 [00:02<00:00,  9.98it/s]
Processing Variant Nets: 100%|██████████| 20/20 [00:00<00:00, 9997.15it/s]


#########Start generating Object-Centric Petri Net#########
#########Finished generating Object-Centric Petri Net#########


In [51]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [52]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

Precision of IM-discovered net:  0.625
Fitness of IM-discovered net:  0.4444
VAE Generalization= 0.5195


# BPI Challenge

In [53]:
filename = "../src/data/jsonocel/BPI2017-Final.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})

In [54]:
train_log = create_VAE_input(ocel,'../src/data/VAE_input/BPI_process.txt')

In [55]:
train_log

['Createapplication Accept Createoffer Createoffer Canceloffer Send(mailandonline) Complete Call Createoffer Canceloffer Createoffer Send(mailandonline) Return Validate Createoffer Send(mailandonline) Createoffer Send(mailandonline) Createoffer Send(mailandonline) Return Callincompletefiles Validate Validate Callincompletefiles Callincompletefiles Validate Callincompletefiles Createoffer Send(mailandonline) Createoffer Send(online) Validate Acceptoffer Pending Canceloffer Canceloffer Canceloffer Canceloffer Canceloffer Canceloffer',
 'Createapplication Submit Complete Accept Createoffer Createoffer Send(mailandonline) Send(mailandonline) Call Canceloffer Canceloffer Createoffer Createoffer Createoffer Createoffer Canceloffer Canceloffer Createoffer Createoffer Send(mailandonline) Send(mailandonline) Validate Callincompletefiles Validate Callincompletefiles Createoffer Send(mailandonline) Createoffer Send(online) Validate Callincompletefiles Validate Return Callincompletefiles Acceptoff

In [56]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/VAE_input/BPI_process.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 26
Max sequence length for inputs: 70
(10000, 70, 26) Creating model...


In [57]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, None, 26)]   0           []                               
                                                                                                  
 lstm_4 (LSTM)                  (None, 353)          536560      ['input_11[0][0]']               
                                                                                                  
 dense_8 (Dense)                (None, 191)          67614       ['lstm_4[0][0]']                 
                                                                                                  
 dense_9 (Dense)                (None, 191)          67614       ['lstm_4[0][0]']                 
                                                                                            

<keras.callbacks.History at 0x1987a449730>

In [58]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
max_length = max(len(string) for string in train_log)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [00:25<00:00, 19.31it/s]


In [59]:
log

[['createapplication submit accept createoffer send ( mailandonline ) complete call return validate callincompletefiles validate callincompletefiles validate callincompletefiles acceptoffer pending <end> '],
 ['createapplication submit accept createoffer send ( mailandonline ) complete call validate return callincompletefiles validate callincompletefiles acceptoffer pending <end> '],
 ['createapplication submit accept createoffer send ( mailandonline ) complete call return validate callincompletefiles validate callincompletefiles validate acceptoffer pending <end> '],
 ['createapplication submit accept createoffer createoffer send ( mailandonline ) send ( mailandonline ) complete call createoffer send ( mailandonline ) cancelapplication canceloffer canceloffer canceloffer <end> '],
 ['createapplication submit accept createoffer send ( mailandonline ) complete call return validate callincompletefiles validate callincompletefiles validate callincompletefiles acceptoffer pending <end> '],

In [60]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/BPI_process_sampled.csv')

In [61]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,offer,application
0,0,Create application,1,2022-01-01 04:29:23.607489,[],[application1]
1,1,Submit,1,2022-01-01 04:30:23.607489,[],[application1]
2,2,Accept,1,2022-01-01 04:31:23.607489,[],[application1]
3,3,Create offer,1,2022-01-01 04:32:23.607489,[offer1],[application1]
4,4,Complete,1,2022-01-01 04:33:23.607489,[],[application1]
...,...,...,...,...,...,...
6970,6970,Call,500,2022-03-13 15:50:49.123708,[offer500],[application500]
6971,6971,Validate,500,2022-03-13 15:51:49.123708,[],[application500]
6972,6972,Call,500,2022-03-13 15:52:49.123708,[offer500],[application500]
6973,6973,Accept,500,2022-03-13 15:53:49.123708,[],[application500]


In [62]:
object_types = ["application","offer"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/BPI_process_sampled.csv', parameters=parameters)

# OCPN Model

In [63]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.2698
Fitness of IM-discovered net:  0.5697
VAE Generalization= 0.3662


# Happy Path

In [64]:
happy_path__ocel = get_happy_path_log(filename)

In [65]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [66]:
happy_path__ocel = get_happy_path_log(filename)
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  0.9052
Fitness of IM-discovered net:  0.2658
VAE Generalization= 0.4109


# Flower Model 

In [67]:
filename = "../src/data/jsonocel/BPI2017-Final.jsonocel"
ots = ["application","offer"]
flower_ocpn = create_flower_model(filename,ots)

In [68]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.0909
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.1667


# Variant OCPN

In [69]:
import pickle

In [70]:
with open("../src/data/csv/bpi_variant_ocpn.pickle", "rb") as file:
    variant_ocpn = pickle.load(file)

In [71]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [72]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

Precision of IM-discovered net:  1.0
Fitness of IM-discovered net:  0.0717
VAE Generalization= 0.1338


# DS3 Log

In [4]:
filename = "../src/data/jsonocel/DS3.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})

In [5]:
train_log = create_VAE_input(ocel,'../src/data/VAE_input/DS3.txt')

In [6]:
train_log

['Resolved ClosedIncident NewIncident NewIncident AwaitingUserInfo AwaitingUserInfo AwaitingUserInfo Resolved ClosedIncident ClosedIncident Resolved Active Active Active Active ClosedIncident Resolved Resolved Resolved Active NewIncident Active NewIncident NewIncident NewIncident Active NewIncident AwaitingUserInfo AwaitingUserInfo Resolved ClosedIncident AwaitingUserInfo ClosedIncident Resolved AwaitingUserInfo Active AwaitingUserInfo AwaitingUserInfo AwaitingUserInfo NewIncident NewIncident Active Resolved ClosedIncident NewIncident Resolved ClosedIncident Resolved ClosedIncident NewIncident NewIncident NewIncident NewIncident NewIncident NewIncident NewIncident Resolved ClosedIncident Active ClosedIncident Resolved Active Active Active Active Resolved Active Active NewIncident ClosedIncident NewIncident NewIncident ClosedIncident Resolved NewIncident NewIncident NewIncident ClosedIncident NewIncident NewIncident Active Resolved Resolved ClosedIncident NewIncident NewIncident NewInci

In [6]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/VAE_input/DS3.txt')

print(x.shape, "Creating model...")

Number of samples: 4825
Number of unique input tokens: 10
Max sequence length for inputs: 261
(4825, 261, 10) Creating model...


In [7]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 10)]   0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 353)          513968      ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 191)          67614       ['lstm[0][0]']                   
                                                                                                  
 dense_1 (Dense)                (None, 191)          67614       ['lstm[0][0]']                   
                                                                                              

<keras.callbacks.History at 0x15737edefd0>

In [8]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
max_length = max(len(string) for string in train_log)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

Fitted, predicting...


Sample Traces: 100%|██████████| 500/500 [08:18<00:00,  1.00it/s]


In [9]:
log

[['newincident newincident newincident newincident newincident newincident newincident resolved closedincident closedincident resolved active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active active acti

In [10]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/DS3_process_sampled.csv')

In [11]:
df_log

Unnamed: 0,event_id,event_activity,event_execution,event_timestamp,customer,incident
0,0,New Incident,1,2022-01-01 10:50:16.544534,[customer1],[incident1]
1,1,New Incident,1,2022-01-01 10:51:16.544534,[customer1],[incident1]
2,2,New Incident,1,2022-01-01 10:52:16.544534,[customer1],[incident1]
3,3,New Incident,1,2022-01-01 10:53:16.544534,[customer1],[incident1]
4,4,New Incident,1,2022-01-01 10:54:16.544534,[customer1],[incident1]
...,...,...,...,...,...,...
210995,210995,Active,500,2027-10-12 22:25:14.658055,[],[incident500]
210996,210996,Active,500,2027-10-12 22:26:14.658055,[],[incident500]
210997,210997,Active,500,2027-10-12 22:27:14.658055,[],[incident500]
210998,210998,Active,500,2027-10-12 22:28:14.658055,[],[incident500]


In [4]:
object_types = ["incident","customer"]
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/DS3_process_sampled.csv', parameters=parameters)

# OCPN Model

In [None]:
generalization = VAE_generalization(ocel_gen, ocpn)

# Happy Path Order

In [None]:
happy_path__ocel = get_happy_path_log(filename)

In [None]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [None]:
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

# Flower Model Order

In [None]:
filename = "../src/data/jsonocel/DS3.jsonocel"
ots = ["incident","customer"]
flower_ocpn = create_flower_model(filename,ots)

In [None]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

# Variant OCPN

In [None]:
import pickle

In [None]:
with open("../src/data/csv/DS3_variant_ocpn.pickle", "rb") as file:
    variant_ocpn = pickle.load(file)

In [None]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [None]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)

# DS4 Log

In [3]:
filename = "../src/data/jsonocel/DS4.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})

In [4]:
train_log = create_VAE_input(ocel,'../src/data/VAE_input/DS4.txt')

In [9]:
train_log

['Paymentapplicationmailvalid Geoparceldocumentinitialize Geoparceldocumentbeginediting Geoparceldocumentcreate Controlsummaryinitialize Controlsummarybeginediting Controlsummaryfinishediting Paymentapplicationmailincome Geoparceldocumentcreate Inspectionplan Inspectioninitialize Inspectionsave Inspectionsave Inspectionsave Inspectionsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentfinishpre-check Geoparceldocumentfinishediting Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentinsertdocument Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocumentsave Geoparceldocu

In [15]:
timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=2000,
                                                                                      data_path='../src/data/VAE_input/DS4.txt')

print(x.shape, "Creating model...")

Number of samples: 2000
Number of unique input tokens: 68
Max sequence length for inputs: 2975
(2000, 2975, 68) Creating model...


Trained the model on GPU ressources, because CPU would take too long.

In [17]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 20

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_26 (InputLayer)          [(None, None, 68)]   0           []                               
                                                                                                  
 lstm_10 (LSTM)                 (None, 353)          595864      ['input_26[0][0]']               
                                                                                                  
 dense_20 (Dense)               (None, 191)          67614       ['lstm_10[0][0]']                
                                                                                                  
 dense_21 (Dense)               (None, 191)          67614       ['lstm_10[0][0]']                
                                                                                           

KeyboardInterrupt: 

In [None]:
print("Fitted, predicting...")
#rearrange the input data and get the max amount of characters
max_length = max(len(string) for string in train_log)

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, max_length)

log = []

for _ in tqdm(range(500), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    #print(decode(seq_from))
    log.append([decode(seq_from)])

In [None]:
log

In [None]:
df_log = process_log(log, ocel, ocpn, '../src/data/VAE_generated/DS4_process_sampled.csv')

In [None]:
df_log

In [6]:
object_types =  ["Payment application","Control summary","Geo parcel document","Reference alignment"]
#sampling did not generate all original object types
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "event_activity",
              "time_name": "event_timestamp",
              "sep": ","}
ocel_gen = ocel_import_factory_csv.apply(file_path='../src/data/VAE_generated/DS4_process_sampled.csv', parameters=parameters)

# OCPN Model

In [8]:
generalization = VAE_generalization(ocel_gen, ocpn)

Precision of IM-discovered net:  0.157
Fitness of IM-discovered net:  0.537
VAE Generalization= 0.243


# Happy Path Order

In [9]:
happy_path__ocel = get_happy_path_log(filename)

In [10]:
happy_path_ocpn = ocpn_discovery_factory.apply(happy_path__ocel, parameters={"debug": False})

In [11]:
happy_path__ocel = get_happy_path_log(filename)
generalization = VAE_generalization(ocel_gen, happy_path_ocpn)

Precision of IM-discovered net:  0.4111
Fitness of IM-discovered net:  0.537
VAE Generalization= 0.4657


# Flower Model Order

In [12]:
filename = "../src/data/jsonocel/DS4.jsonocel"
ots =  ["Payment application","Control summary","Entitlement application","Geo parcel document","Inspection","Reference alignment"]
flower_ocpn = create_flower_model(filename,ots)

In [13]:
generalization = VAE_generalization(ocel_gen, flower_ocpn)

Precision of IM-discovered net:  0.0286
Fitness of IM-discovered net:  1.0
VAE Generalization= 0.0555


# Variant OCPN

In [14]:
import pickle

In [15]:
with open("../src/data/csv/DS4_variant_ocpn.pickle", "rb") as file:
    variant_ocpn = pickle.load(file)

In [16]:
for transition in variant_ocpn.transitions:
    split_string = transition.name.split("_")
    transition.name = split_string[0]

In [None]:
generalization = VAE_generalization(ocel_gen, variant_ocpn)