# VAE Tests

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ocpa.objects.log.importer.ocel import factory as ocel_import_factory
from ocpa.algo.discovery.ocpn import algorithm as ocpn_discovery_factory
from src.utils import get_happy_path_log, create_flower_model, generate_variant_model, sample_traces

In [4]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [23]:
import argparse
import json
from keras import backend as K
from keras.losses import categorical_crossentropy
from keras.layers import Layer
from keras.layers import Input, LSTM, TimeDistributed
from keras.layers.core import Dense, Lambda
from keras.models import Model
from nltk.tokenize import word_tokenize
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import os
from tqdm import tqdm

In [6]:
filename = "../src/data/jsonocel/order_process.jsonocel"
ocel = ocel_import_factory.apply(filename)
ocpn = ocpn_discovery_factory.apply(ocel, parameters={"debug": False})
train_log = sample_traces(ocel, ocpn, 10000)


Check the arcs: 100%|██████████| 46/46 [00:00<?, ?it/s]
Generate the traces: 100%|██████████| 10000/10000 [00:00<00:00, 29019.02it/s]


In [7]:
train_log

[['Place Order',
  'Confirm Order',
  'Item out of stock',
  'Fuel Car',
  'Reorder Item',
  'Pick Item',
  'Payment Reminder',
  'Pay Order',
  'Load Cargo',
  'Start Route',
  'End Route'],
 ['Fuel Car',
  'Place Order',
  'Confirm Order',
  'Payment Reminder',
  'Payment Reminder',
  'Item out of stock',
  'Reorder Item',
  'Pay Order',
  'Pick Item',
  'Load Cargo',
  'Start Route',
  'End Route'],
 ['Fuel Car',
  'Place Order',
  'Confirm Order',
  'Payment Reminder',
  'Pick Item',
  'Payment Reminder',
  'Payment Reminder',
  'Load Cargo',
  'Pay Order',
  'Start Route',
  'End Route'],
 ['Place Order',
  'Fuel Car',
  'Confirm Order',
  'Pick Item',
  'Payment Reminder',
  'Load Cargo',
  'Item out of stock',
  'Reorder Item',
  'Pay Order',
  'Pick Item',
  'Load Cargo',
  'Start Route',
  'End Route'],
 ['Fuel Car',
  'Place Order',
  'Confirm Order',
  'Payment Reminder',
  'Pick Item',
  'Load Cargo',
  'Payment Reminder',
  'Start Route',
  'Pay Order',
  'Item out of stoc

In [30]:
data_path = '../src/data/sampled_data_order.txt'

In [31]:
with open(data_path, "w", encoding="utf-8") as file:
    for sentence in train_log:
        line = " ".join(sentence) + "\n"
        file.write(line)

In [10]:
def create_lstm_vae(input_dim,
                    batch_size,  # we need it for sampling
                    intermediate_dim,
                    latent_dim):
    """
    Creates an LSTM Variational Autoencoder (VAE).

    # Arguments
        input_dim: int.
        batch_size: int.
        intermediate_dim: int, output shape of LSTM.
        latent_dim: int, latent z-layer shape.
        epsilon_std: float, z-layer sigma.


    # References
        - [Building Autoencoders in Keras](https://blog.keras.io/building-autoencoders-in-keras.html)
        - [Generating sentences from a continuous space](https://arxiv.org/abs/1511.06349)
    """
    x = Input(shape=(None, input_dim,))

    # LSTM encoding
    h = LSTM(units=intermediate_dim)(x)

    # VAE Z layer
    z_mean = Dense(units=latent_dim)(h)
    z_log_sigma = Dense(units=latent_dim)(h)

    def sampling(args):
        z_mean, z_log_sigma = args
        epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., stddev=1.0)
        return z_mean + z_log_sigma * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    # so you could write `Lambda(sampling)([z_mean, z_log_sigma])`
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])

    z_reweighting = Dense(units=intermediate_dim, activation="linear")
    z_reweighted = z_reweighting(z)

    # "next-word" data for prediction
    decoder_words_input = Input(shape=(None, input_dim,))

    # decoded LSTM layer
    decoder_h = LSTM(intermediate_dim, return_sequences=True, return_state=True)

    # todo: not sure if this initialization is correct
    h_decoded, _, _ = decoder_h(decoder_words_input, initial_state=[z_reweighted, z_reweighted])
    decoder_dense = TimeDistributed(Dense(input_dim, activation="softmax"))
    decoded_onehot = decoder_dense(h_decoded)

    # end-to-end autoencoder
    vae = Model([x, decoder_words_input], decoded_onehot)

    # encoder, from inputs to latent space
    encoder = Model(x, [z_mean, z_log_sigma])

    # generator, from latent space to reconstructed inputs -- for inference's first step
    decoder_state_input = Input(shape=(latent_dim,))
    _z_rewighted = z_reweighting(decoder_state_input)
    _h_decoded, _decoded_h, _decoded_c = decoder_h(decoder_words_input, initial_state=[_z_rewighted, _z_rewighted])
    _decoded_onehot = decoder_dense(_h_decoded)
    generator = Model([decoder_words_input, decoder_state_input], [_decoded_onehot, _decoded_h, _decoded_c])

    # RNN for inference
    input_h = Input(shape=(intermediate_dim,))
    input_c = Input(shape=(intermediate_dim,))
    __h_decoded, __decoded_h, __decoded_c = decoder_h(decoder_words_input, initial_state=[input_h, input_c])
    __decoded_onehot = decoder_dense(__h_decoded)
    stepper = Model([decoder_words_input, input_h, input_c], [__decoded_onehot, __decoded_h, __decoded_c])

    def vae_loss(x, x_decoded_onehot):
        xent_loss = categorical_crossentropy(x, x_decoded_onehot)
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        loss = xent_loss + kl_loss
        return loss

    vae.compile(optimizer="adam", loss=vae_loss)
    vae.summary()

    return vae, encoder, generator, stepper

In [11]:
def decode_sequence(states_value, decoder_adapter_model, rnn_decoder_model, num_decoder_tokens, token2id, id2token, max_seq_length):
    """
    Decoding adapted from this example:
    https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

    :param states_value:
    :param decoder_adapter_model: reads text representation, makes the first prediction, yields states after the first RNN's step
    :param rnn_decoder_model: reads previous states and makes one RNN step
    :param num_decoder_tokens:
    :param token2id: dict mapping words to ids
    :param id2token: dict mapping ids to words
    :param max_seq_length: the maximum length of the sequence
    :return:
    """

    # generate empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))

    # populate the first token of the target sequence with the start character
    target_seq[0, 0, token2id["\t"]] = 1.0

    # sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1)
    stop_condition = False
    decoded_sentence = ""

    first_time = True
    h, c = None, None

    while not stop_condition:

        if first_time:
            # feeding in states sampled with the mean and std provided by encoder
            # and getting current LSTM states to feed in to the decoder at the next step
            output_tokens, h, c = decoder_adapter_model.predict([target_seq, states_value])
            first_time = False
        else:
            # reading output token
            output_tokens, h, c = rnn_decoder_model.predict([target_seq, h, c])

        # sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = id2token[sampled_token_index]
        decoded_sentence += sampled_token + " "

        # exit condition: either hit max length
        # or find stop character.
        if sampled_token == "<end>" or len(decoded_sentence) > max_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

    return decoded_sentence

In [12]:
def get_vectors(word2id={}, vectors_path="../src/data/glove.6B.50d.txt"):

    encoder_input_data = np.random.random((len(word2id), 50))

    for line in open(vectors_path):
        line = line.strip().split(" ")
        if line[0] in word2id:
            encoder_input_data[word2id[line[0]]] = np.array([float(n) for n in line[1:]])

    return encoder_input_data

In [13]:
def get_text_data(data_path, num_samples=1000):

    # vectorize the data
    input_texts = []
    input_characters = set(["\t"])

    with open(data_path, "r", encoding="utf-8") as f:
        lines = f.read().lower().split("\n")

    for line in lines[: min(num_samples, len(lines) - 1)]:

        #input_text, _ = line.split("\t")
        input_text = word_tokenize(line)
        input_text.append("<end>")

        input_texts.append(input_text)

        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)

    input_characters = sorted(list(input_characters))
    num_encoder_tokens = len(input_characters)
    max_encoder_seq_length = max([len(txt) for txt in input_texts]) + 1

    print("Number of samples:", len(input_texts))
    print("Number of unique input tokens:", num_encoder_tokens)
    print("Max sequence length for inputs:", max_encoder_seq_length)

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())

    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
    decoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")

    for i, input_text in enumerate(input_texts):
        decoder_input_data[i, 0, input_token_index["\t"]] = 1.0

        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
            decoder_input_data[i, t + 1, input_token_index[char]] = 1.0

    return max_encoder_seq_length, num_encoder_tokens, input_characters, input_token_index, reverse_input_char_index, \
           encoder_input_data, decoder_input_data

In [16]:
#     from argparse import ArgumentParser

#     p = ArgumentParser()
#     p.add_argument("--input", default="data/fra.txt", type=str)
#     p.add_argument("--num_samples", default=3000, type=int)
#     p.add_argument("--batch_size", default=1, type=int)
#     p.add_argument("--epochs", default=40, type=int)
#     p.add_argument("--latent_dim", default=191, type=int)
#     p.add_argument("--inter_dim", default=353, type=int)
#     p.add_argument("--samples", default=5, type=int)
#     args = p.parse_args()

timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=10000,
                                                                                      data_path='../src/data/sampled_data_order.txt')

print(x.shape, "Creating model...")

Number of samples: 10000
Number of unique input tokens: 21
Max sequence length for inputs: 52
(10000, 52, 21) Creating model...


In [21]:
input_dim, timesteps = x.shape[-1], x.shape[-2]
batch_size, latent_dim = 1, 191
intermediate_dim, epochs = 353, 10

vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                         batch_size=batch_size,
                                         intermediate_dim=intermediate_dim,
                                         latent_dim=latent_dim,
                                        )
print("Training model...")

vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)

Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_26 (InputLayer)          [(None, None, 21)]   0           []                               
                                                                                                  
 lstm_10 (LSTM)                 (None, 353)          529500      ['input_26[0][0]']               
                                                                                                  
 dense_20 (Dense)               (None, 191)          67614       ['lstm_10[0][0]']                
                                                                                                  
 dense_21 (Dense)               (None, 191)          67614       ['lstm_10[0][0]']                
                                                                                           

<keras.callbacks.History at 0x20311e19b50>

In [26]:
print("Fitted, predicting...")

def decode(s):
    return decode_sequence(s, gen, stepper, input_dim, char2id, id2char, timesteps_max)

log = []

for _ in tqdm(range(10000), desc="Sample Traces"):

    id_from = np.random.randint(0, x.shape[0] - 1)
    id_to = np.random.randint(0, x.shape[0] - 1)

    m_from, std_from = enc.predict([[x[id_from]]])
    m_to, std_to = enc.predict([[x[id_to]]])

    seq_from = np.random.normal(size=(latent_dim,))
    seq_from = m_from + std_from * seq_from

    seq_to = np.random.normal(size=(latent_dim,))
    seq_to = m_to + std_to * seq_to

    #print("==  \t", " ".join([id2char[j] for j in np.argmax(x[id_from], axis=1)]), "==")

    #for v in np.linspace(0, 1, 7):
        #print("%.2f\t" % (1 - v), decode(v * seq_to + (1 - v) * seq_from))

    #print("==  \t", " ".join([id2char[j] for j in np.argmax(x[id_to], axis=1)]), "==")
    parts = " ".join([id2char[j] for j in np.argmax(x[id_to], axis=1)]).split(' <end>')
    log.append([parts[0]])

Fitted, predicting...


Sample Traces: 100%|██████████| 10000/10000 [04:52<00:00, 34.23it/s]


In [27]:
log

[['fuel car place order confirm order pay order item out of stock pick item reorder item pick item load cargo start route end route'],
 ['place order confirm order item out of stock pay order pick item load cargo reorder item fuel car start route pick item load cargo end route'],
 ['fuel car place order confirm order pay order item out of stock reorder item pick item load cargo start route end route'],
 ['fuel car place order confirm order pay order item out of stock pick item reorder item load cargo pick item start route end route'],
 ['place order confirm order pick item pay order fuel car load cargo item out of stock start route reorder item pick item load cargo start route end route'],
 ['place order confirm order payment reminder payment reminder item out of stock reorder item pick item pay order load cargo fuel car start route end route'],
 ['place order fuel car confirm order pick item load cargo pay order item out of stock start route end route'],
 ['fuel car place order confir

In [28]:
data_path_gen = '../src/data/gen_data_order.txt'

In [33]:
with open(data_path_gen, "w", encoding="utf-8") as file:
    for sentence in log:
        line = " ".join(sentence) + "\n"
        file.write(line)

In [35]:
# vectorize the data
input_texts = []
input_characters = set(["\t"])

with open(data_path_gen, "r", encoding="utf-8") as f:
    lines = f.read().lower().split("\n")

for line in lines[: min(10000, len(lines) - 1)]:

    #input_text, _ = line.split("\t")
    input_text = word_tokenize(line)
    input_text.append("<end>")

    input_texts.append(input_text)

    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)

In [36]:
input_texts

[['fuel',
  'car',
  'place',
  'order',
  'confirm',
  'order',
  'pay',
  'order',
  'item',
  'out',
  'of',
  'stock',
  'pick',
  'item',
  'reorder',
  'item',
  'pick',
  'item',
  'load',
  'cargo',
  'start',
  'route',
  'end',
  'route',
  '<end>'],
 ['place',
  'order',
  'confirm',
  'order',
  'item',
  'out',
  'of',
  'stock',
  'pay',
  'order',
  'pick',
  'item',
  'load',
  'cargo',
  'reorder',
  'item',
  'fuel',
  'car',
  'start',
  'route',
  'pick',
  'item',
  'load',
  'cargo',
  'end',
  'route',
  '<end>'],
 ['fuel',
  'car',
  'place',
  'order',
  'confirm',
  'order',
  'pay',
  'order',
  'item',
  'out',
  'of',
  'stock',
  'reorder',
  'item',
  'pick',
  'item',
  'load',
  'cargo',
  'start',
  'route',
  'end',
  'route',
  '<end>'],
 ['fuel',
  'car',
  'place',
  'order',
  'confirm',
  'order',
  'pay',
  'order',
  'item',
  'out',
  'of',
  'stock',
  'pick',
  'item',
  'reorder',
  'item',
  'load',
  'cargo',
  'pick',
  'item',
  'start'

In [None]:
import os, time, argparse
from datetime import datetime

from pm4py.objects.log.importer.csv import factory as csv_importer
from pm4py.objects.log.exporter.xes import factory as xes_exporter
from pm4py.objects.log.importer.xes import factory as xes_importer
from pm4py.objects.petri.importer import pnml as pnml_importer

from pm4py.evaluation.replay_fitness import factory as replay_factory
from pm4py.evaluation.precision import factory as precision_factory

from conf.settings import DATA_PATH

WORK_PATH = os.path.abspath(os.getcwd())

def readFile(f_name1, f_name2, unique=False):
    traces = []

    skipped = 0

    with open(f_name1) as file:
        file_contents = file.read()
        file_contents = file_contents.split("\n")
        print("Number of train traces are:", str(len(file_contents)))
        for row in file_contents:
            if unique:
                if row not in traces:
                    traces.append(row)
                else:
                    skipped += 1
            else:
                traces.append(row)

    with open(f_name2) as file:
        file_contents = file.read()
        file_contents = file_contents.split("\n")
        print("Number of generated traces are:", str(len(file_contents)))
        for row in file_contents:
            if unique:
                if row not in traces:
                    traces.append(row)
                else:
                    skipped += 1
            else:
                traces.append(row)

    f_traces = []
    for trace in traces:
        f_trace = []
        t = trace.split(" ")
        for i in t:
            if i != "" and "<" not in i:
                f_trace.append(i)
        if len(f_trace) > 0:
            f_traces.append(f_trace)

    print("Number of traces are:", str(len(f_traces)))
    print("Number of skipped traces are:", str(skipped))
    return f_traces

def writeToFile(file, lst):
    with open(file, 'w') as outfile:
        for entry in lst:
            outfile.write(str(entry) + "\n")

def convertToCsv(traces, to_path):
    lines = []

    case = 0
    timestamp = 0
    line = "concept:name,case:concept:name,time:timestamp"
    lines.append(line)
    for trace in traces:
        for event in trace:
            timestamp = timestamp + 1
            dt_object = datetime.fromtimestamp(timestamp)

            line = str(event) + "_" + "," + str(case) + "," + str(dt_object)
            lines.append(line)

        case = case + 1

    writeToFile(str(to_path), lines)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--system', help='Which system (e.g. pb_system_5_3)', required=True)
    parser.add_argument('-sfx', '--suffix', help='Suffix (chosen epoch, e.g. 1981)', required=True)
    parser.add_argument('-j', '--job', help='Job (0/1)', required=True)
    parser.add_argument('-pn', '--pn', help='Petri net file to evaluate', required=True)
    parser.add_argument('-strategy', '--strategy', help='naive/mh', required=True)
    args = parser.parse_args()

    system = args.system
    suffix = int(args.suffix)
    job = args.job
    pn = args.pn
    strategy = args.strategy

    if DATA_PATH is None:
        train_file = os.path.join(WORK_PATH, "data", "variants", system + "_train.txt")
        gen_file = os.path.join(WORK_PATH, "data", "avatar", "variants", system + "_relgan_" + str(suffix) + "_j" + str(job) + "_" + strategy + ".txt")
        csv_file = os.path.join(WORK_PATH, "data", "avatar", "variants", system + "_relgan_" + str(suffix) + "_j" + str(job) + "_" + strategy + "_generalization.csv")
        xes_file = os.path.join(WORK_PATH, "data", "avatar", "variants", system + "_relgan_" + str(suffix) + "_j" + str(job) + "_" + strategy + "_generalization.xes")
        pn_file = os.path.join(WORK_PATH, "data", "pns", system, pn)
    else:
        train_file = os.path.join(DATA_PATH, "variants", system + "_train.txt")
        gen_file = os.path.join(DATA_PATH, "avatar", "variants", system + "_relgan_" + str(suffix) + "_j" + str(job) + "_" + strategy + ".txt")
        csv_file = os.path.join(DATA_PATH, "avatar", "variants", system + "_relgan_" + str(suffix) + "_j" + str(job) + "_" + strategy + "_generalization.csv")
        xes_file = os.path.join(DATA_PATH, "avatar", "variants", system + "_relgan_" + str(suffix) + "_j" + str(job) + "_" + strategy + "_generalization.xes")
        pn_file = os.path.join(DATA_PATH, "pns", system, pn)


    """ READ FILES AND CONVERT TO XES """
    traces = readFile(train_file,gen_file, unique=True)
    convertToCsv(traces=traces, to_path=csv_file)
    time.sleep(1)

    log = csv_importer.import_event_log(csv_file)
    xes_exporter.export_log(log, xes_file)
    time.sleep(1)

    """ PERFORM MEASUREMENT ON PN AND XES"""
    log = xes_importer.import_log(xes_file)
    net, initial_marking, final_marking = pnml_importer.import_net(pn_file)
    #ftiness von object-centric pm nehmen
    fitness = replay_factory.apply(log, net, initial_marking, final_marking)
    print("Fitness=", fitness)
    
    #precision von object-centric pm nehmen
    precision = precision_factory.apply(log, net, initial_marking, final_marking)
    print("Precision=", precision)

    fitness = fitness["log_fitness"]
    generalization = 2 * ((fitness * precision) / (fitness + precision))

    if strategy == "mh":
        print("**** ", str(system), " Job ", str(job), " on PN ", str(pn_file), " using MH SAMPLING on suffix ", str(suffix)," ***")
    elif strategy == "naive":
        print("**** ", str(system), " Job ", str(job), " on PN ", str(pn_file), " using NAIVE SAMPLING on suffix ", str(suffix), " ***")
    else:
        raise ValueError("Unknown strategy.")
    print("AVATAR Generalization=", generalization)