In [2]:
import pandas as pd
#from pm4py import pm4py
from pm4py.objects.conversion.log import converter as xes_converter
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.objects.log.obj import EventLog, Trace, Event
import math
import csv
from bidict import bidict
from datetime import datetime, timedelta
import numpy as np
import itertools
import networkx as nx
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics
from keras.models import Model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Input
from keras.optimizers import Nadam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import BatchNormalization
import pre_processing
import utility_functions
import mappers
import vectorize_features
import train
import prediction
import post_processing
import evaluation
import constants
import preprocess_graph_attributes
import pickle as pkl
import torch

In [3]:
# Load and convert the dataset 


# Import the required attributes from the CSV file
df = pd.read_csv('.//logistic_CSS9D_CSS9E.csv')[constants.required_attributes+constants.event_attribute_features+constants.case_attribute+constants.case_attribute]

# Convert the DataFrame into the event log structure
log = xes_converter.apply(df, variant=xes_converter.Variants.TO_EVENT_LOG) 

# Convert the timestamp from string to datetime format
log = pre_processing.convert_timestamp(log)

# Adjust the arrived time
#log = pre_processing.adjust_arrived_time(log)

  return df.to_dict('records')


In [4]:
data_name = 'logistic'

In [5]:

"""
global variables using the entire event log
"""
# Calculate the average dwell time in seconds: 245.68552766191863
average_dwell_time_in_sec = utility_functions.average_dwell_time(log)

# Calculate the average time since case start in seconds: 5737.639253954513
average_time_since_start_in_sec = utility_functions.average_time_since_case_start(log)

average_time_till_case_end = utility_functions.average_time_till_case_end(log)

# Build the mapper for average dwell time based on EQTYP: 
dwell_time_mapper = mappers.build_average_dwell_time_mapper(log)

In [7]:
"""
Fold creation
"""
# Create folds for training and testing
folds = pre_processing.create_folds(log, 3)

# get the training and validation log log from first two folds
train_and_validation = EventLog([case for sub_log in folds[:2] for case in sub_log])

training_log = train_and_validation[:int(len(train_and_validation)*0.8)]
validation_log = train_and_validation[int(len(train_and_validation)*0.8):]

# Use the third fold for testing
testing_log = EventLog([case for sub_log in folds[2:] for case in sub_log])

In [8]:
"""
Global variables using only the training log
"""
# Build the mapper for EID based on the training log
eid_mapper = mappers.build_eid_mapper(training_log)

# Get the maximum case length in the training log and add an end signal
max_case_len = max([len(case) for case in training_log]) + 1

# Build mappers for event attribute features based on the training log
event_attribute_mappers = {attr: mappers.build_event_attr_mapper(training_log, attr) for attr in constants.event_attribute_features}

# Build mappers for case attribute features based on the training log
case_attribute_mappers = {attr: mappers.build_case_attr_mapper(training_log, attr) for attr in constants.case_attribute_features}

# Get the routing netwrok of logistic log
network = preprocess_graph_attributes.preprocess_graph_attributes('.//20230906-logistic_nw.gml')

In [9]:
# vectorize train, test and validation set
X_train = vectorize_features.vectorize_features(training_log, average_time_since_start_in_sec, eid_mapper, average_dwell_time_in_sec, event_attribute_mappers, case_attribute_mappers, max_case_len, network, average_time_till_case_end)
Y_train = vectorize_features.vectorize_rtm_prediction(training_log, average_time_till_case_end)

X_valid = vectorize_features.vectorize_features(validation_log, average_time_since_start_in_sec, eid_mapper, average_dwell_time_in_sec, event_attribute_mappers, case_attribute_mappers, max_case_len, network, average_time_till_case_end)
Y_valid = vectorize_features.vectorize_rtm_prediction(validation_log, average_time_till_case_end)

X_test = vectorize_features.vectorize_features(testing_log, average_time_since_start_in_sec, eid_mapper, average_dwell_time_in_sec, event_attribute_mappers, case_attribute_mappers, max_case_len, network, average_time_till_case_end)
Y_test = vectorize_features.vectorize_rtm_prediction(testing_log, average_time_till_case_end)

In [10]:
#to save it
with open("GGNN_"+data_name+"_train.pkl", "wb") as f:
    pkl.dump([X_train, Y_train], f)
with open("GGNN_"+data_name+"_valid.pkl", "wb") as f:
    pkl.dump([X_valid, Y_valid], f)
with open("GGNN_"+data_name+"_test.pkl", "wb") as f:
    pkl.dump([X_test, Y_test], f)