In [41]:
from __future__ import print_function, division
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN, Input, BatchNormalization
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from collections import Counter
import csv
import numpy as np
import random
import sys
import os
import copy
import time
from itertools import zip_longest as izip
from datetime import datetime
from math import log

eventlog = "helpdesk.csv"
data_path = os.path.join(os.getcwd(), 'data', eventlog)

########################################################################################
#
# this part of the code opens the file, reads it into three following variables
#

lines = []  # these are all the activity seq
timeseqs = []  # time sequences (differences between two events)
timeseqs2 = []  # time sequences (differences between the current and first)

# helper variables
lastcase = ''
line = ''
firstLine = True
times = []
times2 = []
numlines = 0
casestarttime = None
lasteventtime = None

with open(data_path, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    ascii_offset = 161

    for row in spamreader:  # the rows are "CaseID,ActivityID,CompleteTimestamp"
        try:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
        except ValueError:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
        if row[0] != lastcase:  # 'lastcase' is to save the last executed case for the loop
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
            line = ''
            times = []
            times2 = []
            numlines += 1
        line += chr(int(row[1]) + ascii_offset)
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        lasteventtime = t
        firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
numlines += 1

########################################

divisor = np.mean([item for sublist in timeseqs for item in sublist])  # average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])  # average time between current and first events
print('divisor2: {}'.format(divisor2))

#########################################################################################################

# separate training data into 3 parts

elems_per_fold = int(round(numlines / 3))
fold1 = lines[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]

fold2 = lines[elems_per_fold:2 * elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold]

fold3 = lines[2 * elems_per_fold:]
fold3_t = timeseqs[2 * elems_per_fold:]
fold3_t2 = timeseqs2[2 * elems_per_fold:]

# leave away fold3 for now
lines = fold1 + fold2
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2

step = 1
sentences = []
softness = 0
next_chars = []
lines = list(map(lambda x: x + '!', lines))  # put delimiter symbol
maxlen = max(map(lambda x: len(x), lines))  # find maximum line size

# next lines here to get all possible characters for events and annotate them with numbers
chars = list(map(lambda x: set(x), lines))
chars = list(set().union(*chars))
chars.sort()
target_chars = copy.copy(chars)
chars.remove('!')
print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
print(indices_char)

with open(data_path, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    timeseqs = []
    timeseqs2 = []
    timeseqs3 = []
    timeseqs4 = []
    times = []
    times2 = []
    times3 = []
    times4 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None
    for row in spamreader:
        try:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
        except ValueError:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
        if row[0] != lastcase:
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
                timeseqs4.append(times4)
            line = ''
            times = []
            times2 = []
            times3 = []
            times4 = []
            numlines += 1
        line += chr(int(row[1]) + ascii_offset)
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        timediff3 = timesincemidnight.seconds  # this leaves only time even occured after midnight
        timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday()  # day of the week
        times.append(timediff)
        times2.append(timediff2)
        times3.append(timediff3)
        times4.append(timediff4)
        lasteventtime = t
        firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
timeseqs4.append(times4)
numlines += 1

elems_per_fold = int(round(numlines / 3))
fold1 = lines[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]
fold1_t3 = timeseqs3[:elems_per_fold]
fold1_t4 = timeseqs4[:elems_per_fold]
with open('output_files/folds/fold1.csv', 'w', newline='', encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in izip(fold1, fold1_t):
        spamwriter.writerow([f'{s}#{t}' for s, t in izip(row, timeseq)])

fold2 = lines[elems_per_fold:2 * elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold]
fold2_t3 = timeseqs3[elems_per_fold:2 * elems_per_fold]
fold2_t4 = timeseqs4[elems_per_fold:2 * elems_per_fold]
with open('output_files/folds/fold2.csv', 'w', newline='', encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in izip(fold2, fold2_t):
        spamwriter.writerow([f'{s}#{t}' for s, t in izip(row, timeseq)])

fold3 = lines[2 * elems_per_fold:]
fold3_t = timeseqs[2 * elems_per_fold:]
fold3_t2 = timeseqs2[2 * elems_per_fold:]
fold3_t3 = timeseqs3[2 * elems_per_fold:]
fold3_t4 = timeseqs4[2 * elems_per_fold:]
with open('output_files/folds/fold3.csv', 'w', newline='', encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in izip(fold3, fold3_t):
        spamwriter.writerow([f'{s}#{t}' for s, t in izip(row, timeseq)])

lines = fold1 + fold2
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2
lines_t3 = fold1_t3 + fold2_t3
lines_t4 = fold1_t4 + fold2_t4

step = 1
sentences = []
softness = 0
next_chars = []
lines = list(map(lambda x: x + '!', lines))

sentences_t = []
sentences_t2 = []
sentences_t3 = []
sentences_t4 = []
next_chars_t = []
next_chars_t2 = []
next_chars_t3 = []
next_chars_t4 = []
for line, line_t, line_t2, line_t3, line_t4 in izip(lines, lines_t, lines_t2, lines_t3, lines_t4):
    for i in range(0, len(line), step):
        if i == 0:
            continue

        # we add iteratively, first symbol of the line, then two first, three...

        sentences.append(line[0: i])
        sentences_t.append(line_t[0:i])
        sentences_t2.append(line_t2[0:i])
        sentences_t3.append(line_t3[0:i])
        sentences_t4.append(line_t4[0:i])
        next_chars.append(line[i])
        if i == len(line) - 1:  # special case to deal time of end character
            next_chars_t.append(0)
            next_chars_t2.append(0)
            next_chars_t3.append(0)
            next_chars_t4.append(0)
        else:
            next_chars_t.append(line_t[i])
            next_chars_t2.append(line_t2[i])
            next_chars_t3.append(line_t3[i])
            next_chars_t4.append(line_t4[i])
print('nb sequences:', len(sentences))

print('Vectorization...')
num_features = len(chars) + 5
print('num features: {}'.format(num_features))
X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32)
y_t = np.zeros((len(sentences)), dtype=np.float32)
for i, sentence in enumerate(sentences):
    leftpad = maxlen - len(sentence)
    next_t = next_chars_t[i]
    sentence_t = sentences_t[i]
    sentence_t2 = sentences_t2[i]
    sentence_t3 = sentences_t3[i]
    sentence_t4 = sentences_t4[i]
    for t, char in enumerate(sentence):
        multiset_abstraction = Counter(sentence[:t + 1])
        for c in chars:
            if c == char:  # this will encode present events to the right places
                X[i, t + leftpad, char_indices[c]] = 1
        X[i, t + leftpad, len(chars)] = t + 1
        X[i, t + leftpad, len(chars) + 1] = sentence_t[t] / divisor
        X[i, t + leftpad, len(chars) + 2] = sentence_t2[t] / divisor2
        X[i, t + leftpad, len(chars) + 3] = sentence_t3[t] / 86400
        X[i, t + leftpad, len(chars) + 4] = sentence_t4[t] / 7
    for c in target_chars:
        if c == next_chars[i]:
            y_a[i, target_char_indices[c]] = 1 - softness
        else:
            y_a[i, target_char_indices[c]] = softness / (len(target_chars) - 1)
    y_t[i] = next_t / divisor

# build the model: 
print('Build model...')
main_input = Input(shape=(maxlen, num_features), name='main_input')
# train a 2-layer LSTM with one shared layer
l1 = LSTM(100, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input)  # the shared layer
b1 = BatchNormalization()(l1)
l2_1 = LSTM(100, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1)  # the layer specialized in activity prediction
b2_1 = BatchNormalization()(l2_1)
l2_2 = LSTM(100, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1)  # the layer specialized in time prediction
b2_2 = BatchNormalization()(l2_2)
act_output = Dense(len(target_chars), activation='softmax', kernel_initializer='glorot_uniform', name='act_output')(b2_1)
time_output = Dense(1, kernel_initializer='glorot_uniform', name='time_output')(b2_2)

model = Model(inputs=[main_input], outputs=[act_output, time_output])

opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clipvalue=3)


model.compile(loss={'act_output': 'categorical_crossentropy', 'time_output': 'mae'}, optimizer=opt)
early_stopping = EarlyStopping(monitor='val_loss', patience=42)
model_checkpoint = ModelCheckpoint('output_files/models/model_{epoch:02d}-{val_loss:.2f}.keras', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

model.fit(X, {'act_output': y_a, 'time_output': y_t}, validation_split=0.2, verbose=2, callbacks=[early_stopping, model_checkpoint, lr_reducer], batch_size=maxlen, epochs=500)


divisor: 210915.5579868709
divisor2: 409875.1028446389
total chars: 9, target chars: 10
{0: '¢', 1: '£', 2: '¤', 3: '¥', 4: '¦', 5: '§', 6: '¨', 7: '©', 8: 'ª'}
nb sequences: 9181
Vectorization...
num features: 14
Build model...
Epoch 1/500
490/490 - 15s - 30ms/step - loss: 2.1459 - val_loss: 1.6761 - learning_rate: 0.0020
Epoch 2/500
490/490 - 7s - 15ms/step - loss: 1.7420 - val_loss: 1.6248 - learning_rate: 0.0020
Epoch 3/500
490/490 - 7s - 15ms/step - loss: 1.7002 - val_loss: 1.5479 - learning_rate: 0.0020
Epoch 4/500
490/490 - 7s - 15ms/step - loss: 1.6705 - val_loss: 1.5443 - learning_rate: 0.0020
Epoch 5/500
490/490 - 7s - 15ms/step - loss: 1.6772 - val_loss: 1.5844 - learning_rate: 0.0020
Epoch 6/500
490/490 - 8s - 16ms/step - loss: 1.6645 - val_loss: 1.5831 - learning_rate: 0.0020
Epoch 7/500
490/490 - 8s - 16ms/step - loss: 1.6564 - val_loss: 1.5322 - learning_rate: 0.0020
Epoch 8/500
490/490 - 8s - 15ms/step - loss: 1.6477 - val_loss: 1.5493 - learning_rate: 0.0020
Epoch 9/50

<keras.src.callbacks.history.History at 0x1dd57bcd0c0>

In [44]:
'''
This script takes as input the LSTM or RNN weights found by train.py
change the path in line 176 of this script to point to the keras file
with LSTM or RNN weights generated by train.py

Author: Niek Tax
'''

from __future__ import division
from tensorflow.keras.models import load_model
import csv
import copy
import numpy as np
import distance
from itertools import zip_longest as izip
from jellyfish._jellyfish import damerau_levenshtein_distance
from sklearn import metrics
from math import sqrt
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from collections import Counter
import os

eventlog = "helpdesk.csv"
data_path = os.path.join(os.getcwd(), 'data', eventlog)

with open(data_path, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    ascii_offset = 161

    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    caseids = []
    timeseqs = []
    timeseqs2 = []
    times = []
    times2 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None
    for row in spamreader:
        try:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
        except ValueError:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
        if row[0] != lastcase:
            caseids.append(row[0])
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
            line = ''
            times = []
            numlines += 1
        line += chr(int(row[1]) + ascii_offset)
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        lasteventtime = t
        firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
numlines += 1

divisor = np.mean([item for sublist in timeseqs for item in sublist])
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
print('divisor2: {}'.format(divisor2))

elems_per_fold = int(round(numlines / 3))
fold1 = lines[:elems_per_fold]
fold1_c = caseids[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]

fold2 = lines[elems_per_fold:2 * elems_per_fold]
fold2_c = caseids[elems_per_fold:2 * elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold]

lines = fold1 + fold2
caseids = fold1_c + fold2_c
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2

step = 1
sentences = []
softness = 0
next_chars = []
lines = list(map(lambda x: x + '!', lines))
maxlen = max(map(lambda x: len(x), lines))

chars = list(map(lambda x: set(x), lines))
chars = list(set().union(*chars))
chars.sort()
target_chars = copy.copy(chars)
chars.remove('!')
print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
print(indices_char)

lastcase = ''
line = ''
firstLine = True
lines = []
caseids = []
timeseqs = []  # relative time since previous event
timeseqs2 = [] # relative time since case start
timeseqs3 = [] # absolute time of previous event
times = []
times2 = []
times3 = []
numlines = 0
casestarttime = None
lasteventtime = None

with open(data_path, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    for row in spamreader:
        try:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
        except ValueError:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
        if row[0] != lastcase:
            caseids.append(row[0])
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
            line = ''
            times = []
            numlines += 1
        line += chr(int(row[1]) + ascii_offset)
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        times3.append(datetime.fromtimestamp(time.mktime(t)))
        lasteventtime = t
        firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
numlines += 1

fold3 = lines[2 * elems_per_fold:]
fold3_c = caseids[2 * elems_per_fold:]
fold3_t = timeseqs[2 * elems_per_fold:]
fold3_t2 = timeseqs2[2 * elems_per_fold:]
fold3_t3 = timeseqs3[2 * elems_per_fold:]

lines = fold3
caseids = fold3_c
lines_t = fold3_t
lines_t2 = fold3_t2
lines_t3 = fold3_t3

# set parameters
predict_size = 1

# load model, set this to the model generated by train.py
model = load_model('output_files/models/model_62-1.51.keras')

# define helper functions
def encode(sentence, times, times3, maxlen=maxlen):
    num_features = len(chars) + 5
    X = np.zeros((1, maxlen, num_features), dtype=np.float32)
    leftpad = maxlen - len(sentence)
    times2 = np.cumsum(times)
    for t, char in enumerate(sentence):
        midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = times3[t] - midnight
        multiset_abstraction = Counter(sentence[:t + 1])
        for c in chars:
            if c == char:
                X[0, t + leftpad, char_indices[c]] = 1
        X[0, t + leftpad, len(chars)] = t + 1
        X[0, t + leftpad, len(chars) + 1] = times[t] / divisor
        X[0, t + leftpad, len(chars) + 2] = times2[t] / divisor2
        X[0, t + leftpad, len(chars) + 3] = timesincemidnight.seconds / 86400
        X[0, t + leftpad, len(chars) + 4] = times3[t].weekday() / 7
    return X

def getSymbol(predictions):
    maxPrediction = 0
    symbol = ''
    i = 0
    for prediction in predictions:
        if prediction >= maxPrediction:
            maxPrediction = prediction
            symbol = target_indices_char[i]
        i += 1
    return symbol

one_ahead_gt = []
one_ahead_pred = []

two_ahead_gt = []
two_ahead_pred = []

three_ahead_gt = []
three_ahead_pred = []

# make predictions
results_path = os.path.join('output_files', 'results', f'next_activity_and_time_{eventlog}')
with open(results_path, 'w', newline='', encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(["CaseID", "Prefix length", "Ground truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE"])
    for prefix_size in range(2, maxlen):
        print(prefix_size)
        for line, caseid, times, times3 in izip(lines, caseids, lines_t, lines_t3):
            times.append(0)
            cropped_line = ''.join(line[:prefix_size])
            cropped_times = times[:prefix_size]
            cropped_times3 = times3[:prefix_size]
            if '!' in cropped_line:
                continue  # make no prediction for this case, since this case has ended already
            ground_truth = ''.join(line[prefix_size:prefix_size + predict_size])
            ground_truth_t = times[prefix_size:prefix_size + predict_size]
            predicted = ''
            predicted_t = []
            for i in range(predict_size):
                if len(ground_truth) <= i:
                    continue
                enc = encode(cropped_line, cropped_times, cropped_times3)
                y = model.predict(enc, verbose=0)
                y_char = y[0][0]
                y_t = y[1][0][0]
                prediction = getSymbol(y_char)
                cropped_line += prediction
                if y_t < 0:
                    y_t = 0
                cropped_times.append(y_t)
                y_t = y_t * divisor
                cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t))
                predicted_t.append(y_t)
                if i == 0:
                    if len(ground_truth_t) > 0:
                        one_ahead_pred.append(y_t)
                        one_ahead_gt.append(ground_truth_t[0])
                if i == 1:
                    if len(ground_truth_t) > 1:
                        two_ahead_pred.append(y_t)
                        two_ahead_gt.append(ground_truth_t[1])
                if i == 2:
                    if len(ground_truth_t) > 2:
                        three_ahead_pred.append(y_t)
                        three_ahead_gt.append(ground_truth_t[2])
                if prediction == '!':  # end of case was just predicted, therefore, stop predicting further into the future
                    print('! predicted, end case')
                    break
                predicted += prediction
            output = []
            if len(ground_truth) > 0:
                output.append(caseid)
                output.append(prefix_size)
                output.append(ground_truth)
                output.append(predicted)
                output.append(1 - distance.nlevenshtein(predicted, ground_truth))
                dls = 1 - (damerau_levenshtein_distance(predicted, ground_truth) / max(len(predicted), len(ground_truth)))
                if dls < 0:
                    dls = 0  # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                output.append(dls)
                output.append(1 - distance.jaccard(predicted, ground_truth))
                output.append('; '.join(str(x) for x in ground_truth_t))
                output.append('; '.join(str(x) for x in predicted_t))
                if len(predicted_t) > len(ground_truth_t):  # if predicted more events than length of case, only use needed number of events for time evaluation
                    predicted_t = predicted_t[:len(ground_truth_t)]
                if len(ground_truth_t) > len(predicted_t):  # if predicted less events than length of case, put 0 as placeholder prediction
                    predicted_t.extend(range(len(ground_truth_t) - len(predicted_t)))
                if len(ground_truth_t) > 0 and len(predicted_t) > 0:
                    output.append('')
                    output.append(metrics.mean_absolute_error([ground_truth_t[0]], [predicted_t[0]]))
                    #output.append(metrics.median_absolute_error([ground_truth_t[0]], [predicted_t[0]]))
                else:
                    output.append('')
                    output.append('')
                    output.append('')
                spamwriter.writerow(output)


divisor: 210915.5579868709
divisor2: 409875.1028446389
total chars: 9, target chars: 10
{0: '¢', 1: '£', 2: '¤', 3: '¥', 4: '¦', 5: '§', 6: '¨', 7: '©', 8: 'ª'}
2
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
3
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case

In [46]:
'''
This script takes as input the LSTM or RNN weights found by train.py
change the path in line 178 of this script to point to the keras file
with LSTM or RNN weights generated by train.py

Author: Niek Tax
'''

from __future__ import division
from tensorflow.keras.models import load_model
import csv
import copy
import numpy as np
import distance
from itertools import zip_longest as izip
from jellyfish._jellyfish import damerau_levenshtein_distance
from sklearn import metrics
from math import sqrt
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from collections import Counter
import os

eventlog = "helpdesk.csv"
data_path = os.path.join(os.getcwd(), 'data', eventlog)

with open(data_path, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    ascii_offset = 161

    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    caseids = []
    timeseqs = []
    timeseqs2 = []
    times = []
    times2 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None
    for row in spamreader:
        try:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
        except ValueError:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
        if row[0] != lastcase:
            caseids.append(row[0])
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
            line = ''
            times = []
            times2 = []
            numlines += 1
        line += chr(int(row[1]) + ascii_offset)
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        lasteventtime = t
        firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
numlines += 1

divisor = np.mean([item for sublist in timeseqs for item in sublist])
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
print('divisor2: {}'.format(divisor2))
divisor3 = np.mean(list(map(lambda x: np.mean(list(map(lambda y: x[len(x)-1]-y, x))), timeseqs2)))
print('divisor3: {}'.format(divisor3))

elems_per_fold = int(round(numlines / 3))
fold1 = lines[:elems_per_fold]
fold1_c = caseids[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]

fold2 = lines[elems_per_fold:2 * elems_per_fold]
fold2_c = caseids[elems_per_fold:2 * elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2 * elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2 * elems_per_fold]

lines = fold1 + fold2
caseids = fold1_c + fold2_c
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2

step = 1
sentences = []
softness = 0
next_chars = []
lines = list(map(lambda x: x + '!', lines))
maxlen = max(map(lambda x: len(x), lines))

chars = list(map(lambda x: set(x), lines))
chars = list(set().union(*chars))
chars.sort()
target_chars = copy.copy(chars)
chars.remove('!')
print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
print(indices_char)

lastcase = ''
line = ''
firstLine = True
lines = []
caseids = []
timeseqs = []  # relative time since previous event
timeseqs2 = [] # relative time since case start
timeseqs3 = [] # absolute time of previous event
times = []
times2 = []
times3 = []
numlines = 0
casestarttime = None
lasteventtime = None

with open(data_path, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(spamreader, None)  # skip the headers
    for row in spamreader:
        try:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
        except ValueError:
            t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
        if row[0] != lastcase:
            caseids.append(row[0])
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
            line = ''
            times = []
            times2 = []
            times3 = []
            numlines += 1
        line += chr(int(row[1]) + ascii_offset)
        timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
        midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = datetime.fromtimestamp(time.mktime(t)) - midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        times3.append(datetime.fromtimestamp(time.mktime(t)))
        lasteventtime = t
        firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
numlines += 1

fold3 = lines[2 * elems_per_fold:]
fold3_c = caseids[2 * elems_per_fold:]
fold3_t = timeseqs[2 * elems_per_fold:]
fold3_t2 = timeseqs2[2 * elems_per_fold:]
fold3_t3 = timeseqs3[2 * elems_per_fold:]

lines = fold3
caseids = fold3_c
lines_t = fold3_t
lines_t2 = fold3_t2
lines_t3 = fold3_t3

# set parameters
predict_size = maxlen

# load model, set this to the model generated by train.py
model = load_model('output_files/models/model_62-1.51.keras')

# define helper functions
def encode(sentence, times, times3, maxlen=maxlen):
    num_features = len(chars) + 5
    X = np.zeros((1, maxlen, num_features), dtype=np.float32)
    leftpad = maxlen - len(sentence)
    times2 = np.cumsum(times)
    for t, char in enumerate(sentence):
        midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = times3[t] - midnight
        multiset_abstraction = Counter(sentence[:t + 1])
        for c in chars:
            if c == char:
                X[0, t + leftpad, char_indices[c]] = 1
        X[0, t + leftpad, len(chars)] = t + 1
        X[0, t + leftpad, len(chars) + 1] = times[t] / divisor
        X[0, t + leftpad, len(chars) + 2] = times2[t] / divisor2
        X[0, t + leftpad, len(chars) + 3] = timesincemidnight.seconds / 86400
        X[0, t + leftpad, len(chars) + 4] = times3[t].weekday() / 7
    return X

def getSymbol(predictions):
    maxPrediction = 0
    symbol = ''
    i = 0
    for prediction in predictions:
        if prediction >= maxPrediction:
            maxPrediction = prediction
            symbol = target_indices_char[i]
        i += 1
    return symbol

one_ahead_gt = []
one_ahead_pred = []

two_ahead_gt = []
two_ahead_pred = []

three_ahead_gt = []
three_ahead_pred = []

# make predictions
results_path = os.path.join('output_files', 'results', f'suffix_and_remaining_time_{eventlog}')
with open(results_path, 'w', newline='', encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(["CaseID", "Prefix length", "Ground truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE"])
    for prefix_size in range(2, maxlen):
        print(prefix_size)
        for line, caseid, times, times2, times3 in izip(lines, caseids, lines_t, lines_t2, lines_t3):
            times.append(0)
            cropped_line = ''.join(line[:prefix_size])
            cropped_times = times[:prefix_size]
            cropped_times3 = times3[:prefix_size]
            if len(times2) < prefix_size:
                continue  # make no prediction for this case, since this case has ended already
            ground_truth = ''.join(line[prefix_size:prefix_size + predict_size])
            ground_truth_t = times2[prefix_size - 1]
            case_end_time = times2[len(times2) - 1]
            ground_truth_t = case_end_time - ground_truth_t
            predicted = ''
            total_predicted_time = 0
            for i in range(predict_size):
                enc = encode(cropped_line, cropped_times, cropped_times3)
                y = model.predict(enc, verbose=0)  # make predictions
                # split predictions into separate activity and time predictions
                y_char = y[0][0]
                y_t = y[1][0][0]
                prediction = getSymbol(y_char)  # undo one-hot encoding
                cropped_line += prediction
                if y_t < 0:
                    y_t = 0
                cropped_times.append(y_t)
                if prediction == '!':  # end of case was just predicted, therefore, stop predicting further into the future
                    one_ahead_pred.append(total_predicted_time)
                    one_ahead_gt.append(ground_truth_t)
                    print('! predicted, end case')
                    break
                y_t = y_t * divisor3
                cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t))
                total_predicted_time = total_predicted_time + y_t
                predicted += prediction
            output = []
            if len(ground_truth) > 0:
                output.append(caseid)
                output.append(prefix_size)
                output.append(ground_truth)
                output.append(predicted)
                output.append(1 - distance.nlevenshtein(predicted, ground_truth))
                dls = 1 - (damerau_levenshtein_distance(predicted, ground_truth) / max(len(predicted), len(ground_truth)))
                if dls < 0:
                    dls = 0  # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                output.append(dls)
                output.append(1 - distance.jaccard(predicted, ground_truth))
                output.append(ground_truth_t)
                output.append(total_predicted_time)
                output.append('')
                output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time]))
                #output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time]))
                spamwriter.writerow(output)


divisor: 210915.5579868709
divisor2: 409875.1028446389
divisor3: 405646.7917099549
total chars: 9, target chars: 10
{0: '¢', 1: '£', 2: '¤', 3: '¥', 4: '¦', 5: '§', 6: '¨', 7: '©', 8: 'ª'}
2
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end 

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicted, end case
! predicte

In [52]:
'''
This script takes as input the output of evaluate_suffix_and_remaining_time.py
therefore, the latter needs to be executed first

Author: Niek Tax
'''

from __future__ import division
import csv
import os

eventlog = "helpdesk.csv"
csvfile_path = 'output_files/results/suffix_and_remaining_time_%s' % eventlog

# Check if the file exists
if not os.path.exists(csvfile_path):
    raise FileNotFoundError(f"The file {csvfile_path} does not exist. Please ensure evaluate_suffix_and_remaining_time.py has been executed successfully.")

with open(csvfile_path, 'r', encoding='utf-8') as csvfile:
    r = csv.reader(csvfile)
    next(r)  # skip header

    vals = dict()
    for row in r:
        if row[0] in vals:
            l = vals[row[0]]
        else:
            l = []
        
        if len(row[2]) == 0 or len(row[3]) == 0:  # Handle empty ground truth or prediction
            l.append(0)
        else:
            l.append(int(row[2][0] == row[3][0]))  # Compare first char of ground truth and predicted
        
        vals[row[0]] = l

# Calculate per-case accuracy and overall accuracy
total_correct = 0
total_cases = 0
case_accuracies = []
for k, v in vals.items():
    case_accuracy = sum(v) / len(v)
    case_accuracies.append((k, case_accuracy))
    total_correct += sum(v)
    total_cases += len(v)

overall_accuracy = total_correct / total_cases if total_cases > 0 else 0

# Display the per-case accuracies and overall accuracy
case_accuracies, overall_accuracy


([('3087', 1.0),
  ('3088', 0.75),
  ('3089', 0.5),
  ('3090', 1.0),
  ('3091', 0.5),
  ('3092', 1.0),
  ('3093', 1.0),
  ('3094', 1.0),
  ('3095', 1.0),
  ('3096', 1.0),
  ('3098', 1.0),
  ('3099', 0.5),
  ('3100', 0.6666666666666666),
  ('3101', 1.0),
  ('3102', 0.5),
  ('3104', 1.0),
  ('3105', 0.5),
  ('3106', 0.5),
  ('3107', 1.0),
  ('3109', 1.0),
  ('3110', 0.5),
  ('3111', 1.0),
  ('3114', 0.5),
  ('3115', 0.5),
  ('3116', 1.0),
  ('3117', 1.0),
  ('3119', 0.6666666666666666),
  ('3120', 1.0),
  ('3122', 0.5),
  ('3123', 1.0),
  ('3124', 1.0),
  ('3125', 1.0),
  ('3127', 1.0),
  ('3129', 1.0),
  ('3130', 1.0),
  ('3131', 0.6666666666666666),
  ('3132', 1.0),
  ('3133', 1.0),
  ('3134', 1.0),
  ('3135', 0.5),
  ('3136', 1.0),
  ('3137', 0.0),
  ('3138', 0.3333333333333333),
  ('3140', 1.0),
  ('3141', 0.5),
  ('3143', 0.5),
  ('3144', 1.0),
  ('3146', 1.0),
  ('3147', 1.0),
  ('3149', 1.0),
  ('3150', 0.75),
  ('3151', 1.0),
  ('3152', 0.6666666666666666),
  ('3153', 0.666666666

In [56]:
from __future__ import division
import shap
import numpy as np
import os
import csv
import copy
import time
from datetime import datetime, timedelta
from collections import Counter
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt

# File paths
eventlog = "helpdesk.csv"
data_path = os.path.join('/mnt/data', eventlog)
model_path = '/mnt/data/model_62-1.51.keras'

# Function to load data
def load_data(data_path):
    with open(data_path, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        next(spamreader, None)  # skip the headers
        ascii_offset = 161

        lastcase = ''
        line = ''
        firstLine = True
        lines = []
        caseids = []
        timeseqs = []
        timeseqs2 = []
        times = []
        times2 = []
        numlines = 0
        casestarttime = None
        lasteventtime = None
        for row in spamreader:
            try:
                t = time.strptime(row[2], "%m/%d/%Y %H:%M")  # updated to match the format in your data
            except ValueError:
                t = time.strptime(row[2], "%m/%d/%Y %H:%M:%S")  # for cases where seconds are present
            if row[0] != lastcase:
                caseids.append(row[0])
                casestarttime = t
                lasteventtime = t
                lastcase = row[0]
                if not firstLine:
                    lines.append(line)
                    timeseqs.append(times)
                    timeseqs2.append(times2)
                line = ''
                times = []
                times2 = []
                numlines += 1
            line += chr(int(row[1]) + ascii_offset)
            timesincelastevent = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(lasteventtime))
            timesincecasestart = datetime.fromtimestamp(time.mktime(t)) - datetime.fromtimestamp(time.mktime(casestarttime))
            timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
            timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
            times.append(timediff)
            times2.append(timediff2)
            lasteventtime = t
            firstLine = False

    # add last case
    lines.append(line)
    timeseqs.append(times)
    timeseqs2.append(times2)
    numlines += 1

    return lines, caseids, timeseqs, timeseqs2

lines, caseids, timeseqs, timeseqs2 = load_data(data_path)

# Load the pre-trained model
model = load_model(model_path)

# Define helper functions for encoding input data
chars = list(set().union(*map(set, lines)))
chars.sort()
target_chars = copy.copy(chars)
if '!' in chars:
    chars.remove('!')
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}
target_char_indices = {c: i for i, c in enumerate(target_chars)}
target_indices_char = {i, c for i, c in enumerate(target_chars)}

divisor = np.mean([item for sublist in timeseqs for item in sublist])
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])

def encode(sentence, times, maxlen, chars, char_indices, divisor, divisor2):
    num_features = len(chars) + 5
    X = np.zeros((1, maxlen, num_features), dtype=np.float32)
    leftpad = maxlen - len(sentence)
    times2 = np.cumsum(times)
    for t, char in enumerate(sentence):
        midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = times3[t] - midnight
        for c in chars:
            if c == char:
                X[0, t + leftpad, char_indices[c]] = 1
        X[0, t + leftpad, len(chars)] = t + 1
        X[0, t + leftpad, len(chars) + 1] = times[t] / divisor
        X[0, t + leftpad, len(chars) + 2] = times2[t] / divisor2
        X[0, t + leftpad, len(chars) + 3] = timesincemidnight.seconds / 86400
        X[0, t + leftpad, len(chars) + 4] = times3[t].weekday() / 7
    return X

# Prepare data for SHAP
maxlen = max(map(lambda x: len(x), lines))
X_encoded = []
for line, times in zip(lines, timeseqs):
    X_encoded.append(encode(line, times, maxlen, chars, char_indices, divisor, divisor2)[0])

X_encoded = np.array(X_encoded)

# Create a SHAP explainer
explainer = shap.KernelExplainer(model.predict, X_encoded[:100])  # Use a subset for the explainer to save time

# Calculate SHAP values
shap_values = explainer.shap_values(X_encoded[:10])  # Use a subset for visualization to save time

# Plot SHAP values
shap.summary_plot(shap_values, X_encoded[:10], plot_type="bar", feature_names=[*chars, "position", "time_since_last_event", "time_since_case_start", "time_since_midnight", "day_of_week"])

plt.show()


SyntaxError: did you forget parentheses around the comprehension target? (1667541550.py, line 87)