In [3]:
#import pm4py
import keras
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [4]:
#local directory with logs (can be changed as per need)
train_directory = "D:\Siddhant\Masters Project\Dataset\Process Discovery Contest 2023_1_all\Training Logs"



#extracting concept names
def extract_concept_names_without_ns(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    concept_names = []
    for trace in root.iter('trace'):
        for event in trace.iter('event'):
            for string in event.iter('string'):
                if string.attrib.get('key') == 'concept:name':
                    concept_names.append(string.attrib['value'])
    
    return concept_names

# Function to read and combine logs from all files in the directory
def read_and_combine_logs(directory):
    combined_activities = []
    for filename in os.listdir(directory):
        if filename.endswith('.xes'):
            log_path = os.path.join(directory, filename)
            concept_names = extract_concept_names_without_ns(log_path)
            combined_activities.extend(concept_names)
    return combined_activities

In [5]:
combined_activities = read_and_combine_logs(train_directory)
print(f"Number of activities extracted: {len(combined_activities)}")

Number of activities extracted: 6674778


In [6]:
# Convert activities to categorical codes
activity_codes = pd.Series(combined_activities).astype('category').cat.codes
unique_activities = pd.Series(combined_activities).astype('category').cat.categories

In [7]:
sequences = []
next_activities = []

In [8]:
max_sequence_length = 100 
num_classes = len(unique_activities)
batch_size = 32

#generator function to yield batches of sequences and labels
def sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes):
    while True:
        sequences = []
        next_activities = []
        for i in range(1, len(activity_codes)):
            sequences.append(activity_codes[:i].tolist())
            next_activities.append(activity_codes[i])
            if len(sequences) == batch_size:
                X = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
                y = to_categorical(next_activities, num_classes=num_classes)
                yield X, y
                sequences = []
                next_activities = []

In [None]:
#model
model = Sequential()
model.add(Embedding(input_dim=num_classes, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Training
model.fit(sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes),
          steps_per_epoch=len(activity_codes) // batch_size,
          epochs=5)




Epoch 1/5


 39306/208586 [====>.........................] - ETA: 9:25:59 - loss: 1.0370 - accuracy: 0.5186

In [None]:
# Evaluation
loss, accuracy = model.evaluate(sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes),
                                steps=len(activity_codes) // batch_size)
print(f'Loss: {loss}, Accuracy: {accuracy}')