In [None]:
#import pm4py
import keras
import tensorflow as tf
import pandas as pd
import random
import numpy as np
import os
import xml.etree.ElementTree as ET
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
#local directory with logs (can be changed as per need)
train_directory = "D:\Siddhant\Masters Project\Dataset\Process Discovery Contest 2023_1_all\Training Logs"



#extracting concept names
def extract_concept_names(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    concept_names = []
    for trace in root.iter('trace'):
        for event in trace.iter('event'):
            for string in event.iter('string'):
                if string.attrib.get('key') == 'concept:name':
                    concept_names.append(string.attrib['value'])
        #end of case token
        concept_names.append('<END>')
    
    return concept_names

# Function to read and combine logs from all files in the directory
def read_and_combine_logs(directory):
    combined_activities = []
    for filename in os.listdir(directory):
        if filename.endswith('.xes'):
            log_path = os.path.join(directory, filename)
            concept_names = extract_concept_names(log_path)
            combined_activities.extend(concept_names)
    return combined_activities

In [None]:
combined_activities = read_and_combine_logs(train_directory)
print(f"Number of activities extracted: {len(combined_activities)}")

In [None]:
# Convert activities to categorical codes
activity_codes = pd.Series(combined_activities).astype('category').cat.codes
unique_activities = pd.Series(combined_activities).astype('category').cat.categories

In [None]:
max_sequence_length = 25
num_classes = len(unique_activities)
batch_size = 16


def sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes):
    while True:
        indices = list(range(1, len(activity_codes)))
        random.shuffle(indices)

        sequences = []
        next_activities = []
        
        for i in indices:
            if activity_codes[i] == '<END>':
                continue  # Skip end of case tokens for next activity prediction

            seq = activity_codes[:i].tolist()
            next_act = activity_codes[i]

            # Stop the sequence at the end of case
            if '<END>' in seq:
                end_index = seq.index('<END>')
                seq = seq[:end_index + 1]

            sequences.append(seq)
            next_activities.append(next_act)
            
            if len(sequences) == batch_size:
                X = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')
                y = to_categorical(next_activities, num_classes=num_classes)
                yield X, y
                sequences = []
                next_activities = []

In [None]:
#model

model = Sequential()
model.add(Embedding(input_dim=num_classes, output_dim=64, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training the model
model.fit(sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes),
          steps_per_epoch=len(activity_codes) // batch_size,
          epochs=10,
          validation_data=sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes),
          validation_steps=len(activity_codes) // batch_size // 10,
          callbacks=[early_stopping])


In [None]:
# Evaluation
loss, accuracy = model.evaluate(sequence_generator(activity_codes, batch_size, max_sequence_length, num_classes),
                                steps=len(activity_codes) // batch_size)
print(f'Loss: {loss}, Accuracy: {accuracy}')