*The prediction pipeline is based on [Comuzzi, M., Kim, S., Ko, J., Salamov, M., Cappiello, C., & Pernici, B. (2024). On the Impact of Low-Quality Activity Labels in Predictive Process Monitoring. In ICPM 2024 Workshop "ML4PM - Leveraging Machine Learning in Process Mining"].*

In this notebook, we run an LSTM-based remaining time prediction pipeline on various event logs where activity labels are ignored, to assess the weight of activity labels for the task. The first section ignores the activity labels and replaces them all with default values. The second section directly drops the Activity column from the training and test sets.

# Setup

##Imports

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
import csv
import numpy as np
import time
import pickle
import statistics
import random

from scipy.spatial import distance as scipy_distance

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, confusion_matrix

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

Mounted at /gdrive
/gdrive/My Drive/Polimi/S12/Thesis/data


# No labels when training

## Functions

In [None]:
def model_evaluate(df):

  start_time = time.time()

  total_epochs = 300

  # Group by NUMPRO and get the last event in each group
  last_events = df.groupby('NUMPRO').tail(1)

  df['DATAEV'] = pd.to_datetime(df['DATAEV'])
  df.sort_values(['NUMPRO', 'DATAEV'], inplace=True)
  df['case_end'] = df.groupby('NUMPRO')['DATAEV'].transform('max')
  df['remaining_time'] = ((df['case_end'] - df['DATAEV']).dt.total_seconds() / 86400).astype(int)
  df['event_idx'] = [0 for x in range(len(df))]  # You might need to replace with actual event encoding
  df['time_diff'] = df.groupby('NUMPRO')['DATAEV'].diff().dt.total_seconds() / 86400
  df['time_diff'] = df['time_diff'].fillna(0).astype(int)

  # sequences for LSTM
  sequences = df.groupby('NUMPRO').apply(lambda x: list(zip(x['event_idx'], x['time_diff']))) #without normalization

  max_sequence_length = 100
  # Convert sequences to a NumPy array with the correct shape
  #sequences_array = [[[elem] for elem in seq] for seq in sequences] # Removed as it was causing the shape mismatch
  sequences_padded = pad_sequences(sequences.tolist(), maxlen=max_sequence_length, padding='post', dtype='float32', value=-1)  # Convert to list before padding

  # y values based on normalized remaining times
  y_sequences = df.groupby('NUMPRO')['remaining_time'].apply(list)
  y_padded = pad_sequences(y_sequences.tolist(), maxlen=max_sequence_length, padding='post', value=0)

  # data split into training and test sets
  X_train, X_test, y_train, y_test = train_test_split(sequences_padded, y_padded, test_size=0.2, random_state=42)

  # data types are uniform
  X_train = np.array(X_train, dtype='float32')
  y_train = np.array(y_train, dtype='float32')
  X_test = np.array(X_test, dtype='float32')
  y_test = np.array(y_test, dtype='float32')

  model = Sequential([
      Masking(mask_value=-1, input_shape=(max_sequence_length, 2)), # input has 2 features per timestep
      LSTM(256, return_sequences=False),
      Dense(1, activation='relu'),
  ])

  def lr_scheduler(epoch, lr):
      decay_rate = 0.1
      decay_step = 90
      if epoch % decay_step == 0 and epoch:
          return lr * decay_rate
      return lr

  callbacks = [
      LearningRateScheduler(lr_scheduler, verbose=0), # Changed verbose from 1 to 0
      EarlyStopping(monitor='val_loss', patience=100, verbose=0),
  ]
  # Compile the model
  model.compile(optimizer='Adam', loss='mse', metrics=['mae'])

  history = model.fit(X_train, y_train, epochs=total_epochs, batch_size=32, validation_split=0.2, verbose=0, callbacks=callbacks) # Changed verbose from 1 to 0

  print("Training time:", (round((time.time() - start_time)/60,3)), "minutes")

  return model

In [None]:
def load_and_prepare_test_set(test_set):
    test_df = test_set.copy()

    # Specify column names
    case_id_col = test_df.columns[0]
    event_id_col = test_df.columns[1]
    event_date_col = test_df.columns[2]

    # Prepare the test data
    test_df[event_date_col] = pd.to_datetime(test_df[event_date_col])
    test_df.sort_values([case_id_col, event_date_col], inplace=True)
    test_df['case_end'] = test_df.groupby(case_id_col)[event_date_col].transform('max')
    test_df['remaining_time'] = ((test_df['case_end'] - test_df[event_date_col]).dt.total_seconds() / 86400).astype(float) # in days

    # Handle unknown event labels
    test_df['event_idx'] = [0 for x in range(len(test_df))]  # Initialize with a numerical value (e.g., 0)
    test_df['time_diff'] = test_df.groupby(case_id_col)[event_date_col].diff().dt.total_seconds() / 86400 # in days
    test_df['time_diff'] = test_df['time_diff'].fillna(0).astype(int)

    # Generate X_test and y_test for each case
    rows = []
    case_ids = test_df[case_id_col].unique()
    max_sequence_length = 100

    for cid in case_ids:
        case_data = test_df[test_df[case_id_col] == cid]
        events = list(zip(case_data['event_idx'], case_data['time_diff']))  # Now contains numerical values
        remaining_times = case_data['remaining_time'].values
        event_ids = case_data[event_id_col].values

        for i in range(1, len(events) + 1):
            x_test = pad_sequences([events[:i]], maxlen=max_sequence_length, padding='post', dtype='float32', value=(-1, 0)) # Should work now
            rows.append({
                'case_id': cid,
                'event_ids': event_ids[:i],
                'x_test': x_test[0],
                'y_test': remaining_times[i-1]
            })

    return pd.DataFrame(rows)

In [None]:
def generate_predictions(test_set, model,):
    start_time = time.time()

    # Load and prepare the test set
    test_df = load_and_prepare_test_set(test_set)

    # Make predictions
    X_test = np.stack(test_df['x_test'].values).astype('float32')
    predictions = model.predict(X_test, verbose=1)

    # Extract the last non-zero prediction for each sequence
    test_df['prediction'] = [pred[np.nonzero(pred)[0][-1]] if np.nonzero(pred)[0].size > 0 else 0 for pred in predictions]

    print("Evaluation time:", (round((time.time() - start_time)/60,3)), "minutes")

    # Edited to return evaluation time
    return test_df[['case_id', 'event_ids', 'y_test', 'prediction']]

In [None]:
def calculate_mae(results_df): # in days
    y_true = results_df['y_test'].values
    y_pred = results_df['prediction'].values
    mae = mean_absolute_error(y_true, y_pred)
    return mae

## Tests

In [None]:
for dataset in ['BPIC11_f1', 'BPIC15_1_f2', 'Credit', 'Pub', 'BPIC12', 'BPIC17']:
  score = np.array([])
  time_scores = np.array([])
  df_test = pd.read_csv(f"./{dataset}/{subset}/{subset}-TEST-CLEAN.csv")
  df_test = df_test[["NUMPRO", "DATAEV", "NUMGIU"]]
  for i in range(3):
    start_time = time.time()
    subset = dataset
    df = pd.read_csv(f"./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv")
    df = df[["NUMPRO", "DATAEV", "NUMGIU"]]
    model = model_evaluate(df)
    result_df = generate_predictions(df_test, model)
    score = np.append(score, calculate_mae(result_df))
    time_scores = np.append(time_scores, (round((time.time() - start_time)/60,3)))
  score_avg = np.average(score)
  time_avg = np.average(time_scores)
  print(dataset, ':', score_avg)
  print(dataset, ':', time_avg, "minutes")
  print()

Training time: 1.426 minutes
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.02 minutes
Training time: 1.373 minutes
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Evaluation time: 0.027 minutes
Training time: 1.442 minutes
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Evaluation time: 0.021 minutes
BPIC11_f1 : 176.34802106423373
BPIC11_f1 : 1.4366666666666668 minutes

Training time: 1.248 minutes
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.027 minutes
Training time: 1.291 minutes
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Evaluation time: 0.029 minutes
Training time: 1.321 minutes
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.027 minutes
BPIC15_1_f2 : 44.23689610005821
BPIC15_1_f2 : 1.3150000000000002 minutes

Training time: 2.817 minutes
[1m396/396[0m 

# No labels

## Functions

In [None]:
def model_evaluate(df):

  start_time = time.time()

  total_epochs = 300

  # Group by NUMPRO and get the last event in each group
  last_events = df.groupby('NUMPRO').tail(1)

  df['DATAEV'] = pd.to_datetime(df['DATAEV'])
  df.sort_values(['NUMPRO', 'DATAEV'], inplace=True)
  df['case_end'] = df.groupby('NUMPRO')['DATAEV'].transform('max')
  df['remaining_time'] = ((df['case_end'] - df['DATAEV']).dt.total_seconds() / 86400).astype(int)
  df['time_diff'] = df.groupby('NUMPRO')['DATAEV'].diff().dt.total_seconds() / 86400
  df['time_diff'] = df['time_diff'].fillna(0).astype(int)

  # sequences for LSTM
  sequences = df.groupby('NUMPRO').apply(lambda x: list(x['time_diff'])) #without normalization

  max_sequence_length = 100
  # Convert sequences to a NumPy array with the correct shape
  #sequences_array = [[[elem] for elem in seq] for seq in sequences] # Removed as it was causing the shape mismatch
  sequences_padded = pad_sequences(sequences.tolist(), maxlen=max_sequence_length, padding='post', dtype='float32', value=-1)  # Convert to list before padding

  # y values based on normalized remaining times
  y_sequences = df.groupby('NUMPRO')['remaining_time'].apply(list)
  y_padded = pad_sequences(y_sequences.tolist(), maxlen=max_sequence_length, padding='post', value=0)

  # data split into training and test sets
  X_train, X_test, y_train, y_test = train_test_split(sequences_padded, y_padded, test_size=0.2, random_state=42)

  # data types are uniform
  X_train = np.array(X_train, dtype='float32')
  y_train = np.array(y_train, dtype='float32')
  X_test = np.array(X_test, dtype='float32')
  y_test = np.array(y_test, dtype='float32')

  model = Sequential([
      Masking(mask_value=-1, input_shape=(max_sequence_length, 1)),
      LSTM(256, return_sequences=False),
      Dense(1, activation='relu'),
  ])

  def lr_scheduler(epoch, lr):
      decay_rate = 0.1
      decay_step = 90
      if epoch % decay_step == 0 and epoch:
          return lr * decay_rate
      return lr

  callbacks = [
      LearningRateScheduler(lr_scheduler, verbose=0), # Changed verbose from 1 to 0
      EarlyStopping(monitor='val_loss', patience=100, verbose=0),
  ]
  # Compile the model
  model.compile(optimizer='Adam', loss='mse', metrics=['mae'])

  history = model.fit(X_train, y_train, epochs=total_epochs, batch_size=32, validation_split=0.2, verbose=0, callbacks=callbacks) # Changed verbose from 1 to 0

  print("Training time:", (round((time.time() - start_time)/60,3)), "minutes")

  return model

In [None]:
def load_and_prepare_test_set(test_set):
    test_df = test_set.copy()

    # Specify column names
    case_id_col = test_df.columns[0]
    event_date_col = test_df.columns[1]

    # Prepare the test data
    test_df[event_date_col] = pd.to_datetime(test_df[event_date_col])
    test_df.sort_values([case_id_col, event_date_col], inplace=True)
    test_df['case_end'] = test_df.groupby(case_id_col)[event_date_col].transform('max')
    test_df['remaining_time'] = ((test_df['case_end'] - test_df[event_date_col]).dt.total_seconds() / 86400).astype(float) # in days

    # Handle unknown event labels
    test_df['time_diff'] = test_df.groupby(case_id_col)[event_date_col].diff().dt.total_seconds() / 86400 # in days
    test_df['time_diff'] = test_df['time_diff'].fillna(0).astype(int)

    # Generate X_test and y_test for each case
    rows = []
    case_ids = test_df[case_id_col].unique()
    max_sequence_length = 100

    for cid in case_ids:
        case_data = test_df[test_df[case_id_col] == cid]
        events = list(case_data['time_diff'])
        remaining_times = case_data['remaining_time'].values

        for i in range(1, len(events) + 1):
            x_test = pad_sequences([events[:i]], maxlen=max_sequence_length, padding='post', dtype='float32', value=(0))
            rows.append({
                'case_id': cid,
                'x_test': x_test[0],
                'y_test': remaining_times[i-1]
            })

    return pd.DataFrame(rows)

In [None]:
def generate_predictions(test_set, model,):
    start_time = time.time()

    # Load and prepare the test set
    test_df = load_and_prepare_test_set(test_set)

    # Make predictions
    X_test = np.stack(test_df['x_test'].values).astype('float32')
    predictions = model.predict(X_test, verbose=1)

    # Extract the last non-zero prediction for each sequence
    test_df['prediction'] = [pred[np.nonzero(pred)[0][-1]] if np.nonzero(pred)[0].size > 0 else 0 for pred in predictions]

    print("Evaluation time:", (round((time.time() - start_time)/60,3)), "minutes")

    # Edited to return evaluation time
    return test_df[['case_id', 'y_test', 'prediction']]

In [None]:
def calculate_mae(results_df): # in days
    y_true = results_df['y_test'].values
    y_pred = results_df['prediction'].values
    mae = mean_absolute_error(y_true, y_pred)
    return mae

## Tests

In [None]:
for dataset in ['BPIC11_f1', 'BPIC15_1_f2', 'Credit', 'Pub', 'BPIC12', 'BPIC17']:
  subset = dataset
  score = np.array([])
  time_scores = np.array([])
  df_test = pd.read_csv(f"./{dataset}/{subset}/{subset}-TEST-CLEAN.csv")
  df_test = df_test[["NUMPRO", "DATAEV", "NUMGIU"]]
  for i in range(3):
    start_time = time.time()
    df = pd.read_csv(f"./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv")
    df = df[["NUMPRO", "DATAEV", "NUMGIU"]]
    model = model_evaluate(df)
    result_df = generate_predictions(df_test, model)
    score = np.append(score, calculate_mae(result_df))
    time_scores = np.append(time_scores, (round((time.time() - start_time)/60,3)))
  score_avg = np.average(score)
  time_avg = np.average(time_scores)
  print(dataset, ':', score_avg)
  print(dataset, ':', time_avg, "minutes")
  print()

Training time: 0.528 minutes
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.019 minutes
Training time: 1.413 minutes
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Evaluation time: 0.027 minutes
Training time: 1.401 minutes
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.019 minutes
BPIC11_f1 : 176.9116745080565
BPIC11_f1 : 1.136 minutes

Training time: 1.184 minutes
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.026 minutes
Training time: 1.274 minutes
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Evaluation time: 0.025 minutes
Training time: 1.3 minutes
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Evaluation time: 0.024 minutes
BPIC15_1_f2 : 44.00802639552467
BPIC15_1_f2 : 1.2803333333333333 minutes

Training time: 1.227 minutes
[1m396/396[0m [32m━━━━━━━━━━━