# DBL Process Mining

## Imports and Class definitions

### Performance Measure

In [None]:
#CPU usage

import psutil
p = psutil.Process()
p.cpu_percent(interval=None)
#all of the code you want to check under here
#p.cpu_percent(interval=None) at the end
#multithreading causes over 100% usage


#define global variables
util_counter = 0
memory_stacked = 0
cpu_stacked = 0

memory_initial = round(psutil.virtual_memory()[3]/1024/1024/1024, 3)
memory_initial_perc = psutil.virtual_memory()[2]

cpu_initial_perc = psutil.cpu_percent(4)
# Define function

def reset_mem_report():
    util_counter = 0
    memory_stacked = 0
    cpu_stacked = 0

def mem_report():
    #retrieve global variables (counters and initial values)
    global util_counter
    global memory_stacked
    global cpu_stacked
    
    global memory_initial
    global memory_initial_perc
    
    global cpu_initial_perc
    
    #retrieve current utils
    ram = psutil.virtual_memory()[3]
    ram_perc = psutil.virtual_memory()[2]
    cpu_perc = psutil.cpu_percent(4)
    
    #print RAM usage
    print('Initial RAM memory usage:', memory_initial, 'GB')
    print('Current RAM memory usage:', round(ram/1024/1024/1024, 3), 'GB\n')
    
    print('Initial RAM memory % used:', memory_initial_perc, '%')
    print('Current RAM memory % used:', ram_perc, '%\n')
    
    #add values to sums, and update counter
    util_counter += 1
    memory_stacked += ram_perc
    cpu_stacked += cpu_perc
    
    print('Averaged RAM memory % used', memory_stacked/util_counter, '%\n')
    
    #print CPU usage
    print('Initial CPU % used: ', cpu_initial_perc, '%')
    print('Current CPU % used: ', cpu_perc, '%\n')
    print('Averaged CPU % used', cpu_stacked/util_counter, '%')  

### Imports

In [None]:
import os
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from Utils.LogFile import LogFile 
import tensorflow as tf
import multiprocessing as mp
import copy
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense, BatchNormalization, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Nadam
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


In [None]:
tf.config.list_physical_devices('GPU')

## Loading in the data

### Two different methods: 
- One csv file, which still has to be split into training and test data
- Two csv files, which are already split into training and test data

In [None]:
# Define attribute columns here
case_attr = "Case ID"
act_attr = "concept:name"
time_attr = "Complete Timestamp"
path = "data/BPI_Challenge_2012_end.csv"
time_format = '%Y-%m-%d %H:%M:%S.%f'
cycle_attr = 'lifecycle:transition' # Optional


In [None]:

logfile = LogFile(path, ",", 0, None, time_attr=time_attr, trace_attr=case_attr,
                   activity_attr=act_attr, time_format=time_format, cycle_attr=cycle_attr, convert=False, k=20)
#logfile = logfile.create_subset(40)
#logfile.add_end_events


In [None]:
len(logfile.data) / len(logfile.data[logfile.trace].unique())

In [None]:
logfile.data

## Visualization

In [None]:
split_log = logfile.create_split_df()
split_train, split_test = split_log.split_train_test(range(67, 73), type='normal')

In [None]:
split_date = split_train.data['Complete Timestamp'].max()
split_case = split_train.data['Case ID'].max()

In [None]:
fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(20, 22))

sns.scatterplot(data=split_log.data, x=split_log.time, y=split_log.trace, hue=split_log.activity, ax=ax[0])
sns.scatterplot(data=split_train.data, x=split_log.time, y=split_log.trace, ax=ax[1])
sns.scatterplot(data=split_test.data, x=split_log.time, y=split_log.trace, ax=ax[1])

fig.suptitle('Visualization of train-test split', size=25, weight='bold', y=1.01)
fig.tight_layout()

ax[0].set_xlabel('')
ax[0].set_ylabel('Case ID')
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Case ID')

ax[0].axvline(x=split_date, color = '#404040', linestyle='--', linewidth=2)
ax[0].axhline(y=split_case, color = '#404040', linestyle='--', linewidth=2)
ax[1].axvline(x=split_date, color = '#404040', linestyle='--', linewidth=2)
ax[1].axhline(y=split_case, color = '#404040', linestyle='--', linewidth=2)

plt.show()

#### Preprocessing

In [None]:
logfile.keep_attributes([logfile.trace, logfile.time, logfile.activity, logfile.cycle_attr])
activity_map = logfile.int_convert()
logfile.add_start_date()
logfile.create_k_context()
log_train, log_test = logfile.split_train_test(range(67, 73), type='normal')

## Baseline prediction

### Preprocessing

In [None]:
def convert_time(dataset):
    """Adds a new column to a dataset with the converted timestamp to datetime"""

    date_list = []

    for time in dataset[logfile.time]:
        date = datetime.strptime(time, logfile.time_format)
        date_list.append(date)

    dataset['time and date'] = date_list
mem_report()

### Add actual next event and time

In [None]:
def add_actual_next(df_case):
    """Adds the actual next activity and time to next event to the final dataframe"""

    # Create a list for all the actual next events for an case
    event_lst = [event for event in df_case[logfile.activity]] # Gets a list of all events for a specific trace
    event_lst = event_lst[1:] # Erase the first activity from the list (thus the second activity becomes first in the list)
    event_lst.append('-') # Append a '-' to the end of the list (the last activity does not have a next activity)
    
    # Create a list for time of the next event
    nexttime_lst1 = [time for time in df_case['time and date']]
    nexttime_lst = nexttime_lst1[1:]
    nexttime_lst.append(nexttime_lst[-1])

    # Create the time difference list
    time_diff = []
    for i in range(len(nexttime_lst)):
        time_diff.append((nexttime_lst[i] - nexttime_lst1[i]).total_seconds())

    # Append columns to the case dataframe
    df_case['Next event'] = event_lst
    df_case['Time to next event'] = time_diff

    trace_len = len(df_case)

    return trace_len
mem_report()

### Predicted next event and time

In [None]:
def get_position_time(df_case, count_dict, time_dict):
    for index, row in df_case.iterrows():
        
        # Get the amount of times an action occured in a certain position {action : {position_1 : count_1, position_2: count_2}}
        if row[logfile.activity] in count_dict:
            if index in count_dict[row[logfile.activity]]:
                count_dict[row[logfile.activity]][index] += 1
            else:
                count_dict[row[logfile.activity]].update({index: 1})
        else:
            count_dict[row[logfile.activity]] = {index: 1}
        
        # Summation of the times to next action per position (index) {position: {"sum": summation_of_time, "count": amount_of_times_occured (to calculate mean)}}
        if index in time_dict:
            time_dict[index]['sum'] += row['Time to next event']
            time_dict[index]['count'] += 1
        else:
            time_dict[index] = {'sum': row['Time to next event'], 'count': 1}

def get_position_rank(max_trace_len, count_dict):
    pos_rank_dict = {}
    for i in range(max_trace_len):
        init = 0
        task = 0
        for key in count_dict.keys():
            try:
                new = count_dict[key][i]
            except:
                new = 0
            if new > init:
                init = new
                task = key

        pos_rank_dict.update({i: task})
    
    return pos_rank_dict

def get_mean_time(total_time_dict):
    mean_time_dict = {}
    for position in total_time_dict.keys():
        mean_time = total_time_dict[position]['sum'] / total_time_dict[position]['count']
        mean_time_dict[position] = mean_time
    
    return mean_time_dict
mem_report()

In [None]:
def create_event_pred(df_case, pos_rank_dict, mean_time_dict):
    
    # Prediction for the action
    pred_act_lst = [pos_rank_dict[i] for i in range(len(df_case))]
    pred_act_lst = pred_act_lst[1:]
    pred_act_lst.append('-')

    # Prediction for time
    pred_time_lst = [mean_time_dict[i] for i in range(len(df_case))]

    df_case['Event prediction'] = pred_act_lst 
    df_case['Time prediction'] = pred_time_lst
mem_report()

### Training and testing functions

In [None]:
def train_baseline(dataframe, maximum=None):
    """Returns the training dataset with predictions and 2 dictionaries which predict next action and nexttime based on position"""
    
    dataset = dataframe
    convert_time(dataset)

    df_actual = pd.DataFrame()


    # Creating a dataframe with the actual events

    cases = list(dataset[logfile.trace].unique())  
    max_trace_len = 0  
    pos_count_dict = {}
    time_dict = {}
    for case in cases[:maximum]:
        df_case = dataset[dataset[logfile.trace] == case].copy().reset_index(drop=True)
        trace_len = add_actual_next(df_case)
        get_position_time(df_case, pos_count_dict, time_dict)
        df_actual = pd.concat([df_actual, df_case])

        if trace_len > max_trace_len:
            max_trace_len = trace_len
    


    # Creating the predicitions
    df_predicted = pd.DataFrame()
    
    pos_rank_dict = get_position_rank(max_trace_len, pos_count_dict)
    mean_time_dict = get_mean_time(time_dict)

    for case in cases[:maximum]:
        df_case = df_actual[df_actual[logfile.trace] == case].copy().reset_index(drop=True)
        create_event_pred(df_case, pos_rank_dict, mean_time_dict)
        df_predicted = pd.concat([df_predicted,df_case])



    return df_predicted, pos_rank_dict, mean_time_dict
mem_report()

In [None]:
def test_baseline(dataframe, train_pos, train_time):
    """Creates the test dataset including the predictions based on the training dataset"""
    
    dataset = dataframe
    convert_time(dataset)

    df_predict = pd.DataFrame()
    cases = list(dataset[logfile.trace].unique())  
    for case in cases:
        df_case = dataset[dataset[logfile.trace] == case].copy().reset_index(drop=True)
        _ = add_actual_next(df_case)
        create_event_pred(df_case, train_pos, train_time)
        df_predict = pd.concat([df_predict, df_case])
    
    return df_predict
mem_report()

### Accuracy measurement

In [None]:
def get_accuracy(dataset):
    event_accuracy = np.mean(dataset['Next event'] ==  dataset['Event prediction'])
    time_accuracy = np.mean(abs(dataset['Time to next event'] - dataset['Time prediction'])) / 86400  # Mean Absolute Error in days
    
    return event_accuracy, time_accuracy

def get_sample_weight(dataset):
    sample_dict = {}
    for event in dataset[act_attr]:
        if event in sample_dict:
            sample_dict[event] += 1
        else:
            sample_dict[event] = 1


def get_balanced_accuracy(actual_event, event_pred, actual_time, time_pred):
    event_balanced_accuracy = balanced_accuracy_score(actual_event, event_pred, adjusted=True) #possibly use sample weight
    time_accuracy = np.mean(abs(actual_time - time_pred)) / 86400  # Mean Absolute Error in days
    return event_balanced_accuracy, time_accuracy
mem_report()

### Predictions

In [None]:
train_df = log_train.data
test_df = log_test.data
mem_report()

In [None]:
train_df, train_pos, train_time = train_baseline(train_df)
test_df = test_baseline(test_df, train_pos, train_time)

mem_report()
p.cpu_percent(interval=None)
reset_mem_report()

In [None]:
p.cpu_percent(interval=None)

# Random Forest Classifier

## Random Forest event prediction

In [None]:
df_X = log_train.contextdata.copy()
df_X.loc[df_X['Complete Timestamp_Prev0'] == 0, "Complete Timestamp_Prev0"] = df_X['Start Date']
df_X['Complete Timestamp_Prev0'] = pd.to_datetime(df_X['Complete Timestamp_Prev0'])
df_X['Complete Timestamp'] = pd.to_datetime(df_X['Complete Timestamp'])
df_X['Start Date'] = pd.to_datetime(df_X['Start Date'])
df_X['time_since_start'] = (df_X['Complete Timestamp_Prev0'] - df_X['Start Date']).dt.total_seconds()
df_X['day_previous_event'] = df_X['Complete Timestamp_Prev0'].dt.weekday
df_X['hour_previous_event'] = df_X['Complete Timestamp_Prev0'].dt.hour
df_X['time_to_next_event'] = (df_X['Complete Timestamp'] - df_X['Complete Timestamp_Prev0']).dt.total_seconds()
mem_report()

In [None]:
df_X_test = log_test.contextdata.copy()
df_X_test.loc[df_X_test['Complete Timestamp_Prev0'] == 0, "Complete Timestamp_Prev0"] = df_X_test['Start Date']
df_X_test['Complete Timestamp_Prev0'] = pd.to_datetime(df_X_test['Complete Timestamp_Prev0'])
df_X_test['Complete Timestamp'] = pd.to_datetime(df_X_test['Complete Timestamp'])
df_X_test['Start Date'] = pd.to_datetime(df_X_test['Start Date'])
df_X_test['time_since_start'] = (df_X_test['Complete Timestamp_Prev0'] - df_X_test['Start Date']).dt.total_seconds()
df_X_test['day_previous_event'] = df_X_test['Complete Timestamp_Prev0'].dt.weekday
df_X_test['hour_previous_event'] = df_X_test['Complete Timestamp_Prev0'].dt.hour
df_X_test['time_to_next_event'] = (df_X_test['Complete Timestamp'] - df_X_test['Complete Timestamp_Prev0']).dt.total_seconds()
mem_report()

In [None]:
y = df_X[logfile.activity]
columns = ['time_since_start', 'day_previous_event', 'hour_previous_event']
columns.extend(["%s_Prev%i" % (logfile.activity, i) for i in range(logfile.k)])
columns.extend(["%s_Prev%i" % (logfile.cycle_attr, i) for i in range(logfile.k)])
X = df_X[columns]
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X, y)

df_X['rf_prediction'] = rf.predict(df_X[columns])
df_X_test['rf_prediction'] = rf.predict(df_X_test[columns])


accuracy_event = np.mean(df_X_test['rf_prediction'] == df_X_test[logfile.activity])
mem_report()
print(accuracy_event)

## Random Forest time prediction

In [None]:
y2 = df_X['time_to_next_event']
columns = ['time_since_start', 'day_previous_event', 'hour_previous_event']
columns.extend(["%s_Prev%i" % (logfile.activity, i) for i in range(logfile.k)])
columns.extend(["%s_Prev%i" % (logfile.cycle_attr, i) for i in range(logfile.k)])
X2 = df_X[columns]


rf2 = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf2 = rf2.fit(X2, y2)

df_X['rf_time_prediction'] = rf2.predict(df_X[columns])
df_X_test['rf_time_prediction'] = rf2.predict(df_X_test[columns])

time_mae = np.mean(abs(df_X_test['time_to_next_event'] - df_X_test['rf_time_prediction'])) / 86400

mem_report()
p.cpu_percent(interval=None)
reset_mem_report()


print(time_mae)

In [None]:
p.cpu_percent(interval=None)

# LSTM Model

### Preprocessing

In [None]:
def transform_log(log):

    activities = np.unique(log.data[log.activity])
    X = np.zeros((len(log.contextdata), log.k, len(activities)+ 9), dtype=np.float32)
    y_a = np.zeros((len(log.contextdata), len(activities) + 1), dtype=np.float32)
    y_t = np.zeros((len(log.contextdata)), dtype=np.float32)
    j = 0
    time_diff = 0
    for row in log.contextdata.iterrows():
        
            act = getattr(row[1], log.activity)
            event_str = getattr(row[1], log.time)
            prev_str = getattr(row[1], "%s_Prev0" % (log.time))
            start_str = getattr(row[1], "Start Date")
            event_time = time.strptime(event_str, logfile.time_format)
            start_time = time.strptime(start_str, logfile.time_format)

            if prev_str != 0:
                prev_time = time.strptime(prev_str, logfile.time_format)
                diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                                          - datetime.fromtimestamp(time.mktime(prev_time))
                diff = diff_prev_event.total_seconds()

            else: 
                diff = 0

                        
    
            y_a[j, act] = 1
            y_t[j] = diff            

            k = 0
            act_count = 0
            last_act = None
            for i in range(log.k -1, -1, -1):
                
                if getattr(row[1], "%s_Prev%i" % (log.activity, i)) != 0: # 0 indicates no activity (first activity is encoded to 1)
                    X[j, log.k - i - 1, getattr(row[1], "%s_Prev%i" % (log.activity, i))] = 1
                    if getattr(row[1], "%s_Prev%i" % (log.activity, i)) != last_act:
                        last_act = getattr(row[1], "%s_Prev%i" % (log.activity, i))
                        act_count = 0
                    else:
                        act_count += 1
                else:
                    last_act = 0
                    act_count = 0

                X[j, log.k - i - 1, len(activities)+4] = k 
                X[j, log.k - i - 1, len(activities)+2] = act_count # How many times one activity occurs in a row 

                if getattr(row[1], "%s_Prev%i" % (log.cycle_attr, i)) != 0:
                   X[j, log.k - i - 1, len(activities)+3] = getattr(row[1], "%s_Prev%i" % (log.cycle_attr, i)) # Lifecycle transition state of the event

 
                str_time = getattr(row[1], "%s_Prev%i" % (log.time, i))
                if str_time != 0:
                    event_time = time.strptime(str_time, logfile.time_format)
                    time_since_start = datetime.fromtimestamp(time.mktime(event_time)) \
                                        - datetime.fromtimestamp(time.mktime(start_time))
                    X[j, log.k - i - 1, len(activities) + 6] = event_time.tm_hour # Hour of day
                    X[j, log.k - i - 1, len(activities) + 7] = event_time.tm_wday # Day of the week
                    X[j, log.k - i - 1, len(activities) + 8] = time_since_start.total_seconds() # Seconds since start
                else: 
                    X[j, log.k - i - 1, len(activities) + 6] = 0 
                    X[j, log.k - i - 1, len(activities) + 7] = 0 
                    X[j, log.k - i - 1, len(activities) + 8] = 0
                    
                prev_str = getattr(row[1], "%s_Prev%i" % (log.time, i + 1))
                if prev_str != 0:
                    
                    prev_time = time.strptime(prev_str, logfile.time_format)
                    diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                                        - datetime.fromtimestamp(time.mktime(prev_time))
                    time_diff = diff_prev_event.total_seconds() 
                    X[j, log.k - i - 1, len(activities) + 5] = time_diff
                else:
                     X[j, log.k - i - 1, len(activities) + 5] = 0

                        

                k += 1

            j += 1

    return X, y_a, y_t

mem_report()

In [None]:
def train_LSTM(log, epochs=4, early_stop=42):


    print("Transforming log...")
    X, y_a, y_t = transform_log(log)

    # build the model:
    print('Build model...')
    main_input = Input(shape=(log.k, len(np.unique(log.data[log.activity]))+9), name='main_input')
    # train a 2-layer LSTM with one shared layer
    l1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
    b1 = BatchNormalization()(l1)
    l2_1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in activity prediction
    b2_1 = BatchNormalization()(l2_1)
    l2_2 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in time prediction
    b2_2 = BatchNormalization()(l2_2)

    act_output = Dense(len(np.unique(log.data[log.activity])) + 1, activation='softmax', kernel_initializer='glorot_uniform', name='act_output')(b2_1)
    time_output = Dense(1, kernel_initializer='glorot_uniform', name='time_output')(b2_2)


    model = Model(inputs=[main_input], outputs=[act_output, time_output])

    opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)

    model.compile(loss={'act_output':'categorical_crossentropy', 'time_output': 'mae'}, optimizer=opt)
    early_stopping = EarlyStopping(monitor='val_loss', patience=early_stop)
    model_checkpoint = ModelCheckpoint(os.path.join("model", 'model_{epoch:03d}-{val_loss:.2f}.h5'), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
    lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    if len(y_a) > 10:
        split = 0.2
    else:
        split = 0

    model.fit(X, {'act_output': y_a, 'time_output': y_t}, validation_split=split, verbose=2, callbacks=[early_stopping, lr_reducer], batch_size=log.k, epochs=epochs)

    return model
mem_report()

In [None]:
def test(model, log):
    X, y_a, y_t = transform_log(log)
    pred_act, pred_time = model.predict(X)
    predict_vals = np.argmax(pred_act, axis=1)
    pred_time = pred_time.reshape(-1)
    #predict_probs = predictions[np.arange(predictions.shape[0]), predict_vals]
    expected_vals = np.argmax(y_a, axis=1)
    #expected_probs = predictions[np.arange(predictions.shape[0]), expected_vals]
    activity_acc = np.mean(expected_vals ==  predict_vals)
    mae_time = np.mean(abs(y_t - pred_time)) / 86400
    return predict_vals, pred_time, activity_acc, mae_time
mem_report()

In [None]:
model = train_LSTM(log_train, epochs=6, early_stop=2)
mem_report()

In [None]:
pred_act, pred_time, acc_act, mae_time = test(model, log_test)
mem_report()

In [None]:
acc_act
mem_report()

In [None]:
mae_time
mem_report()

## Final Dataset Compiling and Accuracy

In [None]:
test_df.head()
mem_report()

In [None]:
test_df['Next event'].replace(activity_map, inplace=True)
test_df['Event prediction'].replace(activity_map, inplace=True)
test_df[log_test.activity].replace(activity_map, inplace=True)
mem_report()

In [None]:
LSTM_act = [activity_map[act] for act in pred_act]
LSTM_act = LSTM_act[1:]
LSTM_act.append('-')
test_df['LSTM event prediction'] = LSTM_act
test_df.loc[test_df[log_test.activity] == "End", "LSTM event prediction"] = "-"
mem_report()

In [None]:
LSTM_time = list(pred_time[1:])
LSTM_time.append('-')
test_df['LSTM time prediction'] = LSTM_time
test_df.loc[test_df[log_test.activity] == 'End', 'LSTM time prediction'] = '-'
mem_report()

In [None]:
rf_act = [activity_map[act] for act in df_X_test['rf_prediction']]
rf_act = rf_act[1:]
rf_act.append('-')
test_df['RF event prediction'] = rf_act
test_df.loc[test_df[log_test.activity] == "End", "RF event prediction"] = "-"
mem_report()

In [None]:
rf_time = list(df_X_test['rf_time_prediction'][1:])
rf_time.append('-')
test_df['RF time prediction'] = rf_time
test_df.loc[test_df[log_test.activity] == 'End', 'RF time prediction'] = '-'
mem_report()

In [None]:
test_df = test_df[test_df[log_test.activity] != 'End']
mem_report()

In [None]:
test_df.head()

In [None]:
base_event_acc, base_time_mae = get_balanced_accuracy(test_df['Next event'], test_df['Event prediction'], test_df['Time to next event'], test_df['Time prediction'])
LSTM_event_acc, LSTM_time_mae = get_balanced_accuracy(test_df['Next event'], test_df['LSTM event prediction'], test_df['Time to next event'], test_df['LSTM time prediction'])
RF_event_acc, RF_time_mae = get_balanced_accuracy(test_df['Next event'], test_df['RF event prediction'], test_df['Time to next event'], test_df['RF time prediction'])
mem_report()

In [None]:
print(base_event_acc, LSTM_event_acc, RF_event_acc)
mem_report()

In [None]:
print(base_time_mae, LSTM_time_mae, RF_time_mae)
mem_report()

In [None]:
test_df['Time to next event'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['Time to next event'], unit='s')
mem_report()

In [None]:
test_df['Time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['Time prediction'], unit='s')
test_df['LSTM time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['LSTM time prediction'], unit='s')
test_df['RF time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['RF time prediction'], unit='s')
mem_report()
p.cpu_percent(interval=None)

In [None]:
#test_df.to_csv('output_log.csv', axis=False)

### Deeper look into the data

#### LSTM

In [None]:
# Prediction accuracy per class
bad_pred_list = []
for pred in test_df['Next event'].unique():
    df_pred = test_df.loc[test_df['Next event'] == pred] 
    LSTM_acc = round(np.mean(df_pred['Next event'] == df_pred['LSTM event prediction']),3) * 100
    if LSTM_acc < 20: # Save classes where prediction accuracy is less than 20%
        bad_pred_list.append(pred)
    print("%s: %s%%" % (pred, LSTM_acc))

In [None]:
df_bad_pred = test_df.loc[test_df['Next event'].isin(bad_pred_list)][['Next event', 'LSTM event prediction']]
predictions, counts = np.unique(df_bad_pred['LSTM event prediction'], return_counts=True)
for i, pred in enumerate(predictions):
    print("%s: %s" % (pred, counts[i]))

In [None]:
print("Actual next event = W_Nabellen offertes: %s \nLSTM prediction = W_Nabellen offertes: %s" %
(len(test_df.loc[test_df['Next event'] == 'W_Nabellen offertes']), len(test_df.loc[test_df['LSTM event prediction'] == 'W_Nabellen offertes'])))

In [None]:
test_df.loc[test_df['Case ID'] == 202656][['Case ID', 'Next event', 'LSTM event prediction']]

In [None]:
# The problem with many occurences of the same activity 
test_df.loc[test_df['Case ID'] == 202659][['Case ID', 'Next event', 'LSTM event prediction']]