# DBL Process Mining

## Imports and Class definitions

### Imports

In [139]:
import os
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from Utils.LogFile import LogFile 
import tensorflow as tf
import multiprocessing as mp
import copy
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense, BatchNormalization, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Nadam
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


In [48]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Loading in the data

### Two different methods: 
- One csv file, which still has to be split into training and test data
- Two csv files, which are already split into training and test data

In [22]:
# Define attribute columns here
case_attr = "Case ID"
act_attr = "concept:name"
time_attr = "Complete Timestamp"
path = "data/BPI_Challenge_2012_end.csv"
time_format = '%Y-%m-%d %H:%M:%S.%f'


In [23]:

logfile = LogFile(path, ",", 0, None, time_attr=time_attr, trace_attr=case_attr,
                   activity_attr=act_attr, time_format=time_format, convert=False, k=50)
#logfile = logfile.create_subset(40)
#logfile.add_end_events


## Visualization

In [None]:
split_log = logfile.create_split_df()
split_train, split_test = split_log.split_train_test(range(67, 73), type='normal')

In [None]:
split_date = split_train.data['Complete Timestamp'].max()
split_case = split_train.data['Case ID'].max()

In [None]:
fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(20, 22))

sns.scatterplot(data=split_log.data, x=split_log.time, y=split_log.trace, hue=split_log.activity, ax=ax[0])
sns.scatterplot(data=split_train.data, x=split_log.time, y=split_log.trace, ax=ax[1])
sns.scatterplot(data=split_test.data, x=split_log.time, y=split_log.trace, ax=ax[1])

fig.suptitle('Visualization of train-test split', size=25, weight='bold', y=1.01)
fig.tight_layout()

ax[0].set_xlabel('')
ax[0].set_ylabel('Case ID')
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Case ID')

ax[0].axvline(x=split_date, color = '#404040', linestyle='--', linewidth=2)
ax[0].axhline(y=split_case, color = '#404040', linestyle='--', linewidth=2)
ax[1].axvline(x=split_date, color = '#404040', linestyle='--', linewidth=2)
ax[1].axhline(y=split_case, color = '#404040', linestyle='--', linewidth=2)

plt.show()

#### Preprocessing

In [24]:
logfile.keep_attributes([logfile.trace, logfile.time, logfile.activity])
activity_map = logfile.int_convert()
logfile.add_start_date()
logfile.create_k_context()
log_train, log_test = logfile.split_train_test(range(67, 73), type='normal')

Create k-context: 50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[self.time] = pd.to_datetime(data[self.time])


Train data lost due to overlap: 0.0832651595267075/n Best Split: 71


## Baseline prediction

### Preprocessing

In [25]:
def convert_time(dataset):
    """Adds a new column to a dataset with the converted timestamp to datetime"""

    date_list = []

    for time in dataset[logfile.time]:
        date = datetime.strptime(time, logfile.time_format)
        date_list.append(date)

    dataset['time and date'] = date_list

### Add actual next event and time

In [26]:
def add_actual_next(df_case):
    """Adds the actual next activity and time to next event to the final dataframe"""

    # Create a list for all the actual next events for an case
    event_lst = [event for event in df_case[logfile.activity]] # Gets a list of all events for a specific trace
    event_lst = event_lst[1:] # Erase the first activity from the list (thus the second activity becomes first in the list)
    event_lst.append('-') # Append a '-' to the end of the list (the last activity does not have a next activity)
    
    # Create a list for time of the next event
    nexttime_lst1 = [time for time in df_case['time and date']]
    nexttime_lst = nexttime_lst1[1:]
    nexttime_lst.append(nexttime_lst[-1])

    # Create the time difference list
    time_diff = []
    for i in range(len(nexttime_lst)):
        time_diff.append((nexttime_lst[i] - nexttime_lst1[i]).total_seconds())

    # Append columns to the case dataframe
    df_case['Next event'] = event_lst
    df_case['Time to next event'] = time_diff

    trace_len = len(df_case)

    return trace_len

### Predicted next event and time

In [27]:
def get_position_time(df_case, count_dict, time_dict):
    for index, row in df_case.iterrows():
        
        # Get the amount of times an action occured in a certain position {action : {position_1 : count_1, position_2: count_2}}
        if row[logfile.activity] in count_dict:
            if index in count_dict[row[logfile.activity]]:
                count_dict[row[logfile.activity]][index] += 1
            else:
                count_dict[row[logfile.activity]].update({index: 1})
        else:
            count_dict[row[logfile.activity]] = {index: 1}
        
        # Summation of the times to next action per position (index) {position: {"sum": summation_of_time, "count": amount_of_times_occured (to calculate mean)}}
        if index in time_dict:
            time_dict[index]['sum'] += row['Time to next event']
            time_dict[index]['count'] += 1
        else:
            time_dict[index] = {'sum': row['Time to next event'], 'count': 1}

def get_position_rank(max_trace_len, count_dict):
    pos_rank_dict = {}
    for i in range(max_trace_len):
        init = 0
        task = 0
        for key in count_dict.keys():
            try:
                new = count_dict[key][i]
            except:
                new = 0
            if new > init:
                init = new
                task = key

        pos_rank_dict.update({i: task})
    
    return pos_rank_dict

def get_mean_time(total_time_dict):
    mean_time_dict = {}
    for position in total_time_dict.keys():
        mean_time = total_time_dict[position]['sum'] / total_time_dict[position]['count']
        mean_time_dict[position] = mean_time
    
    return mean_time_dict

In [28]:
def create_event_pred(df_case, pos_rank_dict, mean_time_dict):
    
    # Prediction for the action
    pred_act_lst = [pos_rank_dict[i] for i in range(len(df_case))]
    pred_act_lst = pred_act_lst[1:]
    pred_act_lst.append('-')

    # Prediction for time
    pred_time_lst = [mean_time_dict[i] for i in range(len(df_case))]

    df_case['Event prediction'] = pred_act_lst 
    df_case['Time prediction'] = pred_time_lst

### Training and testing functions

In [29]:
def train_baseline(dataframe, maximum=None):
    """Returns the training dataset with predictions and 2 dictionaries which predict next action and nexttime based on position"""
    
    dataset = dataframe
    convert_time(dataset)

    df_actual = pd.DataFrame()


    # Creating a dataframe with the actual events

    cases = list(dataset[logfile.trace].unique())  
    max_trace_len = 0  
    pos_count_dict = {}
    time_dict = {}
    for case in cases[:maximum]:
        df_case = dataset[dataset[logfile.trace] == case].copy().reset_index(drop=True)
        trace_len = add_actual_next(df_case)
        get_position_time(df_case, pos_count_dict, time_dict)
        df_actual = pd.concat([df_actual, df_case])

        if trace_len > max_trace_len:
            max_trace_len = trace_len
    


    # Creating the predicitions
    df_predicted = pd.DataFrame()
    
    pos_rank_dict = get_position_rank(max_trace_len, pos_count_dict)
    mean_time_dict = get_mean_time(time_dict)

    for case in cases[:maximum]:
        df_case = df_actual[df_actual[logfile.trace] == case].copy().reset_index(drop=True)
        create_event_pred(df_case, pos_rank_dict, mean_time_dict)
        df_predicted = pd.concat([df_predicted,df_case])



    return df_predicted, pos_rank_dict, mean_time_dict

In [30]:
def test_baseline(dataframe, train_pos, train_time):
    """Creates the test dataset including the predictions based on the training dataset"""
    
    dataset = dataframe
    convert_time(dataset)

    df_predict = pd.DataFrame()
    cases = list(dataset[logfile.trace].unique())  
    for case in cases:
        df_case = dataset[dataset[logfile.trace] == case].copy().reset_index(drop=True)
        _ = add_actual_next(df_case)
        create_event_pred(df_case, train_pos, train_time)
        df_predict = pd.concat([df_predict, df_case])
    
    return df_predict


### Accuracy measurement

In [31]:
def get_accuracy(dataset):
    event_accuracy = np.mean(dataset['Next event'] ==  dataset['Event prediction'])
    time_accuracy = np.mean(abs(dataset['Time to next event'] - dataset['Time prediction'])) / 86400  # Mean Absolute Error in days
    
    return event_accuracy, time_accuracy

def get_sample_weight(dataset):
    sample_dict = {}
    for event in dataset[act_attr]:
        if event in sample_dict:
            sample_dict[event] += 1
        else:
            sample_dict[event] = 1


def get_balanced_accuracy(actual_event, event_pred, actual_time, time_pred):
    event_balanced_accuracy = balanced_accuracy_score(actual_event, event_pred, adjusted=True) #possibly use sample weight
    time_accuracy = np.mean(abs(actual_time - time_pred)) / 86400  # Mean Absolute Error in days
    return event_balanced_accuracy, time_accuracy

### Predictions

In [32]:
train_df = log_train.data
test_df = log_test.data

In [33]:
train_df, train_pos, train_time = train_baseline(train_df)
test_df = test_baseline(test_df, train_pos, train_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['time and date'] = date_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['time and date'] = date_list


# Random Forest Classifier

## Random Forest event prediction

In [34]:
df_X = log_train.contextdata.copy()
df_X.loc[df_X['Complete Timestamp_Prev0'] == 0, "Complete Timestamp_Prev0"] = df_X['Start Date']
df_X['Complete Timestamp_Prev0'] = pd.to_datetime(df_X['Complete Timestamp_Prev0'])
df_X['Complete Timestamp'] = pd.to_datetime(df_X['Complete Timestamp'])
df_X['Start Date'] = pd.to_datetime(df_X['Start Date'])
df_X['time_since_start'] = (df_X['Complete Timestamp_Prev0'] - df_X['Start Date']).dt.total_seconds()
df_X['day_previous_event'] = df_X['Complete Timestamp_Prev0'].dt.weekday
df_X['hour_previous_event'] = df_X['Complete Timestamp_Prev0'].dt.hour
df_X['time_to_next_event'] = (df_X['Complete Timestamp'] - df_X['Complete Timestamp_Prev0']).dt.total_seconds()

In [35]:
df_X_test = log_test.contextdata.copy()
df_X_test.loc[df_X_test['Complete Timestamp_Prev0'] == 0, "Complete Timestamp_Prev0"] = df_X_test['Start Date']
df_X_test['Complete Timestamp_Prev0'] = pd.to_datetime(df_X_test['Complete Timestamp_Prev0'])
df_X_test['Complete Timestamp'] = pd.to_datetime(df_X_test['Complete Timestamp'])
df_X_test['Start Date'] = pd.to_datetime(df_X_test['Start Date'])
df_X_test['time_since_start'] = (df_X_test['Complete Timestamp_Prev0'] - df_X_test['Start Date']).dt.total_seconds()
df_X_test['day_previous_event'] = df_X_test['Complete Timestamp_Prev0'].dt.weekday
df_X_test['hour_previous_event'] = df_X_test['Complete Timestamp_Prev0'].dt.hour
df_X_test['time_to_next_event'] = (df_X_test['Complete Timestamp'] - df_X_test['Complete Timestamp_Prev0']).dt.total_seconds()

In [36]:
y = df_X[logfile.activity]
columns = ['time_since_start', 'day_previous_event', 'hour_previous_event']
columns.extend(["%s_Prev%i" % (logfile.activity, i) for i in range(logfile.k)])
X = df_X[columns]
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X, y)

df_X['rf_prediction'] = rf.predict(df_X[columns])
df_X_test['rf_prediction'] = rf.predict(df_X_test[columns])


accuracy_event = np.mean(df_X_test['rf_prediction'] == df_X_test[logfile.activity])

print(accuracy_event)

0.8387142598858383


## Random Forest time prediction

In [37]:
y2 = df_X['time_to_next_event']
columns = ['time_since_start', 'day_previous_event', 'hour_previous_event']
columns.extend(["%s_Prev%i" % (logfile.activity, i) for i in range(logfile.k)])
X2 = df_X[columns]


rf2 = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf2 = rf2.fit(X2, y2)

df_X['rf_time_prediction'] = rf2.predict(df_X[columns])
df_X_test['rf_time_prediction'] = rf2.predict(df_X_test[columns])

time_mae = np.mean(abs(df_X_test['time_to_next_event'] - df_X_test['rf_time_prediction'])) / 86400
print(time_mae)

0.38749239118167006


# LSTM Model

### Preprocessing

In [38]:
def transform_log(log):

    activities = np.unique(log.data[log.activity])
    X = np.zeros((len(log.contextdata), log.k, len(activities)+ 7), dtype=np.float32)
    y_a = np.zeros((len(log.contextdata), len(activities) + 1), dtype=np.float32)
    y_t = np.zeros((len(log.contextdata)), dtype=np.float32)
    j = 0
    time_diff = 0
    for row in log.contextdata.iterrows():
        
            act = getattr(row[1], log.activity)
            event_str = getattr(row[1], log.time)
            prev_str = getattr(row[1], "%s_Prev0" % (log.time))
            start_str = getattr(row[1], "Start Date")
            event_time = time.strptime(event_str, logfile.time_format)
            start_time = time.strptime(start_str, logfile.time_format)

            if prev_str != 0:
                prev_time = time.strptime(prev_str, logfile.time_format)
                diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                                          - datetime.fromtimestamp(time.mktime(prev_time))
                diff = diff_prev_event.total_seconds()

            else: 
                diff = 0

                        
    
            y_a[j, act] = 1
            y_t[j] = diff            

            k = 0
            
            for i in range(log.k -1, -1, -1):
                
                if getattr(row[1], "%s_Prev%i" % (log.activity, i)) != 0: # 0 indicates no activity (first activity is encoded to 1)
                    X[j, log.k - i - 1, getattr(row[1], "%s_Prev%i" % (log.activity, i))] = 1
                X[j, log.k - i - 1, len(activities)+2] = k
                #X[j, log.k - i - 1, len(activities) + 3] = time_diff # Diff in seconds

 
                str_time = getattr(row[1], "%s_Prev%i" % (log.time, i))
                if str_time != 0:
                    event_time = time.strptime(str_time, logfile.time_format)
                    time_since_start = datetime.fromtimestamp(time.mktime(event_time)) \
                                        - datetime.fromtimestamp(time.mktime(start_time))
                    X[j, log.k - i - 1, len(activities) + 4] = event_time.tm_hour # Hour of day
                    X[j, log.k - i - 1, len(activities) + 5] = event_time.tm_wday # Day of the week
                    X[j, log.k - i - 1, len(activities) + 6] = time_since_start.total_seconds() # Seconds since start
                else: 
                    X[j, log.k - i - 1, len(activities) + 4] = 0 
                    X[j, log.k - i - 1, len(activities) + 5] = 0 
                    X[j, log.k - i - 1, len(activities) + 6] = 0
                    
                prev_str = getattr(row[1], "%s_Prev%i" % (log.time, i + 1))
                if prev_str != 0:
                    
                    prev_time = time.strptime(prev_str, logfile.time_format)
                    diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                                        - datetime.fromtimestamp(time.mktime(prev_time))
                    time_diff = diff_prev_event.total_seconds() 
                    X[j, log.k - i - 1, len(activities) + 3] = time_diff
                else:
                     X[j, log.k - i - 1, len(activities) + 3] = 0

                        

                k += 1

            j += 1

    return X, y_a, y_t

In [39]:
def train_LSTM(log, epochs=4, early_stop=42):


    print("Transforming log...")
    X, y_a, y_t = transform_log(log)

    # build the model:
    print('Build model...')
    main_input = Input(shape=(log.k, len(np.unique(log.data[log.activity]))+7), name='main_input')
    # train a 2-layer LSTM with one shared layer
    l1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
    b1 = BatchNormalization()(l1)
    l2_1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in activity prediction
    b2_1 = BatchNormalization()(l2_1)
    l2_2 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in time prediction
    b2_2 = BatchNormalization()(l2_2)

    act_output = Dense(len(np.unique(log.data[log.activity])) + 1, activation='softmax', kernel_initializer='glorot_uniform', name='act_output')(b2_1)
    time_output = Dense(1, kernel_initializer='glorot_uniform', name='time_output')(b2_2)


    model = Model(inputs=[main_input], outputs=[act_output, time_output])

    opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)

    model.compile(loss={'act_output':'categorical_crossentropy', 'time_output': 'mae'}, optimizer=opt)
    early_stopping = EarlyStopping(monitor='val_loss', patience=early_stop)
    model_checkpoint = ModelCheckpoint(os.path.join("model", 'model_{epoch:03d}-{val_loss:.2f}.h5'), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
    lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    if len(y_a) > 10:
        split = 0.2
    else:
        split = 0

    model.fit(X, {'act_output': y_a, 'time_output': y_t}, validation_split=split, verbose=2, callbacks=[early_stopping, lr_reducer], batch_size=log.k, epochs=epochs)

    return model

In [40]:
def test(model, log):
    X, y_a, y_t = transform_log(log)
    pred_act, pred_time = model.predict(X)
    predict_vals = np.argmax(pred_act, axis=1)
    pred_time = pred_time.reshape(-1)
    #predict_probs = predictions[np.arange(predictions.shape[0]), predict_vals]
    expected_vals = np.argmax(y_a, axis=1)
    #expected_probs = predictions[np.arange(predictions.shape[0]), expected_vals]
    activity_acc = np.mean(expected_vals ==  predict_vals)
    mae_time = np.mean(abs(y_t - pred_time)) / 86400
    return predict_vals, pred_time, activity_acc, mae_time

In [41]:
model = train_LSTM(log_train, epochs=6, early_stop=2)

Transforming log...
Build model...
Epoch 1/6
2591/2591 - 94s - loss: 34465.8594 - act_output_loss: 1.4536 - time_output_loss: 34464.4375 - val_loss: 24634.1211 - val_act_output_loss: 1.3321 - val_time_output_loss: 24632.7754 - lr: 0.0020 - 94s/epoch - 36ms/step
Epoch 2/6
2591/2591 - 85s - loss: 34432.8008 - act_output_loss: 1.2684 - time_output_loss: 34431.5156 - val_loss: 24622.6797 - val_act_output_loss: 1.2917 - val_time_output_loss: 24621.3945 - lr: 0.0020 - 85s/epoch - 33ms/step
Epoch 3/6
2591/2591 - 94s - loss: 34414.9492 - act_output_loss: 1.2218 - time_output_loss: 34413.6641 - val_loss: 24621.8867 - val_act_output_loss: 1.2157 - val_time_output_loss: 24620.6582 - lr: 0.0020 - 94s/epoch - 36ms/step
Epoch 4/6
2591/2591 - 100s - loss: 34404.8477 - act_output_loss: 1.1962 - time_output_loss: 34403.7383 - val_loss: 24617.0762 - val_act_output_loss: 1.3138 - val_time_output_loss: 24615.7441 - lr: 0.0020 - 100s/epoch - 38ms/step
Epoch 5/6
2591/2591 - 103s - loss: 34379.6797 - act_out

In [49]:
pred_act, pred_time, acc_act, mae_time = test(model, log_test)

## Final Dataset Compiling and Accuracy

In [66]:
test_df.head()

Unnamed: 0,Case ID,Complete Timestamp,concept:name,Start Date,time and date,Next event,Time to next event,Event prediction,Time prediction,LSTM event prediction
0,202653,2012-01-20 17:24:44.730,A_SUBMITTED,2012-01-20 17:24:44.730,2012-01-20 17:24:44.730,A_PARTLYSUBMITTED,0.354,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED
1,202653,2012-01-20 17:24:45.084,A_PARTLYSUBMITTED,2012-01-20 17:24:44.730,2012-01-20 17:24:45.084,W_Afhandelen leads,9.69,W_Afhandelen leads,35.273265,W_Afhandelen leads
2,202653,2012-01-20 17:24:54.774,W_Afhandelen leads,2012-01-20 17:24:44.730,2012-01-20 17:24:54.774,W_Afhandelen leads,1237.301,W_Afhandelen leads,7242.202085,W_Afhandelen leads
3,202653,2012-01-20 17:45:32.075,W_Afhandelen leads,2012-01-20 17:24:44.730,2012-01-20 17:45:32.075,A_DECLINED,98.595,W_Completeren aanvraag,7792.050159,A_DECLINED
4,202653,2012-01-20 17:47:10.670,A_DECLINED,2012-01-20 17:24:44.730,2012-01-20 17:47:10.670,W_Afhandelen leads,3.162,W_Completeren aanvraag,1165.050331,W_Completeren aanvraag


In [56]:
test_df['Next event'].replace(activity_map, inplace=True)
test_df['Event prediction'].replace(activity_map, inplace=True)
test_df[log_test.activity].replace(activity_map, inplace=True)

In [64]:
LSTM_act = [activity_map[act] for act in pred_act]
LSTM_act = LSTM_act[1:]
LSTM_act.append('-')
test_df['LSTM event prediction'] = LSTM_act
test_df.loc[test_df[log_test.activity] == "End", "LSTM event prediction"] = "-"

In [67]:
LSTM_time = list(pred_time[1:])
LSTM_time.append('-')
test_df['LSTM time prediction'] = LSTM_time
test_df.loc[test_df[log_test.activity] == 'End', 'LSTM time prediction'] = '-'

In [73]:
rf_act = [activity_map[act] for act in df_X_test['rf_prediction']]
rf_act = rf_act[1:]
rf_act.append('-')
test_df['RF event prediction'] = rf_act
test_df.loc[test_df[log_test.activity] == "End", "RF event prediction"] = "-"

In [78]:
rf_time = list(df_X_test['rf_time_prediction'][1:])
rf_time.append('-')
test_df['RF time prediction'] = rf_time
test_df.loc[test_df[log_test.activity] == 'End', 'RF time prediction'] = '-'

In [80]:
test_df = test_df[test_df[log_test.activity] != 'End']

In [82]:
test_df.head()

Unnamed: 0,Case ID,Complete Timestamp,concept:name,Start Date,time and date,Next event,Time to next event,Event prediction,Time prediction,LSTM event prediction,LSTM time prediction,RF event prediction,RF time prediction
0,202653,2012-01-20 17:24:44.730,A_SUBMITTED,2012-01-20 17:24:44.730,2012-01-20 17:24:44.730,A_PARTLYSUBMITTED,0.354,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,98.187439,A_PARTLYSUBMITTED,0.356812
1,202653,2012-01-20 17:24:45.084,A_PARTLYSUBMITTED,2012-01-20 17:24:44.730,2012-01-20 17:24:45.084,W_Afhandelen leads,9.69,W_Afhandelen leads,35.273265,W_Afhandelen leads,24.644608,A_DECLINED,28.1235
2,202653,2012-01-20 17:24:54.774,W_Afhandelen leads,2012-01-20 17:24:44.730,2012-01-20 17:24:54.774,W_Afhandelen leads,1237.301,W_Afhandelen leads,7242.202085,W_Afhandelen leads,9562.004883,W_Afhandelen leads,2651.11963
3,202653,2012-01-20 17:45:32.075,W_Afhandelen leads,2012-01-20 17:24:44.730,2012-01-20 17:45:32.075,A_DECLINED,98.595,W_Completeren aanvraag,7792.050159,A_DECLINED,256.912781,A_PREACCEPTED,2393.26746
4,202653,2012-01-20 17:47:10.670,A_DECLINED,2012-01-20 17:24:44.730,2012-01-20 17:47:10.670,W_Afhandelen leads,3.162,W_Completeren aanvraag,1165.050331,W_Completeren aanvraag,477.817291,W_Afhandelen leads,46694.18548


In [108]:
base_event_acc, base_time_mae = get_balanced_accuracy(test_df['Next event'], test_df['Event prediction'], test_df['Time to next event'], test_df['Time prediction'])
LSTM_event_acc, LSTM_time_mae = get_balanced_accuracy(test_df['Next event'], test_df['LSTM event prediction'], test_df['Time to next event'], test_df['LSTM time prediction'])
RF_event_acc, RF_time_mae = get_balanced_accuracy(test_df['Next event'], test_df['RF event prediction'], test_df['Time to next event'], test_df['RF time prediction'])

In [93]:
test_df.loc[test_df[log_test.activity] == 'A_SUBMITTED']

Unnamed: 0,Case ID,Complete Timestamp,concept:name,Start Date,time and date,Next event,Time to next event,Event prediction,Time prediction,LSTM event prediction,LSTM time prediction,RF event prediction,RF time prediction
0,202653,2012-01-20 17:24:44.730,A_SUBMITTED,2012-01-20 17:24:44.730,2012-01-20 17:24:44.730,A_PARTLYSUBMITTED,0.354,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,98.187439,A_PARTLYSUBMITTED,0.356812
0,202656,2012-01-20 17:34:09.591,A_SUBMITTED,2012-01-20 17:34:09.591,2012-01-20 17:34:09.591,A_PARTLYSUBMITTED,0.198,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,98.187439,A_PARTLYSUBMITTED,0.356812
0,202659,2012-01-20 17:35:39.051,A_SUBMITTED,2012-01-20 17:35:39.051,2012-01-20 17:35:39.051,A_PARTLYSUBMITTED,0.279,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,98.187439,A_PARTLYSUBMITTED,0.356812
0,202662,2012-01-20 17:38:44.874,A_SUBMITTED,2012-01-20 17:38:44.874,2012-01-20 17:38:44.874,A_PARTLYSUBMITTED,0.237,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,98.187439,A_PARTLYSUBMITTED,0.356812
0,202665,2012-01-20 17:46:59.963,A_SUBMITTED,2012-01-20 17:46:59.963,2012-01-20 17:46:59.963,A_PARTLYSUBMITTED,0.285,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,98.187439,A_PARTLYSUBMITTED,0.356812
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,214364,2012-02-29 23:22:24.570,A_SUBMITTED,2012-02-29 23:22:24.570,2012-02-29 23:22:24.570,A_PARTLYSUBMITTED,0.087,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,91.826683,A_PARTLYSUBMITTED,0.829388
0,214367,2012-02-29 23:28:41.098,A_SUBMITTED,2012-02-29 23:28:41.098,2012-02-29 23:28:41.098,A_PARTLYSUBMITTED,0.100,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,91.826683,A_PARTLYSUBMITTED,0.829388
0,214370,2012-02-29 23:28:55.349,A_SUBMITTED,2012-02-29 23:28:55.349,2012-02-29 23:28:55.349,A_PARTLYSUBMITTED,0.130,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,91.826683,A_PARTLYSUBMITTED,0.829388
0,214373,2012-02-29 23:43:09.766,A_SUBMITTED,2012-02-29 23:43:09.766,2012-02-29 23:43:09.766,A_PARTLYSUBMITTED,0.133,A_PARTLYSUBMITTED,0.567291,A_PARTLYSUBMITTED,91.826683,A_PARTLYSUBMITTED,0.829388


In [122]:
print(len(test_df.loc[test_df['LSTM event prediction'] == "W_Nabellen offertes"]), len(test_df.loc[test_df['Next event'] == "W_Nabellen offertes"]))

# LSTM predicitons of "W_Nabellen offertes" = 23758 
# Next events of "W_Nabellen offertes" = 13308
# Due to the LSTM continously predicting a long sequence of "W_Nabellen offertes"

23758 13308


In [127]:
test_df.loc[test_df['Next event'] == "End"].groupby('LSTM event prediction')['Case ID'].count()

LSTM event prediction
A_FINALIZED                        136
End                                 89
O_CREATED                            8
O_SENT                             105
W_Afhandelen leads                 458
W_Completeren aanvraag            1496
W_Nabellen incomplete dossiers     575
W_Nabellen offertes                713
W_Valideren aanvraag               216
Name: Case ID, dtype: int64

In [123]:
print(len(test_df.loc[test_df['LSTM event prediction'] == "End"]), len(test_df.loc[test_df['Next event'] == "End"]))
# Diff

263 3796


In [128]:
test_df.loc[test_df['Next event'] == "End"].groupby('LSTM event prediction')['Case ID'].count() # What does the LSTM predict when the next event = "End"

LSTM event prediction
A_FINALIZED                        136
End                                 89
O_CREATED                            8
O_SENT                             105
W_Afhandelen leads                 458
W_Completeren aanvraag            1496
W_Nabellen incomplete dossiers     575
W_Nabellen offertes                713
W_Valideren aanvraag               216
Name: Case ID, dtype: int64

In [130]:
print(len(test_df.loc[test_df['LSTM event prediction'] == "W_Completeren aanvraag"]), len(test_df.loc[test_df['Next event'] == "W_Completeren aanvraag"]))
# Conclusion! LSTM very bad at predicting when trace contains a loop.

21794 16685


In [104]:
for pred in test_df['Next event'].unique():
    df_pred = test_df.loc[test_df['Next event'] == pred] 
    LSTM_acc = round(np.mean(df_pred['Next event'] == df_pred['LSTM event prediction']),3) * 100
    print("%s: %s%%" % (pred, LSTM_acc))

A_PARTLYSUBMITTED: 100.0%
W_Afhandelen leads: 69.89999999999999%
A_DECLINED: 18.099999999999998%
End: 2.3%
A_PREACCEPTED: 12.5%
W_Completeren aanvraag: 86.3%
A_ACCEPTED: 10.7%
O_SELECTED: 6.9%
A_FINALIZED: 26.8%
O_CREATED: 28.7%
O_SENT: 52.2%
W_Nabellen offertes: 90.7%
O_CANCELLED: 0.0%
A_CANCELLED: 0.0%
O_SENT_BACK: 0.0%
W_Valideren aanvraag: 11.1%
O_DECLINED: 0.0%
W_Nabellen incomplete dossiers: 65.7%
O_ACCEPTED: 0.0%
A_APPROVED: 0.0%
A_REGISTERED: 0.0%
A_ACTIVATED: 0.0%
W_Beoordelen fraude: 0.5%
W_Wijzigen contractgegevens: 0.0%


In [113]:
for pred in test_df['Next event'].unique():
    df_pred = test_df.loc[test_df['Next event'] == pred] 
    RF_acc = round(np.mean(df_pred['Next event'] == df_pred['RF event prediction']), 3) * 100
    print("%s: %s%%" % (pred, RF_acc))

A_PARTLYSUBMITTED: 100.0%
W_Afhandelen leads: 75.8%
A_DECLINED: 30.4%
End: 89.60000000000001%
A_PREACCEPTED: 35.199999999999996%
W_Completeren aanvraag: 94.1%
A_ACCEPTED: 29.299999999999997%
O_SELECTED: 65.0%
A_FINALIZED: 72.39999999999999%
O_CREATED: 100.0%
O_SENT: 100.0%
W_Nabellen offertes: 97.0%
O_CANCELLED: 45.300000000000004%
A_CANCELLED: 31.6%
O_SENT_BACK: 45.6%
W_Valideren aanvraag: 86.7%
O_DECLINED: 16.5%
W_Nabellen incomplete dossiers: 92.30000000000001%
O_ACCEPTED: 22.2%
A_APPROVED: 28.999999999999996%
A_REGISTERED: 50.3%
A_ACTIVATED: 56.2%
W_Beoordelen fraude: 71.1%
W_Wijzigen contractgegevens: 0.0%


In [109]:
print(base_event_acc, LSTM_event_acc, RF_event_acc)

0.11313203545422935 0.20966951113994864 0.5807263008248222


In [110]:
print(base_time_mae, LSTM_time_mae, RF_time_mae)

0.6361256933360404 0.39611450599707637 0.40746741925040647


In [148]:
test_df['Time to next event'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['Time to next event'], unit='ms')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Time to next event'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['Time to next event'], unit='s')


In [150]:
test_df['Time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['Time prediction'], unit='ms')
test_df['LSTM time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['LSTM time prediction'], unit='ms')
test_df['RF time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['RF time prediction'], unit='ms')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['Time prediction'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['LSTM time prediction'] = pd.to_datetime(test_df['Complete Timestamp']) + pd.to_timedelta(test_df['LSTM time prediction'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

In [None]:
test_df.to_csv('output_log.csv', axis=False)