In [229]:
import pandas as pd
from scipy.stats import mode
from datetime import datetime
import os
from sklearn.model_selection import train_test_split

# Splitting data into the train and test dataset

In [230]:
# Import dataset
df_1 = pd.read_csv('BPI_Challenge_2012-training.csv')
df_2 = pd.read_csv('BPI_Challenge_2012-test.csv')

# Merge 
frames = [df_1, df_2]
df = pd.concat(frames)
df.reset_index(drop=True, inplace=True)

# Parse the timestamp and convert it into y-m-d form
df['event time:timestamp'] = pd.to_datetime(df['event time:timestamp'], format = '%d-%m-%Y %H:%M:%S.%f')

# Sort data by timestamp in ascending order
df.sort_values(['event time:timestamp'], axis=0, inplace=True)

# Split data into the training and test dataset
df_train, df_test = train_test_split(df, test_size=0.2, shuffle = False)

# Reset index
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# Case prediction model

In [231]:
# Assign position number to each event (train dataset)
df_common_train = df_train.set_index(df_train.groupby('case concept:name').cumcount(), append = True)
df_common_train = df_common_train.reset_index()
df_common_train = df_common_train.rename(columns={'level_1':'position'})

# Assign position number to each event (test dataset)
df_common_test = df_test.set_index(df_test.groupby('case concept:name').cumcount(), append = True)
df_common_test = df_common_test.reset_index()
df_common_test = df_common_test.rename(columns={'level_1':'position'})

# df_result is the dataframe to indicate the most frequent event of each position
df_result = df_common_train[["position", "event concept:name"]]

# Find the most frequent event in each position using mode
df_result = df_result.groupby('position')['event concept:name'].apply(lambda x: mode(x)[0][0]).reset_index()
df_result



Unnamed: 0,position,event concept:name
0,0,A_SUBMITTED
1,1,A_PARTLYSUBMITTED
2,2,W_Afhandelen leads
3,3,W_Afhandelen leads
4,4,W_Completeren aanvraag
5,5,W_Completeren aanvraag
6,6,W_Completeren aanvraag
7,7,W_Completeren aanvraag
8,8,W_Completeren aanvraag
9,9,W_Completeren aanvraag


# Error measurement 
Accuracy rate = nr of correct predictions of event / nr of Total events

In [246]:
# Error measurement : Accuracy rate = nr of correct predictions / nr of Total cases 

# Number of total cases 
total_cases_train  = df_train.shape[0]
total_cases_test  = df_test.shape[0]

# Number of correct prediction for the training dataset
nr_correct_prediction_train = 0
for i in range(df_common_train.shape[0]):
    position = df_common_train['position'][i]
    
    if df_common_train['event concept:name'][i] == df_result['event concept:name'][position]:
        nr_correct_prediction_train = nr_correct_prediction_train + 1

accuracy_rate_train = nr_correct_prediction_train / total_cases_train

# Number of correct prediction for the test dataset
nr_correct_prediction_test = 0
for i in range(df_common_test.shape[0]):
    position = df_common_test['position'][i]
    
    if df_common_test['event concept:name'][i] == df_result['event concept:name'][position]:
        nr_correct_prediction_test = nr_correct_prediction_test + 1

accuracy_rate_test = nr_correct_prediction_test / total_cases_test

# Result
accuracy_rate_train, accuracy_rate_test

(0.485602593440122, 0.37993897787948133)