In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
# from scipy.stats import mode
from statistics import mode
from statistics import mean
import operator
import functools
import datetime 
import numpy as np

# Read input
df1 = pd.read_csv('datasets/BPI Challenge 2017-training.csv')
df2 = pd.read_csv('datasets/BPI Challenge 2017-test.csv')
frames = [df1, df2]
df = pd.concat(frames)
df.reset_index(drop=True, inplace=True)

In [2]:
trace_list = [] # list of traces

for name, group in df.groupby(["case concept:name"]):
    trace_list.append(group['case concept:name'].tolist())

In [3]:
#define the function#
def find_list_features(list):
    list_len = [len(i) for i in list]
    return list_len

features = find_list_features(trace_list)

# Setting the limit for extreme traces
limit = np.percentile(features, 95)

In [4]:
# Calculating frequency of each trace
import collections, itertools
freq = collections.defaultdict(int)  # 0 by default
for x in itertools.chain.from_iterable(trace_list):
    freq[x] += 1
freq

# Filtering the frequency dictionary
filtered_dict = {k:v for k,v in freq.items() if v < limit}

# Making a list out of the keys
allowed_traces = [*filtered_dict]

In [5]:
# Final filter on the data frame   
df = df[df['case concept:name'].isin(allowed_traces)]

In [6]:
# Parse the timestamp and convert it into y-m-d form
df['event time:timestamp'] = pd.to_datetime(df['event time:timestamp'], format = '%d-%m-%Y %H:%M:%S.%f')

# Sort data by timestamp in ascending order
df.sort_values(['event time:timestamp'], axis=0, inplace=True)
    
# split into train set and test set (80/20)
df_train, df_test = train_test_split(df, test_size=0.2, shuffle = False)
# remove cases started in the training set
df_test = df_test[~df_test['case concept:name'].isin(df_train['case concept:name'].values)]

# Reset index
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# Assign position number to each event
df_sort = df_train.set_index(df_train.groupby('case concept:name').cumcount(), append = True)

In [7]:
df_sort['day of the week'] = df_sort['event time:timestamp'].dt.dayofweek

In [8]:
avg_day = df_sort.groupby(level=1)['day of the week'].apply(
    lambda x: x.mean()
)

In [9]:

res = df_sort.merge(avg_day, left_on = df_sort.index.get_level_values(1).values, right_index = True, how = 'left')

In [10]:
res['predicted_week'] = res.groupby(level=1)['day of the week_x'].transform(lambda x: x) + res['day of the week_y']

In [11]:
res.predicted_week = res.predicted_week.astype(int)

In [12]:
res['predicted_week'] = np.where(res['predicted_week'] > 6, res['predicted_week'] - 6, res['predicted_week'])

In [13]:
# Idea is to find most common event per day of the week
l = len(res['event concept:name'])
lst = []
commonEventPerDay = []
dayList = res['day of the week_x'].tolist()
eventList = res['event concept:name'].tolist()

for j in range (0, 7):
    for i in range (0, l):
        if dayList[i] == j:
            lst.append(eventList[i])
    findMode = mode(lst)
    commonEventPerDay.append(findMode)
    list = []
        
commonEventPerDay

['W_Call after offers',
 'W_Call after offers',
 'W_Call after offers',
 'W_Validate application',
 'W_Validate application',
 'W_Call after offers',
 'W_Call after offers']

In [14]:
# Predicted Event = Most Common Event for that Day of the Week
res['predicted_event'] = res['predicted_week']
for i in range (0, 7):
    res.loc[res['predicted_week'] == i, 'predicted_event'] = commonEventPerDay[i]

In [15]:
res

Unnamed: 0,Unnamed: 1,eventID,case LoanGoal,case ApplicationType,case concept:name,case RequestedAmount,event Action,event org:resource,event concept:name,event EventOrigin,event EventID,event lifecycle:transition,event time:timestamp,day of the week_x,day of the week_y,predicted_week,predicted_event
0,0,0,Existing loan takeover,New credit,Application_652823628,20000.0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 10:51:15.304,4,2.413361,6,W_Call after offers
1,1,1,Existing loan takeover,New credit,Application_652823628,20000.0,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 10:51:15.352,4,2.413361,6,W_Call after offers
2,2,2,Existing loan takeover,New credit,Application_652823628,20000.0,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 10:51:15.774,4,2.413361,6,W_Call after offers
3,3,3,Existing loan takeover,New credit,Application_652823628,20000.0,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 10:52:36.392,4,2.342081,6,W_Call after offers
4,4,4,Existing loan takeover,New credit,Application_652823628,20000.0,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 10:52:36.403,4,2.342081,6,W_Call after offers
5,5,5,Existing loan takeover,New credit,Application_652823628,20000.0,statechange,User_1,A_Concept,Application,ApplState_642383566,complete,2016-01-01 10:52:36.413,4,2.361628,6,W_Call after offers
6,0,4294967296,Home improvement,New credit,Application_1691306052,10000.0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 11:16:11.500,4,2.413361,6,W_Call after offers
7,1,4294967297,Home improvement,New credit,Application_1691306052,10000.0,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 11:16:11.549,4,2.413361,6,W_Call after offers
8,2,4294967298,Home improvement,New credit,Application_1691306052,10000.0,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 11:16:11.740,4,2.413361,6,W_Call after offers
9,3,4294967299,Home improvement,New credit,Application_1691306052,10000.0,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 11:17:31.573,4,2.342081,6,W_Call after offers
