In [2]:
# This cell should be run in order to execute the predictions. It will ask for an input. 
# Also, a name for the output file with the predictions should be given. It will be stored in the same folder as the
# tool itself and the datasets. The accuracy and error measurement of both prediction models for event and time 
# can be found in the poster.

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from scipy.stats import mode
import operator
import functools
import datetime 
import numpy as np
import collections, itertools

def process_data():
    # Read input
    dataset = input("Please enter the path of the CSV file: ")
    output_name = input("Please enter the name (and path) of the output file: ")
    df = pd.read_csv(dataset)
    
    #preprocessing
    trace_list = [] # list of traces

    for name, group in df.groupby(["case concept:name"]):
        trace_list.append(group['case concept:name'].tolist())

    #define the function#
    def find_list_features(list):
        list_len = [len(i) for i in list]
        return list_len

    features = find_list_features(trace_list)

    # Setting the limit for extreme traces
    limit = np.percentile(features, 95)


    # Calculating frequency of each trace
    freq = collections.defaultdict(int)  # 0 by default
    for x in itertools.chain.from_iterable(trace_list):
        freq[x] += 1

    # Filtering the frequency dictionary
    filtered_dict = {k:v for k,v in freq.items() if v < limit}

    # Making a list out of the keys
    allowed_traces = [*filtered_dict]

    df = df[df['case concept:name'].isin(allowed_traces)]

    # Parse the timestamp and convert it into y-m-d form
    df['event time:timestamp'] = pd.to_datetime(df['event time:timestamp'], format = '%d-%m-%Y %H:%M:%S.%f')

    # Sort data by timestamp in ascending order
    df.sort_values(['event time:timestamp'], axis=0, inplace=True)
    
    # split into train set and test set (80/20)
    df_train, df_test = train_test_split(df, test_size=0.2, shuffle = False)
    
    # remove cases started in the training set
    df_test = df_test[df_test['case concept:name'].isin(df_train['case concept:name'].values)]

    # Reset index
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    
    return (df_train, df_test, output_name)

# Functions for event and time prediction, either in one def like last time or split 

def prediction(df_train, df_test):
    # Event prediction
    df_sort = df_train.set_index(df_train.groupby('case concept:name').cumcount(), append = True)

    df_sort['day of the week'] = df_sort['event time:timestamp'].dt.dayofweek

    avg_day = df_sort.groupby(level=1)['day of the week'].apply(
        lambda x: x.mean()
    )
    
    
    # Time prediction
    # Assign position number to each event
    df_train = df_train.set_index(df_train.groupby('case concept:name').cumcount(), append = True)
    df_test = df_test.set_index(df_test.groupby('case concept:name').cumcount(), append = True)
    
    #calculate time since started for each case
    df_train = df_train.assign(time_since_started=df_train.groupby('case concept:name')['event time:timestamp'].apply(lambda x: x - x.iloc[0]))
    
    #groupby case concept and calculate average for each position
    avg_timespan = df_train.groupby(level=1)['time_since_started'].apply(
        lambda x: x.astype('timedelta64[s]').mean()
    )

    # apply on test set
    result = df_test.merge(avg_timespan, left_on = df_test.index.get_level_values(1).values, right_index = True, how = 'left')
    
    result['time_since_started']=result['time_since_started'].astype('timedelta64[s]')
    result['time_prediction']= result.groupby('case concept:name')['event time:timestamp'].transform(lambda x: x.min())+ result['time_since_started']
    
    result = df_test.merge(avg_day, left_on = df_sort.index.get_level_values(1).values, right_index = True, how = 'left')
    
    result['day of the week'] = df_sort['event time:timestamp'].dt.dayofweek

    # Adding a new column for the predicted week
    result['predicted_week'] = result.groupby(level=1)['day of the week_x'].transform(lambda x: x) + result['day of the week_y']
    # Converting float to int
    result.predicted_week = result.predicted_week.astype(int)
    # Making sure that no values are above 6 since we consider 0,...,6 as days of the week
    result['predicted_week'] = np.where(result['predicted_week'] > 6, result['predicted_week'] - 6, result['predicted_week'])

    # Finding the most common event per day of the week
    l = len(result['event concept:name'])
    lst = []
    commonEventPerDay = []
    dayList = result['day of the week_x'].tolist()
    eventList = result['event concept:name'].tolist()

    for j in range (0, 7):
        for i in range (0, l):
            if dayList[i] == j:
                lst.append(eventList[i])
        findMode = mode(lst)
        commonEventPerDay.append(findMode)
        list = []
        
    # Predicted Event = Most Common Event for that Day of the Week
    result['predicted_event'] = result['predicted_week']
    for i in range (0, 7):
        result.loc[result['predicted_week'] == i, 'predicted_event'] = commonEventPerDay[i]
    
    result = result.drop('Unnamed: 0', axis=1)
    
    return result

def save_results(output_name):
    result.to_csv(output_name + ".csv")

In [3]:
df_train, df_test, output_name = process_data()
result = prediction(df_train, df_test)
save_results(output_name)

Please enter the path of the CSV file: C:\Users\HP\Desktop\2IOI0 DBL Process Mining\2IOI0-DBL-Process-Mining-main\datasets\BPI Challenge 2017-training.csv
Please enter the name (and path) of the output file: resultTool2


KeyError: 'Column not found: day of the week_x'

In [None]:
final = pd.read_csv('try1.csv')
final[final['case concept:name']==205433]