In [1]:
import pandas as pd
import os
import datetime
import matplotlib.pyplot as plt
from scipy.stats import mode
import operator
import functools 
import statistics

In [2]:
def df_filter_cols(df_original):
    df_cols1 = df_original.set_index(df_original.groupby('case concept:name').cumcount(), append = True)
    df_cols1 = df_cols1.reset_index()
    df_cols2 = df_cols1[[ "level_1",  "eventID ", "case concept:name", "event concept:name", "event lifecycle:transition"]]
    return df_cols2

In [3]:
def create_case_dictionary(df):
    df["case concept:name"].unique()
    case_events={}
    df.sort_values("level_1")

    for index, row in df.iterrows():
        if row["level_1"] == 0:
            case_events.update({row["case concept:name"] : ["X"]})
        case_events.get(row["case concept:name"]).append(row["event concept:name"])
    return case_events

In [4]:
def calc_distances(event_dictionary, case_number):
    trace_distances = {}
    a = []
    a.clear()
    a.append(event_dictionary.get(case_number))
    d=[]

    for key, value in event_dictionary.items():
        if key == case_number:
            continue    
    
        for i in range (0, len(a)):
            d.append([i])

        for j in range (0, len(value)):
            d[0].insert(j, j)
    
        for i in range (1, len(a)):
            for j in range (1, len(value)):
                if a[i] == value[j]:
                    cost = 0
                else:
                    cost = 1
                    
                d[i].insert(j, min(d[i-1][j] + 1,
                             d[i][j-1] + 1,
                             d[i-1][j-1]+cost))
                if i>1 and j>1 and a[i] == value[j-1] and a[i-1] == value[j]:
                    d[i].insert(j, min(d[i][j], d[i-2][j-2] + 1))
                    d[i].pop(j+1)
                    
        trace_distances.update({key:d[len(a)-1][len(value)-1]})
        d.clear()
        
    a.clear()
    d.clear()
    return trace_distances

In [5]:
def filter_df_similar_maxdist(df_original, case_events, case_number, maxdist):
    distance_dict = calc_distances(case_events, case_number)
    distance_dict_filtered= []
    for (key, value) in distance_dict.items():
        if value <= maxdist:
            distance_dict_filtered.append(key)
    df_filtered = df_original[(df_original["case concept:name"].isin(distance_dict_filtered))][["case concept:name", "event concept:name"]]
    return df_filtered

def filter_df_similar_closestx(df_original, case_events, case_number, x):
    distance_dict = calc_distances(case_events, case_number)
    distance_dict_filtered = sorted(distance_dict, key=distance_dict.get, reverse=True)[:x] 
    df_filtered = df_original[(df_original["case concept:name"].isin(distance_dict_filtered))][["case concept:name", "event concept:name"]]
    return df_filtered

In [6]:
from collections import Counter
def get_first_mode(a):
    c = Counter(a)  
    mode_count = max(c.values())
    mode = {key for key, count in c.items() if count == mode_count}
    first_mode = next(x for x in a if x in mode)
    return first_mode

In [7]:
def add_predicted_events(df_2012): 
    filter_method = input("Please enter how you would like to filter on cases(maxdist/closestx):")
    if filter_method == "maxdist":
        n = int(input("Please enter the maximum Damerau-Levenshtein distance for traces to be used in predictions:"))
    if filter_method == "closestx":
        n = int(input("Please enter the the number of similar traces used when predicting an event:"))
    if filter_method != "maxdist" and filter_method != "closestx":
        print("Invalid method entered. Please enter either 'maxdist' or 'closestx'")
        raise KeyboardInterrupt
    print("Now predicting events.. Please be patient as this can take a while depending on the size of the input file.")
    
    df_cols = df_filter_cols(df_2012)
    case_events = create_case_dictionary(df_cols)
    df_cols = df_cols[["eventID ", "case concept:name", "event concept:name"]]
    df_dictionary = {}
    
    for row in df_cols.itertuples(index=True):
        mostcommonevent = 'bla'
        eventlist=['x']
        currentrow = row[0]
        currentcase = row[1]
        currentevent = row[3]
        if currentcase in df_dictionary:
            df_filtered = df_dictionary[currentcase]
        else:
            if filter_method == "maxdist":
                df_filtered = filter_df_similar_maxdist(df_2012, case_events, row[1], n)
            if filter_method == "closestx":
                df_filtered = filter_df_similar_closestx(df_2012, case_events, row[1], n)
            df_dictionary[currentcase] = df_filtered
        ADDNEXT = False
        for row1 in df_filtered.itertuples(index=True):
            if ADDNEXT == True:
                if lastcase == row1[1]:
                    eventlist.append(row1[2])
                ADDNEXT = False
            if row1[2] == currentevent:
                lastcase = row1[1]
                ADDNEXT = True
        mostcommonevent = get_first_mode(eventlist)
        df_cols.loc[currentrow, "event_prediction"] = mostcommonevent
        eventlist.clear()
    result = pd.concat([df_cols, df_2012], axis=1)
    return result

In [8]:
def save_results(output_name, result):
    result.to_csv(output_name + ".csv")

In [None]:
input_path = input("Please enter the path of the input file:")
output_name = input("Please enter a name and path for the output:")
result = add_predicted_events(pd.read_csv(input_path))
save_results(output_name, result)

Please enter the path of the input file: BPI_Challenge_2012-training.csv
Please enter a name and path for the output: DM_out_2012train_maxdist20
Please enter how you would like to filter on cases(maxdist/closestx): maxdist
Please enter the maximum Damerau-Levenshtein distance for traces to be used in predictions: 20


Now predicting events.. Please be patient as this can take a while depending on the size of the input file.
