In [2]:
import pandas as pd
from scipy.stats import mode
from datetime import datetime
from statistics import mode
from statistics import mean 
import pm4py
import numpy as np

In [3]:
events = pm4py.read_csv("BPI_Challenge_2012-training.csv", sep=',', quotechar=None, encoding=None, nrows=None, timest_format=None)


  """Entry point for launching an IPython kernel.


In [13]:
def predict_model(events):
    # Input : csv data which is open by pm4py.read_csv
    # Code flow : data preprocessing -> prediction of the next event -> prediction of the time taken
    # Output : dictionary(key: event name, value:[the next activity, duration])
    
    # Notice : the time unit is second.
    
    
    """Data preprocessing """
    # Create a pivot table of the start (minimum) and end (maximum) timestamps associated with each case:
    case_starts_ends = events.pivot_table(index='case concept:name', aggfunc={'event time:timestamp': ['min', 'max']}) 
    case_starts_ends = case_starts_ends.reset_index() 
    case_starts_ends.columns = ['case concept:name', 'caseend', 'casestart'] 

    # Merge with the main event log data so that for each row we have the start and end times.
    events = events.merge(case_starts_ends, on='case concept:name') 

    # Calculate the relative time by subtracting the process start time from the event timestamp
    events['relativetime'] = events['event time:timestamp'] - events['casestart']

    # Convert relative times to more friendly measures
    ## seconds
    events['relativetime_s'] = events['relativetime'].dt.seconds + 86400*events['relativetime'].dt.days 
    
    # Combine activity and its status
    events['concept:name'] = events['event concept:name'] + "-" + (events['event lifecycle:transition'])

    
    """ Predict the next event based on the most frequent next event"""
    trace_list = [] # list of traces

    for name, group in events.groupby(["case concept:name"]):
        trace_list.append(group['concept:name'].tolist())
    
    events_list = list(set(events["concept:name"])) # list of unique events
    
    
    # Get the most frequent next event of each event
    most_common_next_event = []
    for event in events_list:
        lst = []
    
        for trace in trace_list:
            if event in trace:
                index = trace.index(event)

                if index < len(trace)-1:
                    next_event = trace[index+1]
                    lst.append(next_event)
        most_frequent = mode(lst)
        most_common_next_event.append(most_frequent)
    
    # Create the dictionary(key : event name, value : the most frequent next event)
    zip_iterator = zip(events_list, most_common_next_event)
    result_dic =  dict(zip_iterator)
    
    """Predict the time based on the average time between events"""
    duration_list = []

    for key in result_dic:
        time_between = []
        for name, group in events.groupby(["case concept:name"]):
            for i in range(len(group)-2):
                if (group["concept:name"].iloc[i] == key and group["concept:name"].iloc[i+1] == result_dic[key]):
                    time_diff = group.iloc[i+1]["relativetime_s"] - group.iloc[i]["relativetime_s"]
                    time_between.append(time_diff)
        duration_list.append(np.mean(time_between))
    
    """Make a final dictionary(key:given event, value:[next event, duration])"""

    result = dict((z[0], list(z[1:])) for z in zip(events_list, most_common_next_event, duration_list))
    
    return(result)




predict_model(events)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


{'W_Afhandelen leads-COMPLETE': ['W_Completeren aanvraag-START',
  70653.21656976744],
 'A_CANCELLED-COMPLETE': ['O_CANCELLED-COMPLETE', 0.0],
 'A_DECLINED-COMPLETE': ['W_Afhandelen leads-COMPLETE', nan],
 'A_SUBMITTED-COMPLETE': ['A_PARTLYSUBMITTED-COMPLETE', 0.3229534817078995],
 'A_REGISTERED-COMPLETE': ['A_ACTIVATED-COMPLETE', 0.0],
 'W_Wijzigen contractgegevens-SCHEDULE': ['W_Wijzigen contractgegevens-SCHEDULE',
  26.666666666666668],
 'W_Nabellen offertes-COMPLETE': ['W_Nabellen offertes-START',
  544092.2720892929],
 'O_CANCELLED-COMPLETE': ['O_SELECTED-COMPLETE', 0.0],
 'W_Nabellen incomplete dossiers-START': ['W_Nabellen incomplete dossiers-COMPLETE',
  -436.5411718648049],
 'O_SENT_BACK-COMPLETE': ['W_Valideren aanvraag-SCHEDULE',
  0.33926031294452347],
 'W_Completeren aanvraag-COMPLETE': ['W_Completeren aanvraag-START',
  154314.22377126655],
 'A_PARTLYSUBMITTED-COMPLETE': ['W_Afhandelen leads-SCHEDULE',
  27.531626120358514],
 'A_ACTIVATED-COMPLETE': ['W_Valideren aanvraag