In [None]:
# This cell should be run in order to execute the predictions. It will ask for an input. 
# Also, a name for the output file with the predictions should be given. It will be stored in the same folder as the
# tool itself and the datasets. The accuracy and error measurement of both prediction models for event and time 
# can be found in the poster.

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from scipy.stats import mode
import operator
import functools
import datetime 

def process_data():
    # Read input
    dataset = input("Please enter the path of the CSV file: ")
    output_name = input("Please enter the name (and path) of the output file: ")
    df = pd.read_csv(dataset)

    # Parse the timestamp and convert it into y-m-d form
    df['event time:timestamp'] = pd.to_datetime(df['event time:timestamp'], format = '%d-%m-%Y %H:%M:%S.%f')

    # Sort data by timestamp in ascending order
    df.sort_values(['event time:timestamp'], axis=0, inplace=True)
    
    # split into train set and test set (80/20)
    df_train, df_test = train_test_split(df, test_size=0.2, shuffle = False)
    
    # remove cases started in the training set
    df_test = df_test[df_test['case concept:name'].isin(df_train['case concept:name'].values)]

    # Reset index
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    
    return (df_train, df_test, output_name)

# Functions for event and time prediction, either in one def like last time or split 

def prediction(df_train, df_test):
    # Assign position number to each event
    df_train = df_train.set_index(df_train.groupby('case concept:name').cumcount(), append = True)
    df_test = df_test.set_index(df_test.groupby('case concept:name').cumcount(), append = True)
    
    #calculate time since started for each case
    df_train = df_train.assign(time_since_started=df_train.groupby('case concept:name')['event time:timestamp'].apply(lambda x: x - x.iloc[0]))
    
    #groupby case concept and calculate average for each position
    avg_timespan = df_train.groupby(level=1)['time_since_started'].apply(
        lambda x: x.astype('timedelta64[s]').mean()
    )

    # merge
    result = df_test.merge(avg_timespan, left_on = df_test.index.get_level_values(1).values, right_index = True, how = 'left')
    
    result['time_since_started']=result['time_since_started'].astype('timedelta64[s]')
    result['time_prediction']= result.groupby('case concept:name')['event time:timestamp'].transform(lambda x: x.min())+ result['time_since_started']
    
    result = result.drop('Unnamed: 0', axis=1)
    
    return result

def save_results(output_name):
    result.to_csv(output_name + ".csv")

In [None]:
df_train, df_test, output_name = process_data()
result = prediction(df_train, df_test)
save_results(output_name)