In [1]:
import pm4py
import pandas as pd

# pd.options.display.max_rows = 4000

In [None]:
log = pd.read_csv("ircc_uOttawa.csv")
log

In [3]:
event_parts = log['event'].apply(lambda e: e.split(" - ")).to_list()
event_parts = pd.DataFrame(event_parts)

In [None]:
# event names with more than 2 parts (difficult ones)
troublemakers = event_parts[pd.notna(event_parts[2])][0].unique()
troublemakers

In [5]:
# have a closer look at the troublemakers
# for troublemaker in troublemakers:
#     print(event_parts[event_parts[0]==troublemaker].drop_duplicates())

In [6]:
# easy ones!
activ_lifec = event_parts[~ event_parts[0].isin(troublemakers)]

# start from these
activ_lifec = pd.DataFrame({ 'activity': activ_lifec[0], 'lifecycle': activ_lifec[1] })
# activ_lifec

In [7]:
# add activity_cols values to "activity"; add lifecycle_cols values to "lifecycle"
def create_activity_lifecycle(activity_label, activity_cols, lifecycle_cols, event_parts):
    def join_labels(row, cols):
        label = ""
        for col in cols:
             label +=  ((" - " if label != "" else "") + row[col] if pd.notna(row[col]) else "")
        return label
    
    subset = event_parts[event_parts[0]==activity_label]
    activity = subset.apply(lambda row: join_labels(row, activity_cols), axis=1)
    lifecycle = subset.apply(lambda row: join_labels(row, lifecycle_cols), axis=1)
    
    return pd.DataFrame({ 'activity': activity, 'lifecycle': lifecycle})

In [8]:
# for biographics,
# add first two parts to "activity"; add last two parts to "lifecycle"
biometrics_new = create_activity_lifecycle('Biometrics', [0, 1], [2, 3], event_parts)
activ_lifec = pd.concat([activ_lifec, biometrics_new])

# biometrics_new

In [9]:
# for biographic,
# add first part to "activity"; add last three parts to "lifecycle"
biographic_new = create_activity_lifecycle('Biographic', [0], [1, 2, 3], event_parts)
activ_lifec = pd.concat([activ_lifec, biographic_new])

# biographic_new

In [10]:
# for biometric,
# add first three parts to "activity"; add last two parts to "lifecycle"
biometric_new = create_activity_lifecycle('Biometric', [0, 1, 2], [3, 4], event_parts)
activ_lifec = pd.concat([activ_lifec, biometric_new])

# biometric_new

In [11]:
# for the others, 
for idx in range(3, len(troublemakers)):
    troublemaker = troublemakers[idx]
    
    # add first part to "activity"; add last three parts to "lifecycle"
    activity_new = create_activity_lifecycle(troublemaker, [0], [1, 2, 3], event_parts)
    activ_lifec = pd.concat([activ_lifec, activity_new])
    
    # print(activity_new.shape[0])

In [None]:
# have a quick look
activ_lifec

In [None]:
# check number of unique activities
len(activ_lifec['activity'].unique())

In [None]:
# join the "activity" & "lifecycle" to original log
ext_log = log.join(activ_lifec)

# do some checks
# should be the same number as the original log
print(activ_lifec.shape[0] == log.shape[0])
# event should always start with "activity"
check1 = ext_log.apply(lambda row: row['event'].startswith(row['activity']), axis=1)
print(ext_log[~ check1].shape[0]==0)
# event should always include "lifecycle"
check2 = ext_log.apply(lambda row: row['lifecycle'] in row['event'], axis=1)
print(ext_log[~ check2].shape[0]==0)

# prepare for pm
ext_log = ext_log[['case_id', 'timestamp', 'activity', 'lifecycle']]
ext_log = ext_log.rename(columns={ 'case_id': 'case:concept:name', 'lifecycle': 'concept:name', 'timestamp': 'time:timestamp' })
ext_log['case:concept:name'] = ext_log['case:concept:name'].astype(str)
ext_log['time:timestamp'] = pd.to_datetime(ext_log['time:timestamp'])

In [15]:
# separate each activity & its lifecycle into a separate log

labeled_logs = [ (g, df) for g, df in ext_log.groupby('activity') ]

In [27]:
from mine_utils import mine_dfg, mine_alpha, mine_heur, mine_induct

# per activity,
for label, sublog in labeled_logs:
    print(f"{label} (# events: {sublog.shape[0]})")

    # store log
    sublog.to_csv(f"lifecycles/logs/{label.replace('/', '_')}")
    
    # mine process model
    mine_dfg(sublog, f"lifecycles/dfg/{label.replace('/', '_')}")
    mine_alpha(sublog, f"lifecycles/alpha/{label.replace('/', '_')}")
    mine_heur(sublog, f"lifecycles/heur/{label.replace('/', '_')}")
    mine_induct(sublog, f"lifecycles/induct/{label.replace('/', '_')}")

Application Status (# events: 23336)
