In [None]:
import pm4py
import pandas as pd
import numpy as np

# pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200
pd.options.display.max_colwidth = 200

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
log = pd.read_csv("ircc_uOttawa-filter_evt_10p.csv")
log

In [None]:
event_parts = log['event'].apply(lambda e: e.split(" - ")).to_list()
event_parts = pd.DataFrame(event_parts)

In [None]:
# event names with more than 2 parts (difficult ones)
troublemakers = event_parts[pd.notna(event_parts[2])][0].unique()
troublemakers

In [None]:
# have a closer look at the troublemakers
# for troublemaker in troublemakers:
#     print(event_parts[event_parts[0]==troublemaker].drop_duplicates())

In [None]:
# easy ones!
activ_lifec = event_parts[~ event_parts[0].isin(troublemakers)]

# start from these
activ_lifec = pd.DataFrame({ 'activity': activ_lifec[0], 'lifecycle': activ_lifec[1] })
# activ_lifec

In [None]:
# add activity_cols values to activity; add lifecycle_cols values to lifecycle
def create_activity_lifecycle(activity_label, activity_cols, lifecycle_cols, event_parts):
    def join_labels(row, cols):
        label = ""
        for col in cols:
             label +=  ((" - " if label != "" else "") + row[col] if pd.notna(row[col]) else "")
        return label
    
    subset = event_parts[event_parts[0]==activity_label]
    activity = subset.apply(lambda row: join_labels(row, activity_cols), axis=1)
    lifecycle = subset.apply(lambda row: join_labels(row, lifecycle_cols), axis=1)
    
    return pd.DataFrame({ 'activity': activity, 'lifecycle': lifecycle})

In [None]:
# for biographics,
# add first two parts to "activity"; add last two parts to "lifecycle"
biometrics_new = create_activity_lifecycle('Biometrics', [0, 1], [2, 3], event_parts)
activ_lifec = pd.concat([activ_lifec, biometrics_new])

# biometrics_new

In [None]:
# for biographic,
# add first part to "activity"; add last three parts to "lifecycle"
biographic_new = create_activity_lifecycle('Biographic', [0], [1, 2, 3], event_parts)
activ_lifec = pd.concat([activ_lifec, biographic_new])

# biographic_new

In [None]:
# for biometric,
# add first three parts to "activity"; add last two parts to "lifecycle"
biometric_new = create_activity_lifecycle('Biometric', [0, 1, 2], [3, 4], event_parts)
activ_lifec = pd.concat([activ_lifec, biometric_new])

# biometric_new

In [None]:
# for the others, 
for idx in range(3, len(troublemakers)):
    troublemaker = troublemakers[idx]
    
    # add first part to "activity"; add last three parts to "lifecycle"
    activity_new = create_activity_lifecycle(troublemaker, [0], [1, 2, 3], event_parts)
    activ_lifec = pd.concat([activ_lifec, activity_new])
    
    # print(activity_new.shape[0])

In [None]:
# drop all 'NIL' lifecycle events
activ_lifec = activ_lifec[activ_lifec['lifecycle']!='NIL']

In [None]:
# have a quick look
activ_lifec

In [None]:
# number of unique activities
len(activ_lifec['activity'].unique())

In [None]:
# join the "activity" & "lifecycle" to original log
ext_log = log.join(activ_lifec, how='inner') # merge on shared index

# do some checks
# activ_lifec should be same size as original log (not if we filtered activ_lifec)
# print(activ_lifec.shape[0] == log.shape[0])
# join result should have same size as activ_lifec
print(activ_lifec.shape[0] == ext_log.shape[0])
# event should always start with "activity"
check1 = ext_log.apply(lambda row: row['event'].startswith(row['activity']), axis=1)
print(ext_log[~ check1].shape[0]==0)
# event should always include "lifecycle"
check2 = ext_log.apply(lambda row: row['lifecycle'] in row['event'], axis=1)
print(ext_log[~ check2].shape[0]==0)

# prepare for pm
ext_log = ext_log[['case_id', 'timestamp', 'activity', 'lifecycle']]
ext_log = ext_log.rename(columns={ 'case_id': 'case:concept:name', 'lifecycle': 'concept:name', 'timestamp': 'time:timestamp' })
ext_log['case:concept:name'] = ext_log['case:concept:name'].astype(str)
ext_log['time:timestamp'] = pd.to_datetime(ext_log['time:timestamp'])

In [None]:
# just to have a wee look
# pd.options.display.max_rows = 100
# ext_log[ext_log['case:concept:name']=="1"].sort_values(by='time:timestamp')

In [None]:
# separate each activity & its lifecycle into a separate log

In [None]:
# but, only do this for activities with >= 3 lifecycle events
activ_lifec_counts = activ_lifec[['activity', 'lifecycle']].drop_duplicates().groupby('activity').count().reset_index().sort_values(by='lifecycle')
activ_lifec_counts

In [None]:
# (ct'ed)
separ_activ = activ_lifec_counts[activ_lifec_counts['lifecycle']>=3]['activity']
separ_activ

In [None]:
to_separ = ext_log[ext_log['activity'].isin(separ_activ)]
to_separ

In [None]:
labeled_logs = [ (g, df) for g, df in to_separ.groupby('activity') ]

In [None]:
# from mine_utils import mine_dfg, mine_alpha, mine_heur, mine_induct
# import os, shutil

# subdir="filter_evt_10p/"

# shutil.rmtree(f"lifecycles/{subdir}")
# os.mkdir(f"lifecycles/{subdir}")
# os.mkdir(f"lifecycles/{subdir}logs/")
# os.mkdir(f"lifecycles/{subdir}dfg/")
# os.mkdir(f"lifecycles/{subdir}alpha/")
# os.mkdir(f"lifecycles/{subdir}heur/")
# os.mkdir(f"lifecycles/{subdir}induct/")

# # per activity,
# for label, sublog in labeled_logs:
#     print(f"{label} (# events: {sublog.shape[0]})")

#     # store log
#     sublog.to_csv(f"lifecycles/{subdir}/logs/{label.replace('/', '_')}")
    
#     # mine process model
#     mine_dfg(sublog, f"lifecycles/{subdir}/dfg/{label.replace('/', '_')}")
#     mine_alpha(sublog, f"lifecycles/{subdir}/alpha/{label.replace('/', '_')}")
#     mine_heur(sublog, f"lifecycles/{subdir}/heur/{label.replace('/', '_')}")
#     mine_induct(sublog, f"lifecycles/{subdir}/induct/{label.replace('/', '_')}")

In [None]:
# per case, for the subprocesses, replace all activity lifecycle events by single start & end event

sorted_grouped = to_separ.sort_values(['case:concept:name', 'activity', 'time:timestamp']).groupby(['case:concept:name', 'activity'])
start_evts = sorted_grouped.first().reset_index(); start_evts['concept:name'] = start_evts['activity'] + ':start'
end_evts = sorted_grouped.last().reset_index(); end_evts['concept:name'] = end_evts['activity'] + ':end'
abstract_log = pd.concat([start_evts, end_evts])
# abstract_log = end_evts 

In [None]:
# re-add the non-subprocess activities

not_separ = ext_log[~ext_log['activity'].isin(separ_activ)]
not_separ['concept:name'] = not_separ['activity'] + ":" + not_separ['concept:name']
abstract_log = pd.concat([ abstract_log, not_separ ], ignore_index=True).sort_values(by=['case:concept:name','time:timestamp'])

In [None]:
# delete 'duplicate' events, i.e., same events in a trace occurring within x seconds of each other

# NOTE assumes that timestamps are sorted within each group
diff = abstract_log.groupby(['case:concept:name', 'concept:name'])['time:timestamp'].diff().astype(int)
abstract_log['diff'] = diff
abstract_log[abstract_log['diff']>0]

In [None]:
# drop duplicate events with time difference less than max_diff
max_diff = 10
billion=pow(10,9) # get diffs in seconds
to_drop = abstract_log[(abstract_log['diff']>0) & (abstract_log['diff']<max_diff*billion)].index
to_drop

In [None]:
abstract_log2 = abstract_log[~abstract_log.index.isin(to_drop)]
abstract_log2

In [None]:
abstract_log2 = abstract_log2[['case:concept:name', 'concept:name', 'time:timestamp']]

# abstract_log2[abstract_log2['case:concept:name']=="1"].sort_values(by='time:timestamp')
abstract_log2

In [145]:
abstract_log2.to_csv("abstract_log-starts_ends.csv")

In [None]:
# TODO 

# we not differentiate b/w small subprocesses & non-subprocess events
# we simply leave "small" subprocesses (2 evts or less) in there
# some groups of starts / ends always occur together
# seems second/... starts are subprocesses of first/... starts

# √ aggregate events where order does not matter
# *Fee:outstanding ; *Fee:paid/exempt
# replace with singular event when all or subset of group events have occurred

# to automatically detect these two cases; 
# detect tandem pairs (but, what if events interleave; allow noise?)
# to detect tandem pairs; use dcr/decl?
# WASTE OF TIME
# these events have the exact same timestamp
# per case, find events with exact same timestamp

# check time durations

In [None]:
# (cope with timestamps that are a few seconds from each other)

# diff = abstract_log2.groupby(['case:concept:name'])['time:timestamp'].diff().astype(int)
# abstract_log2['diff'] = diff
# abstract_log2[abstract_log2['diff']>0]

# max_diff = 10
# billion=pow(10,9) # get diffs in seconds
# to_drop = abstract_log[(abstract_log['diff']>0) & (abstract_log['diff']<max_diff*billion)].index
# to_drop

# abstract_log2 = abstract_log[~abstract_log.index.isin(to_drop)]
# abstract_log2

In [None]:
from mine_utils import aggregate_events_replace_last

abstract_log3 = abstract_log2.copy() #abstract_log2[abstract_log2['case:concept:name'].isin(["1","2","3"])]

In [None]:
activs = abstract_log3['concept:name'].unique()

In [None]:
fees_out = [ a for a in activs if "Outstanding" in a ]
print(fees_out)
abstract_log3 = aggregate_events_replace_last(abstract_log3, fees_out, "Outstanding:last", False)

In [None]:
fees_paid_exm = [ a for a in activs if "Paid" in a or "Exempt" in a ]
print(fees_paid_exm)
abstract_log3 = aggregate_events_replace_last(abstract_log3, fees_paid_exm, "Paid_Exm:last", False)

In [None]:
# have a wee look
abstract_log3[abstract_log3['case:concept:name'].isin(["3"])]

In [None]:
size0 = abstract_log2.shape[0]
size1 = size0 - ((2.352875040154192 - 1) * 12452) - ((2.3639868744872845 - 1) * 12190)

print("expected:", size1)
print("actual:", abstract_log3.shape[0])

In [None]:
from log_stats import get_trace_lengths
get_trace_lengths('concept:name', 'case:concept:name', abstract_log3, plot=False).describe()

In [None]:
len(abstract_log3['case:concept:name'].unique())

In [None]:
from variant_stats import get_variants_stats, get_variant_coverage, get_covering_variants, filter_traces_on_variants

In [None]:
var_stats = get_variants_stats(abstract_log3)
var_stats

In [None]:
vars = get_covering_variants(75, var_stats)
vars
# vars = var_stats[var_stats['cov_perc']>=1]
# vars

In [None]:
# filter on 0.1% cov_perc: 14% remaining
# filter on 1% cov perc: 48% remaining

print("remaining coverage:", vars['cov_perc'].sum())

In [None]:
abstract_log4 = filter_traces_on_variants(abstract_log3, vars)
abstract_log4

In [None]:
abstract_log4.to_csv("abstract_log-starts_ends-cov_var_75perc.csv")

In [None]:
import pm4py.objects.log.exporter.xes.exporter as xes_export
# xes_export.apply(abstract_log4, "abstract_log-starts_ends-cov_var_75perc.xes")

In [None]:
from mine_utils import mine_heur

mine_heur(abstract_log4)