In [None]:
# notebook for new ircc dataset (combined_event_log_anonymous.csv)

# TODO
# âˆš mark subprocesses that are fully automated; incorporate in viewer
# in general, nesting subprocesses in dcr (paper)

In [None]:
# data/combined_event_log_anonymous.csv
# -> data/combined_event_log-filt_evt1p.csv
#   -> data/combined_event_log-filt_evt1p-time1m.csv

# subprocesses:
#   data/combined_event_log-filt_evt1p-time1m.csv
#   -> data/combined_event_log-abstracted.csv + sublogs in level2 (based on status) 
#       -> data/combined_event_log-abstracted2.csv + sublogs in level1 (based on nesting)

In [None]:
import pm4py
import pandas as pd
import numpy as np
from mine_utils import get_log

# pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200
pd.options.display.max_colwidth = 200

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Preprocess logs

## Variant analysis

In [None]:
# huge amount of variability

# total # traces = 7734 traces

# (non-abstr) new log: # vars = 7665, ratio = 99.10783553141971
#   (a variant covers at most 4 traces ...)
#   filter evts in less than X of cases:
#       1%: 7664 variants: minus 1 variant
#       10%: 7657 variants: minus 8 variants
#   same time for events within 1min:
#       ** 6030 variants: minus 1635 variants
#       + above (1%): 6014 variants: minus 1651 variants
#       + above (10%): 5841 variants: minus 1824 variants

# abstr new log: # vars = 5145, ratio = 66.5244375484872
#   same time for events within 1min: # vars = 3654, ratio = 47.25
#   + filter evts in less than 1% of cases: # vars = 3564, ratio = 46.08

# (abstr or log: # traces = 7734, # vars = 2642, ratio = 34.16084820274114)


In [None]:
from variant_stats import get_variants_stats, get_variant_ratio, get_variant_coverage, get_covering_variants, filter_traces_on_variants

In [None]:
log = pd.read_csv("data/combined_event_log-filt_evt1p.csv")
vars_log = get_variants_stats(log)
print(get_variant_ratio(log, vars_log))
vars_log

In [None]:
# much bigger improvement, clearly

log = pd.read_csv("data/combined_event_log-time1m.csv")
vars_log = get_variants_stats(log)
print(get_variant_ratio(log, vars_log))
vars_log

In [None]:
log = pd.read_csv("data/combined_event_log-filt_evt1p-time1m.csv")
vars_log = get_variants_stats(log)
print(get_variant_ratio(log, vars_log))
vars_log

In [None]:
log = pd.read_csv("data/combined_event_log_anonymous.csv")
log['act_upd_date'] = pd.to_datetime(log['act_upd_date'])
log = log.rename({ 'app_num': 'case:concept:name', 'activity_full_value': 'concept:name', 'act_upd_date': 'time:timestamp' }, axis=1)

abstr_log_or = pd.read_csv("data/or/abstract_log-starts_ends-v2.csv")
abstr_log_new = pd.read_csv("data/combined_event_log-abstracted2.csv")

In [None]:
print("total # traces:", len(log['case:concept:name'].unique()))

vars_log = get_variants_stats(log)
vars_abstr_log_or = get_variants_stats(abstr_log_or)
vars_abstr_log_new = get_variants_stats(abstr_log_new)

print("log:", get_variant_ratio(log, vars_log))
print("abstr log or:", get_variant_ratio(log, vars_abstr_log_or))
print("abstr log new:", get_variant_ratio(log, vars_abstr_log_new))

## Infrequent events

In [None]:
log = pd.read_csv("data/combined_event_log_anonymous.csv")
log = log.rename({ 'app_num': 'case:concept:name', 'activity_full_value': 'concept:name', 'act_upd_date': 'time:timestamp' }, axis=1)

In [None]:
log[log['activity']=='Other Reqs Assessment']['activity_status'].drop_duplicates()

In [None]:
from log_stats import count_cases_per_event

activ_cases_counts = count_cases_per_event('concept:name', 'case:concept:name', log).reset_index()

In [None]:
activ_cases_counts[activ_cases_counts['perc']<1] #.to_csv("data/dropped_events.csv")

In [None]:
activ_cases_counts[activ_cases_counts['perc']<10]

In [None]:
# let's filter on activities that occur in 1% or less of cases
to_drop = activ_cases_counts.loc[activ_cases_counts['perc']<1, 'concept:name']
log_filter = log[~ log['concept:name'].isin(to_drop)]

In [None]:
log_filter.to_csv("data/combined_event_log-filt_evt1p.csv")

## Timestamp differences

In [None]:
# log = pd.read_csv("data/combined_event_log_anonymous.csv")

log = pd.read_csv("data/combined_event_log-filt_evt1p.csv")
log = log.drop('Unnamed: 0', axis=1)

log = log.rename({ 'app_num': 'case:concept:name', 'activity_full_value': 'concept:name', 'act_upd_date': 'time:timestamp' }, axis=1)
log['time:timestamp'] = pd.to_datetime(log['time:timestamp'])

In [None]:
from mine_utils import get_time_diff, equal_timestamps_interval

In [None]:
log = get_time_diff(log)

In [None]:
log[log['activity']=='Associate Medicals'].groupby('case:concept:name')['time_diff'].mean().mean()

In [None]:
# (check all sequential events with exact same timestamp)
print("num simult (0 sec):", len(log[log['time_diff']==0]), " <> total num:", log.shape[0])

In [None]:
# (check all sequential events with less than 1min difference in timestamps)
print("num simult (1 min):", len(log[log['time_diff'] < 60]), " <> total num:", log.shape[0])

In [None]:
# (takes ca. 2-4 sec)

log = equal_timestamps_interval(log, 60) # 60 sec

In [None]:
# log.to_csv("data/combined_event_log-time1m.csv")
log.to_csv("data/combined_event_log-filt_evt1p-time1m.csv")

# checkout variant analysis for seeing whether it improved matters

# Subprocesses

In [None]:
src_file = "data/combined_event_log-filt_evt1p.csv"
# src_file = "data/combined_event_log-filt_evt1p-time1m.csv"

tgt_folder = "lifecycles/filt_evt1p/data"

## Subprocesses based on status

In [None]:
# log = pd.read_csv("data/combined_event_log_anonymous.csv")
# log = log.rename({ 'app_num': 'case:concept:name', 'act_upd_date': 'time:timestamp' }, axis=1)

log = pd.read_csv(src_file)
log = log.drop('Unnamed: 0', axis=1)
log

In [None]:
# get all unique activities
log['activity'].drop_duplicates().sort_values().to_excel("data/all_activities.xlsx")

In [None]:
# get activity subprocesses
activ_lifecycles = log[['activity', 'activity_status']].drop_duplicates().sort_values(by=['activity'])
activ_lifecycles.to_excel("data/activity_lifecycles.xlsx")

In [None]:
# count number of sub-activities in each subprocess
counts = log[['activity', 'activity_status']].drop_duplicates().groupby('activity')['activity_status'].count()
counts = counts.sort_values(ascending=False)
counts

In [None]:
# only separate subprocesses with >= 3 sub-activities
parent_activ = counts[counts >= 3].reset_index()
parent_activ

In [None]:
subproc_evts = log[log['activity'].isin(parent_activ['activity'])]
subproc_evts

In [None]:
non_subproc_evts = log[~ log['activity'].isin(parent_activ['activity'])]
non_subproc_evts

### Create separate logs per subprocess

In [None]:
from separ_subproc import separ_subproc

separ_subproc(subproc_evts, non_subproc_evts, 'activity', 'activity_status', 'concept:name', f"{tgt_folder}/level2/", f"{tgt_folder}/combined_event_log-abstracted_status.csv")

## Subprocesses based on nesting

### Check original nesting file

In [None]:
nestings = pd.read_excel("data/nested_activities-original.xlsx")
log = pd.read_csv("data/combined_event_log_anonymous.csv")

In [None]:
nestings[~ (nestings['Activity'].isin(log['activity']))]

In [None]:
cost_recov = nestings[nestings['Activity']=='Cost Recovery']['Parent Item']
cost_recov[~ (cost_recov.isin(log['activity']))]

In [None]:
print(len(log[log['activity']=='Criminality Assessment']))
print(len(log[log['activity']=='Medical Assessment']))
print(len(log[log['activity']=='Misrep Assessment']))
print(len(log[log['activity']=='Security Assessment']))

In [None]:
print(log[log['activity'].str.startswith('Biographic')]['activity'].drop_duplicates())
print()
print(log[log['activity'].str.startswith('Biometric')]['activity'].drop_duplicates())
print()
print(log[log['activity'].str.startswith('Criminal')]['activity'].drop_duplicates())
print()
print(log[log['activity'].str.startswith('Medical')]['activity'].drop_duplicates())
print()
print(log[log['activity'].str.startswith('Security')]['activity'].drop_duplicates())

### Find subprocesses

In [None]:
nestings = pd.read_excel("data/nested_activities-fixed.xlsx")
nestings

In [None]:
# make sure there's no non-existent activities in the nesting file
or_log = pd.read_csv("data/combined_event_log_anonymous.csv")
nestings[~ nestings['Activity'].isin(or_log['activity'])].sort_values(by='Activity')

In [None]:
abstr_log = pd.read_csv(f"{tgt_folder}/combined_event_log-abstracted_status.csv")
abstr_log

In [None]:
# connect parent items to events
# left merge; also keep events that are not being nested
abstr_log_parent = abstr_log.merge(nestings, left_on='activity', right_on='Activity', how='left')
abstr_log_parent

In [None]:
subproc_evts = abstr_log_parent[abstr_log_parent['Parent Item'].notna()]
# (non-nested events; those not merged with parent)
non_subproc_evts = abstr_log_parent[abstr_log_parent['Parent Item'].isna()]

### Create separate logs per nested activity

In [None]:
from separ_subproc import separ_subproc
from shutil import copy

separ_subproc(subproc_evts, non_subproc_evts, 'Parent Item', 'concept:name', 'concept:name', f"{tgt_folder}/level1", f"{tgt_folder}/combined_event_log-abstracted_nesting.csv")

copy(f"{tgt_folder}/combined_event_log-abstracted_nesting.csv", f"{tgt_folder}/level0/main.csv")

## Sanity check

In [None]:
import os

dir = "lifecycles/level2/logs"
sublogs_lvl2 = [ (f, pd.read_csv(os.path.join(dir, f))) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) ]

### Check original events

In [None]:
# same input as subprocesses part
# (NOTE - original log will have to be extended with index for sanity code to work)
log = pd.read_csv("data/combined_event_log-filt_evt1p-time1m.csv")

In [None]:
from collections import defaultdict

found_indexes = set()

# > subprocesses based on status

# - find original events in the sublogs

for f, sublog in sublogs_lvl2:    
    f_name = f[0:f.index(".")]
    
    # join original log with the sublog
    log_merged = log.merge(sublog, left_on='index', right_on='index')
    # print(log_merged)
    
    # (apply same string operation as on the file name)
    activ_file = (log_merged['activity_x'].str.replace("/", "_") == f_name)
    # for all matches, activity names should correspond to file name
    assert (activ_file).all(), f_name + " <> " + log_merged[~ activ_file]['activity_x']
    # for all matches, activity statuses should correspond to sublog activity name
    assert (log_merged['activity_status'] == log_merged['concept:name_y']).all(), "file: {f_name}"
    
    found_indexes.update(log_merged['index'])

In [None]:
# - find original events in the abstracted log

# only interested in activities without parent (non-nested)
log_filter = log[~ log['activity'].isin(parent_activ['activity'])]
# join original log with the abstracted log
log_merged = log_filter.merge(abstr_log, left_on='index', right_on='index')
# print(log_merged)

activ_name = (log_merged['concept:name_x'] == log_merged['concept:name_y'])
# activity names should correspond
assert (activ_name).all(), log_merged[~ activ_name][['concept:name_x', 'concept:name_y']]

found_indexes.update(log_merged['index'])

In [None]:
# all events found?
assert len(found_indexes) == log.shape[0], f"{len(found_indexes)} <> {log.shape[0]}"

### Check subprocesses

In [None]:
# > subprocesses based on nesting

# per case, check if activity's start/end markers correspond to first & last sorted events of the activity

def check_subproc(abstr_log, marker_label, is_start):
    # group all start/end markers by activity
    groups = abstr_log[abstr_log['concept:name'].str.endswith(marker_label)].groupby('activity')

    # for each activity & their markers for all cases
    for activ, g in groups:
        # sort on case
        g = g.sort_values(by='case:concept:name').reset_index()
        
        # find sublog corresponding to activity
        for i in [ 1, 2 ]:
            path = f"lifecycles/level2/logs/{activ.replace('/', '_')}.csv"
            if not os.path.exists(path):
                continue
            sublog = pd.read_csv(path)
        
        # print(activ, path)
        
        # first, in the activity's sublog, find the firsts/lasts for each case
        gb = sublog.groupby('case:concept:name')
        delims = (gb.first() if is_start else gb.last()).reset_index()
        # also sort on case
        delims = delims.sort_values(by='case:concept:name').reset_index()
        
        # firsts/lasts should be the same as the start/end markers
        assert(g['index'] == delims['index']).all(), activ
        
check_subproc(abstr_log, " [begin]", True)
check_subproc(abstr_log, " [end]", False)

path = "lifecycles/level1/logs/"

from pathlib import Path
paths = Path(path).rglob("*.csv")
for path in paths:
    abstr_sublog = pd.read_csv(path)
    check_subproc(abstr_sublog, " [begin]", True)
    check_subproc(abstr_sublog, " [end]", False)
    

## Variant analysis (bis)

### Main variant (no filtering)

In [None]:
from variant_stats import get_variants_stats, get_variant_ratio, get_variant_coverage, get_covering_variants, filter_traces_on_variants

In [None]:
log = get_log("data/combined_event_log-abstracted2.csv")
log = log[['case:concept:name', 'concept:name', 'time:timestamp']]

In [None]:
var_stats = get_variants_stats(log)
print(get_variant_ratio(log, var_stats))
var_stats

In [None]:
singleton_vars = var_stats[var_stats['cov_amt']==1]

# ooph, a lot of traces here ...
singleton_vars['cov_amt'].sum() / var_stats['cov_amt'].sum()

### Subprocess variants (filtering!)

In [None]:
# some ad-hoc analysis

# log = pd.read_csv("/Users/wvw/git/pm/ircc/lifecycles/level1/logs/Other Req Activity.csv")
# var_stats = get_variants_stats(log, plot=False)
# print(var_stats)
# rare_vars = var_stats[var_stats['cov_perc'] < 1]
# print()
# print(rare_vars['cov_perc'].sum().round(2))
# print((rare_vars['sequence'].count() / var_stats.shape[0] * 100).round(2))

In [None]:
# filter the variants
# (ca. 3-5 s)

dir = "/Users/wvw/git/pm/ircc/lifecycles"
for subdir in [ "level1", "level2" ]:
    print(">", subdir)
    for path in Path(os.path.join(dir, subdir, "logs")).rglob("*.csv"):
        name = os.path.basename(path)
        log = pd.read_csv(path)
        var_stats = get_variants_stats(log, plot=False)
        
        rare_vars = var_stats[var_stats['cov_perc'] < 1]
        print(name, ":", rare_vars['cov_perc'].sum().round(2), "% traces", "(", (rare_vars['sequence'].count() / var_stats.shape[0] * 100).round(2), "% vars" ")")
        
        flog = filter_traces_on_variants(log, var_stats[var_stats['cov_perc'] >= 1])
        flog.to_csv(path)

    print()

## (Timestamp differences (bis))

In [None]:
log = get_log("data/combined_event_log-abstracted2.csv")
log = log[['case:concept:name', 'concept:name', 'time:timestamp']]
log

In [None]:
from mine_utils import get_time_diff

logd = get_time_diff(log)

In [None]:
# (check all sequential events with less than 1min difference in timestamps)
print("num simult (1 min):", len(logd[ (logd['time_diff'] > 0) & (logd['time_diff'] < 60)]), " <> total num:", logd.shape[0])

# none to be found

## (Infrequent events (bis))

In [None]:
log = pd.read_csv("data/combined_event_log-abstracted2.csv")

from log_stats import count_cases_per_event
activ_cases_counts = count_cases_per_event('concept:name', 'case:concept:name', log).reset_index()

In [None]:
activ_cases_counts[activ_cases_counts['perc']<1]

# Mine process models

## Ad-hoc mining

In [None]:
from variant_stats import get_variants_stats, get_variant_ratio, get_variant_coverage, get_covering_variants, filter_traces_on_variants
from mine_utils import get_log, ProcAnn, mine_heur, mine_induct, mine_alpha, mine_dfg

### Main process

In [None]:
log = get_log("data/combined_event_log-abstracted2.csv")

cost_recov = log[(log['concept:name']=='Cost Recovery [begin]') | (log['concept:name']=='Cost Recovery [end]')]

log = log[['case:concept:name', 'time:timestamp', 'concept:name']]
cost_recov.to_csv("data/cost_recov.csv")
xes_export.apply(cost_recov, "data/cost_recov.xes")

mine_heur(log, ProcAnn.FREQ, "graphs/combined_event_log-abstracted2-time1m")
mine_induct(log, convert_to='petri_net', output_path="graphs/combined_event_log-abstracted2-time1m-pn")

log = log[['case:concept:name', 'time:timestamp', 'concept:name']]
xes_export.apply(log, "data/combined_event_log-abstracted2.xes")

### Subprocess

In [None]:
verif = get_log("/Users/wvw/git/pm/ircc/lifecycles/level2/logs/Verification.csv")
verif

In [None]:
vars = get_variants_stats(verif)
vars

In [None]:
mine_dfg(verif)

## Systematic mining

In [None]:
tgt_folder = "lifecycles/filt_evt1p"

In [None]:
from mine_utils import ProcAnn, mine_heur, mine_induct, mine_alpha, mine_dfg
import pm4py.objects.log.exporter.xes.exporter as xes_export
from pathlib import Path

In [None]:
import shutil, os

def init_subdir(subdir, subsubdirs=[]):
    if os.path.exists(subdir):
        shutil.rmtree(subdir)
    os.mkdir(subdir)
    for subsubdir in subsubdirs:
        os.mkdir(os.path.join(subdir, subsubdir))

In [None]:
induct_formats = ['bpmn', 'petri_net']
formats_with_ann = ['dfg']

In [None]:
def save_entries_json(names, default_format, default_ann, path):
    def entry_pref():
        if default_format in formats_with_ann:
            return f"{{ \"format\": \"{default_format}\", \"ann\": \"{default_ann.value}\" }}"
        else:
            return f"{{ \"format\": \"{default_format}\" }}"
    
    all = "[" + ", ".join(map(lambda n: f"\"{n}\"", names)) + "]"
    prefs = "{" + ", ".join(map(lambda n: f"\"{n}\": {entry_pref()}", names)) + "}"
    obj = f"{{ \"all\": {all}, \"prefs\": {prefs} }}"
    open(os.path.join(path, "graphs.json"), "w").write(obj)

In [None]:
# mine process models for sublogs
# (ca. 1 min)

for lvl in [ 1, 2 ]:
    # subprocess level (levels 1-2)
    subdir = os.path.join(tgt_folder, f"level{lvl}")

    # default model & annotation to be shown
    default_format = "bpmn"
    default_ann = ProcAnn.FREQ

    init_subdir(os.path.join(subdir, "xes"))
    init_subdir(os.path.join(subdir, "dfg"), [ ProcAnn.FREQ.value, ProcAnn.PERF.value ])
    init_subdir(os.path.join(subdir, "heur"), [ ProcAnn.FREQ.value, ProcAnn.PERF.value ])
    for format in induct_formats:
        init_subdir(os.path.join(subdir, format))

    names = [ ]
    for path in Path(os.path.join(tgt_folder, "data", f"level{lvl}")).rglob("*.csv"):
        file = os.path.basename(path)
        name = file[0: file.index(".csv")]
        print(name)
        names.append(name)
        
        log = pd.read_csv(path)
        log = log[['case:concept:name', 'time:timestamp', 'concept:name']]
        
        log['case:concept:name'] = log['case:concept:name'].astype('int64')
        log['time:timestamp'] = pd.to_datetime(log['time:timestamp'])
        
        xes_export.apply(log, os.path.join(subdir, "xes", name + ".xes"))
        
        for ann in ProcAnn:
            mine_dfg(log, ann, output_path=os.path.join(subdir, "dfg", ann.value, name), save_gviz=True)
            mine_heur(log, ann, output_path=os.path.join(subdir, "heur", ann.value, name), save_gviz=True)
        
        for format in induct_formats:
            mine_induct(log, convert_to=format, output_path=os.path.join(subdir, format, name), save_gviz=True)

    save_entries_json(names, default_format, default_ann, subdir)

In [None]:
# mine models for main process
# (ca. 30-40 sec)

log = pd.read_csv(os.path.join(tgt_folder, "data", "level0", "main.csv"))
log = log[['case:concept:name', 'time:timestamp', 'concept:name']]
log['time:timestamp'] = pd.to_datetime(log['time:timestamp'])

subdir = os.path.join(tgt_folder, "level0")

default_format = "dcr"
default_ann = ProcAnn.FREQ

init_subdir(os.path.join(subdir, "logs"))
init_subdir(os.path.join(subdir, "dfg"), [ ProcAnn.FREQ.value, ProcAnn.PERF.value ])
init_subdir(os.path.join(subdir, "heur"), [ ProcAnn.FREQ.value, ProcAnn.PERF.value ])
init_subdir(os.path.join(subdir, "bpmn"))
init_subdir(os.path.join(subdir, "petri_net"))
init_subdir(os.path.join(subdir, "xes"))

name = "main"
log.to_csv(os.path.join(subdir, "logs", name + ".csv"))

for ann in ProcAnn:
    mine_dfg(log, ann, output_path=os.path.join(subdir, "dfg", ann.value, name), save_gviz=True)
    mine_heur(log, ann, output_path=os.path.join(subdir, "heur", ann.value, name), save_gviz=True)

for format in induct_formats:
    mine_induct(log, convert_to=format, output_path=os.path.join(subdir, format, name), save_gviz=True)

xes_export.apply(log, os.path.join(subdir, "xes", name + ".xes"))

save_entries_json([name], default_format, default_ann, subdir)

In [None]:
# mine DCR: see
# /Users/wvw/git/pm/declarative/dcr4py/pm4py-dcr/ircc_dcr.ipynb