In [1]:
import sys
import os
import json

sys.path.append("../../..")

from dataloaders.base import make_feature_fn, BaseEventLogDataset
from dataloaders.csv import CSVEventLogDataset
from dataloaders.xes import XESEventLogDataset

DATA_DIR = "../../../data"
datasets = os.listdir(DATA_DIR)
datasets

['c2c3b154-ab26-4b31-a0e8-8f2350ddac11',
 'fb84cf2d-166f-4de2-87be-62ee317077e5',
 'c3f3ba2d-e81e-4274-87c7-882fa1dbab0d',
 '33632f3c-5c48-40cf-8d8f-2db57f5a6ce7',
 'e30ba0c8-0039-4835-a493-6e3aa2301d3f',
 '6af6d5f0-f44c-49be-aac8-8eaa5fe4f6fd',
 '3d5ae0ce-198c-4b5c-b0f9-60d3035d07bf',
 '679b11cf-47cd-459e-a6de-9ca614e25985',
 'd9769f3d-0ab0-4fb8-803b-0d1120ffcf54',
 '500573e6-accc-4b0c-9576-aa5468b10cee',
 '01345ac4-7d1d-426e-92b8-24933a079412',
 '3537c19d-6c64-4b1d-815d-915ab0e479da',
 '5f3067df-f10b-45da-b98b-86ae4c7a310b',
 'a0addfda-2044-4541-a450-fdcc9fe16d17',
 '3926db30-f712-4394-aebc-75976070e91f',
 '63a8435a-077d-4ece-97cd-2c76d394d99c',
 'db35afac-2133-40f3-a565-2dc77a9329a3',
 '3301445f-95e8-4ff0-98a4-901f1f204972',
 '91fd1fa8-4df4-4b1a-9a3f-0116c412378f',
 '9b99a146-51b5-48df-aa70-288a76c82ec4',
 '3cfa2260-f5c5-44be-afe1-b70d35288d6d',
 'a6f651a7-5ce0-4bc6-8be1-a7747effa1cc',
 'ed445cdd-27d5-4d77-a1f7-59fe7360cfbe',
 '12683249',
 '6a0a26d2-82d0-4018-b1cd-89afb0e8627f',
 '2

In [2]:
def filter_files(fs: list[str]) -> list[str]:
    return list(filter(lambda f: f.endswith('.csv') or f.endswith('.xes'), fs))

files = {d: filter_files(os.listdir(f"{DATA_DIR}/{d}")) for d in datasets}
files

{'dx.doi.org_10.4121_uuid_3537c19d-6c64-4b1d-815d-915ab0e479da': ['BPI_Challenge_2013_open_problems.xes'],
 'dx.doi.org_10.4121_uuid_679b11cf-47cd-459e-a6de-9ca614e25985': ['BPIC15_4.xes'],
 'dx.doi.org_10.4121_uuid_c2c3b154-ab26-4b31-a0e8-8f2350ddac11': ['BPI_Challenge_2013_closed_problems.xes'],
 'dx.doi.org_10.4121_uuid_3d5ae0ce-198c-4b5c-b0f9-60d3035d07bf': ['Detail_Interaction.csv'],
 'doi.org_10.4121_uuid_d06aff4b-79f0-45e6-8ec8-e19730c248f1': ['BPI_Challenge_2019.xes'],
 'dx.doi.org_10.4121_uuid_86977bac-f874-49cf-8337-80f26bf5d2ef': ['Detail_Incident_Activity.csv'],
 'dx.doi.org_10.4121_uuid_3cfa2260-f5c5-44be-afe1-b70d35288d6d': ['Detail_Incident.csv'],
 'dx.doi.org_10.4121_uuid_e30ba0c8-0039-4835-a493-6e3aa2301d3f': ['BPI2016_Complaints.csv'],
 'dx.doi.org_10.4121_uuid_9b99a146-51b5-48df-aa70-288a76c82ec4': ['BPI2016_Clicks_NOT_Logged_In.csv'],
 'dx.doi.org_10.4121_uuid_2b02709f-9a84-4538-a76a-eb002eacf8d1': ['BPI2016_Questions.csv'],
 'dx.doi.org_10.4121_uuid_d9769f3d-0ab0-4

In [None]:
import numpy as np
from collections import Counter

def summarize(ds: BaseEventLogDataset):
    return {
        "n_traces": len(ds),
        "avg_trace_len": np.mean([len(t) for t in ds.log]),
        "n_unique_activities": len(ds.vocab[getattr(ds, "activity_col", "concept:name")]),
        "activity_freq": Counter(e["concept:name"] for t in ds.log for e in t),
    }

from dataloaders.util import CONSTRUCTION_PARAMS, DEFAULT_PARAMS_CSV, DEFAULT_PARAMS_XES

def get_ds(path: str):
    print(path)
    dataset_ftype = path.split(".")[-1].strip()
    match dataset_ftype:
        case "csv":
            doi = path.split('/')[-2]
            params = CONSTRUCTION_PARAMS.get(doi, DEFAULT_PARAMS_CSV)
            ensure_type = params.get("rtype")
            if ensure_type is not None:
                if ensure_type != "csv":
                    raise ValueError(f"Something went wrong... params says this Dataset is a different type than csv: {ensure_type}")
            return CSVEventLogDataset(
                source_path=path,
                feature_fn=make_feature_fn,
                **params
            )
        case "xes":
            doi = path.split('/')[-2]
            params = CONSTRUCTION_PARAMS.get(doi, DEFAULT_PARAMS_XES)
            ensure_type = params.get("rtype")
            if ensure_type is not None:
                if ensure_type != "xes":
                    raise ValueError(f"Something went wrong... params says this Dataset is a different type than xes: {ensure_type}")
            return XESEventLogDataset(
                source_path=path,
                feature_fn=make_feature_fn,
                **params
            )
        case _:
            raise ValueError(f"unknown dataset source type: '{dataset_ftype}'")

def make_path(uuid: str):
    for f in files[uuid]:
        yield f"{DATA_DIR}/{uuid}/{f}"

uuid = "c3f3ba2d-e81e-4274-87c7-882fa1dbab0d"
pth = list(make_path(uuid))[0]
test_ds = get_ds(pth)
s = summarize(test_ds)

summary_path = '/'.join(pth.split('/')[:-1] + ["summary.json"])
with open(summary_path, mode="w", encoding="utf-8") as f:
    f.write(json.dumps(s, indent=4))

In [None]:
summaries = {}

for doi in datasets:
    for path in make_path(doi):
        ds = get_ds(path)
        s = summarize(ds)
        print(s)
        summaries[path] = s

  from .autonotebook import tqdm as notebook_tqdm


../../../data/dx.doi.org_10.4121_uuid_3537c19d-6c64-4b1d-815d-915ab0e479da/BPI_Challenge_2013_open_problems.xes


parsing log, completed traces :: 100%|██████████| 819/819 [00:00<00:00, 17233.93it/s]

{'n_traces': 819, 'avg_trace_len': np.float64(2.8705738705738706), 'n_unique_activities': 3, 'activity_freq': Counter({'Accepted': 1581, 'Completed': 387, 'Queued': 383})}
../../../data/dx.doi.org_10.4121_uuid_679b11cf-47cd-459e-a6de-9ca614e25985/BPIC15_4.xes



parsing log, completed traces :: 100%|██████████| 1053/1053 [00:00<00:00, 1104.64it/s]


{'n_traces': 1053, 'avg_trace_len': np.float64(44.91263057929724), 'n_unique_activities': 356, 'activity_freq': Counter({'01_HOOFD_010': 1052, '01_HOOFD_015': 1052, '01_HOOFD_020': 1051, '01_HOOFD_180': 1023, '01_HOOFD_065_1': 1005, '01_HOOFD_065_2': 944, '01_HOOFD_200': 912, '09_AH_I_010': 877, '01_HOOFD_380': 877, '01_HOOFD_375': 870, '01_HOOFD_490_1': 869, '01_HOOFD_510_1': 857, '01_HOOFD_510_2': 857, '01_HOOFD_490_2': 842, '01_HOOFD_480': 797, '01_HOOFD_430': 791, '01_HOOFD_330': 788, '01_HOOFD_370': 776, '01_HOOFD_195': 750, '04_BPT_005': 722, '01_HOOFD_011': 722, '02_DRZ_010': 721, '03_GBH_005': 705, '01_HOOFD_495': 704, '01_BB_540': 635, '11_AH_II_010': 627, '08_AWB45_005': 624, '01_HOOFD_491': 571, '01_HOOFD_065_0': 541, '01_HOOFD_500': 536, '01_HOOFD_110_0': 525, '01_HOOFD_061': 520, '01_HOOFD_490_4': 495, '01_HOOFD_110': 465, '01_HOOFD_510_3': 461, '01_HOOFD_515': 461, '01_HOOFD_510_4': 458, '01_HOOFD_490_5': 453, '01_HOOFD_120': 451, '01_BB_770': 450, '01_HOOFD_050': 440, '1

parsing log, completed traces :: 100%|██████████| 1487/1487 [00:00<00:00, 10905.95it/s]
  df = pd.read_csv(f, sep=sep, index_col=False)


{'n_traces': 1487, 'avg_trace_len': np.float64(4.478816408876933), 'n_unique_activities': 4, 'activity_freq': Counter({'Accepted': 4207, 'Completed': 1568, 'Queued': 875, 'Unmatched': 10})}
../../../data/dx.doi.org_10.4121_uuid_3d5ae0ce-198c-4b5c-b0f9-60d3035d07bf/Detail_Interaction.csv
{'n_traces': 147004, 'avg_trace_len': np.float64(1.0), 'n_unique_activities': 6, 'activity_freq': Counter({'incident': 115704, 'request for information': 31183, 'complaint': 63, 'service request': 48, 'problem': 5, 'request for change': 1})}
../../../data/doi.org_10.4121_uuid_d06aff4b-79f0-45e6-8ec8-e19730c248f1/BPI_Challenge_2019.xes


parsing log, completed traces :: 100%|██████████| 251734/251734 [00:18<00:00, 13264.37it/s]


{'n_traces': 251734, 'avg_trace_len': np.float64(6.33971970413214), 'n_unique_activities': 42, 'activity_freq': Counter({'Record Goods Receipt': 314097, 'Create Purchase Order Item': 251734, 'Record Invoice Receipt': 228760, 'Vendor creates invoice': 219919, 'Clear Invoice': 194393, 'Record Service Entry Sheet': 164975, 'Remove Payment Block': 57136, 'Create Purchase Requisition Item': 46592, 'Receive Order Confirmation': 32065, 'Change Quantity': 21449, 'Change Price': 12423, 'Delete Purchase Order Item': 8875, 'Change Approval for Purchase Order': 7541, 'Cancel Invoice Receipt': 7096, 'Vendor creates debit memo': 6255, 'Change Delivery Indicator': 3289, 'Cancel Goods Receipt': 3096, 'SRM: In Transfer to Execution Syst.': 1765, 'SRM: Created': 1628, 'SRM: Complete': 1628, 'SRM: Awaiting Approval': 1628, 'SRM: Document Completed': 1628, 'SRM: Ordered': 1628, 'Release Purchase Order': 1610, 'SRM: Change was Transmitted': 1440, 'Reactivate Purchase Order Item': 543, 'Block Purchase Order

  df = pd.read_csv(f, sep=sep, index_col=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[


{'n_traces': 46606, 'avg_trace_len': np.float64(1.0), 'n_unique_activities': 3019, 'activity_freq': Counter({'SUB000456': 3050, 'SBA000263': 2176, 'SBA000607': 1743, 'SBA000462': 1698, 'WBA000058': 1614, 'WBA000133': 1453, 'SUB000113': 1100, 'WBA000011': 945, 'WBA000144': 909, 'DTA000616': 851, 'SUB000424': 744, 'SAP000004': 737, 'DTA000057': 684, 'SBA000017': 669, 'WBA000124': 615, 'SBA000759': 572, 'SBA000689': 558, 'SBA000609': 539, 'WBA000103': 506, 'APP000005': 468, 'WBA000018': 463, 'SBA000805': 457, 'SBA000439': 455, 'CBA000014': 414, 'SAN000182': 398, 'SBA000659': 391, 'SBA000427': 374, 'SBA000834': 369, 'SBA000458': 351, 'SBA000459': 336, 'DTA000025': 330, 'WBA000148': 312, 'SUB000508': 309, 'SAP000005': 295, 'SBA000131': 294, 'SBA000464': 291, 'DTA000016': 276, 'SBA000054': 276, 'SUB000523': 259, 'DTA000056': 235, 'SBA000063': 232, 'WBA000145': 230, 'SUB000443': 228, 'SBA000172': 218, 'SBA000782': 218, 'WBA000094': 212, 'WBA000022': 207, 'DTA000031': 206, 'SUB000479': 194, 'S

  df = pd.read_csv(f, sep=sep, index_col=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[


{'n_traces': 186, 'avg_trace_len': np.float64(1.2258064516129032), 'n_unique_activities': 44, 'activity_freq': Counter({'login general': 45, 'activate vacancy': 23, 'Add a message': 18, 'Step 1 job vacancy': 10, 'Open my Certificate of Registration pdf': 9, 'Reporting Add application action 11': 9, 'Change step 7 Other Jobs': 8, 'Request password': 7, 'step 7 remaining vacancy': 6, 'list vacancy': 6, 'Remove vacancy survey': 5, 'download pDF': 5, 'customize business company': 5, 'Show tasks': 4, 'step6 wish vacancy': 4, 'activation code aanvragen1': 4, 'step3 employment vacancy': 4, 'Step2 characteristics vacancy': 4, 'UC132 log vacancy in office': 3, 'tone inbox': 3, 'Reporting Add application Action 1': 3, 'Reporting add application action 6': 3, 'Reporting Add application action 9': 3, 'workplan read': 3, 'Step 5 Add a work experience job sites': 3, 'deactivate job': 3, 'tone documents': 3, 'Step5 survey work experience vacancy': 2, 'step3 change conditions vacancy': 2, 'download rt

parsing log, completed traces :: 100%|██████████| 1143/1143 [00:02<00:00, 437.06it/s]


{'n_traces': 1143, 'avg_trace_len': np.float64(131.48818897637796), 'n_unique_activities': 624, 'activity_freq': Counter({'aanname laboratoriumonderzoek': 15353, 'ligdagen - alle spec.beh.kinderg.-reval.': 10897, '190205 klasse 3b        a205': 9351, 'ordertarief': 9008, '190101 bovenreg.toesl.  a101': 6241, 'vervolgconsult poliklinisch': 5239, 'kalium potentiometrisch': 4328, 'natrium vlamfotometrisch': 4304, 'hemoglobine foto-elektrisch': 4275, 'creatinine': 3955, 'leukocyten tellen elektronisch': 2968, 'trombocyten tellen - elektronisch': 2724, 'differentiele telling automatisch': 2370, 'administratief tarief       - eerste pol': 2171, '190021 klinische opname a002': 2118, 'calcium': 2042, 'glucose': 1950, 'kruisproef volledig -drie methoden-': 1705, 'haemoglobine foto-electrisch - spoed': 1676, 'ureum': 1643, 'bloedgroep abo en rhesusfactor': 1564, 'rhesusfactor d - centrifugeermethode - e': 1564, 'screening antistoffen erytrocyten': 1535, 'sgpt - alat kinetisch': 1469, 'sgot - asa

  df = pd.read_csv(f, sep=sep, index_col=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[


{'n_traces': 3991, 'avg_trace_len': np.float64(2.285642696066149), 'n_unique_activities': 32, 'activity_freq': Counter({'Show tasks': 2282, 'Open my Certificate of Registration pdf': 1401, 'show closed tasks': 1322, 'Show reports': 1157, 'show unfinished tasks': 888, 'show documents': 715, 'download rtf': 383, 'download pDF': 375, 'show inbox': 192, 'Add a message': 132, 'Reporting Add application action 11': 83, 'Add document servlet': 65, 'Reporting Add application action 8': 22, 'Reporting Add application Action 4': 14, 'Reporting Add application Action 3': 11, 'Reporting Add application Action 10': 9, 'Add report': 9, 'workplan read': 9, 'show week calendar': 8, 'Open my Certificate of Registration html': 8, 'Reporting Add application action 12': 8, 'Reporting Add application action 6': 7, 'Reporting Add application action 9': 5, 'show calendar': 4, 'sent messages': 3, 'Reporting Add application Action 1': 3, 'report changes page': 2, 'Reporting Add application action 13': 1, 'CV W

parsing log, completed traces :: 100%|██████████| 7554/7554 [00:01<00:00, 5826.64it/s]


{'n_traces': 7554, 'avg_trace_len': np.float64(8.675271379401641), 'n_unique_activities': 4, 'activity_freq': Counter({'Accepted': 40117, 'Completed': 13867, 'Queued': 11544, 'Unmatched': 5})}


In [None]:
import json

for k, v in summaries.items():
    summary_path = '/'.join(k.split('/')[:-1] + ["summary.json"])
    with open(summary_path, mode="w", encoding="utf-8") as f:
        f.write(json.dumps(v, indent=4))