# Log conversion and preprocessing

In this file, we preprocess the logs from the cases and variants paper in object-centric process mining by Adams et al., before we can convert them into JSON-OCEL.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
import pandas as pd
from ocpa.objects.log.importer.csv import factory as ocel_import_factory_csv
from ocpa.objects.log.exporter.ocel import factory as ocel_export_factory

In [5]:
def preprocess_csv(path,ots):
    """
    Function to preprocess the csv file of the object-centric event log.
    :param path: path to the csv file, type: string
    :param ots: list of object types, type: list
    :return: preprocessed event log, type: pandas dataframe
    """
    #Load the file into a pandas dataframe
    event_df = pd.read_csv(path)
    # split string elements by commas and strip leading/trailing whitespace to generate the lists of objects for the specific object type
    for t in ots:
        event_df[t] = event_df[t].map(lambda x: [y.strip() for y in x.split(',')] if isinstance(x, str) else [])
        #generate the event_id column and the index column in the same way
    event_df["event_id"] = list(range(0, len(event_df)))
    event_df.index = list(range(0, len(event_df)))
    event_df["event_id"] = event_df["event_id"].astype(float).astype(int)
    return event_df

## DS3 log

In [8]:
#first dataset
dataset = "../src/data/csv/DS3.csv"
types = ["incident","customer"]

event_df = preprocess_csv(dataset, types)
#rename the timestamp column such that it fits the pattern
event_df = event_df.rename(columns={'event_closed_at': 'event_timestamp'})
event_df.to_csv("../src/data/csv/DS3.csv",index=False)

In [9]:
#import the dataframe again and convert it to a JSON-OCEL log
parameters = {
            "obj_names": ["incident","customer"],
            "val_names": [],
            "act_name": "event_activity",
            "time_name": "event_timestamp",
            "sep": ",",
        }
ocel_new = ocel_import_factory_csv.apply(file_path="../src/data/csv/DS3.csv", parameters=parameters)
print("Number of process executions: "+str(len(ocel_new.process_executions)))
print("Number of variants: "+str(len(ocel_new.variants)))
#export the JSON-OCEL file for better usability
ocel_export_factory.apply(
    ocel_new, '../src/data/jsonocel/DS3.jsonocel')

Number of process executions: 4825
Number of variants: 3380


{'ocel:global-log': {'ocel:attribute-names': ['event_category',
   'event_location',
   'event_cmdb_ci',
   'event_resolved_at',
   'event_sys_created_by',
   'event_problem_id',
   'event_subcategory',
   'event_assigned_to',
   'event_sys_updated_at',
   'event_sys_created_at',
   'event_resource',
   'event_sys_mod_count',
   'event_knowledge',
   'event_made_sla',
   'event_u_symptom',
   'event_rfc',
   'event_reopen_count',
   'event_vendor',
   'event_u_priority_confirmation',
   'event_reassignment_count',
   'event_resolved_by',
   'event_active',
   'event_index',
   'event_priority',
   'event_sys_updated_by',
   'event_contact_type',
   'event_caused_by',
   'event_assignment_group',
   'event_notify',
   'event_urgency',
   'event_closed_code',
   'event_id',
   'event_opened_at',
   'event_impact'],
  'ocel:object-types': ['incident', 'customer'],
  'ocel:version': '1.0',
  'ocel:ordering': 'timestamp'},
 'ocel:global-event': {'ocel:activity': '__INVALID__'},
 'ocel:globa

## DS4 log

In [12]:
######Datasets
dataset = "../src/data/csv/DS4.csv"
types = ["Payment application","Control summary","Entitlement application","Geo parcel document","Inspection","Reference alignment"]

event_df = preprocess_csv(dataset, types)
event_df.to_csv("../src/data/csv/DS4.csv",index=False)

In [15]:
#import the dataframe again and convert it to a JSON-OCEL log
parameters = {
            "obj_names": ["Payment application","Control summary","Entitlement application","Geo parcel document","Inspection","Reference alignment"],
            "val_names": [],
            "act_name": "event_activity",
            "time_name": "event_timestamp",
            "sep": ",",
        }
ocel_new = ocel_import_factory_csv.apply(file_path="../src/data/csv/DS4.csv", parameters=parameters)
print("Number of process executions: "+str(len(ocel_new.process_executions)))
print("Number of variants: "+str(len(ocel_new.variants)))
#export the JSON-OCEL file for better usability
ocel_export_factory.apply(
    ocel_new, '../src/data/jsonocel/DS4.jsonocel')

Number of process executions: 14507
Number of variants: 7259


{'ocel:global-log': {'ocel:attribute-names': ['event_Unnamed: 0',
   'event_starttime',
   'event_resource',
   'event_subprocess',
   'event_object',
   'event_docid',
   'event_identity:id',
   'event_id',
   'event_index',
   'event_success',
   'event_year',
   'event_new_activity'],
  'ocel:object-types': ['Inspection',
   'Reference alignment',
   'Geo parcel document',
   'Entitlement application',
   'Control summary',
   'Payment application'],
  'ocel:version': '1.0',
  'ocel:ordering': 'timestamp'},
 'ocel:global-event': {'ocel:activity': '__INVALID__'},
 'ocel:global-object': {'ocel:type': '__INVALID__'},
 'ocel:events': {'0': {'ocel:activity': 'Payment application mail income',
   'ocel:timestamp': '2017-04-07T00:00:00',
   'ocel:omap': ['adcbf6cccf7e6c89'],
   'ocel:vmap': {'event_Unnamed: 0': 1661656,
    'event_starttime': '2017/04/07 00:00:00.000',
    'event_resource': '0;n/a',
    'event_subprocess': 'Application',
    'event_object': 'F6BAE319-09AB-4C7D-9098-BCDF71C