In [1]:
import os
os.chdir('/'.join(os.getcwd().split('\\')[:-2]))
import pandas as pd
import json
from copy import copy
from typing import Any, Callable
import numpy as np

In [2]:
ocel_in_file = "data/OTC/source/running-example.jsonocel"
ocel_out_file = "data/OTC/source/OTC.jsonocel"
with open(ocel_in_file, 'r') as fp:
    ocel = json.load(fp)

In [3]:
events_df = pd.DataFrame.from_dict(ocel['ocel:events'], orient='index')
objects_df = pd.DataFrame.from_dict(ocel['ocel:objects'], orient='index')

In [4]:
objects_df = pd.concat([objects_df, objects_df['ocel:ovmap'].apply(pd.Series)], axis=1)

In [5]:
# Convert the index of 'objects_df' to a column named 'oid',
# then select only the 'oid', 'age', and 'bankaccount' columns,
# and drop rows containing NaN values in the selected columns.
# Finally, store the result in the 'objects_df_ready_to_join' DataFrame.
objects_df_ready_to_join = objects_df.reset_index().rename(columns={'index':'oid'})[['oid','age','bankaccount']].dropna()

# Clean 'objects_df' again, restoring it to its original state
objects_df = objects_df.loc[objects_df['ocel:type']!='customers',['ocel:type',	'ocel:ovmap']]

# Create a new column named 'customer' in 'objects_df_ready_to_join'
# and copy the values from the 'oid' column into it.
objects_df_ready_to_join['customer'] = objects_df_ready_to_join['oid']

# Explode the 'ocel:omap' column in 'events_df', which means that for each
# row with a list of 'ocel:omap' values, create a new row for each value in the list.
# Then, reset the index and rename the 'ocel:omap' column to 'oid'.
eexploded = events_df.explode("ocel:omap").reset_index().rename(columns={"ocel:omap":'oid'})

# Perform a left join between the 'eexploded' DataFrame and 'objects_df_ready_to_join'
# DataFrame based on the 'oid' column. This operation adds the 'age', 'bankaccount', and 'customer'
# columns from 'objects_df_ready_to_join' to 'eexploded', matching the corresponding 'oid' values.
# The result is stored in the 'eexploded' DataFrame.
eexploded = eexploded.merge(objects_df_ready_to_join, on="oid", how="left")

# Group the DataFrame based on the 'group' column and get the first non-null value for each group
valid_values = eexploded.groupby('index').first()

# Use 'fillna' with the valid values to fill NaN values in columns 'age', 'bankaccount', and 'customer'
eexploded['age'] = eexploded['age'].fillna(eexploded['index'].map(valid_values['age']))
eexploded['bankaccount'] = eexploded['bankaccount'].fillna(eexploded['index'].map(valid_values['bankaccount']))
eexploded['customer'] = eexploded['customer'].fillna(eexploded['index'].map(valid_values['customer']))
# Filter out rows where "oid" and "customer" columns have the same value (to remove 'customers' as object type)
eexploded = eexploded[eexploded['oid'] != eexploded['customer']]

In [6]:
# Group by the original index and aggregate the "oid" column back into a list
events_df = eexploded.groupby("index").agg(
    {
        "ocel:activity": "first",
        "ocel:timestamp": "first",
        "oid": list,
        "ocel:vmap": "first",
        "age": "first",
        "bankaccount": "first",
        "customer": "first",
    }
)
# Reset original index, remove residual index, and rename 'oid' column back to its original name: 'ocel:omap'
events_df=events_df.reset_index().drop(columns=['index']).rename(columns={'oid':'ocel:omap'})
events_df

Unnamed: 0,ocel:activity,ocel:timestamp,ocel:omap,ocel:vmap,age,bankaccount,customer
0,place order,2019-05-20T09:07:47,"[880001, Echo Studio, 880004, 880003, 880002, ...","{'weight': 3.52, 'price': 524.96}",50.0,91248.0,Marco Pegoraro
1,item out of stock,2019-05-20T15:19:49,"[990003, iPad Air, 880009]","{'weight': 0.44, 'price': 476.0}",46.0,74370.0,Majid Rafiei
2,pick item,2019-05-22T17:13:56,"[990008, 880023, iPad Air]","{'weight': 0.44, 'price': 476.0}",46.0,74370.0,Majid Rafiei
3,pick item,2019-06-07T11:07:36,"[880450, iPad, 990111]","{'weight': 0.483, 'price': 495.0}",40.0,69940.0,Seran Uysal
4,pick item,2019-10-28T11:52:08,"[iPhone 11 Pro, 990896, 883602]","{'weight': 0.188, 'price': 1149.0}",43.0,59702.0,Christine Dobbert
...,...,...,...,...,...,...,...
22362,pick item,2019-10-28T11:26:20,"[iPhone X, 990931, 883737]","{'weight': 0.172, 'price': 699.0}",38.0,76896.0,Kefang Ding
22363,pay order,2019-10-28T11:27:29,"[883623, 883621, 883626, 990902, Echo Show 5, ...","{'weight': 2.636, 'price': 2692.99}",54.0,29653.0,Claudia Graf
22364,pick item,2019-10-28T11:29:18,"[Kindle, 990929, 883728]","{'weight': 0.483, 'price': 79.99}",58.0,42513.0,Anahita Farhang Ghahfarokhi
22365,pay order,2019-10-28T11:30:29,"[iPhone X, Echo Dot, 883462, 990861, MacBook A...","{'weight': 1.802, 'price': 2933.99}",51.0,44449.0,Lisa Mannel


In [7]:
# Add 'customer', 'age', and 'bankaccount' as event attribute
events_df['ocel:vmap'] = events_df.apply(lambda row: {**row['ocel:vmap'], 'age': row['age'], 'bankaccount': row['bankaccount'], 'customer': row['customer']}, axis=1)
events_df=events_df.drop(columns=[ 'age', 'bankaccount', 'customer'])

In [8]:
# encode and use the object ID as a feature for the object types that don't have object attributes ('orders','items', and 'packages')
objects_df = objects_df.reset_index()
# we just encode the oid as a float (it is a number contained in a string)
objects_df['encoded_oid'] = objects_df.apply(lambda row: float(row['index']) if not row['ocel:ovmap'] else np.nan, axis=1)
object_type_filter = objects_df['encoded_oid'].notna()
objects_df.loc[object_type_filter,'ocel:ovmap'] = objects_df.loc[object_type_filter,:].apply(lambda row: {**row['ocel:ovmap'], 'encoded_oid': row['encoded_oid']} if row['encoded_oid'] else row['ocel:ovmap'], axis=1)
objects_df = objects_df.drop(columns=['encoded_oid'])
objects_df=objects_df.set_index('index')

In [9]:
ocel["ocel:objects"] = objects_df.to_dict(orient="index")
ocel["ocel:events"] = events_df.to_dict(orient="index")
ocel["ocel:global-log"]['ocel:attribute-names']=ocel["ocel:global-log"]['ocel:attribute-names'].append('customer')
ocel["ocel:global-log"]["ocel:object-types"] = [
    "items",
    "orders",
    "packages",
    "products",
]

In [10]:
with open(ocel_out_file, 'w') as fp:
    json.dump(ocel,fp,indent=2)