Load the "MainProcess.xes" file

In [8]:
import os
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer
import pandas as pd

# Directory with your .xes files
xes_directory = os.path.join(os.getcwd(), "20130794", "Cleaned Event Log")

# Output directory for models or visuals
output_directory = os.path.join(os.getcwd(), "output")
os.makedirs(output_directory, exist_ok=True)

# Process MainProcess.xes file
filename = "MainProcess.xes"
file_path = os.path.join(xes_directory, filename)
print(f"Processing {filename}")

try:
    log = xes_importer.apply(file_path)
    print("\n\n")
    print(f"Imported {filename} with {len(log)} traces.")
    
    # Print important information about the log
    print(f"Number of events: {sum(len(trace) for trace in log)}")
    activities = set(event["concept:name"] for trace in log for event in trace if "concept:name" in event)
    print(f"Number of unique activities: {len(activities)}")
    print(f"Unique activities: {activities}")
    case_ids = [trace.attributes["concept:name"] for trace in log if "concept:name" in trace.attributes]
    print(f"Number of cases: {len(case_ids)}")
    print(f"First 5 case IDs: {case_ids[:5]}")
    print("\n\n")

except Exception as e:
    print(f"Error processing {filename}: {e}")


Processing MainProcess.xes


parsing log, completed traces :: 100%|██████████| 301/301 [00:00<00:00, 632.12it/s]




Imported MainProcess.xes with 301 traces.
Number of events: 9471
Number of unique activities: 21
Unique activities: {'/mm/mill', '/pm/punch_recesses', '/vgr/pick_up_and_transport', '/hbw/unload', '/mm/transport_from_to', '/hbw/store_empty_bucket', '/mm/deburr', '/mm/drill', '/sm/transport', '/ov/temper', '/pm/punch_gill', '/pm/punch_ribbing', '/hbw/get_empty_bucket', '/hw/human_review', '/sm/sort', '/dm/lower', '/wt/pick_up_and_transport', '/hbw/store', '/dm/drill', '/dm/cylindrical_drill', '/ov/burn'}
Number of cases: 301
First 5 case IDs: ['WF_101_0', 'WF_102_0', 'WF_103_0', 'WF_104_0', 'WF_105_0']








Get all the attributes included in this file ("MainProcess.xes")

In [9]:
try:
    # List all attributes of the traces in a table
    print("\n\nAttributes of traces:\n")
    
    trace_attributes = set()
    for trace in log:
        trace_attributes.update(trace.attributes.keys())
        
    trace_attr_info = []
    for attr in trace_attributes:
        attr_type = "unknown"
        for trace in log:
            if attr in trace.attributes:
                attr_type = type(trace.attributes[attr]).__name__
                break
        trace_attr_info.append({"Attribute": attr, "Type": attr_type})

    trace_attr_df = pd.DataFrame(trace_attr_info)
    display(trace_attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    trace_attr_df.to_excel(os.path.join(tables_dir, "main_trace_attribute_info.xlsx"), index=False)
    
    # List all attributes of the events in a table
    print("\n\nAttributes of events:\n")
    
    event_attributes = set()
    for trace in log:
        for event in trace:
            event_attributes.update(event.keys())

    event_attr_info = []
    for attr in event_attributes:
        attr_type = "unknown"
        for trace in log:
            for event in trace:
                if attr in event:
                    attr_type = type(event[attr]).__name__
                    break
            if attr_type != "unknown":
                break
        event_attr_info.append({"Attribute": attr, "Type": attr_type})

    event_attr_df = pd.DataFrame(event_attr_info)
    display(event_attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    event_attr_df.to_excel(os.path.join(tables_dir, "main_event_attribute_info.xlsx"), index=False)
except Exception as e:
    print(f"Error processing the log: {e}")




Attributes of traces:



Unnamed: 0,Attribute,Type
0,concept:name,str




Attributes of events:



Unnamed: 0,Attribute,Type
0,concept:name,str
1,operation_end_time,datetime
2,lifecycle:transition,str
3,current_task,str
4,complete_service_time,str
5,time:timestamp,datetime
6,process_model_id,str
7,planned_operation_time,str
8,SubProcessID,str
9,case:concept:name,str


List all the resources of the log in a table

In [10]:
try:
    # Set pandas display options for full width
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.width', 0)
    pd.set_option('display.max_columns', None)

    resources = set()
    for trace in log:
        for event in trace:
            if "org:resource" in event:
                resources.add(event["org:resource"])

    resource_info = []
    for resource in resources:
        # Find the first SubProcessID for this resource by searching events until found
        first_subprocess_id = "N/A"
        parameters_dict = None
        found = False
        for trace in log:
                for event in trace:
                    if event.get("org:resource") == resource:
                        if "SubProcessID" in event:
                            first_subprocess_id = event["SubProcessID"]
                            found = True
                            break
                if found:
                    break
                
        found = False
        for trace in log:
                for event in trace:
                    if event.get("org:resource") == resource:
                        if "parameters" in event:
                            parameters_dict = event["parameters"]
                            found = True
                            break
                if found:
                    break
        event_count = sum(1 for trace in log for event in trace if event.get("org:resource") == resource)
        activities_performed = set(event["concept:name"] for trace in log for event in trace if event.get("org:resource") == resource)
        parameter_keys = list(parameters_dict['children'][i][0] for i in range(len(parameters_dict['children']))) if parameters_dict and 'children' in parameters_dict else []
        resource_info.append({
            "Resource": resource,
            "Event Count": event_count,
            "Unique Activities": len(activities_performed),
            "Activities": ", ".join(sorted(activities_performed)),
            "First Subprocess ID": first_subprocess_id,
            "Parameter Keys": parameter_keys
        })

    resource_df = pd.DataFrame(resource_info)
    display(resource_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    resource_df.to_excel(os.path.join(tables_dir, "resources_info.xlsx"), index=False)

except Exception as e:
    print(f"Error processing the log: {e}")

Unnamed: 0,Resource,Event Count,Unique Activities,Activities,First Subprocess ID,Parameter Keys
0,hbw_1,873,4,"/hbw/get_empty_bucket, /hbw/store, /hbw/store_empty_bucket, /hbw/unload",b179f074-238d-4666-b50f-9a8959d0a48e,"[parameter_hbw_slot, parameter_use_nfc]"
1,mm_1,576,4,"/mm/deburr, /mm/drill, /mm/mill, /mm/transport_from_to",167db95e-ae8b-4ae8-ac11-055401e11894,"[parameter_start_position, parameter_end_position]"
2,ov_1,612,2,"/ov/burn, /ov/temper",633d065f-96c0-4c4b-8112-302990575763,"[parameter_burn_workpiece_size, parameter_burn_workpiece_thickness]"
3,hw_1,522,1,/hw/human_review,a8d0fcdd-46c6-44f8-8b19-cecd803d356f,[]
4,vgr_2,885,1,/vgr/pick_up_and_transport,4d198444-6633-4218-b1f7-ca67ec666360,"[parameter_start_position, parameter_end_position]"
5,wt_2,330,1,/wt/pick_up_and_transport,7316381c-127f-43cb-956b-ca72e60bc6ab,"[parameter_start_position, parameter_end_position]"
6,vgr_1,1866,1,/vgr/pick_up_and_transport,0e7b5a4c-4c03-47b2-96fd-e401ed7fbca9,"[parameter_start_position, parameter_end_position]"
7,sm_2,309,2,"/sm/sort, /sm/transport",722f5091-ed89-45a3-89c7-4962901b6c14,"[parameter_start_position, parameter_end_position]"
8,pm_1,204,3,"/pm/punch_gill, /pm/punch_recesses, /pm/punch_ribbing",21559c95-22a5-4c8b-9424-dbbc14a9f63b,"[parameter_start_position, parameter_end_position, parameter_quantity]"
9,hbw_2,1581,2,"/hbw/store_empty_bucket, /hbw/unload",27958fc0-4484-41ff-9260-e76f8a83a7cd,"[parameter_hbw_slot, parameter_use_nfc]"


Load a single subevent log file

In [11]:
# Process 0a0a7c16-85d9-48be-a7d5-32931240c337.xes file
filename = "0a0a7c16-85d9-48be-a7d5-32931240c337.xes"
file_path = os.path.join(xes_directory, filename)
print(f"Processing {filename}")

try:
    subevent_log = xes_importer.apply(file_path)
    print("\n\n")
    print(f"Imported {filename} with {len(subevent_log)} traces.")

    # Print important information about the subevent_log
    print(f"Number of events: {sum(len(trace) for trace in subevent_log)}")
    activities = set(event["concept:name"] for trace in subevent_log for event in trace if "concept:name" in event)
    print(f"Number of unique activities: {len(activities)}")
    print(f"Unique activities: {activities}")
    case_ids = [trace.attributes["concept:name"] for trace in subevent_log if "concept:name" in trace.attributes]
    print(f"Number of cases: {len(case_ids)}")
    print(f"First 5 case IDs: {case_ids[:5]}")
    print("\n\n")

except Exception as e:
    print(f"Error processing {filename}: {e}")

Processing 0a0a7c16-85d9-48be-a7d5-32931240c337.xes


parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 142.48it/s]




Imported 0a0a7c16-85d9-48be-a7d5-32931240c337.xes with 1 traces.
Number of events: 5
Number of unique activities: 5
Unique activities: {'transporting the workpiece to the mill', 'ejecting the workpiece to the conveyor belt', 'milling the workpiece', 'transporting the workpiece to the sorting machine', 'transporting the workpiece to the ejection position'}
Number of cases: 0
First 5 case IDs: []








List all the attributes in this file

In [12]:
try:
    # List all attributes in the subevent_log in a table and display nicely in Jupyter Notebook
    all_attributes = set()
    for trace in subevent_log:
        all_attributes.update(trace.attributes.keys())
        for event in trace:
            all_attributes.update(event.keys())

    # Prepare attribute type information
    attr_info = []
    for attr in all_attributes:
        if attr in subevent_log[0].attributes:
            attr_type = type(subevent_log[0].attributes[attr]).__name__
        elif len(subevent_log[0]) > 0 and attr in subevent_log[0][0]:
            attr_type = type(subevent_log[0][0][attr]).__name__
        else:
            attr_type = "unknown"
        attr_info.append({"Attribute": attr, "Type": attr_type})

    # Display as a pandas DataFrame
    attr_df = pd.DataFrame(attr_info)
    display(attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    attr_df.to_excel(os.path.join(tables_dir, "sub_attribute_info.xlsx"), index=False)

except Exception as e:
    print(f"Error processing the subevent_log: {e}")

Unnamed: 0,Attribute,Type
0,org:resource,str
1,concept:name,str
2,SubProcessID,str
3,stream:datastream,dict
4,operation_end_time,datetime
5,time:timestamp,datetime


Get all the sensor data from the database using duckdb

In [None]:
import duckdb
import os
from pathlib import Path
import pandas as pd

xes_directory = Path.cwd() / "20130794" / "Data Quality Issues Event Log"

# Make sure xes_directory is defined before this line
parquet_dir = os.path.join(xes_directory, "parquet")

# Connect to an in-memory DuckDB instance
con = duckdb.connect(database=':memory:')

# Collect all .parquet files in the directory
parquet_file = os.path.join(parquet_dir, "all_combined.parquet")

# Ensure there are files to process
if not os.path.exists(parquet_file):
    raise FileNotFoundError(f"No Parquet file found at {parquet_file}")

# Register all Parquet files as a single virtual table (view)
# IMPORTANT: DuckDB's parquet_scan expects a *list of strings* to be passed as a DuckDB list literal
# Use array syntax ['file1.parquet', 'file2.parquet', ...]
parquet_list_str = f"'{parquet_file}'"
query = f"CREATE VIEW sensor_data AS SELECT * FROM parquet_scan([{parquet_list_str}])"
con.execute(query)



# Optionally preview first few rows
# Query: group by stream:system and stream:observation, filter for one resource (e.g., 'sm_1')
df_sensor_grouped = con.execute("""
    SELECT 
        *
    FROM sensor_data
    WHERE "stream:observation" = 'http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure'
    AND "stream:value" > '0.5'
    LIMIT 50
""").df()
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
display(df_sensor_grouped)


Unnamed: 0,concept:name,org:resource,stream:system,stream:system_type,stream:observation,stream:procedure_type,stream:interaction_type,stream:timestamp,stream:value,sensor_key
0,opening the oven door,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:09:54.503000,0.426,stream:point
1,transporting the workpiece to the inside of the oven,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:09:57.243000,0.426,stream:point
2,transporting the workpiece to the inside of the oven,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:09:57.848000,0.426,stream:point
3,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:09:58.556000,0.426,stream:point
4,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:10:00.277000,0.426,stream:point
5,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:10:01.487000,0.426,stream:point
6,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:10:06.737000,0.426,stream:point
7,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:10:07.948000,0.426,stream:point
8,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:10:08.854000,0.426,stream:point
9,temper the workpiece for 40 seconds,ov_1,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Compressor_8,sosa:Sensor,http://iot.uni-trier.de/FTOnto#OV_1_WT_1_Pneumatic_System_Pressure,stream:continuous,sosa:Observation,2021-07-01T19:10:11.071000,0.426,stream:point


Use the sensor dict