Load the "MainProcess.xes" file

In [1]:
import os
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer
import pandas as pd

# Directory with your .xes files
xes_directory = os.path.join(os.getcwd(), "20130794", "Cleaned Event Log")

# Output directory for models or visuals
output_directory = os.path.join(os.getcwd(), "output")
os.makedirs(output_directory, exist_ok=True)

# Process MainProcess.xes file
filename = "MainProcess.xes"
file_path = os.path.join(xes_directory, filename)
print(f"Processing {filename}")

try:
    log = xes_importer.apply(file_path)
    print("\n\n")
    print(f"Imported {filename} with {len(log)} traces.")
    
    # Print important information about the log
    print(f"Number of events: {sum(len(trace) for trace in log)}")
    activities = set(event["concept:name"] for trace in log for event in trace if "concept:name" in event)
    print(f"Number of unique activities: {len(activities)}")
    print(f"Unique activities: {activities}")
    case_ids = [trace.attributes["concept:name"] for trace in log if "concept:name" in trace.attributes]
    print(f"Number of cases: {len(case_ids)}")
    print(f"First 5 case IDs: {case_ids[:5]}")
    print("\n\n")

except Exception as e:
    print(f"Error processing {filename}: {e}")


Processing MainProcess.xes


  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 301/301 [00:00<00:00, 688.11it/s]




Imported MainProcess.xes with 301 traces.
Number of events: 9471
Number of unique activities: 21
Unique activities: {'/ov/temper', '/pm/punch_recesses', '/hbw/unload', '/hbw/store_empty_bucket', '/dm/lower', '/sm/transport', '/vgr/pick_up_and_transport', '/mm/deburr', '/mm/transport_from_to', '/hbw/get_empty_bucket', '/pm/punch_ribbing', '/mm/drill', '/hw/human_review', '/dm/cylindrical_drill', '/sm/sort', '/ov/burn', '/dm/drill', '/wt/pick_up_and_transport', '/pm/punch_gill', '/hbw/store', '/mm/mill'}
Number of cases: 301
First 5 case IDs: ['WF_101_0', 'WF_102_0', 'WF_103_0', 'WF_104_0', 'WF_105_0']








Get all the attributes included in this file ("MainProcess.xes")

In [2]:
try:
    # List all attributes of the traces in a table
    print("\n\nAttributes of traces:\n")
    
    trace_attributes = set()
    for trace in log:
        trace_attributes.update(trace.attributes.keys())
        
    trace_attr_info = []
    for attr in trace_attributes:
        attr_type = "unknown"
        for trace in log:
            if attr in trace.attributes:
                attr_type = type(trace.attributes[attr]).__name__
                break
        trace_attr_info.append({"Attribute": attr, "Type": attr_type})

    trace_attr_df = pd.DataFrame(trace_attr_info)
    display(trace_attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    trace_attr_df.to_excel(os.path.join(tables_dir, "main_trace_attribute_info.xlsx"), index=False)
    
    # List all attributes of the events in a table
    print("\n\nAttributes of events:\n")
    
    event_attributes = set()
    for trace in log:
        for event in trace:
            event_attributes.update(event.keys())

    event_attr_info = []
    for attr in event_attributes:
        attr_type = "unknown"
        for trace in log:
            for event in trace:
                if attr in event:
                    attr_type = type(event[attr]).__name__
                    break
            if attr_type != "unknown":
                break
        event_attr_info.append({"Attribute": attr, "Type": attr_type})

    event_attr_df = pd.DataFrame(event_attr_info)
    display(event_attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    event_attr_df.to_excel(os.path.join(tables_dir, "main_event_attribute_info.xlsx"), index=False)
except Exception as e:
    print(f"Error processing the log: {e}")




Attributes of traces:



Unnamed: 0,Attribute,Type
0,concept:name,str




Attributes of events:



Unnamed: 0,Attribute,Type
0,lifecycle:transition,str
1,case,str
2,response_status_code,float
3,concept:name,str
4,time:timestamp,datetime
5,planned_operation_time,str
6,case:concept:name,str
7,unsatisfied_condition_description,str
8,human_workstation_green_button_pressed,float
9,SubProcessID,str


List all the resources of the log in a table

In [3]:
try:
    # Set pandas display options for full width
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.width', 0)
    pd.set_option('display.max_columns', None)

    resources = set()
    for trace in log:
        for event in trace:
            if "org:resource" in event:
                resources.add(event["org:resource"])

    resource_info = []
    for resource in resources:
        # Find the first SubProcessID for this resource by searching events until found
        first_subprocess_id = "N/A"
        parameters_dict = None
        found = False
        for trace in log:
                for event in trace:
                    if event.get("org:resource") == resource:
                        if "SubProcessID" in event:
                            first_subprocess_id = event["SubProcessID"]
                            found = True
                            break
                if found:
                    break
                
        found = False
        for trace in log:
                for event in trace:
                    if event.get("org:resource") == resource:
                        if "parameters" in event:
                            parameters_dict = event["parameters"]
                            found = True
                            break
                if found:
                    break
        event_count = sum(1 for trace in log for event in trace if event.get("org:resource") == resource)
        activities_performed = set(event["concept:name"] for trace in log for event in trace if event.get("org:resource") == resource)
        parameter_keys = list(parameters_dict['children'][i][0] for i in range(len(parameters_dict['children']))) if parameters_dict and 'children' in parameters_dict else []
        resource_info.append({
            "Resource": resource,
            "Event Count": event_count,
            "Unique Activities": len(activities_performed),
            "Activities": ", ".join(sorted(activities_performed)),
            "First Subprocess ID": first_subprocess_id,
            "Parameter Keys": parameter_keys
        })

    resource_df = pd.DataFrame(resource_info)
    display(resource_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    resource_df.to_excel(os.path.join(tables_dir, "resources_info.xlsx"), index=False)

except Exception as e:
    print(f"Error processing the log: {e}")

Unnamed: 0,Resource,Event Count,Unique Activities,Activities,First Subprocess ID,Parameter Keys
0,ov_1,612,2,"/ov/burn, /ov/temper",633d065f-96c0-4c4b-8112-302990575763,"[parameter_burn_workpiece_size, parameter_burn_workpiece_thickness]"
1,hw_1,522,1,/hw/human_review,a8d0fcdd-46c6-44f8-8b19-cecd803d356f,[]
2,dm_2,177,3,"/dm/cylindrical_drill, /dm/drill, /dm/lower",ad6c9c0b-f3ba-45e7-b887-b96bf0260887,"[parameter_start_position, parameter_end_position]"
3,sm_1,378,2,"/sm/sort, /sm/transport",16d2bd16-3be9-4daa-a4ad-edb7f5818fcb,"[parameter_use_nfc, parameter_start_position, parameter_end_position, parameter_sorting_machine_ejection_position]"
4,mm_2,381,4,"/mm/deburr, /mm/drill, /mm/mill, /mm/transport_from_to",570d0814-988a-4856-bc82-249db6050f5e,"[parameter_start_position, parameter_end_position, parameter_burn_workpiece_size, parameter_quantity]"
5,hbw_1,873,4,"/hbw/get_empty_bucket, /hbw/store, /hbw/store_empty_bucket, /hbw/unload",b179f074-238d-4666-b50f-9a8959d0a48e,"[parameter_hbw_slot, parameter_use_nfc]"
6,pm_1,204,3,"/pm/punch_gill, /pm/punch_recesses, /pm/punch_ribbing",21559c95-22a5-4c8b-9424-dbbc14a9f63b,"[parameter_start_position, parameter_end_position, parameter_quantity]"
7,wt_1,447,1,/wt/pick_up_and_transport,8febb390-19ce-4d63-a018-d9617a8bb1b7,"[parameter_start_position, parameter_end_position]"
8,ov_2,330,1,/ov/burn,1ab1350e-cba4-42ea-8efd-a0b01e88380e,"[parameter_burn_workpiece_size, parameter_burn_workpiece_thickness]"
9,vgr_2,885,1,/vgr/pick_up_and_transport,4d198444-6633-4218-b1f7-ca67ec666360,"[parameter_start_position, parameter_end_position]"


Load a single subevent log file

In [4]:
# Process 0a0a7c16-85d9-48be-a7d5-32931240c337.xes file
filename = "0a0a7c16-85d9-48be-a7d5-32931240c337.xes"
file_path = os.path.join(xes_directory, filename)
print(f"Processing {filename}")

try:
    subevent_log = xes_importer.apply(file_path)
    print("\n\n")
    print(f"Imported {filename} with {len(subevent_log)} traces.")

    # Print important information about the subevent_log
    print(f"Number of events: {sum(len(trace) for trace in subevent_log)}")
    activities = set(event["concept:name"] for trace in subevent_log for event in trace if "concept:name" in event)
    print(f"Number of unique activities: {len(activities)}")
    print(f"Unique activities: {activities}")
    case_ids = [trace.attributes["concept:name"] for trace in subevent_log if "concept:name" in trace.attributes]
    print(f"Number of cases: {len(case_ids)}")
    print(f"First 5 case IDs: {case_ids[:5]}")
    print("\n\n")

except Exception as e:
    print(f"Error processing {filename}: {e}")

Processing 0a0a7c16-85d9-48be-a7d5-32931240c337.xes


parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 133.46it/s]




Imported 0a0a7c16-85d9-48be-a7d5-32931240c337.xes with 1 traces.
Number of events: 5
Number of unique activities: 5
Unique activities: {'transporting the workpiece to the ejection position', 'ejecting the workpiece to the conveyor belt', 'transporting the workpiece to the mill', 'milling the workpiece', 'transporting the workpiece to the sorting machine'}
Number of cases: 0
First 5 case IDs: []








List all the attributes in this file

In [5]:
try:
    # List all attributes in the subevent_log in a table and display nicely in Jupyter Notebook
    all_attributes = set()
    for trace in subevent_log:
        all_attributes.update(trace.attributes.keys())
        for event in trace:
            all_attributes.update(event.keys())

    # Prepare attribute type information
    attr_info = []
    for attr in all_attributes:
        if attr in subevent_log[0].attributes:
            attr_type = type(subevent_log[0].attributes[attr]).__name__
        elif len(subevent_log[0]) > 0 and attr in subevent_log[0][0]:
            attr_type = type(subevent_log[0][0][attr]).__name__
        else:
            attr_type = "unknown"
        attr_info.append({"Attribute": attr, "Type": attr_type})

    # Display as a pandas DataFrame
    attr_df = pd.DataFrame(attr_info)
    display(attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    attr_df.to_excel(os.path.join(tables_dir, "sub_attribute_info.xlsx"), index=False)

except Exception as e:
    print(f"Error processing the subevent_log: {e}")

Unnamed: 0,Attribute,Type
0,concept:name,str
1,time:timestamp,datetime
2,stream:datastream,dict
3,operation_end_time,datetime
4,SubProcessID,str
5,org:resource,str


Load the database file with the sensor data and query it using DuckDB

In [None]:
import duckdb
import os
from pathlib import Path
import pandas as pd


# Make sure xes_directory is defined before this line
parquet_dir = os.path.join(xes_directory, "parquet")

# Connect to an in-memory DuckDB instance
con = duckdb.connect(database=':memory:')

# Collect all .parquet files in the directory
parquet_file = os.path.join(parquet_dir, "all_combined.parquet")

# Ensure there are files to process
if not os.path.exists(parquet_file):
    raise FileNotFoundError(f"No Parquet file found at {parquet_file}")

# Register all Parquet files as a single virtual table (view)
# IMPORTANT: DuckDB's parquet_scan expects a *list of strings* to be passed as a DuckDB list literal
# Use array syntax ['file1.parquet', 'file2.parquet', ...]
parquet_list_str = f"'{parquet_file}'"
query = f"CREATE VIEW sensor_data AS SELECT * FROM parquet_scan([{parquet_list_str}])"
con.execute(query)

# Print the number of rows in the sensor_data table
row_count = con.execute("SELECT COUNT(*) FROM sensor_data").fetchone()[0]
print(f"Number of rows in sensor_data: {row_count}")

# Print column names
info_df = con.execute("PRAGMA table_info('sensor_data')").df()
print("\nColumns and types in sensor_data:")
print(info_df['name'])

# Show a sample of the data
print("\nSample rows from sensor_data:")
sample_df = con.execute("SELECT * FROM sensor_data LIMIT 5").df()
display(sample_df)


# ------------------------
# Enter query here
# ------------------------

df_sensor_grouped = con.execute("""
    SELECT 
        "stream:system", COUNT(*) AS count
    FROM sensor_data
    GROUP BY "stream:system"
    ORDER BY count DESC
""").df()

# ------------------------

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
display(df_sensor_grouped)


Number of rows in sensor_data: 121480672

Columns and types in sensor_data:
0                concept:name
1                org:resource
2               stream:system
3          stream:system_type
4          stream:observation
5       stream:procedure_type
6     stream:interaction_type
7            stream:timestamp
8                stream:value
9                  sensor_key
10                   children
Name: name, dtype: object

Sample rows from sensor_data:


Unnamed: 0,concept:name,org:resource,stream:system,stream:system_type,stream:observation,stream:procedure_type,stream:interaction_type,stream:timestamp,stream:value,sensor_key,children
0,detecting the workpiece color,sm_1,http://iot.uni-trier.de/FTOnto#ADXL345_Pi_1_AccSensor_2,sosa:Sensor,http://iot.uni-trier.de/FTOnto#SM_1_Compressor_8_Vibration,stream:continuous,sosa:Observation,2021-07-06T15:46:52.373960,"[0.0, 0.0, 9.4176]",stream:point,
1,detecting the workpiece color,sm_1,http://iot.uni-trier.de/FTOnto#ADXL345_Pi_1_AccSensor_2,sosa:Sensor,http://iot.uni-trier.de/FTOnto#SM_1_Compressor_8_Vibration,stream:continuous,sosa:Observation,2021-07-06T15:46:52.383218,"[1.0987, -1.8835, 8.2404]",stream:point,
2,detecting the workpiece color,sm_1,http://iot.uni-trier.de/FTOnto#ADXL345_Pi_1_AccSensor_2,sosa:Sensor,http://iot.uni-trier.de/FTOnto#SM_1_Compressor_8_Vibration,stream:continuous,sosa:Observation,2021-07-06T15:46:52.384988,"[0.6278, 0.3139, 9.4961]",stream:point,
3,detecting the workpiece color,sm_1,http://iot.uni-trier.de/FTOnto#ADXL345_Pi_1_AccSensor_2,sosa:Sensor,http://iot.uni-trier.de/FTOnto#SM_1_Compressor_8_Vibration,stream:continuous,sosa:Observation,2021-07-06T15:46:52.389276,"[0.0, 0.3924, 9.1037]",stream:point,
4,detecting the workpiece color,sm_1,http://iot.uni-trier.de/FTOnto#ADXL345_Pi_1_AccSensor_2,sosa:Sensor,http://iot.uni-trier.de/FTOnto#SM_1_Compressor_8_Vibration,stream:continuous,sosa:Observation,2021-07-06T15:46:52.389999,"[0.3924, 0.6278, 9.4176]",stream:point,


Unnamed: 0,stream:observation,count
0,http://iot.uni-trier.de/FTOnto#HBW_1_Crane_Jib_Rotation,17896630
1,http://iot.uni-trier.de/FTOnto#HBW_1_Crane_Jib_Acceleration,17826752
2,http://iot.uni-trier.de/FTOnto#HBW_1_Crane_Jib_Magnetic_Field_Strength,17441995
3,http://iot.uni-trier.de/FTOnto#VGR_1_Crane_Jib_Acceleration,16083844
4,http://iot.uni-trier.de/FTOnto#VGR_1_Crane_Jib_Magnetic_Field_Strength,15777030
...,...,...
73,http://iot.uni-trier.de/StreamDataAnnotationOnto#MM_2_Property_Current_State,8211
74,http://iot.uni-trier.de/StreamDataAnnotationOnto#SM_2_Property_Current_State,8192
75,http://iot.uni-trier.de/StreamDataAnnotationOnto#SM_2_Property_Current_Task_Elapsed_Seconds_Since_Start,8192
76,http://iot.uni-trier.de/StreamDataAnnotationOnto#DM_2_Property_Current_State,5768


Use the sensor dict