Load the "MainProcess.xes" file

In [2]:
import os
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer
import pandas as pd

# Directory with your .xes files
xes_directory = os.path.join(os.getcwd(), "20130794", "Cleaned Event Log")

# Output directory for models or visuals
output_directory = os.path.join(os.getcwd(), "output")
os.makedirs(output_directory, exist_ok=True)

# Process MainProcess.xes file
filename = "MainProcess.xes"
file_path = os.path.join(xes_directory, filename)
print(f"Processing {filename}")

try:
    log = xes_importer.apply(file_path)
    print("\n\n")
    print(f"Imported {filename} with {len(log)} traces.")
    
    # Print important information about the log
    print(f"Number of events: {sum(len(trace) for trace in log)}")
    activities = set(event["concept:name"] for trace in log for event in trace if "concept:name" in event)
    print(f"Number of unique activities: {len(activities)}")
    print(f"Unique activities: {activities}")
    case_ids = [trace.attributes["concept:name"] for trace in log if "concept:name" in trace.attributes]
    print(f"Number of cases: {len(case_ids)}")
    print(f"First 5 case IDs: {case_ids[:5]}")
    print("\n\n")

except Exception as e:
    print(f"Error processing {filename}: {e}")


Processing MainProcess.xes


  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 301/301 [00:00<00:00, 646.00it/s]




Imported MainProcess.xes with 301 traces.
Number of events: 9471
Number of unique activities: 21
Unique activities: {'/mm/transport_from_to', '/dm/drill', '/hbw/store_empty_bucket', '/sm/transport', '/ov/temper', '/pm/punch_gill', '/ov/burn', '/hbw/get_empty_bucket', '/dm/lower', '/sm/sort', '/dm/cylindrical_drill', '/mm/drill', '/pm/punch_ribbing', '/vgr/pick_up_and_transport', '/mm/mill', '/hbw/store', '/hbw/unload', '/mm/deburr', '/wt/pick_up_and_transport', '/pm/punch_recesses', '/hw/human_review'}
Number of cases: 301
First 5 case IDs: ['WF_101_0', 'WF_102_0', 'WF_103_0', 'WF_104_0', 'WF_105_0']








Get all the attributes included in this file ("MainProcess.xes")

In [None]:
try:
    # List all attributes of the traces in a table
    print("\n\nAttributes of traces:\n")
    
    trace_attributes = set()
    for trace in log:
        trace_attributes.update(trace.attributes.keys())
        
    trace_attr_info = []
    for attr in trace_attributes:
        attr_type = "unknown"
        for trace in log:
            if attr in trace.attributes:
                attr_type = type(trace.attributes[attr]).__name__
                break
        trace_attr_info.append({"Attribute": attr, "Type": attr_type})

    trace_attr_df = pd.DataFrame(trace_attr_info)
    display(trace_attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    trace_attr_df.to_excel(os.path.join(tables_dir, "main_trace_attribute_info.xlsx"), index=False)
    
    # List all attributes of the events in a table
    print("\n\nAttributes of events:\n")
    
    event_attributes = set()
    for trace in log:
        for event in trace:
            event_attributes.update(event.keys())

    event_attr_info = []
    for attr in event_attributes:
        attr_type = "unknown"
        for trace in log:
            for event in trace:
                if attr in event:
                    attr_type = type(event[attr]).__name__
                    break
            if attr_type != "unknown":
                break
        event_attr_info.append({"Attribute": attr, "Type": attr_type})

    event_attr_df = pd.DataFrame(event_attr_info)
    display(event_attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    event_attr_df.to_excel(os.path.join(tables_dir, "main_event_attribute_info.xlsx"), index=False)
except Exception as e:
    print(f"Error processing the log: {e}")




Attributes of traces:



Unnamed: 0,Attribute,Type
0,concept:name,str




Attributes of events:



Unnamed: 0,Attribute,Type
0,identifier:id,str
1,process_model_id,str
2,operation_end_time,datetime
3,planned_operation_time,str
4,time:timestamp,datetime
5,complete_service_time,str
6,requested_service_url,str
7,current_task,str
8,human_workstation_green_button_pressed,float
9,event_id,str


List all the resources of the log in a table

In [None]:
try:
    # Set pandas display options for full width
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.width', 0)
    pd.set_option('display.max_columns', None)

    resources = set()
    for trace in log:
        for event in trace:
            if "org:resource" in event:
                resources.add(event["org:resource"])

    resource_info = []
    for resource in resources:
        # Find the first SubProcessID for this resource by searching events until found
        first_subprocess_id = "N/A"
        parameters_dict = None
        found = False
        for trace in log:
                for event in trace:
                    if event.get("org:resource") == resource:
                        if "SubProcessID" in event:
                            first_subprocess_id = event["SubProcessID"]
                            found = True
                            break
                if found:
                    break
                
        found = False
        for trace in log:
                for event in trace:
                    if event.get("org:resource") == resource:
                        if "parameters" in event:
                            parameters_dict = event["parameters"]
                            found = True
                            break
                if found:
                    break
        event_count = sum(1 for trace in log for event in trace if event.get("org:resource") == resource)
        activities_performed = set(event["concept:name"] for trace in log for event in trace if event.get("org:resource") == resource)
        parameter_keys = list(parameters_dict['children'][i][0] for i in range(len(parameters_dict['children']))) if parameters_dict and 'children' in parameters_dict else []
        resource_info.append({
            "Resource": resource,
            "Event Count": event_count,
            "Unique Activities": len(activities_performed),
            "Activities": ", ".join(sorted(activities_performed)),
            "First Subprocess ID": first_subprocess_id,
            "Parameter Keys": parameter_keys
        })

    resource_df = pd.DataFrame(resource_info)
    display(resource_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    resource_df.to_excel(os.path.join(tables_dir, "resources_info.xlsx"), index=False)

except Exception as e:
    print(f"Error processing the log: {e}")

Unnamed: 0,Resource,Event Count,Unique Activities,Activities,First Subprocess ID,Parameter Keys
0,vgr_1,1866,1,/vgr/pick_up_and_transport,0e7b5a4c-4c03-47b2-96fd-e401ed7fbca9,"[parameter_start_position, parameter_end_position]"
1,sm_2,309,2,"/sm/sort, /sm/transport",722f5091-ed89-45a3-89c7-4962901b6c14,"[parameter_start_position, parameter_end_position]"
2,wt_2,330,1,/wt/pick_up_and_transport,7316381c-127f-43cb-956b-ca72e60bc6ab,"[parameter_start_position, parameter_end_position]"
3,ov_2,330,1,/ov/burn,1ab1350e-cba4-42ea-8efd-a0b01e88380e,"[parameter_burn_workpiece_size, parameter_burn_workpiece_thickness]"
4,vgr_2,885,1,/vgr/pick_up_and_transport,4d198444-6633-4218-b1f7-ca67ec666360,"[parameter_start_position, parameter_end_position]"
5,wt_1,447,1,/wt/pick_up_and_transport,8febb390-19ce-4d63-a018-d9617a8bb1b7,"[parameter_start_position, parameter_end_position]"
6,ov_1,612,2,"/ov/burn, /ov/temper",633d065f-96c0-4c4b-8112-302990575763,"[parameter_burn_workpiece_size, parameter_burn_workpiece_thickness]"
7,dm_2,177,3,"/dm/cylindrical_drill, /dm/drill, /dm/lower",ad6c9c0b-f3ba-45e7-b887-b96bf0260887,"[parameter_start_position, parameter_end_position]"
8,pm_1,204,3,"/pm/punch_gill, /pm/punch_recesses, /pm/punch_ribbing",21559c95-22a5-4c8b-9424-dbbc14a9f63b,"[parameter_start_position, parameter_end_position, parameter_quantity]"
9,mm_2,381,4,"/mm/deburr, /mm/drill, /mm/mill, /mm/transport_from_to",570d0814-988a-4856-bc82-249db6050f5e,"[parameter_start_position, parameter_end_position, parameter_burn_workpiece_size, parameter_quantity]"


Load a single subevent log file

In [5]:
# Process 0a0a7c16-85d9-48be-a7d5-32931240c337.xes file
filename = "0a0a7c16-85d9-48be-a7d5-32931240c337.xes"
file_path = os.path.join(xes_directory, filename)
print(f"Processing {filename}")

try:
    subevent_log = xes_importer.apply(file_path)
    print("\n\n")
    print(f"Imported {filename} with {len(subevent_log)} traces.")

    # Print important information about the subevent_log
    print(f"Number of events: {sum(len(trace) for trace in subevent_log)}")
    activities = set(event["concept:name"] for trace in subevent_log for event in trace if "concept:name" in event)
    print(f"Number of unique activities: {len(activities)}")
    print(f"Unique activities: {activities}")
    case_ids = [trace.attributes["concept:name"] for trace in subevent_log if "concept:name" in trace.attributes]
    print(f"Number of cases: {len(case_ids)}")
    print(f"First 5 case IDs: {case_ids[:5]}")
    print("\n\n")

except Exception as e:
    print(f"Error processing {filename}: {e}")

Processing 0a0a7c16-85d9-48be-a7d5-32931240c337.xes


parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 145.04it/s]




Imported 0a0a7c16-85d9-48be-a7d5-32931240c337.xes with 1 traces.
Number of events: 5
Number of unique activities: 5
Unique activities: {'transporting the workpiece to the mill', 'ejecting the workpiece to the conveyor belt', 'milling the workpiece', 'transporting the workpiece to the ejection position', 'transporting the workpiece to the sorting machine'}
Number of cases: 0
First 5 case IDs: []








List all the attributes in this file

In [None]:
try:
    # List all attributes in the subevent_log in a table and display nicely in Jupyter Notebook
    all_attributes = set()
    for trace in subevent_log:
        all_attributes.update(trace.attributes.keys())
        for event in trace:
            all_attributes.update(event.keys())

    # Prepare attribute type information
    attr_info = []
    for attr in all_attributes:
        if attr in subevent_log[0].attributes:
            attr_type = type(subevent_log[0].attributes[attr]).__name__
        elif len(subevent_log[0]) > 0 and attr in subevent_log[0][0]:
            attr_type = type(subevent_log[0][0][attr]).__name__
        else:
            attr_type = "unknown"
        attr_info.append({"Attribute": attr, "Type": attr_type})

    # Display as a pandas DataFrame
    attr_df = pd.DataFrame(attr_info)
    display(attr_df)
    # Save the attribute DataFrame to an Excel file in a "tables" subfolder
    tables_dir = os.path.join(os.getcwd(), "tables")
    os.makedirs(tables_dir, exist_ok=True)
    attr_df.to_excel(os.path.join(tables_dir, "sub_attribute_info.xlsx"), index=False)

except Exception as e:
    print(f"Error processing the subevent_log: {e}")

Unnamed: 0,Attribute,Type
0,concept:name,str
1,org:resource,str
2,SubProcessID,str
3,operation_end_time,datetime
4,time:timestamp,datetime
5,stream:datastream,dict


Get all the sensor data included per resource

In [7]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from pm4py.objects.log.importer.xes import importer as xes_importer
from IPython.display import display

# Helper: parse datastream list element into a dict structure
def parse_datastream_from_event_xml(event_xml):
    # Namespace for XES
    ns = {"xes": "http://code.deckfour.org/xes"}
    # datastream list(s) under this event
    datastreams = event_xml.findall("xes:list[@key='stream:datastream']", ns)
    if not datastreams:
        return None

    # We'll collect multiple datastream lists if present (usually one). For simplicity,
    # if more than one datastream list exists we'll merge their children.
    merged = {"children": {}}
    for datastream in datastreams:
        # Each 'point' is a nested list inside the datastream
        for idx, point in enumerate(datastream.findall("xes:list", ns)):
            # create sensor_point with attributes from the point element (attributes are in the 'stream' namespace)
            # the stream namespace is: https://cpee.org/datastream/datastream.xesext
            stream_ns = "https://cpee.org/datastream/datastream.xesext"
            sensor_point = {
                "stream:system": point.attrib.get(f"{{{stream_ns}}}system"),
                "stream:system_type": point.attrib.get(f"{{{stream_ns}}}system_type"),
                "stream:observation": point.attrib.get(f"{{{stream_ns}}}observation"),
                "stream:procedure_type": point.attrib.get(f"{{{stream_ns}}}procedure_type"),
                "stream:interaction_type": point.attrib.get(f"{{{stream_ns}}}interaction_type"),
                "children": {}
            }

            # child elements inside the point (like <date key="stream:timestamp" value="..."/>)
            for child in point:
                # child.attrib typically has 'key' and 'value'
                key = child.attrib.get("key")
                val = child.attrib.get("value")
                if key:
                    sensor_point["children"][key] = val

            # name the sensor node: use the 'key' attribute of the point element if available,
            # otherwise create an indexed name to keep them unique.
            sensor_key = point.attrib.get("key", f"stream:point_{idx}")
            # If same key already exists, create an indexed unique name
            if sensor_key in merged["children"]:
                # find a unique postfix
                i = 1
                new_key = f"{sensor_key}_{i}"
                while new_key in merged["children"]:
                    i += 1
                    new_key = f"{sensor_key}_{i}"
                sensor_key = new_key

            merged["children"][sensor_key] = sensor_point

    return merged


# Helper: attach parsed sensor dicts to pm4py events (in-memory)
def attach_sensor_data_to_pm4py_log(pm4py_log, xes_file_path):
    ns = {"xes": "http://code.deckfour.org/xes"}
    tree = ET.parse(xes_file_path)
    root = tree.getroot()

    xml_traces = root.findall(".//xes:trace", ns)

    if len(xml_traces) != len(pm4py_log):
        print(f"Warning: number of XML traces ({len(xml_traces)}) != number of pm4py traces ({len(pm4py_log)}). "
              "We'll attach for the minimum of both and skip extras.")

    n_traces = min(len(xml_traces), len(pm4py_log))

    for t_idx in range(n_traces):
        xml_trace = xml_traces[t_idx]
        pm_trace = pm4py_log[t_idx]

        xml_events = xml_trace.findall("xes:event", ns)
        pm_events = list(pm_trace)

        if len(xml_events) != len(pm_events):
            # warn but still attach for min length (common case: may still match)
            print(f"Warning: trace {t_idx} has {len(xml_events)} XML events vs {len(pm_events)} pm4py events. "
                  "Attaching up to the minimum matched events in order.")

        n_events = min(len(xml_events), len(pm_events))
        for e_idx in range(n_events):
            xml_event = xml_events[e_idx]
            pm_event = pm_events[e_idx]

            datastream_dict = parse_datastream_from_event_xml(xml_event)
            if datastream_dict:
                # attach the Python dict directly into the pm4py event
                pm_event["stream:datastream"] = datastream_dict

    # returns pm4py_log mutated in place
    return pm4py_log


# -------------------------
# Main loop (your original structure, adapted)
# -------------------------
# resource_df is assumed to be defined elsewhere (DataFrame with Resource and First Subprocess ID columns)
# xes_directory likewise defined

# sensor_dict is used as a mapping from resource -> DataFrame, so initialize as a dict
sensor_dict = {}

for resource_info in resource_df.itertuples():
    resource = resource_info.Resource
    # keep your original way to access the First Subprocess ID column (you used _5)
    first_subprocess_id = resource_info._5

    if first_subprocess_id == "N/A":
        print(f"\nResource: {resource} has no associated SubProcessID.\n")
        continue

    subprocess_filename = f"{first_subprocess_id}.xes"
    subprocess_file_path = os.path.join(xes_directory, subprocess_filename)

    if not os.path.exists(subprocess_file_path):
        print(f"\nSubprocess file {subprocess_filename} for resource {resource} not found.\n")
        continue

    try:
        # load with pm4py (this gives us the pm4py event objects)
        subprocess_log = xes_importer.apply(subprocess_file_path)

        # Parse XES in-memory and attach sensor dicts to pm4py events (no save/reload)
        attach_sensor_data_to_pm4py_log(subprocess_log, subprocess_file_path)

        print(f"\nSensor data for Resource: {resource} from SubprocessID: {first_subprocess_id}\n")

        # Find the events performed by the current resource and give the sensor data as a table
        for trace in subprocess_log:
            for event in trace:
                # Check if the event is performed by the current resource
                if event.get("org:resource") == resource:
                    sensor_data = event.get("stream:datastream", {})
                    # Check if sensor data exists
                    if sensor_data:
                        sensor_info = []
                        # sensor_data is a dict with 'children' key containing sensor points
                        if "children" in sensor_data:
                            for sensor_name, sensor_point in sensor_data["children"].items():
                                # sensor_name is usually 'stream:point' or similar
                                if isinstance(sensor_point, dict) and "children" in sensor_point:
                                    sensor_details = sensor_point["children"]
                                    timestamp = sensor_details.get("stream:timestamp")
                                    value = sensor_details.get("stream:value")
                                    system = sensor_point.get("stream:system")
                                    system_type = sensor_point.get("stream:system_type")
                                    observation = sensor_point.get("stream:observation")
                                    procedure_type = sensor_point.get("stream:procedure_type")
                                    interaction_type = sensor_point.get("stream:interaction_type")
                                    sensor_info.append({
                                        "Sensor Name": sensor_name,
                                        "System": system,
                                        "System Type": system_type,
                                        "Observation": observation,
                                        "Procedure Type": procedure_type,
                                        "Interaction Type": interaction_type,
                                        "Timestamp": timestamp,
                                        "Value": value
                                    })

                        # display as a pandas DataFrame (empty DataFrame shown if no rows)
                        sensor_df = pd.DataFrame(sensor_info)
                        if not sensor_df.empty:
                            #display(sensor_df)
                            sensor_dict[resource] = sensor_df
                        else:
                            print(f"Event: {event.get('concept:name')} at {event.get('time:timestamp')} "
                                  f"has no parsed sensor points.\n")
                    else:
                        print(f"Event: {event.get('concept:name')} at {event.get('time:timestamp')} "
                              "has no additional sensor data.\n")
    except Exception as e:
        print(f"Error processing subprocess log {subprocess_filename} for resource {resource}: {e}")


parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]




Sensor data for Resource: vgr_1 from SubprocessID: 0e7b5a4c-4c03-47b2-96fd-e401ed7fbca9



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 97.22it/s]


Sensor data for Resource: sm_2 from SubprocessID: 722f5091-ed89-45a3-89c7-4962901b6c14




parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 48.44it/s]




Sensor data for Resource: wt_2 from SubprocessID: 7316381c-127f-43cb-956b-ca72e60bc6ab



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 98.37it/s]




Sensor data for Resource: ov_2 from SubprocessID: 1ab1350e-cba4-42ea-8efd-a0b01e88380e



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 35.52it/s]




Sensor data for Resource: vgr_2 from SubprocessID: 4d198444-6633-4218-b1f7-ca67ec666360



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 46.21it/s]


Sensor data for Resource: wt_1 from SubprocessID: 8febb390-19ce-4d63-a018-d9617a8bb1b7




parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 90.41it/s]




Sensor data for Resource: ov_1 from SubprocessID: 633d065f-96c0-4c4b-8112-302990575763



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 102.95it/s]


Sensor data for Resource: dm_2 from SubprocessID: ad6c9c0b-f3ba-45e7-b887-b96bf0260887




parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 52.58it/s]
parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 52.58it/s]



Sensor data for Resource: pm_1 from SubprocessID: 21559c95-22a5-4c8b-9424-dbbc14a9f63b



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 120.98it/s]


Sensor data for Resource: mm_2 from SubprocessID: 570d0814-988a-4856-bc82-249db6050f5e








parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 12.60it/s]
parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 12.60it/s]



Sensor data for Resource: mm_1 from SubprocessID: 167db95e-ae8b-4ae8-ac11-055401e11894



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]




Sensor data for Resource: hbw_1 from SubprocessID: b179f074-238d-4666-b50f-9a8959d0a48e



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 1531.33it/s]


Sensor data for Resource: hw_1 from SubprocessID: a8d0fcdd-46c6-44f8-8b19-cecd803d356f




parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00, 24.82it/s]




Sensor data for Resource: hbw_2 from SubprocessID: 27958fc0-4484-41ff-9260-e76f8a83a7cd



parsing log, completed traces :: 100%|██████████| 1/1 [00:00<00:00,  8.47it/s]




Sensor data for Resource: sm_1 from SubprocessID: 16d2bd16-3be9-4daa-a4ad-edb7f5818fcb



Use the sensor dict

In [None]:
print(len(sensor_dict))

for resource, sensor_df in sensor_dict.items():
    print(f"\nSensor data for Resource: {resource}\n")
    display(sensor_df)

15
