In [3]:
import os
import json
import csv

# Define the directory containing the JSON files
directory = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\Gateway journal\BPMAI-29-10-2019\bpmai\models"

# Output CSV file
output_file = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data.csv"

# Prepare CSV headers
csv_headers = ["Image_Id", "Lane", "Lane_Position", "Task", "Task_Position"]

# Function to extract lane and task details from JSON
def extract_data_from_json(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []
    
    # Navigate through the JSON structure to find lanes and tasks
    for child in data.get("childShapes", []):
        if child["stencil"]["id"] == "Pool":
            for lane in child.get("childShapes", []):
                if lane["stencil"]["id"] == "Lane":
                    lane_name = lane["properties"].get("name", "Unknown")
                    lane_position = lane["bounds"]
                    
                    # Extract tasks inside the lane
                    for task in lane.get("childShapes", []):
                        if task["stencil"]["id"] == "Task":
                            task_name = task["properties"].get("name", "Unknown")
                            task_position = task["bounds"]
                            
                            extracted_data.append([
                                os.path.basename(json_file).split(".")[0],  # Image ID
                                lane_name,
                                lane_position,
                                task_name,
                                task_position
                            ])
    
    return extracted_data

# Process all files in the directory
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_headers)

    for file in os.listdir(directory):
        if file.endswith(".meta.json"):
            meta_file_path = os.path.join(directory, file)
            with open(meta_file_path, "r", encoding="utf-8") as f:
                meta_data = json.load(f)
            
            # Check if modeling language is 'bpmn11' or 'bpmn20'
            modeling_language = meta_data["model"].get("modelingLanguage", "").lower()
            if modeling_language in ["bpmn11", "bpmn20"]:
                model_id = meta_data["model"]["modelId"]
                json_file_path = os.path.join(directory, f"{model_id}.json")

                if os.path.exists(json_file_path):
                    extracted_data = extract_data_from_json(json_file_path)
                    writer.writerows(extracted_data)

print(f"Extraction completed. Data saved to {output_file}")


Extraction completed. Data saved to C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data.csv


In [4]:
import os
import json
import csv

# Define the directory containing the JSON files
directory = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\Gateway journal\BPMAI-29-10-2019\bpmai\models"

# Output CSV file
output_file = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data_V2.csv"

# Define CSV headers
csv_headers = [
    "Image_Id", "Model_Name", "Lane", "Lane_Position", "Task", "Task_Position",
    "Modeling_Language", "Model_Path", "questions", "answers"
]

# Function to extract data from JSON file
def extract_data_from_json(json_file, model_name, model_id, modeling_language):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []
    
    # Iterate through the BPMN elements
    for child in data.get("childShapes", []):
        if child["stencil"]["id"] == "Pool":
            for lane in child.get("childShapes", []):
                if lane["stencil"]["id"] == "Lane":
                    lane_name = lane["properties"].get("name", "Unknown")
                    lane_position = lane["bounds"]

                    tasks = []
                    task_positions = []
                    
                    # Extract tasks inside the lane
                    for task in lane.get("childShapes", []):
                        if task["stencil"]["id"] == "Task":
                            task_name = task["properties"].get("name", "Unknown")
                            task_position = task["bounds"]
                            tasks.append(task_name)
                            task_positions.append(task_position)
                    
                    # Format the question and answer
                    question = f'What is the role of "{lane_name}"?'
                    answer = ", ".join(tasks) if tasks else "No tasks assigned"

                    # Append extracted data
                    extracted_data.append([
                        model_id,  # Image ID
                        model_name,
                        lane_name,
                        lane_position,
                        tasks,
                        task_positions,
                        modeling_language,
                        json_file,
                        question,
                        answer
                    ])
    
    return extracted_data

# Process all files in the directory
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_headers)

    for file in os.listdir(directory):
        if file.endswith(".meta.json"):
            meta_file_path = os.path.join(directory, file)
            with open(meta_file_path, "r", encoding="utf-8") as f:
                meta_data = json.load(f)
            
            # Extract metadata
            modeling_language = meta_data["model"].get("modelingLanguage", "").lower()
            if modeling_language in ["bpmn11", "bpmn20"]:
                model_id = meta_data["model"]["modelId"]
                model_name = meta_data["model"]["modelName"]
                json_file_path = os.path.join(directory, f"{model_id}.json")

                if os.path.exists(json_file_path):
                    extracted_data = extract_data_from_json(json_file_path, model_name, model_id, modeling_language)
                    writer.writerows(extracted_data)

print(f"Extraction completed. Data saved to {output_file}")


Extraction completed. Data saved to C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data_V2.csv


In [5]:
import os
import json
import csv

# Define the directory containing the JSON files
directory = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\Gateway journal\BPMAI-29-10-2019\bpmai\models"

# Output CSV file
output_file = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data_V3.csv"

# Define CSV headers
csv_headers = [
    "Image_Id", "Model_Name", "Lane", "Lane_Position", "Task", "Task_Position",
    "Modeling_Language", "Model_Path", "Question", "Answer"
]

# Function to extract data from JSON file
def extract_data_from_json(json_file, model_name, model_id, modeling_language):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []

    # Iterate through BPMN elements to find pools and lanes
    for pool in data.get("childShapes", []):
        if pool["stencil"]["id"] == "Pool":
            lanes_in_pool = [lane["properties"].get("name", "Unknown") for lane in pool.get("childShapes", []) if lane["stencil"]["id"] == "Lane"]

            for lane in pool.get("childShapes", []):
                if lane["stencil"]["id"] == "Lane":
                    lane_name = lane["properties"].get("name", "Unknown")
                    lane_position = lane["bounds"]
                    
                    tasks = []
                    task_positions = []
                    has_gateway = False

                    # Extract tasks and check for gateways inside the lane
                    for element in lane.get("childShapes", []):
                        if element["stencil"]["id"] == "Task":
                            task_name = element["properties"].get("name", "Unknown")
                            task_position = element["bounds"]
                            tasks.append(task_name)
                            task_positions.append(task_position)
                        elif element["stencil"]["id"] == "Exclusive_Databased_Gateway":
                            has_gateway = True
                    
                    # Define questions and answers
                    questions_answers = [
                        (f'What is the role of "{lane_name}"?', ", ".join(tasks) if tasks else "No tasks assigned"),
                        (f'What is the main responsibility of "{lane_name}" in this BPMN model?', 
                         f'This lane is responsible for {", ".join(tasks)}' if tasks else "No specific tasks assigned."),
                        (f'Does "{lane_name}" have a decision-making step (gateway)?', "Yes" if has_gateway else "No"),
                        (f'Where is "{lane_name}" located in the process diagram?', 
                         ", ".join([ln for ln in lanes_in_pool if ln != lane_name]) if lanes_in_pool else "No other lanes.")
                    ]

                    # Append extracted data with each question-answer pair
                    for question, answer in questions_answers:
                        extracted_data.append([
                            model_id, model_name, lane_name, lane_position, tasks, task_positions,
                            modeling_language, json_file, question, answer
                        ])
    
    return extracted_data

# Process all files in the directory
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_headers)

    for file in os.listdir(directory):
        if file.endswith(".meta.json"):
            meta_file_path = os.path.join(directory, file)
            with open(meta_file_path, "r", encoding="utf-8") as f:
                meta_data = json.load(f)
            
            # Extract metadata
            modeling_language = meta_data["model"].get("modelingLanguage", "").lower()
            if modeling_language in ["bpmn11", "bpmn20"]:
                model_id = meta_data["model"]["modelId"]
                model_name = meta_data["model"]["modelName"]
                json_file_path = os.path.join(directory, f"{model_id}.json")

                if os.path.exists(json_file_path):
                    extracted_data = extract_data_from_json(json_file_path, model_name, model_id, modeling_language)
                    writer.writerows(extracted_data)

print(f"Extraction completed. Data saved to {output_file}")


Extraction completed. Data saved to C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data_V3.csv


In [6]:
import os
import json
import csv

# Define the directory containing the JSON files
directory = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\Gateway journal\BPMAI-29-10-2019\bpmai\models"

# Output CSV file
output_file = r"C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data_V4.csv"

# Define CSV headers
csv_headers = [
    "Image_Id", "Model_Name", "Lane", "Lane_Position", "Task", "Task_Position",
    "Modeling_Language", "Model_Path", "Question", "Answer"
]

# Recursive function to extract tasks from nested lanes
def extract_tasks_from_lane(lane):
    tasks = []
    task_positions = []
    has_gateway = False

    for element in lane.get("childShapes", []):
        if element["stencil"]["id"] == "Task":
            task_name = element["properties"].get("name", "Unknown")
            task_position = element["bounds"]
            tasks.append(task_name)
            task_positions.append(task_position)
        elif element["stencil"]["id"] == "Exclusive_Databased_Gateway":
            has_gateway = True
        elif element["stencil"]["id"] == "Lane":
            # Recursively extract tasks from nested lanes
            nested_tasks, nested_positions, nested_has_gateway = extract_tasks_from_lane(element)
            tasks.extend(nested_tasks)
            task_positions.extend(nested_positions)
            has_gateway = has_gateway or nested_has_gateway

    return tasks, task_positions, has_gateway

# Function to extract data from JSON file
def extract_data_from_json(json_file, model_name, model_id, modeling_language):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []

    # Iterate through BPMN elements to find pools and lanes
    for pool in data.get("childShapes", []):
        if pool["stencil"]["id"] == "Pool":
            lanes_in_pool = [lane["properties"].get("name", "Unknown") for lane in pool.get("childShapes", []) if lane["stencil"]["id"] == "Lane"]

            for lane in pool.get("childShapes", []):
                if lane["stencil"]["id"] == "Lane":
                    lane_name = lane["properties"].get("name", "Unknown")
                    lane_position = lane["bounds"]
                    
                    # Extract tasks (including nested ones)
                    tasks, task_positions, has_gateway = extract_tasks_from_lane(lane)

                    # Define questions and answers
                    questions_answers = [
                        (f'What is the role of "{lane_name}"?', ", ".join(tasks) if tasks else "No tasks assigned"),
                        (f'What is the main responsibility of "{lane_name}" in this BPMN model?', 
                         f'This lane is responsible for {", ".join(tasks)}' if tasks else "No specific tasks assigned."),
                        (f'Does "{lane_name}" have a decision-making step (gateway)?', "Yes" if has_gateway else "No"),
                        (f'Where is "{lane_name}" located in the process diagram?', 
                         ", ".join([ln for ln in lanes_in_pool if ln != lane_name]) if lanes_in_pool else "No other lanes.")
                    ]

                    # Append extracted data with each question-answer pair
                    for question, answer in questions_answers:
                        extracted_data.append([
                            model_id, model_name, lane_name, lane_position, tasks, task_positions,
                            modeling_language, json_file, question, answer
                        ])
    
    return extracted_data

# Process all files in the directory
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_headers)

    for file in os.listdir(directory):
        if file.endswith(".meta.json"):
            meta_file_path = os.path.join(directory, file)
            with open(meta_file_path, "r", encoding="utf-8") as f:
                meta_data = json.load(f)
            
            # Extract metadata
            modeling_language = meta_data["model"].get("modelingLanguage", "").lower()
            if modeling_language in ["bpmn11", "bpmn20"]:
                model_id = meta_data["model"]["modelId"]
                model_name = meta_data["model"]["modelName"]
                json_file_path = os.path.join(directory, f"{model_id}.json")

                if os.path.exists(json_file_path):
                    extracted_data = extract_data_from_json(json_file_path, model_name, model_id, modeling_language)
                    writer.writerows(extracted_data)

print(f"Extraction completed. Data saved to {output_file}")


Extraction completed. Data saved to C:\Users\ayads\OneDrive\Documents\Sarah\Research\OCR\bpmn_OCR_extracted_data_V4.csv
