In [71]:
import mlcroissant as mlc
from mlcroissant._src.structure_graph.nodes.source import FileProperty
import hashlib
import os

# Distribution

In [None]:
distribution = [
    mlc.FileObject(
        id="github-repository",
        name="github-repository",
        description="BO benchmark on GitHub",
        content_url="https://github.com/TrafficSimBenchmark/dataset",
        encoding_formats=["git+https"],
        sha256="main",
    ),

    mlc.FileSet(
        id="xml-files",
        name="xml-files",
        description="Each subfolder under 'network/' contains SUMO XML files for one specific traffic network.",
        contained_in=["github-repository"],
        encoding_formats=["application/xml"],
        includes="network/network_*/{additional.xml,net.xml,od.xml,taz.xml}",
    ),
    
    mlc.FileSet(
        id="csv-routes-files",
        name="csv-routes-files",
        description="Each subfolder under 'network/' contains routes CSV files for one specific traffic network. Single route files has only one pair from TAZ to TAZ, while multiple route files has multiple pairs from TAZ to TAZ.",
        contained_in=["github-repository"],
        encoding_formats=["text/csv"],
        includes="network/network_*/{routes_single.csv, routes_multiple.csv}",
    ),

    mlc.FileSet(
        id="csv-sensor-files",
        name="csv-sensor-files",
        description="Sensor ground-truth vehicle link count and average speed data for each network, stored as edge-wise CSV files.",
        contained_in=["github-repository"],
        encoding_formats=["text/csv"],
        includes="sensor_data/*/gt_link_data_*.csv",
    ),

]





# Record Sets

In [73]:
record_sets = [

    ########################
    # (1) xml-files
    ########################
    
    mlc.RecordSet(
        id="xml",
        name="xml",
        description="Metadata about SUMO network-related XML files (net, taz, additional).",
        fields=[
            mlc.Field(
                id="xml/filename",
                name="filename",
                description="Name of the XML file (e.g., net.xml, taz.xml).",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="xml-files",
                    extract=mlc.Extract(file_property=FileProperty.filename),
                ),
            ),
            mlc.Field(
                id="xml/network_name",
                name="network_name",
                description="Traffic network name inferred from folder structure.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="xml-files",
                    extract=mlc.Extract(file_property=FileProperty.filename),
                    transforms=[mlc.Transform(regex="network/(network_.*?)/.*")]
                ),
            ),
            mlc.Field(
                id="xml/type",
                name="xml_type",
                description="Type of XML file (net, taz, additional, rou) inferred from filename.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="xml-files",
                    extract=mlc.Extract(file_property=FileProperty.filename),
                    transforms=[mlc.Transform(regex="(additional|net|od|taz)\\.xml")]
                ),
            ),
        ]
    ),


    ########################
    # (2-1) routes_single.csv
    ########################
    
    mlc.RecordSet(
        id="csv_routes_single",
        name="csv_routes_single",
        description="Vehicle route definitions per traffic network.",
        fields=[
            mlc.Field(
                id="csv_routes_single/fromTaz",
                name="fromTaz",
                description="Origin TAZ ID.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="fromTaz"),
                ),
            ),
            mlc.Field(
                id="csv_routes_single/toTaz",
                name="toTaz",
                description="Destination TAZ ID.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="toTaz"),
                ),
            ),
            mlc.Field(
                id="csv_routes_single/route_edges",
                name="route_edges",
                description="Route edges assigned to the vehicle.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="route_edges"),
                ),
            ),
            mlc.Field(
                id="csv_routes_single/start_edge",
                name="start_edge",
                description="Starting edge of the route.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="start_edge"),
                ),
            ),
            mlc.Field(
                id="csv_routes_single/last_edge",
                name="last_edge",
                description="Ending edge of the route.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="last_edge"),
                ),
            ),            
            mlc.Field(
                id="csv_routes_single/network_name",
                name="network_name",
                description="Network name inferred from folder path.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(
                        file_property=FileProperty.filename
                    ),
                    transforms=[mlc.Transform(regex="network_(.*?)/")]
                ),
            ),
        ]
    ),
    
    ########################
    # (2-2) routes_mutiple.csv
    ########################
    
    mlc.RecordSet(
        id="csv_routes_multiple",
        name="csv_routes_multiple",
        description="Multiple route definitions with ratios for each TAZ pair in each network.",
        fields=[
            mlc.Field(
                id="csv_routes_multiple/fromTaz",
                name="fromTaz",
                description="Origin TAZ ID.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="fromTaz"),
                ),
            ),
            mlc.Field(
                id="csv_routes_multiple/toTaz",
                name="toTaz",
                description="Destination TAZ ID.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="toTaz"),
                ),
            ),
            mlc.Field(
                id="csv_routes_multiple/route_idx",
                name="route_idx",
                description="Index of the route within same TAZ pair.",
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="route_idx"),
                ),
            ),
            mlc.Field(
                id="csv_routes_multiple/ratio",
                name="ratio",
                description="Ratio of the route within same TAZ pair.",
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="ratio"),
                ),
            ),
            mlc.Field(
                id="csv_routes_multiple/route_edges",
                name="route_edges",
                description="Route edges assigned to the vehicle.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="route_edges"),
                ),
            ),
            mlc.Field(
                id="csv_routes_multiple/start_edge",
                name="start_edge",
                description="Starting edge of the route.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="start_edge"),
                ),
            ),
            mlc.Field(
                id="csv_routes_multiple/last_edge",
                name="last_edge",
                description="Ending edge of the route.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(column="last_edge"),
                ),
            ),            
            mlc.Field(
                id="csv_routes_multiple/network_name",
                name="network_name",
                description="Network name inferred from folder path.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-routes-files",
                    extract=mlc.Extract(
                        file_property=FileProperty.filename
                    ),
                    transforms=[mlc.Transform(regex="network_(.*?)/")]
                ),
            ),
        ]
    ),
    
    
    
    ########################
    # (3) sensor_data
    ########################
    
    mlc.RecordSet(
        id="csv_sensor",
        name="csv_sensor",
        fields=[          
            mlc.Field(
                id="csv_sensor/link_id",
                name="link_id",
                description="Edge ID where the sensor is located in the SUMO network.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-sensor-files",
                    extract=mlc.Extract(column="link_id"),
                ),
            ),
            mlc.Field(
                id="csv_sensor/interval_meanSpeed",
                name="interval_meanSpeed",
                description="Average speed of vehicles that passed the sensor during the interval.",
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="csv-sensor-files",
                    extract=mlc.Extract(column="interval_meanSpeed"),
                ),
            ),
            mlc.Field(
                id="csv_sensor/interval_nVehContrib",
                name="interval_nVehContrib",
                description="Number of vehicles that passed the sensor during the interval.",
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="csv-sensor-files",
                    extract=mlc.Extract(column="interval_nVehContrib"),
                ),
            ),
            mlc.Field(
                id="csv_sensor/network_name",
                name="network_name",
                description="Network name inferred from the filename.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="csv-sensor-files",
                    extract=mlc.Extract(
                        file_property=FileProperty.filename
                    ),
                    transforms=[mlc.Transform(regex="gt_link_data_(.*?)_\\d{6}_\\d{2}-\\d{2}\\.csv")]
                ),
            ),
        ]
    ),


]


# Meta Data

In [None]:
metadata = mlc.Metadata(
    name="BO4Mob_dataset",
    description=(
        
        "This dataset contains five traffic networks modeled in SUMO, each accompanied "
        "by vehicle link count and average speed measurements derived from PeMS sensor data. "
        "Every network includes its own configuration files (network, OD, TAZ, and additional files), "
        "along with a corresponding routes.csv file that defines vehicle paths between TAZs. "
        "Two versions of the routes are provided: one containing a single route per TAZ pair (single) and "
        "another with multiple routes per TAZ pair (multiple). "
        "Sensor data indicates the number of vehicles passing through detectors located on specific network links. "
        "For each network, sensor data is available for a particular date and time period."
    ),
    cite_as=(
        "@article{ryu2025bo4mob, title={BO4Mob: Bayesian Optimization Benchmarks for High-Dimensional Urban Mobility Problem},"
        "author={Seunghee Ryu and Donghoon Kwon and Seongjin Choi and Aryan Deshwal and Seungmo Kang and Carolina Osorio}, "
        "year={2025},"
    ),        
    
    url="https://github.com/TrafficSimBenchmark/dataset", 
    distribution=distribution,
    record_sets=record_sets,
)

# Check and Fix

In [75]:
print(metadata.issues.report())

  -  [Metadata(BO4Mob_dataset)] Property "https://schema.org/datePublished" is recommended, but does not exist.
  -  [Metadata(BO4Mob_dataset)] Property "https://schema.org/license" is recommended, but does not exist.
  -  [Metadata(BO4Mob_dataset)] Property "https://schema.org/version" is recommended, but does not exist.


In [76]:
import json

with open("croissant_bo4mob.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=2)
  print(content)
  f.write(content)
  f.write("\n")  # Terminate file with newline

{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:re

In [77]:
mlc.Dataset(jsonld="croissant_bo4mob.json")

  -  [Metadata(BO4Mob_dataset)] Property "https://schema.org/datePublished" is recommended, but does not exist.
  -  [Metadata(BO4Mob_dataset)] Property "https://schema.org/license" is recommended, but does not exist.
  -  [Metadata(BO4Mob_dataset)] Property "https://schema.org/version" is recommended, but does not exist.




---

In [36]:
import json
from pathlib import Path

# Load original JSON
with open("./croissant_bo4mob.json", "r", encoding="utf-8") as f:
    croissant_data = json.load(f)

# 1. Fix "recordSet" → "recordSets"
if "recordSet" in croissant_data:
    croissant_data["recordSets"] = croissant_data.pop("recordSet")

# 2. Recursively fix all "transform" to "transforms" as list
def fix_transforms(obj):
    if isinstance(obj, dict):
        # If "transform" is a dict, wrap it into a list under "transforms"
        if "transform" in obj and isinstance(obj["transform"], dict):
            obj["transforms"] = [obj.pop("transform")]
        # Recurse for nested dicts
        for k, v in obj.items():
            fix_transforms(v)
    elif isinstance(obj, list):
        for item in obj:
            fix_transforms(item)

fix_transforms(croissant_data)

# Save the fixed JSON
fixed_path = "./croissant_test.json"
with open(fixed_path, "w", encoding="utf-8") as f:
    json.dump(croissant_data, f, indent=2, ensure_ascii=False)

fixed_path


'./croissant_test.json'

In [None]:
dataset = mlc.Dataset(jsonld="croissant_test.json")

ValueError: File c:\Users\TLSYSLAB_3_ADMIN\Documents\Git\BO4Mob_dataset\croissant.json does not exist.

In [None]:
# records = dataset.records(record_set="sensor-records-1ramp")

# for i, record in enumerate(records):
#   print(record)
#   if i > 10:
#     break