In [16]:
import mlcroissant as mlc
import hashlib
import os

In [17]:
network_list = ['1ramp', '2corridor', '3junction', '4smallRegion']

In [18]:


# 현재 작업 디렉토리 확인
current_path = os.getcwd()
current_path = current_path.replace("\\", "/")
print(f"Updated current_path: {current_path}")

Updated current_path: c:/Users/TLSYSLAB_3_ADMIN/Documents/Git/dataset


In [19]:


def sha256sum(file_path):
    with open(file_path, "rb") as f:
        return hashlib.sha256(f.read()).hexdigest()

sha_list = []
for net in network_list:
    sha_list.append(sha256sum(f'{current_path}/sensor_data/gt_edge_data_{net}-221014-from8to9.csv'))

# Distribution

In [20]:


distribution = [
    mlc.FileObject(
        id="bo-benchmark-repo",
        name="BO Benchmark Repository",
        description="GitHub repository containing SUMO network files and sensor measurements for Bayesian Optimization benchmarks.",
        content_url="https://github.com/TrafficSimBenchmark/dataset",
        encoding_formats=["git+https"],
        sha256="main",
    ),

    mlc.FileSet(
        id="network-files",
        name="Network Files",
        description="Each subfolder under 'network/' contains SUMO XML/CSV files for one specific traffic network.",
        contained_in=["bo-benchmark-repo"],
        encoding_formats=["application/xml", "text/csv"],
        includes="network/network1_SF_*/{additional.xml,net.xml,od.xml,routes.csv,taz.xml}",
    ),

    mlc.FileSet(
        id="sensor-csv-files",
        name="Sensor Data Files",
        description="Sensor ground-truth vehicle flow data for each network, stored as edge-wise CSV files.",
        contained_in=["bo-benchmark-repo"],
        encoding_formats=["text/csv"],
        includes="sensor_data/gt_edge_data_*.csv",
    ),

    # :::::::::: Individual sensor link flows csv files (FileObject) ::::::::::
    mlc.FileObject(
        id="sensor-1ramp",
        name="gt_edge_data_1ramp-221014-from8to9.csv",
        description="Sensor flow data for network 1ramp.",
        content_url="https://github.com/TrafficSimBenchmark/dataset/raw/main/sensor_data/gt_edge_data_1ramp-221014-from8to9.csv",
        encoding_formats=["text/csv"],
        sha256=sha_list[0],
        
    ),
    mlc.FileObject(
        id="sensor-2corridor",
        name="gt_edge_data_2corridor-221014-from8to9.csv",
        description="Sensor flow data for network 2corridor.",
        content_url="https://github.com/TrafficSimBenchmark/dataset/raw/main/sensor_data/gt_edge_data_2corridor-221014-from8to9.csv",
        encoding_formats=["text/csv"],
        sha256=sha_list[1],
    ),
    mlc.FileObject(
        id="sensor-3junction",
        name="gt_edge_data_3junction-221014-from8to9.csv",
        description="Sensor flow data for network 3junction.",
        content_url="https://github.com/TrafficSimBenchmark/dataset/raw/main/sensor_data/gt_edge_data_3junction-221014-from8to9.csv",
        encoding_formats=["text/csv"],
        sha256=sha_list[2],
    ),
    mlc.FileObject(
        id="sensor-4smallRegion",
        name="gt_edge_data_4smallRegion-221014-from8to9.csv",
        description="Sensor flow data for network 4smallRegion.",
        content_url="https://github.com/TrafficSimBenchmark/dataset/raw/main/sensor_data/gt_edge_data_4smallRegion-221014-from8to9.csv",
        encoding_formats=["text/csv"],
        sha256=sha_list[3],
    ),
]





# Record Sets

In [21]:
record_sets = []

for i, name in enumerate(["1ramp", "2corridor", "3junction", "4smallRegion"], start=1):
    file_id = f"sensor-{name}"
    file_path = f"sensor_data/gt_edge_data_{name}-221014-from8to9.csv"
    
    record_sets.append(
        mlc.RecordSet(
            id=f"sensor-records-{name}",
            name=f"Sensor Records - {name}",
            description=f"Sensor vehicle flow data for network {name}.",
            fields=[
                mlc.Field(
                    id=f"{name}-edge-id",
                    name="edge_id",
                    description="Edge ID where the sensor is located in the SUMO network.",
                    data_types=mlc.DataType.TEXT,
                    source=mlc.Source(
                        file_object=file_id,
                        extract=mlc.Extract(column="edge_id"),
                    ),
                ),
                mlc.Field(
                    id=f"{name}-flow",
                    name="interval_nVehContrib",
                    description="Number of vehicles that passed the sensor on this edge during the interval.",
                    data_types=mlc.DataType.INTEGER,
                    source=mlc.Source(
                        file_object=file_id,
                        extract=mlc.Extract(column="interval_nVehContrib"),
                    ),
                ),
            ]
        )
    )


# Meta Data

In [22]:
metadata = mlc.Metadata(
    name="BO Benchmark Dataset",
    description=(
        "This dataset contains four synthetic traffic networks modeled in SUMO "
        "along with corresponding vehicle flow measurements from virtual sensors. "
        "Each network has its own configuration files (network, OD, TAZ, etc.), and "
        "each has a matching CSV file listing vehicle flows on sensor-equipped edges."
    ),
    cite_as=(),
    url="https://github.com/TrafficSimBenchmark/dataset",          # 🔁 GitHub 실제 URL
    distribution=distribution,
    record_sets=record_sets,
)

# Check and Fix

In [23]:
print(metadata.issues.report())

  -  [Metadata(BO Benchmark Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.
  -  [Metadata(BO Benchmark Dataset)] Property "https://schema.org/datePublished" is recommended, but does not exist.
  -  [Metadata(BO Benchmark Dataset)] Property "https://schema.org/license" is recommended, but does not exist.
  -  [Metadata(BO Benchmark Dataset)] Property "https://schema.org/version" is recommended, but does not exist.


In [24]:
import json

with open("croissant.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=2)
  print(content)
  f.write(content)
  f.write("\n")  # Terminate file with newline

{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:re

In [25]:
dataset = mlc.Dataset(jsonld="croissant.json")

  -  [Metadata(BO Benchmark Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.
  -  [Metadata(BO Benchmark Dataset)] Property "https://schema.org/datePublished" is recommended, but does not exist.
  -  [Metadata(BO Benchmark Dataset)] Property "https://schema.org/license" is recommended, but does not exist.
  -  [Metadata(BO Benchmark Dataset)] Property "https://schema.org/version" is recommended, but does not exist.


In [15]:
records = dataset.records(record_set="sensor-records-1ramp")

for i, record in enumerate(records):
  print(record)
  if i > 10:
    break

Downloading https://github.com/TrafficSimBenchmark/dataset/raw/main/sensor_data/gt_edge_data_1ramp-221014-from8to9.csv...:  88%|████████▊ | 75.0/85.0 [00:00<?, ?iB/s]


GenerationError: An error occured during the streaming generation of the dataset, more specifically during the operation Download(sensor-1ramp)