## Try to use mlcroissant package to generate metadata

- Test file csv file located at: https://web.s3.wisc.edu/pelican-data-loader/data/bird_migration_data.csv

In [1]:
import os

os.chdir("..")

In [2]:
from pathlib import Path
from datetime import datetime
import json

from dotenv import load_dotenv
import pandas as pd
import mlcroissant as mlc
from pelican_data_loader.utils import get_sha256, parse_col

load_dotenv()

True

In [3]:
csv_file = Path("data/bird_migration_data.csv")
df = pd.read_csv(csv_file)

DEV_S3_URL = f"https://{os.getenv('S3_ENDPOINT_URL')}/{os.getenv('S3_BUCKET_NAME')}"

In [4]:
# In order to properly reference the dataset hosted on s3, we need to create a FileObject for the S3 bucket and another FileObject for the CSV file within that bucket. Not exactly intuitive to me, but this is how mlcroissant works. Example: https://github.com/mlcommons/croissant/blob/main/datasets/1.0/fashion-mnist/metadata.json


# S3 bucket

mlc_s3 = mlc.FileObject(
    id="dsi_s3_bucket",
    name="DSI Development S3 Bucket",
    description="DSI development S3 bucket",
    content_url=DEV_S3_URL,
    encoding_formats=["https"],  # https is the protocol used for S3
    sha256="main",  # sha256 not available for S3
)

# CSV file within the S3 bucket

mlc_file_object = mlc.FileObject(
    id="csv",
    name="Bird migration data",
    # description="Bird migration data",
    contained_in=[mlc_s3.id],
    sha256=get_sha256(csv_file),
    encoding_formats=[mlc.EncodingFormat.CSV],
    content_url=f"{DEV_S3_URL}/{csv_file.name}",
)

mlc_distribution = [mlc_s3, mlc_file_object]

In [5]:
# Here we generate the metadata such as dtype in each column of the DataFrame.
# The `parse_col` function is used to convert each column into a Field object.

mlc_record_sets = [
    mlc.RecordSet(
        id="bird_migration_data_record_set",
        name="Bird migration data",
        fields=[parse_col(df[col], parent_id=mlc_file_object.id) for col in df.columns],
    )
]

In [8]:
# Add author information

creators = [mlc.Person(name="Jason Testing Lo", email="jason.lo@wisc.edu")]

In [9]:
# Compose Croissant Metadata

metadata = mlc.Metadata(
    name="Bird Migration Data",
    description="Bird migration data",
    version="0.0.1",
    distribution=mlc_distribution,  # type: ignore
    record_sets=mlc_record_sets,
    cite_as="PLACE_HOLDER_CITE_AS",
    license=["https://choosealicense.com/licenses/mit/"],
    date_published=datetime.now(),
    creators=creators,  # type: ignore
    keywords=["bird", "testing"],
)

In [10]:
jsonld = metadata.to_json()
print(jsonld["datePublished"])
# We have a datetime datatype bug in mlcroissant, so we patch the date_published field manually
jsonld["datePublished"] = datetime.now().strftime("%Y-%m-%d")
dataset = mlc.Dataset(jsonld=jsonld)


2025-07-01 13:10:10.578923


In [11]:
# Export to JSON-LD
jsonld_file = Path("data/bird_migration_metadata.json")
jsonld_file.write_text(json.dumps(dataset.jsonld, indent=2))

16699

In [4]:
# Load JSON-LD from file
jsonld_file = Path("data/bird_migration_metadata.json")

dataset = mlc.Dataset(jsonld=json.loads(jsonld_file.read_text()))

# Re-validate metadata
dataset.metadata.issues.report()

''

In [None]:
def get_primary_url(jsonld: dict, extension_priority: list[str] | None = None) -> str:
    """Guess the primary source URL from the JSON-LD document by getting all contentUrls and returning the one with supported file extensions."""

    if extension_priority is None:
        extension_priority = [".csv", ".parquet"]

    distributions = jsonld.get("distribution", [])
    if not distributions:
        return ""

    urls = [dist.get("contentUrl", "") for dist in distributions if isinstance(dist, dict)]
    if not urls:
        return ""

    # Return by priority of file extensions
    urls = [url for url in urls if url.endswith(tuple(extension_priority))]
    if urls:
        return urls[0]
    return ""

In [10]:
get_primary_url(dataset.jsonld)

'https://web.s3.wisc.edu/pelican-data-loader/bird_migration_data.csv'

In [None]:
def sort_distribution_extension(distributions: dict, extension_priority: list[str] | None = None) -> list[dict]:
    """Sort the distribution list in a JSON-LD document by file extension priority."""

    if not extension_priority:
        extension_priority = [".csv", ".parquet"]

    priority = {ext: rank for rank, ext in enumerate(extension_priority)}

    def get_priority(item):
        url = item.get("contentUrl", "")
        for ext, rank in priority.items():
            if url.endswith(ext):
                return rank
        return max(priority.values()) + 1  # Default to a high rank if no extension matches

    return sorted(distributions, key=get_priority)

In [19]:
sort_distribution_extension(dataset.jsonld.get("distribution", []), extension_priority=["loader", ".parquet"])

[{'@type': 'cr:FileObject',
  '@id': 'dsi_s3_bucket',
  'name': 'DSI Development S3 Bucket',
  'description': 'DSI development S3 bucket',
  'contentUrl': 'https://web.s3.wisc.edu/pelican-data-loader',
  'encodingFormat': 'https',
  'sha256': 'main'},
 {'@type': 'cr:FileObject',
  '@id': 'csv',
  'name': 'Bird migration data',
  'containedIn': {'@id': 'dsi_s3_bucket'},
  'contentUrl': 'https://web.s3.wisc.edu/pelican-data-loader/bird_migration_data.csv',
  'encodingFormat': 'text/csv',
  'sha256': '85da618b044d8220b5a8c3c7030ff3f35f791e875736ed43115415750a824fbf'}]

In [None]:
dataset.jsonld["distribution"]

[{'@type': 'cr:FileObject',
  '@id': 'dsi_s3_bucket',
  'name': 'DSI Development S3 Bucket',
  'description': 'DSI development S3 bucket',
  'contentUrl': 'https://web.s3.wisc.edu/pelican-data-loader',
  'encodingFormat': 'https',
  'sha256': 'main'},
 {'@type': 'cr:FileObject',
  '@id': 'csv',
  'name': 'Bird migration data',
  'containedIn': {'@id': 'dsi_s3_bucket'},
  'contentUrl': 'https://web.s3.wisc.edu/pelican-data-loader/bird_migration_data.csv',
  'encodingFormat': 'text/csv',
  'sha256': '85da618b044d8220b5a8c3c7030ff3f35f791e875736ed43115415750a824fbf'}]

- `mlcroissant` is still not production ready, many typing issues and missing date types... be extra cautious