## Try to use mlcroissant package to generate metadata

- Test file csv file located at: https://web.s3.wisc.edu/pelican-data-loader/data/bird_migration_data.csv

In [1]:
import os

os.chdir("..")

In [2]:
from pathlib import Path
from datetime import datetime
import json

from dotenv import load_dotenv
import pandas as pd
import mlcroissant as mlc
from pelican_data_loader.utils import get_sha256, parse_col

load_dotenv()

True

In [3]:
csv_file = Path("data/bird_migration_data.csv")
df = pd.read_csv(csv_file)

DEV_S3_URL = f"https://{os.getenv('S3_ENDPOINT_URL')}/{os.getenv('S3_BUCKET_NAME')}"

In [4]:
# In order to properly reference the dataset hosted on s3, we need to create a FileObject for the S3 bucket and another FileObject for the CSV file within that bucket. Not exactly intuitive to me, but this is how mlcroissant works. Example: https://github.com/mlcommons/croissant/blob/main/datasets/1.0/fashion-mnist/metadata.json


# S3 bucket

mlc_s3 = mlc.FileObject(
    id="dsi_s3_bucket",
    name="DSI Development S3 Bucket",
    description="DSI development S3 bucket",
    content_url=DEV_S3_URL,
    encoding_formats=["https"],  # https is the protocol used for S3
    sha256="main",  # sha256 not available for S3
)

# CSV file within the S3 bucket

mlc_file_object = mlc.FileObject(
    id="csv",
    name="Bird migration data",
    # description="Bird migration data",
    contained_in=[mlc_s3.id],
    sha256=get_sha256(csv_file),
    encoding_formats=[mlc.EncodingFormat.CSV],
    content_url=f"{DEV_S3_URL}/{csv_file.name}",
)

mlc_distribution = [mlc_s3, mlc_file_object]

In [5]:
# Here we generate the metadata such as dtype in each column of the DataFrame.
# The `parse_col` function is used to convert each column into a Field object.

mlc_record_sets = [
    mlc.RecordSet(
        id="bird_migration_data_record_set",
        name="Bird migration data",
        fields=[parse_col(df[col], parent_id=mlc_file_object.id) for col in df.columns],
    )
]

In [8]:
# Add author information

creators = [mlc.Person(name="Jason Testing Lo", email="jason.lo@wisc.edu")]

In [9]:
# Compose Croissant Metadata

metadata = mlc.Metadata(
    name="Bird Migration Data",
    description="Bird migration data",
    version="0.0.1",
    distribution=mlc_distribution,  # type: ignore
    record_sets=mlc_record_sets,
    cite_as="PLACE_HOLDER_CITE_AS",
    license=["https://choosealicense.com/licenses/mit/"],
    date_published=datetime.now(),
    creators=creators,  # type: ignore
    keywords=["bird", "testing"],
)

In [10]:
jsonld = metadata.to_json()
print(jsonld["datePublished"])
# We have a datetime datatype bug in mlcroissant, so we patch the date_published field manually
jsonld["datePublished"] = datetime.now().strftime("%Y-%m-%d")
dataset = mlc.Dataset(jsonld=jsonld)


2025-07-01 13:10:10.578923


In [11]:
# Export to JSON-LD
jsonld_file = Path("data/bird_migration_metadata.json")
jsonld_file.write_text(json.dumps(dataset.jsonld, indent=2))

16699

In [12]:
# Load JSON-LD from file
dataset = mlc.Dataset(jsonld=json.loads(jsonld_file.read_text()))

# Re-validate metadata
dataset.metadata.issues.report()

''

- `mlcroissant` is still not production ready, many typing issues and missing date types... be extra cautious