## Try to use mlcroissant package to generate metadata

- Already uploaded csv file to https://minio.services.dsi.wisc.edu/pelican-dev/bird_migration_data.csv

In [1]:
import os

os.chdir("..")

In [2]:
from pathlib import Path
import json

import pandas as pd
import mlcroissant as mlc

from pelican_data_loader.utils import get_sha256, parse_col

In [3]:
csv_file = Path("data/bird_migration_data.csv")
df = pd.read_csv(csv_file)

DEV_S3_ID = "dsi-dev-s3"
DEV_S3_URL = "https://minio.services.dsi.wisc.edu/pelican-dev"

In [4]:
# S3 bucket

mlc_s3 = mlc.FileObject(
    id=DEV_S3_ID,
    name=DEV_S3_ID,
    description="DSI development S3 bucket",
    content_url=DEV_S3_URL,
    encoding_formats=["https"],  # https is the protocol used for S3
    sha256="main",  # sha256 not available for S3
)

# CSV file within the S3 bucket

mlc_file_object = mlc.FileObject(
    id="csv",
    name="csv",
    description="Bird migration data",
    contained_in=[DEV_S3_ID],
    sha256=get_sha256(csv_file),
    encoding_formats=[mlc.EncodingFormat.CSV],
    content_url=f"{DEV_S3_URL}/{csv_file.name}",
)

mlc_distribution = [mlc_s3, mlc_file_object]

In [5]:
mlc_record_sets = [
    mlc.RecordSet(
        id="bird_migration_data",
        name="Bird migration data",
        fields=[parse_col(df[col]) for col in df.columns],
    )
]

In [6]:
# Compose Croissant Metadata
metadata = mlc.Metadata(
    name="Bird Migration Data",
    description="Bird migration data",
    version="0.0.1",
    distribution=mlc_distribution,  # type: ignore
    record_sets=mlc_record_sets,
    cite_as="PLACE_HOLDER_CITE_AS",
    license="https://choosealicense.com/licenses/mit/",  # type: ignore
    # date_published="2025-05-13",  # mlcroissant has a bug with date_published
)

In [7]:
# Validate metadata
dataset = mlc.Dataset(jsonld=metadata.to_json())
dataset.jsonld["datePublished"] = "2025-05-13"  # type: ignore


  -  [Metadata(Bird Migration Data)] Property "https://schema.org/datePublished" is recommended, but does not exist.


In [8]:
# Export to JSON-LD
jsonld_file = Path("data/bird_migration_metadata.json")
jsonld_file.write_text(json.dumps(dataset.jsonld, indent=2))

16545

In [9]:
# Load JSON-LD from file
dataset = mlc.Dataset(jsonld=json.loads(jsonld_file.read_text()))

# Re-validate metadata
dataset.metadata.issues.report()

''

- `mlcroissant` is still not production ready, many typing issues and missing date types... be extra cautious