### Meta-data local store

- We use SQLite for now...

In [None]:
import os

os.chdir("..")

In [None]:
import json
from pathlib import Path

from pelican_data_loader.db import initialize_database, Dataset
from sqlmodel import Session, create_engine


In [None]:
initialize_database(path=Path("data/datasets.db"), wipe=True)

In [None]:
# We need a function to flatten most useful metadata from the croissant jsonld to the Dataset model.
# This will be used to populate the Dataset table in the SQLite database.

metadata = json.loads(Path("data/bird_migration_metadata.json").read_text())

# Dataset object include the most useful flattened metadata and the raw JSON-LD metadata.
test_dataset = Dataset.from_jsonld(metadata)


In [None]:
# We populate the primary source with a best guess based on the url's extension, see `pelican_data_loader.db.guess_primary_url` for details.
test_dataset.primary_source_url

In [None]:
# These are the most useful fields for end-user query. We can extend this later.
test_dataset.model_dump(exclude={"croissant_jsonld"})

In [None]:
# Push the parsed metadata to the database.
with Session(create_engine("sqlite:///data/datasets.db")) as session:
    session.add(test_dataset)
    session.commit()


In [None]:
# Query by primary creator email.
from sqlmodel import select

with Session(create_engine("sqlite:///data/datasets.db")) as session:
    statement = select(Dataset).where(Dataset.primary_creator_email == "jason.lo@wisc.edu")
    results = session.exec(statement)
    for dataset in results:
        print(dataset)


In [None]:
# Query by keyword `testing`

with Session(create_engine("sqlite:///data/datasets.db")) as session:
    statement = select(Dataset).where(Dataset.keywords.contains("testing"))  # type: ignore
    results = session.exec(statement)
    for dataset in results:
        print(dataset)

In [None]:
# Query by description contains `bird`

with Session(create_engine("sqlite:///data/datasets.db")) as session:
    statement = select(Dataset).where(Dataset.description.contains("bird"))  # type: ignore
    results = session.exec(statement)
    for dataset in results:
        print(dataset)