In [1]:
import os

os.chdir("..")

In [None]:
import fsspec

from pelican_data_loader.config import SystemConfig
from pelican_data_loader.db import DataRepoEngine, HFDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = SystemConfig()
db = DataRepoEngine(config)

In [4]:
# Dataset DB
# TODO: Deduplicate perhaps and immutably store datasets

all_datasets = db.list_datasets()

In [None]:
# Load with huggingface datasets library via s3fs
list(all_datasets[-1].pull()["train"].take(1))

[{'Bird_ID': 'B1000',
  'Species': 'Warbler',
  'Region': 'South America',
  'Habitat': 'Grassland',
  'Weather_Condition': 'Stormy',
  'Migration_Reason': 'Feeding',
  'Start_Latitude': 11.906566441337574,
  'Start_Longitude': -169.37825068830264,
  'End_Latitude': 30.37764666430312,
  'End_Longitude': -21.36687925839209,
  'Flight_Distance_km': 1753.79,
  'Flight_Duration_hours': 49.5,
  'Average_Speed_kmph': 47.82,
  'Max_Altitude_m': 5280,
  'Min_Altitude_m': 285,
  'Temperature_C': -2.2,
  'Wind_Speed_kmph': 9.1,
  'Humidity_pc': 43,
  'Pressure_hPa': 1030.3,
  'Visibility_km': 1.5,
  'Nesting_Success': 'No',
  'Tag_Battery_Level_pc': 45,
  'Signal_Strength_dB': -64.9,
  'Migration_Start_Month': 'Jan',
  'Migration_End_Month': 'Apr',
  'Rest_Stops': 3,
  'Predator_Sightings': 6,
  'Tag_Type': 'Radio',
  'Migrated_in_Flock': 'Yes',
  'Flock_Size': 264,
  'Food_Supply_Level': 'Low',
  'Tracking_Quality': 'Excellent',
  'Migration_Interrupted': 'Yes',
  'Interrupted_Reason': 'Storm',

In [8]:
# This is the raw `fsspec` interface to S3, just in case we need it...

fs = fsspec.filesystem("s3", **config.storage_options)
print(f"Listing contents of {config.s3_bucket_name}:")
print([item for item in fs.ls(config.s3_bucket_name)])


Listing contents of pelican-data-loader:
['pelican-data-loader/README.md', 'pelican-data-loader/bird_migration_data.csv', 'pelican-data-loader/metadata']


## UX

Make a even simpler wrapper to load with `netid/data` index

In [14]:
FAKE_INDEX = {"clo36/bird": "https://web.s3.wisc.edu/pelican-data-loader/metadata/bird_migration_data.json"}


In [15]:
def load_uw_data(key: str | None = None, croissant_jsonld_url: str | None = None) -> HFDataset:
    """Thin wrapper to load a dataset from the database."""

    # Check exclusively OR
    if key is None and croissant_jsonld_url is None:
        raise ValueError("Either 'key' or 'croissant_jsonld_url' must be provided.")
    if key is not None and croissant_jsonld_url is not None:
        raise ValueError("Only one of 'key' or 'croissant_jsonld_url' should be provided.")

    if key:
        croissant_jsonld_url = FAKE_INDEX.get(key)

    if not croissant_jsonld_url:
        raise ValueError("No valid URL found for the provided key.")

    record = db.get_dataset(croissant_jsonld_url=croissant_jsonld_url)
    if not record:
        raise ValueError(f"No dataset found for URL: {croissant_jsonld_url}")
    return record.pull()


In [None]:
# Example usage with dataset key

d = load_uw_data("clo36/bird")
list(d["train"].take(1))


[{'Bird_ID': 'B1000',
  'Species': 'Warbler',
  'Region': 'South America',
  'Habitat': 'Grassland',
  'Weather_Condition': 'Stormy',
  'Migration_Reason': 'Feeding',
  'Start_Latitude': 11.906566441337574,
  'Start_Longitude': -169.37825068830264,
  'End_Latitude': 30.37764666430312,
  'End_Longitude': -21.36687925839209,
  'Flight_Distance_km': 1753.79,
  'Flight_Duration_hours': 49.5,
  'Average_Speed_kmph': 47.82,
  'Max_Altitude_m': 5280,
  'Min_Altitude_m': 285,
  'Temperature_C': -2.2,
  'Wind_Speed_kmph': 9.1,
  'Humidity_pc': 43,
  'Pressure_hPa': 1030.3,
  'Visibility_km': 1.5,
  'Nesting_Success': 'No',
  'Tag_Battery_Level_pc': 45,
  'Signal_Strength_dB': -64.9,
  'Migration_Start_Month': 'Jan',
  'Migration_End_Month': 'Apr',
  'Rest_Stops': 3,
  'Predator_Sightings': 6,
  'Tag_Type': 'Radio',
  'Migrated_in_Flock': 'Yes',
  'Flock_Size': 264,
  'Food_Supply_Level': 'Low',
  'Tracking_Quality': 'Excellent',
  'Migration_Interrupted': 'Yes',
  'Interrupted_Reason': 'Storm',

In [None]:
# Or using the URL directly

d = load_uw_data(croissant_jsonld_url="https://web.s3.wisc.edu/pelican-data-loader/metadata/bird_migration_data.json")
list(d["train"].take(1))


[{'Bird_ID': 'B1000',
  'Species': 'Warbler',
  'Region': 'South America',
  'Habitat': 'Grassland',
  'Weather_Condition': 'Stormy',
  'Migration_Reason': 'Feeding',
  'Start_Latitude': 11.906566441337574,
  'Start_Longitude': -169.37825068830264,
  'End_Latitude': 30.37764666430312,
  'End_Longitude': -21.36687925839209,
  'Flight_Distance_km': 1753.79,
  'Flight_Duration_hours': 49.5,
  'Average_Speed_kmph': 47.82,
  'Max_Altitude_m': 5280,
  'Min_Altitude_m': 285,
  'Temperature_C': -2.2,
  'Wind_Speed_kmph': 9.1,
  'Humidity_pc': 43,
  'Pressure_hPa': 1030.3,
  'Visibility_km': 1.5,
  'Nesting_Success': 'No',
  'Tag_Battery_Level_pc': 45,
  'Signal_Strength_dB': -64.9,
  'Migration_Start_Month': 'Jan',
  'Migration_End_Month': 'Apr',
  'Rest_Stops': 3,
  'Predator_Sightings': 6,
  'Tag_Type': 'Radio',
  'Migrated_in_Flock': 'Yes',
  'Flock_Size': 264,
  'Food_Supply_Level': 'Low',
  'Tracking_Quality': 'Excellent',
  'Migration_Interrupted': 'Yes',
  'Interrupted_Reason': 'Storm',

[{'Bird_ID': 'B1000',
  'Species': 'Warbler',
  'Region': 'South America',
  'Habitat': 'Grassland',
  'Weather_Condition': 'Stormy',
  'Migration_Reason': 'Feeding',
  'Start_Latitude': 11.906566441337574,
  'Start_Longitude': -169.37825068830264,
  'End_Latitude': 30.37764666430312,
  'End_Longitude': -21.36687925839209,
  'Flight_Distance_km': 1753.79,
  'Flight_Duration_hours': 49.5,
  'Average_Speed_kmph': 47.82,
  'Max_Altitude_m': 5280,
  'Min_Altitude_m': 285,
  'Temperature_C': -2.2,
  'Wind_Speed_kmph': 9.1,
  'Humidity_pc': 43,
  'Pressure_hPa': 1030.3,
  'Visibility_km': 1.5,
  'Nesting_Success': 'No',
  'Tag_Battery_Level_pc': 45,
  'Signal_Strength_dB': -64.9,
  'Migration_Start_Month': 'Jan',
  'Migration_End_Month': 'Apr',
  'Rest_Stops': 3,
  'Predator_Sightings': 6,
  'Tag_Type': 'Radio',
  'Migrated_in_Flock': 'Yes',
  'Flock_Size': 264,
  'Food_Supply_Level': 'Low',
  'Tracking_Quality': 'Excellent',
  'Migration_Interrupted': 'Yes',
  'Interrupted_Reason': 'Storm',