TragerTech · JasonTrager · Oct 1, 2025
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,20 @@
+# BuildStockTools Agents Guide
+
+This repository contains a Python package and CLI for interacting with BuildStock datasets. When extending the project, follow these guidelines:
+
+- Prefer DuckDB SQL and pandas vectorized operations over Python loops.
+- Configure logging with `buildstocktools.logging.configure_logging` and the `loguru` logger.
+- Keep canonical release metadata centralized in `buildstocktools/config.py`.
+- Document schema quirks or aliases in the modules under `buildstocktools/sources/`.
+- Update the README with new user-facing features or workflows.
+
+Testing expectations:
+
+- Add synthetic parquet fixtures in tests to validate new behavior.
+- Use pytest and avoid network calls in unit tests.
+
+For CLI contributions:
+
+- Commands live in `buildstocktools/cli.py` using Typer.
+- Provide examples in the README.
+
diff --git a/README.md b/README.md
@@ -1,2 +1,67 @@
 # BuildStockTools
-A set of tools for BuildStock
+
+BuildStockTools provides reproducible access to NREL's ResStock and ComStock datasets published on the [OEDI data lake](https://data.openei.org/submissions/4870). The package combines DuckDB-powered Parquet readers, canonical release discovery, and weight-aware stratified sampling utilities. A Typer-powered CLI offers quick inspection of datasets.
+
+## Installation
+
+```bash
+pip install .
+```
+
+For development, install the optional test dependencies:
+
+```bash
+pip install .[test]
+```
+
+## Usage
+
+### CLI
+
+List canonical releases:
+
+```bash
+buildstocktools canonical
+```
+
+Inspect a parquet file:
+
+```bash
+buildstocktools head data/sample.parquet --column state --column weight
+```
+
+Draw a weighted stratified sample:
+
+```bash
+buildstocktools sample data/sample.parquet \
+  --sample-size 100 \
+  --weight-column weight \
+  --strata state --strata iecc_climate_zone \
+  --filter state=[CO,CA]
+```
+
+### Python API
+
+```python
+from buildstocktools.access import weighted_stratified_sample
+
+df = weighted_stratified_sample(
+    "data/sample.parquet",
+    weight_column="weight",
+    strata=["state"],
+    sample_size=100,
+    filters={"state": ["CO", "CA"]},
+    random_state=42,
+)
+```
+
+## Testing
+
+```bash
+pytest
+```
+
+## License
+
+GPL-3.0
+
diff --git a/buildstocktools/__init__.py b/buildstocktools/__init__.py
@@ -0,0 +1,15 @@
+"""BuildStockTools package."""
+from __future__ import annotations
+
+from importlib.metadata import version
+
+from buildstocktools.logging import configure_logging, logger
+
+__all__ = ["configure_logging", "logger"]
+
+
+def __getattr__(name: str):
+    if name == "__version__":
+        return version("buildstocktools")
+    raise AttributeError(name)
+
diff --git a/buildstocktools/access/__init__.py b/buildstocktools/access/__init__.py
@@ -0,0 +1,113 @@
+"""High-level accessors for BuildStock releases."""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+import pandas as pd
+
+from buildstocktools import config
+from buildstocktools.io.duck import read_parquet
+from buildstocktools.logging import logger
+
+
+def discover_releases(dataset: Optional[str] = None, canonical_only: bool = True) -> pd.DataFrame:
+    """Return a DataFrame of known releases."""
+    releases = list(config.iter_releases(dataset=dataset, canonical_only=canonical_only))
+    logger.debug("Discovered %d releases", len(releases))
+    return pd.DataFrame(
+        [
+            {
+                "key": key,
+                "dataset": release.dataset,
+                "version": release.version,
+                "canonical": release.canonical,
+                "s3_path": release.s3_path,
+                "description": release.description,
+            }
+            for key, release in releases
+        ]
+    )
+
+
+def _format_filter_value(value) -> str:
+    if isinstance(value, (list, tuple, set)):
+        members = ",".join(f"'{v}'" if isinstance(v, str) else str(v) for v in value)
+        return f"IN ({members})"
+    if isinstance(value, str):
+        return f"= '{value}'"
+    return f"= {value}"
+
+
+def build_filters(filters: Optional[Dict[str, Iterable[str]]]) -> Optional[str]:
+    if not filters:
+        return None
+    clauses = [f"{column} {_format_filter_value(value)}" for column, value in filters.items()]
+    return " AND ".join(clauses)
+
+
+def head(parquet_path: str | Path, n: int = 5, *, columns: Optional[Iterable[str]] = None, filters: Optional[Dict[str, Iterable[str]]] = None) -> pd.DataFrame:
+    """Return the first n rows matching filters."""
+    filter_expr = build_filters(filters)
+    df = read_parquet(parquet_path, columns=columns, filters=filter_expr)
+    return df.head(n)
+
+
+def weighted_stratified_sample(
+    parquet_path: str | Path,
+    *,
+    weight_column: str,
+    strata: Optional[List[str]] = None,
+    sample_size: int,
+    filters: Optional[Dict[str, Iterable[str]]] = None,
+    random_state: Optional[int] = None,
+) -> pd.DataFrame:
+    """Return a weighted stratified sample from parquet."""
+    filter_expr = build_filters(filters)
+    df = read_parquet(parquet_path, filters=filter_expr)
+    if df.empty:
+        raise ValueError("No records match the provided filters.")
+
+    if strata:
+        grouped = df.groupby(strata, dropna=False)
+        weight_sums = grouped[weight_column].sum()
+        proportions = weight_sums / weight_sums.sum()
+        allocations_float = proportions * sample_size
+        allocations = allocations_float.astype(int)
+        remainder = sample_size - allocations.sum()
+        if remainder > 0:
+            fractional = allocations_float - allocations
+            top_groups = fractional.sort_values(ascending=False).head(remainder).index
+            allocations.loc[top_groups] += 1
+
+        def sample_group(group: pd.DataFrame) -> pd.DataFrame:
+            key = tuple(group.name) if isinstance(group.name, tuple) else group.name
+            n_samples = allocations.loc[key]
+            if n_samples == 0:
+                return group.iloc[0:0]
+            n_samples = min(int(n_samples), len(group))
+            return group.sample(
+                n=n_samples,
+                weights=group[weight_column],
+                replace=False,
+                random_state=random_state,
+            )
+
+        sample_df = grouped.apply(sample_group)
+        return sample_df.reset_index(drop=True)
+
+    return df.sample(
+        n=sample_size,
+        weights=df[weight_column],
+        replace=False,
+        random_state=random_state,
+    )
+
+
+__all__ = [
+    "discover_releases",
+    "build_filters",
+    "head",
+    "weighted_stratified_sample",
+]
+
diff --git a/buildstocktools/cli.py b/buildstocktools/cli.py
@@ -0,0 +1,85 @@
+"""Typer-powered CLI for BuildStockTools."""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional
+
+import typer
+
+from buildstocktools.access import discover_releases, head as head_df, weighted_stratified_sample
+from buildstocktools.logging import configure_logging, logger
+
+app = typer.Typer(help="Tools for working with BuildStock datasets")
+
+
+def _parse_filters(filters: List[str]) -> dict:
+    parsed = {}
+    for item in filters:
+        if "=" not in item:
+            raise typer.BadParameter("Filters must be in column=value format")
+        column, value = item.split("=", 1)
+        if value.startswith("[") and value.endswith("]"):
+            entries = [v.strip() for v in value.strip("[]").split(",") if v.strip()]
+            parsed[column] = entries
+        else:
+            parsed[column] = value
+    return parsed
+
+
+@app.callback()
+def main(verbose: bool = typer.Option(False, "--verbose", help="Enable debug logging")) -> None:
+    configure_logging("DEBUG" if verbose else "INFO")
+
+
+@app.command()
+def canonical(dataset: Optional[str] = typer.Option(None, help="Filter by dataset")) -> None:
+    """List canonical releases."""
+    df = discover_releases(dataset=dataset)
+    if df.empty:
+        typer.echo("No releases found")
+    else:
+        typer.echo(df.to_string(index=False))
+
+
+@app.command()
+def head(
+    parquet: Path = typer.Argument(..., exists=True, dir_okay=False),
+    n: int = typer.Option(5, help="Number of rows to display"),
+    column: List[str] = typer.Option(None, "--column", help="Columns to select"),
+    filter: List[str] = typer.Option(None, "--filter", help="Filters in column=value or column=[a,b] format"),
+) -> None:
+    filters = _parse_filters(filter) if filter else None
+    df = head_df(parquet, n=n, columns=column or None, filters=filters)
+    typer.echo(df.to_string(index=False))
+
+
+@app.command()
+def sample(
+    parquet: Path = typer.Argument(..., exists=True, dir_okay=False),
+    sample_size: int = typer.Option(..., help="Number of samples to draw"),
+    weight_column: str = typer.Option(..., help="Weight column name"),
+    strata: List[str] = typer.Option(None, "--strata", help="Columns for stratification"),
+    filter: List[str] = typer.Option(None, "--filter", help="Filters in column=value or column=[a,b] format"),
+    random_state: Optional[int] = typer.Option(None, help="Random seed"),
+) -> None:
+    filters = _parse_filters(filter) if filter else None
+    logger.info(
+        "Sampling %d rows stratified by %s with filters %s",
+        sample_size,
+        strata or "none",
+        filters or "none",
+    )
+    df = weighted_stratified_sample(
+        parquet,
+        weight_column=weight_column,
+        strata=strata or None,
+        sample_size=sample_size,
+        filters=filters,
+        random_state=random_state,
+    )
+    typer.echo(df.to_string(index=False))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    app()
+
diff --git a/buildstocktools/config.py b/buildstocktools/config.py
@@ -0,0 +1,90 @@
+"""BuildStock release registry and configuration utilities."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, Optional
+
+from loguru import logger
+
+
+@dataclass(frozen=True)
+class Release:
+    """Description of an OEDI BuildStock release."""
+
+    dataset: str
+    version: str
+    canonical: bool
+    s3_path: str
+    description: str
+
+    def local_path(self, base_dir: Optional[Path] = None) -> Path:
+        """Return the expected local storage path for the release."""
+        if base_dir is None:
+            base_dir = Path.home() / ".cache" / "buildstocktools"
+        path = base_dir / self.dataset / self.version
+        logger.debug("Derived local path for {} {}: {}", self.dataset, self.version, path)
+        return path
+
+
+_RELEASES: Dict[str, Release] = {
+    "resstock-2024.2": Release(
+        dataset="resstock",
+        version="2024.2",
+        canonical=True,
+        s3_path="s3://oedi-data-lake/buildstock/resstock/2024.2",
+        description="ResStock v2024.2 official release",
+    ),
+    "resstock-2025.1": Release(
+        dataset="resstock",
+        version="2025.1",
+        canonical=True,
+        s3_path="s3://oedi-data-lake/buildstock/resstock/2025.1",
+        description="ResStock v2025.1 official release",
+    ),
+    "comstock-2024.2": Release(
+        dataset="comstock",
+        version="2024.2",
+        canonical=True,
+        s3_path="s3://oedi-data-lake/buildstock/comstock/2024.2",
+        description="ComStock v2024.2 official release",
+    ),
+}
+
+
+def get_release(key: str) -> Release:
+    """Fetch a release by key, raising KeyError if unknown."""
+    logger.debug("Fetching release {}", key)
+    return _RELEASES[key]
+
+
+def list_releases(dataset: Optional[str] = None, canonical_only: bool = False) -> Iterable[Release]:
+    """Iterate over releases filtered by dataset/canonical flag."""
+    releases = _RELEASES.values()
+    if dataset:
+        releases = [r for r in releases if r.dataset == dataset.lower()]
+    if canonical_only:
+        releases = [r for r in releases if r.canonical]
+    logger.debug(
+        "Listing releases for dataset={} canonical_only={} -> {} entries",
+        dataset,
+        canonical_only,
+        len(list(releases)) if isinstance(releases, list) else "unknown",
+    )
+    return releases
+
+
+def iter_releases(dataset: Optional[str] = None, canonical_only: bool = False):
+    """Yield (key, release) pairs respecting filters."""
+    for key, release in _RELEASES.items():
+        if dataset and release.dataset != dataset.lower():
+            continue
+        if canonical_only and not release.canonical:
+            continue
+        yield key, release
+
+
+def describe_release(key: str) -> str:
+    release = get_release(key)
+    return f"{release.dataset.title()} {release.version}: {release.description} ({release.s3_path})"
+
diff --git a/buildstocktools/io/__init__.py b/buildstocktools/io/__init__.py