Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# BuildStockTools Agents Guide

This repository contains a Python package and CLI for interacting with BuildStock datasets. When extending the project, follow these guidelines:

- Prefer DuckDB SQL and pandas vectorized operations over Python loops.
- Configure logging with `buildstocktools.logging.configure_logging` and the `loguru` logger.
- Keep canonical release metadata centralized in `buildstocktools/config.py`.
- Document schema quirks or aliases in the modules under `buildstocktools/sources/`.
- Update the README with new user-facing features or workflows.

Testing expectations:

- Add synthetic parquet fixtures in tests to validate new behavior.
- Use pytest and avoid network calls in unit tests.

For CLI contributions:

- Commands live in `buildstocktools/cli.py` using Typer.
- Provide examples in the README.

67 changes: 66 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,67 @@
# BuildStockTools
A set of tools for BuildStock

BuildStockTools provides reproducible access to NREL's ResStock and ComStock datasets published on the [OEDI data lake](https://data.openei.org/submissions/4870). The package combines DuckDB-powered Parquet readers, canonical release discovery, and weight-aware stratified sampling utilities. A Typer-powered CLI offers quick inspection of datasets.

## Installation

```bash
pip install .
```

For development, install the optional test dependencies:

```bash
pip install .[test]
```

## Usage

### CLI

List canonical releases:

```bash
buildstocktools canonical
```

Inspect a parquet file:

```bash
buildstocktools head data/sample.parquet --column state --column weight
```

Draw a weighted stratified sample:

```bash
buildstocktools sample data/sample.parquet \
--sample-size 100 \
--weight-column weight \
--strata state --strata iecc_climate_zone \
--filter state=[CO,CA]
```

### Python API

```python
from buildstocktools.access import weighted_stratified_sample

df = weighted_stratified_sample(
"data/sample.parquet",
weight_column="weight",
strata=["state"],
sample_size=100,
filters={"state": ["CO", "CA"]},
random_state=42,
)
```

## Testing

```bash
pytest
```

## License

GPL-3.0

15 changes: 15 additions & 0 deletions buildstocktools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""BuildStockTools package."""
from __future__ import annotations

from importlib.metadata import version

from buildstocktools.logging import configure_logging, logger

__all__ = ["configure_logging", "logger"]


def __getattr__(name: str):
if name == "__version__":
return version("buildstocktools")
raise AttributeError(name)

113 changes: 113 additions & 0 deletions buildstocktools/access/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""High-level accessors for BuildStock releases."""
from __future__ import annotations

from pathlib import Path
from typing import Dict, Iterable, List, Optional

import pandas as pd

from buildstocktools import config
from buildstocktools.io.duck import read_parquet
from buildstocktools.logging import logger


def discover_releases(dataset: Optional[str] = None, canonical_only: bool = True) -> pd.DataFrame:
"""Return a DataFrame of known releases."""
releases = list(config.iter_releases(dataset=dataset, canonical_only=canonical_only))
logger.debug("Discovered %d releases", len(releases))
return pd.DataFrame(
[
{
"key": key,
"dataset": release.dataset,
"version": release.version,
"canonical": release.canonical,
"s3_path": release.s3_path,
"description": release.description,
}
for key, release in releases
]
)


def _format_filter_value(value) -> str:
if isinstance(value, (list, tuple, set)):
members = ",".join(f"'{v}'" if isinstance(v, str) else str(v) for v in value)
return f"IN ({members})"
if isinstance(value, str):
return f"= '{value}'"
return f"= {value}"


def build_filters(filters: Optional[Dict[str, Iterable[str]]]) -> Optional[str]:
if not filters:
return None
clauses = [f"{column} {_format_filter_value(value)}" for column, value in filters.items()]
return " AND ".join(clauses)


def head(parquet_path: str | Path, n: int = 5, *, columns: Optional[Iterable[str]] = None, filters: Optional[Dict[str, Iterable[str]]] = None) -> pd.DataFrame:
"""Return the first n rows matching filters."""
filter_expr = build_filters(filters)
df = read_parquet(parquet_path, columns=columns, filters=filter_expr)
return df.head(n)


def weighted_stratified_sample(
parquet_path: str | Path,
*,
weight_column: str,
strata: Optional[List[str]] = None,
sample_size: int,
filters: Optional[Dict[str, Iterable[str]]] = None,
random_state: Optional[int] = None,
) -> pd.DataFrame:
"""Return a weighted stratified sample from parquet."""
filter_expr = build_filters(filters)
df = read_parquet(parquet_path, filters=filter_expr)
if df.empty:
raise ValueError("No records match the provided filters.")

if strata:
grouped = df.groupby(strata, dropna=False)
weight_sums = grouped[weight_column].sum()
proportions = weight_sums / weight_sums.sum()
allocations_float = proportions * sample_size
allocations = allocations_float.astype(int)
remainder = sample_size - allocations.sum()
if remainder > 0:
fractional = allocations_float - allocations
top_groups = fractional.sort_values(ascending=False).head(remainder).index
allocations.loc[top_groups] += 1

def sample_group(group: pd.DataFrame) -> pd.DataFrame:
key = tuple(group.name) if isinstance(group.name, tuple) else group.name
n_samples = allocations.loc[key]
if n_samples == 0:
return group.iloc[0:0]
n_samples = min(int(n_samples), len(group))
return group.sample(
n=n_samples,
weights=group[weight_column],
replace=False,
random_state=random_state,
)

sample_df = grouped.apply(sample_group)
return sample_df.reset_index(drop=True)

return df.sample(
n=sample_size,
weights=df[weight_column],
replace=False,
random_state=random_state,
)


__all__ = [
"discover_releases",
"build_filters",
"head",
"weighted_stratified_sample",
]

85 changes: 85 additions & 0 deletions buildstocktools/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Typer-powered CLI for BuildStockTools."""
from __future__ import annotations

from pathlib import Path
from typing import List, Optional

import typer

from buildstocktools.access import discover_releases, head as head_df, weighted_stratified_sample
from buildstocktools.logging import configure_logging, logger

app = typer.Typer(help="Tools for working with BuildStock datasets")


def _parse_filters(filters: List[str]) -> dict:
parsed = {}
for item in filters:
if "=" not in item:
raise typer.BadParameter("Filters must be in column=value format")
column, value = item.split("=", 1)
if value.startswith("[") and value.endswith("]"):
entries = [v.strip() for v in value.strip("[]").split(",") if v.strip()]
parsed[column] = entries
else:
parsed[column] = value
return parsed


@app.callback()
def main(verbose: bool = typer.Option(False, "--verbose", help="Enable debug logging")) -> None:
configure_logging("DEBUG" if verbose else "INFO")


@app.command()
def canonical(dataset: Optional[str] = typer.Option(None, help="Filter by dataset")) -> None:
"""List canonical releases."""
df = discover_releases(dataset=dataset)
if df.empty:
typer.echo("No releases found")
else:
typer.echo(df.to_string(index=False))


@app.command()
def head(
parquet: Path = typer.Argument(..., exists=True, dir_okay=False),
n: int = typer.Option(5, help="Number of rows to display"),
column: List[str] = typer.Option(None, "--column", help="Columns to select"),
filter: List[str] = typer.Option(None, "--filter", help="Filters in column=value or column=[a,b] format"),
) -> None:
filters = _parse_filters(filter) if filter else None
df = head_df(parquet, n=n, columns=column or None, filters=filters)
typer.echo(df.to_string(index=False))


@app.command()
def sample(
parquet: Path = typer.Argument(..., exists=True, dir_okay=False),
sample_size: int = typer.Option(..., help="Number of samples to draw"),
weight_column: str = typer.Option(..., help="Weight column name"),
strata: List[str] = typer.Option(None, "--strata", help="Columns for stratification"),
filter: List[str] = typer.Option(None, "--filter", help="Filters in column=value or column=[a,b] format"),
random_state: Optional[int] = typer.Option(None, help="Random seed"),
) -> None:
filters = _parse_filters(filter) if filter else None
logger.info(
"Sampling %d rows stratified by %s with filters %s",
sample_size,
strata or "none",
filters or "none",
)
df = weighted_stratified_sample(
parquet,
weight_column=weight_column,
strata=strata or None,
sample_size=sample_size,
filters=filters,
random_state=random_state,
)
typer.echo(df.to_string(index=False))


if __name__ == "__main__": # pragma: no cover
app()

90 changes: 90 additions & 0 deletions buildstocktools/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""BuildStock release registry and configuration utilities."""
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, Optional

from loguru import logger


@dataclass(frozen=True)
class Release:
"""Description of an OEDI BuildStock release."""

dataset: str
version: str
canonical: bool
s3_path: str
description: str

def local_path(self, base_dir: Optional[Path] = None) -> Path:
"""Return the expected local storage path for the release."""
if base_dir is None:
base_dir = Path.home() / ".cache" / "buildstocktools"
path = base_dir / self.dataset / self.version
logger.debug("Derived local path for {} {}: {}", self.dataset, self.version, path)
return path


_RELEASES: Dict[str, Release] = {
"resstock-2024.2": Release(
dataset="resstock",
version="2024.2",
canonical=True,
s3_path="s3://oedi-data-lake/buildstock/resstock/2024.2",
description="ResStock v2024.2 official release",
),
"resstock-2025.1": Release(
dataset="resstock",
version="2025.1",
canonical=True,
s3_path="s3://oedi-data-lake/buildstock/resstock/2025.1",
description="ResStock v2025.1 official release",
),
"comstock-2024.2": Release(
dataset="comstock",
version="2024.2",
canonical=True,
s3_path="s3://oedi-data-lake/buildstock/comstock/2024.2",
description="ComStock v2024.2 official release",
),
}


def get_release(key: str) -> Release:
"""Fetch a release by key, raising KeyError if unknown."""
logger.debug("Fetching release {}", key)
return _RELEASES[key]


def list_releases(dataset: Optional[str] = None, canonical_only: bool = False) -> Iterable[Release]:
"""Iterate over releases filtered by dataset/canonical flag."""
releases = _RELEASES.values()
if dataset:
releases = [r for r in releases if r.dataset == dataset.lower()]
if canonical_only:
releases = [r for r in releases if r.canonical]
logger.debug(
"Listing releases for dataset={} canonical_only={} -> {} entries",
dataset,
canonical_only,
len(list(releases)) if isinstance(releases, list) else "unknown",
)
return releases


def iter_releases(dataset: Optional[str] = None, canonical_only: bool = False):
"""Yield (key, release) pairs respecting filters."""
for key, release in _RELEASES.items():
if dataset and release.dataset != dataset.lower():
continue
if canonical_only and not release.canonical:
continue
yield key, release


def describe_release(key: str) -> str:
release = get_release(key)
return f"{release.dataset.title()} {release.version}: {release.description} ({release.s3_path})"

Empty file added buildstocktools/io/__init__.py
Empty file.
Loading