# Demography and Domestic Water Use

这个笔记本展示人口变化和饮食结构如何影响生活用水需求

## 前端交互设计

模拟前端界面的两个主要控件：
- **Fertility Slider**: 调节生育率参数 (1.6 → 1.8)
- **Diet Dropdown**: 选择饮食情景 (1=传统, 2=过渡, 3=现代)

## 可视化特点

- 使用**置信区间**展示参数不确定性
- 只聚焦于**人口**和**生活用水**两个核心指标
- 所有图表使用英文标注（便于前端国际化）


## Setup Environment


In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import polars as pl
import os
from hydra import compose, initialize

# Import ScenarioQuery API
import sys

sys.path.insert(0, str(Path.cwd()))
from scripts.query_scenarios import ScenarioQuery

with initialize(version_base=None, config_path="../config"):
    cfg = compose(config_name="config.yaml")
os.chdir(cfg.ds.root)

# Initialize query engine
query = ScenarioQuery("data_parquet")
print("✓ Query engine initialized")

## Parameter Configuration

Fixed parameters (baseline scenario)

In [None]:
# Fixed baseline parameters (other sliders/controls in frontend)
FIXED_PARAMS = {
    "water saving irrigation efficiency ratio": 0.9,
    "fire generation share province target": 0.2,
    "Ecological water flow variable": 0.25,
    "Climate change scenario switch for water yield": 2,
}

# Interactive parameters (will be controlled by frontend)
FERTILITY_VALUES = [1.6, 1.7, 1.8]  # Slider values
DIET_VALUES = [1, 2, 3]  # Dropdown options

# Labels for visualization
FERTILITY_LABELS = {
    1.6: "Low Fertility (1.6)",
    1.7: "Medium Fertility (1.7)",
    1.8: "High Fertility (1.8)",
}
DIET_LABELS = {1: "Traditional Diet", 2: "Transitional Diet", 3: "Modern Diet"}

print("Fixed parameters:")
for k, v in FIXED_PARAMS.items():
    print(f"  {k}: {v}")
print("\nInteractive parameters:")
print(f"  Fertility (slider): {FERTILITY_VALUES}")
print(f"  Diet (dropdown): {DIET_VALUES}")

敏感性分析人口情况，受影响最多的变量为：

- production water demand
- OA water demand province sum
- domestic water demand
- Total population

In [None]:
from scripts.analysis_helpers import sensitivity_test

# ========== 敏感性分析 ==========
results = sensitivity_test(
    query,
    vary_param="Fertility Variation",
    fixed_params=FIXED_PARAMS,
    time_range=(2020, 2100),
    metric="cv",
)
results

### 看看对人口和人居用水的影响

In [None]:
from scripts.viz_helpers import quick_plot

quick_plot(
    query,
    variable="Total population",
    filters=None,
    time_range=(2020, 2100),
)

In [None]:
quick_plot(
    query,
    variable="domestic water demand province sum",
    filters=None,
    time_range=(2020, 2100),
)

## Scenario 1: Find Peak Year

In [None]:
def find_peak_year(
    data: pl.DataFrame,
    value_col: str = None,
    time_col: str = None,
    scenario_col: str = None,
    method: str = "mean",
) -> dict:
    """
    Find the year when maximum value occurs in time series data.

    Parameters
    ----------
    data : pl.DataFrame
        Time series data
    value_col : str, optional
        Value column name. Auto-detects 'Value' or 'value'
    time_col : str, optional
        Time column name. Auto-detects 'Year', 'time', or 'step'
    scenario_col : str, optional
        Scenario column name. Auto-detects 'scenario_id' or 'scenario_name'
    method : str, default="mean"
        Method to aggregate across scenarios: "mean", "median", "max", "min"

    Returns
    -------
    dict with keys: peak_year, peak_value, method_used
    """
    # Auto-detect column names
    cols = data.columns
    if value_col is None:
        value_col = "value" if "value" in cols else "Value"
    if time_col is None:
        time_col = "time" if "time" in cols else ("Year" if "Year" in cols else "step")
    if scenario_col is None:
        scenario_col = (
            "scenario_name"
            if "scenario_name" in cols
            else ("scenario_id" if "scenario_id" in cols else None)
        )

    # Group by time and aggregate across scenarios
    if scenario_col and scenario_col in cols:
        # Multiple scenarios - aggregate by time
        if method == "mean":
            aggregated = data.group_by(time_col).agg(
                pl.col(value_col).mean().alias("aggregated_value")
            )
        elif method == "median":
            aggregated = data.group_by(time_col).agg(
                pl.col(value_col).median().alias("aggregated_value")
            )
        elif method == "max":
            aggregated = data.group_by(time_col).agg(
                pl.col(value_col).max().alias("aggregated_value")
            )
        elif method == "min":
            aggregated = data.group_by(time_col).agg(
                pl.col(value_col).min().alias("aggregated_value")
            )
        else:
            raise ValueError(
                f"Unknown method: {method}. Use 'mean', 'median', 'max', or 'min'"
            )
    else:
        # Single scenario
        aggregated = data.select([time_col, value_col]).rename(
            {value_col: "aggregated_value"}
        )

    # Find peak
    peak_row = aggregated.sort("aggregated_value", descending=True).head(1)

    if peak_row.height == 0:
        return {"peak_year": None, "peak_value": None, "method_used": method}

    peak_year = peak_row[time_col][0]
    peak_value = peak_row["aggregated_value"][0]

    return {
        "peak_year": peak_year,
        "peak_value": peak_value,
        "method_used": method,
        "total_scenarios": (
            data[scenario_col].n_unique()
            if scenario_col and scenario_col in cols
            else 1
        ),
    }


def find_peak_years_by_scenario(
    data: pl.DataFrame,
    value_col: str = None,
    time_col: str = None,
    scenario_col: str = None,
) -> pl.DataFrame:
    """
    Find peak years for each individual scenario.

    Returns DataFrame with columns: scenario, peak_year, peak_value
    """
    # Auto-detect column names
    cols = data.columns
    if value_col is None:
        value_col = "value" if "value" in cols else "Value"
    if time_col is None:
        time_col = "time" if "time" in cols else ("Year" if "Year" in cols else "step")
    if scenario_col is None:
        scenario_col = (
            "scenario_name"
            if "scenario_name" in cols
            else ("scenario_id" if "scenario_id" in cols else None)
        )

    if not scenario_col or scenario_col not in cols:
        # Single scenario
        peak_row = data.sort(value_col, descending=True).head(1)
        return pl.DataFrame(
            {
                "scenario": ["single"],
                "peak_year": [peak_row[time_col][0]],
                "peak_value": [peak_row[value_col][0]],
            }
        )

    # Multiple scenarios
    results = []
    scenarios = data[scenario_col].unique().sort()

    for scenario_id in scenarios:
        scenario_data = data.filter(pl.col(scenario_col) == scenario_id)
        peak_row = scenario_data.sort(value_col, descending=True).head(1)

        if peak_row.height > 0:
            results.append(
                {
                    "scenario": scenario_id,
                    "peak_year": peak_row[time_col][0],
                    "peak_value": peak_row[value_col][0],
                }
            )

    return pl.DataFrame(results) if results else pl.DataFrame()

In [None]:
# 获取数据并找到峰值年份
data = query.get_series(
    variables="domestic water demand province sum",
    filters=None,
    time_range=(2020, 2100),
    include_params=True,
)

# 找到峰值年份（基于均值）
peak_info = find_peak_year(data, method="mean")
print(f"峰值年份: {peak_info['peak_year']}")
print(f"峰值数值: {peak_info['peak_value']:.2f}")
print(f"聚合方法: {peak_info['method_used']}")
print(f"情景数量: {peak_info['total_scenarios']}")

In [None]:
# 对比不同聚合方法的峰值年份
methods = ["mean", "median", "max", "min"]
for method in methods:
    peak_info = find_peak_year(data, method=method)
    print(
        f"{method.capitalize():>6}: 峰值年份 {peak_info['peak_year']}, 数值 {peak_info['peak_value']:.2f}"
    )

## Other Agricultural Water Demand

In [None]:
from scripts.analysis_helpers import sensitivity_test

# ========== 敏感性分析 ==========
results = sensitivity_test(
    query,
    vary_param="Diet change scenario switch",
    fixed_params=FIXED_PARAMS,
    time_range=(2020, 2100),
    metric="cv",
)
results

In [None]:
quick_plot(
    query,
    variable="OA water demand province sum",
    filters=None,
    time_range=(2020, 2100),
)

In [None]:
# 获取数据并找到峰值年份
data = query.get_series(
    variables="OA water demand province sum",
    filters=None,
    time_range=(2020, 2100),
    include_params=True,
)

# 找到峰值年份（基于均值）
peak_info = find_peak_year(data, method="mean")
print(f"峰值年份: {peak_info['peak_year']}")
print(f"峰值数值: {peak_info['peak_value']:.2f}")
print(f"聚合方法: {peak_info['method_used']}")
print(f"情景数量: {peak_info['total_scenarios']}")