# 03 - Bayesian Hierarchical Modeling

Fit hierarchical model to 911 prices, diagnose, visualize, and predict.

**Inputs:**
- `data/processed/cleaned_listings.parquet`

**Model:**
```
log(price) ~ age + mileage + sale_year + 
             (1 + age | generation) +
             (1 | trim) +
             (1 | transmission)
```

In [None]:
import logging
from pathlib import Path

import arviz as az
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from price_analysis.data.cleaning import prepare_model_data
from price_analysis.models import build_model, fit_model, predict_price
from price_analysis.models.hierarchical import (
    check_diagnostics,
    extract_effects,
    format_prediction_summary,
)

logging.basicConfig(level=logging.INFO)
az.style.use("arviz-darkgrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# Paths
DATA_DIR = Path("../data")
PROCESSED_PATH = DATA_DIR / "processed" / "cleaned_listings.parquet"

## Load and Prepare Data

In [None]:
df_cleaned = pd.read_parquet(PROCESSED_PATH)
df = prepare_model_data(df_cleaned)
print(f"Model data: {len(df)} listings")
display(df.head())

In [None]:
# Store scaling parameters for prediction
MILEAGE_MEAN = df_cleaned["mileage"].mean()
MILEAGE_STD = df_cleaned["mileage"].std()
print(f"Mileage scaling: mean={MILEAGE_MEAN:.0f}, std={MILEAGE_STD:.0f}")

In [None]:
# Check category levels
print("Generation levels:", df["generation"].cat.categories.tolist())
print("Trim levels:", df["trim"].cat.categories.tolist())
print("Transmission levels:", df["transmission"].cat.categories.tolist())

## Build and Fit Model

In [None]:
model = build_model(df)
print(model)

In [None]:
# Fit model (this may take a few minutes)
idata = fit_model(model, draws=2000, tune=1000, chains=4)

## MCMC Diagnostics

Check for convergence issues before interpreting results.

In [None]:
diagnostics = check_diagnostics(idata)
print(f"Converged: {diagnostics['converged']}")
print(f"Divergences: {diagnostics['n_divergences']}")
print(f"Max R-hat: {diagnostics['rhat_max']:.3f}")
print(f"Min ESS (bulk): {diagnostics['ess_bulk_min']:.0f}")
if diagnostics["issues"]:
    print(f"Issues: {diagnostics['issues']}")

In [None]:
# Trace plots for fixed effects
az.plot_trace(idata, var_names=["Intercept", "age", "mileage_scaled", "sale_year"])
plt.tight_layout()
plt.show()

In [None]:
# R-hat summary
az.plot_forest(idata, var_names=["Intercept", "age", "mileage_scaled", "sale_year"], r_hat=True)
plt.tight_layout()
plt.show()

## Model Summary

Interpret the fixed effects (population-level estimates).

In [None]:
summary = az.summary(
    idata, var_names=["Intercept", "age", "mileage_scaled", "sale_year"], hdi_prob=0.9
)
display(summary)

**Interpretation (coefficients are on log-price scale):**

- `age`: Expected change in log(price) per year of age (negative = depreciation)
- `mileage_scaled`: Effect of 1 SD increase in mileage
- `sale_year`: Market trend (positive = prices rising over time)

## Random Effects

Visualize generation, trim, and transmission effects.

In [None]:
effects = extract_effects(idata)

print("Fixed effects:")
for name, vals in effects["fixed"].items():
    print(f"  {name}: {vals['mean']:.3f} [{vals['hdi_90'][0]:.3f}, {vals['hdi_90'][1]:.3f}]")

In [None]:
# Forest plot of generation effects
az.plot_forest(idata, var_names=["1|generation", "age|generation"], combined=True)
plt.suptitle("Generation Effects (Intercept and Age Slope)")
plt.tight_layout()
plt.show()

In [None]:
# Forest plot of trim effects
az.plot_forest(idata, var_names=["1|trim"], combined=True)
plt.title("Trim Effects (Random Intercepts)")
plt.tight_layout()
plt.show()

In [None]:
# Forest plot of transmission effects
az.plot_forest(idata, var_names=["1|transmission"], combined=True)
plt.title("Transmission Effects (Random Intercepts)")
plt.tight_layout()
plt.show()

## Posterior Predictive Checks

How well does the model reproduce the observed data?

In [None]:
# Generate posterior predictive samples
model.predict(idata, kind="pps", inplace=True)

In [None]:
az.plot_ppc(idata, num_pp_samples=100)
plt.title("Posterior Predictive Check")
plt.xlabel("log(price)")
plt.tight_layout()
plt.show()

## Price Predictions

Predict prices for specific configurations.

In [None]:
# Example: 2022 992.1 Carrera 4S PDK with 15k miles, sold in 2025
pred = predict_price(
    model=model,
    idata=idata,
    generation="992.1",
    trim="Carrera 4S",
    transmission="PDK",
    model_year=2022,
    mileage=15000,
    sale_year=2025,
    mileage_mean=MILEAGE_MEAN,
    mileage_std=MILEAGE_STD,
)

print(format_prediction_summary(pred))

In [None]:
# Visualize prediction distribution
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(pred["samples"] / 1000, bins=50, alpha=0.7, density=True)
ax.axvline(pred["price"]["median"] / 1000, color="red", linestyle="--", label="Median")
ax.axvline(pred["price"]["ci_80"][0] / 1000, color="orange", linestyle=":", label="80% CI")
ax.axvline(pred["price"]["ci_80"][1] / 1000, color="orange", linestyle=":")
ax.set_xlabel("Predicted Price ($k)")
ax.set_ylabel("Density")
ax.set_title(
    f"Price Prediction: {pred['config']['model_year']} {pred['config']['generation']} {pred['config']['trim']}"
)
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Compare: Same car but Manual transmission
pred_manual = predict_price(
    model=model,
    idata=idata,
    generation="992.1",
    trim="Carrera 4S",
    transmission="Manual",
    model_year=2022,
    mileage=15000,
    sale_year=2025,
    mileage_mean=MILEAGE_MEAN,
    mileage_std=MILEAGE_STD,
)

print("\nManual vs PDK comparison:")
print(f"PDK median:    ${pred['price']['median']:,.0f}")
print(f"Manual median: ${pred_manual['price']['median']:,.0f}")
print(f"Manual premium: ${pred_manual['price']['median'] - pred['price']['median']:,.0f}")

## Depreciation Curves by Generation

Project price trajectories over time.

In [None]:
# Project prices for different generations over next 5 years
generations = ["991.2", "992.1"]
ages = range(1, 10)
base_year = 2020  # Approximate mid-point for 992.1

fig, ax = plt.subplots(figsize=(12, 7))

for gen in generations:
    medians = []
    ci_lows = []
    ci_highs = []

    for age in ages:
        pred = predict_price(
            model=model,
            idata=idata,
            generation=gen,
            trim="Carrera 4S",
            transmission="PDK",
            model_year=base_year,
            mileage=10000 * age,  # Assume 10k miles/year
            sale_year=base_year + age,
            mileage_mean=MILEAGE_MEAN,
            mileage_std=MILEAGE_STD,
        )
        medians.append(pred["price"]["median"])
        ci_lows.append(pred["price"]["ci_80"][0])
        ci_highs.append(pred["price"]["ci_80"][1])

    ax.plot(list(ages), [m / 1000 for m in medians], label=gen, marker="o")
    ax.fill_between(
        list(ages),
        [l / 1000 for l in ci_lows],
        [h / 1000 for h in ci_highs],
        alpha=0.2,
    )

ax.set_xlabel("Age (years)")
ax.set_ylabel("Predicted Price ($k)")
ax.set_title("Depreciation Curves: Carrera 4S PDK (with 80% CI)")
ax.legend()
plt.tight_layout()
plt.show()

## Save Model Artifacts

In [None]:
# Save inference data for later use
# idata.to_netcdf(DATA_DIR / "processed" / "model_idata.nc")
# print("Saved inference data")