# 3-6-ダミー変数と分散分析モデル

In [None]:
# -*- coding: utf-8 -*-
# 3-6 Dummy variables & ANOVA model — Python/NumPyro version

# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az

import jax
import jax.numpy as jnp
import numpyro
import numpyro.distributions as dist
from numpyro.infer import NUTS, MCMC, Predictive

# Pretty plotting
sns.set(style="whitegrid")

# --- Data load & quick look (use pandas; show with print()) ---
df = pd.read_csv("3-6-1-beer-sales-3.csv")
print("Head (3 rows):")
print(df.head(3))
print("\nSummary (describe):")
print(df.describe(include="all"))

# --- Visualization: violin + points (labels in English) ---
plt.figure(figsize=(7, 5))
sns.violinplot(data=df, x="weather", y="sales", inner=None)
sns.stripplot(data=df, x="weather", y="sales", alpha=0.7)
plt.title("Beer Sales by Weather")
plt.xlabel("Weather")
plt.ylabel("Sales")
plt.tight_layout()
plt.show()

# --- Design matrix (like R's model.matrix(sales ~ weather)) ---
# Treat weather as categorical; drop_first=True gives K = 1 (intercept) + (L-1) dummies
weather_cat = pd.Categorical(df["weather"])
levels = list(weather_cat.categories)  # category order
dummies = pd.get_dummies(weather_cat, drop_first=True)
X = np.column_stack([np.ones(len(df)), dummies.to_numpy()])  # [N, K]
y = df["sales"].to_numpy()
N, K = X.shape

# Show "data_list" equivalent with print()
data_list_like = {"N": N, "K": K, "Y(head)": y[:5].tolist(), "X(head)": X[:5].tolist(), "levels": levels}
print("\nData list-like info for Stan-equivalent:")
print(data_list_like)

# --- NumPyro model: Gaussian ANOVA via design matrix ---
def anova_model(X, y=None):
    K = X.shape[1]
    # Weakly-informative priors
    beta = numpyro.sample("beta", dist.Normal(0.0, 10.0).expand([K]).to_event(1))
    sigma = numpyro.sample("sigma", dist.HalfNormal(10.0))
    mu = jnp.dot(X, beta)
    numpyro.sample("y", dist.Normal(mu, sigma), obs=y)

# --- Run MCMC (NUTS) ---
rng_key = jax.random.PRNGKey(1)
kernel = NUTS(anova_model)
mcmc = MCMC(kernel, num_warmup=1000, num_samples=2000, num_chains=4, progress_bar=True)
mcmc.run(rng_key, X=X, y=y)

# Convert to InferenceData (do NOT pass observed_data)
idata = az.from_numpyro(mcmc)

# Print a compact summary with print()
summary_df = az.summary(idata, hdi_prob=0.95)
print("\nMCMC summary (hdi_prob=0.95):")
print(summary_df)

# --- Visualize the Bayesian statistical model (NumPyro built-in) ---
# Uses graphviz under the hood. Save to file if available.
try:
    g = numpyro.render_model(
        anova_model,
        model_args=(X, y),
        render_distributions=True,
        render_params=True,
    )
    # Save a PNG to the current directory
    g.render("anova_model_graph", format="png", cleanup=True)
    print('\nSaved model graph to "anova_model_graph.png"')
except Exception as e:
    print(f"\nModel rendering skipped: {e}")

# --- Posterior predictive (for PPC) ---
predictive = Predictive(anova_model, posterior_samples=mcmc.get_samples())
ppc = predictive(jax.random.PRNGKey(2), X=X)  # y is generated (posterior predictive)

# Update idata with posterior predictive draws (still no observed_data)
idata = az.from_numpyro(mcmc, posterior_predictive={"y": ppc["y"]})

# --- Plot posterior predictive check (must pass group='posterior') ---
plt.figure(figsize=(7, 5))
az.plot_ppc(idata, group="posterior", num_pp_samples=200)
plt.title("Posterior Predictive Check")
plt.xlabel("Sales")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

# --- "Marginal effects" analogue: posterior of group means by weather ---
# With intercept + (L-1) dummies:
# mean(level_0) = beta[0]
# mean(level_j) = beta[0] + beta[j] for j=1..L-1 (aligned with get_dummies order)
posterior = mcmc.get_samples()
beta_samples = np.asarray(posterior["beta"])  # [S, K]
means_by_level = {}
means_by_level[levels[0]] = beta_samples[:, 0]
for j, lev in enumerate(levels[1:], start=1):
    means_by_level[lev] = beta_samples[:, 0] + beta_samples[:, j]

# Print posterior mean of each group's mean with print()
print("\nPosterior means of expected sales by weather:")
for lev, arr in means_by_level.items():
    print(f"  {lev}: {arr.mean():.3f}")

# Plot posteriors of group means (use hdi_prob; labels in English)
fig = plt.figure(figsize=(8, 5))
axes = az.plot_posterior(
    means_by_level,
    var_names = list(means_by_level.keys()),
    hdi_prob=0.95,
)
# Set titles/labels per axis
axes = np.atleast_1d(axes)
for ax, lev in zip(axes, means_by_level.keys()):
    ax.set_title(f"Posterior of Mean Sales: {lev}")
    ax.set_xlabel("Sales")
plt.tight_layout()
plt.show()

# --- Optional: simple coefficient view (no forbidden args used) ---
plt.figure(figsize=(7, 4))
az.plot_posterior(idata, var_names=["beta", "sigma"], hdi_prob=0.95)
plt.suptitle("Coefficients & Sigma (Posterior)", y=1.02)
plt.tight_layout()
plt.show()
