# 2-2-データの要約

In [None]:
# -*- coding: utf-8 -*-
# R→Python rewrite (requirements met: print() for outputs, pandas for CSV, matplotlib/seaborn/arviz for plots,
# English plot labels, NumPyro for Bayesian estimation, NumPyro's built-in model rendering,
# ArviZ for posterior with hdi_prob (no credible_interval))

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az

import statsmodels.api as sm
from statsmodels.tsa.stattools import acf as sm_acf, acovf
from statsmodels.graphics.tsaplots import plot_acf

import jax
import jax.numpy as jnp
from jax import random
import numpyro
import numpyro.distributions as dist
from numpyro.infer import MCMC, NUTS

# Optional (for model visualization)
try:
    from numpyro.contrib.render import render_model
    _HAS_RENDER = True
except Exception:
    _HAS_RENDER = False


# ---------------------------------------------------------------------
# Frequency / Histogram / Relative frequency
# ---------------------------------------------------------------------

# Read CSV
fish = pd.read_csv("2-2-1-fish.csv")  # expects a column named 'length'
print("First 3 rows of fish:")
print(fish.head(3))

# Histogram
sns.set(style="whitegrid")
plt.figure()
sns.histplot(fish["length"], bins="auto", stat="count", edgecolor="white")
plt.xlabel("Length")
plt.ylabel("Count")
plt.title("Histogram of Fish Length")
plt.tight_layout()
plt.show()


# ---------------------------------------------------------------------
# Kernel density estimation
# ---------------------------------------------------------------------

plt.figure()
# default bandwidth
sns.kdeplot(fish["length"], bw_adjust=1.0, linewidth=2, label="Default")
# bandwidth x 1/4
sns.kdeplot(fish["length"], bw_adjust=0.25, label="Bandwidth × 1/4")
# bandwidth x 4
sns.kdeplot(fish["length"], bw_adjust=4.0, label="Bandwidth × 4")
plt.ylim(0, 0.26)
plt.xlabel("Length")
plt.ylabel("Density")
plt.title("Changing Bandwidth")
plt.legend(loc="upper left", frameon=False)
plt.tight_layout()
plt.show()


# ---------------------------------------------------------------------
# Arithmetic mean
# ---------------------------------------------------------------------

mean_length = fish["length"].mean()
print(f"Arithmetic mean of fish length: {mean_length:.6f}")


# ---------------------------------------------------------------------
# Median, quartiles, percentiles
# ---------------------------------------------------------------------

suuretu = np.arange(0, 1001)  # 0..1000 inclusive
print("Sequence 0..1000:")
print(suuretu)
print(f"Length of sequence: {len(suuretu)}")

median_val = np.median(suuretu)
print(f"Median (0.5 quantile): {median_val}")

q25, q75 = np.quantile(suuretu, [0.25, 0.75])
print(f"Quartiles (0.25, 0.75): {q25}, {q75}")

ci025, ci975 = np.quantile(suuretu, [0.025, 0.975])
print(f"95% interval (2.5%, 97.5%): {ci025}, {ci975}")


# ---------------------------------------------------------------------
# Covariance and Pearson correlation coefficient
# ---------------------------------------------------------------------

birds = pd.read_csv("2-1-1-birds.csv")  # expects 'body_length' and 'feather_length'
pearson_r = birds[["body_length", "feather_length"]].corr().iloc[0, 1]
cov_bl_fl = np.cov(birds["body_length"], birds["feather_length"], ddof=1)[0, 1]
print(f"Pearson correlation (body_length, feather_length): {pearson_r:.6f}")
print(f"Sample covariance (body_length, feather_length): {cov_bl_fl:.6f}")


# ---------------------------------------------------------------------
# Autocovariance, autocorrelation, correlogram (Nile River flow)
# ---------------------------------------------------------------------

nile_df = sm.datasets.nile.load_pandas().data  # columns: 'year', 'volume'
nile = nile_df["volume"].to_numpy()
print("Nile River flow (first 5):")
print(nile[:5])

# Sample autocovariance up to lag 5
acov_vals = acovf(nile, nlag=5, fft=False, demean=True)  # lags 0..5
print("Sample autocovariance (lags 0..5):")
print(acov_vals)

# Sample autocorrelation up to lag 5
acf_vals = sm_acf(nile, nlags=5, fft=False)
print("Sample autocorrelation (lags 0..5):")
print(acf_vals)

# Correlogram
plt.figure()
plot_acf(nile, lags=40, zero=False)
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.title("Correlogram (ACF) of Nile River Flow")
plt.tight_layout()
plt.show()


# ---------------------------------------------------------------------
# Bayesian estimation with NumPyro (Normal model for fish length)
# - Model visualization: NumPyro built-in (render_model)
# - Posterior visualization: ArviZ with hdi_prob (no credible_interval)
# ---------------------------------------------------------------------

# Prepare data
y = jnp.array(fish["length"].to_numpy())

def model(y_obs):
    mu = numpyro.sample("mu", dist.Normal(0.0, 10.0))
    sigma = numpyro.sample("sigma", dist.HalfCauchy(5.0))
    numpyro.sample("obs", dist.Normal(mu, sigma), obs=y_obs)

# (Optional) visualize model structure using NumPyro's built-in renderer
if _HAS_RENDER:
    try:
        graph = render_model(model, model_args=(y,), render_distributions=True)
        # Save and also display via matplotlib for convenience
        outpath = graph.render(filename="fish_model", format="png", cleanup=True)
        img = plt.imread(outpath)
        plt.figure()
        plt.imshow(img)
        plt.axis("off")
        plt.title("NumPyro Model Graph (Fish Length ~ Normal)")
        plt.tight_layout()
        plt.show()
        print(f"Model graph saved to: {outpath}")
    except Exception as e:
        print(f"Model rendering skipped (graphviz not available?): {e}")

# Run MCMC (NumPyro / NUTS)
rng_key = random.PRNGKey(0)
kernel = NUTS(model)
mcmc = MCMC(kernel, num_warmup=1000, num_samples=2000, num_chains=2, progress_bar=False)
mcmc.run(rng_key, y_obs=y)

# Print a concise summary using ArviZ (and print() as requested)
idata = az.from_numpyro(mcmc)
summary_df = az.summary(idata, var_names=["mu", "sigma"], hdi_prob=0.95)
print("Posterior summary (HDI 95%):")
print(summary_df)

# Posterior plots with ArviZ (use hdi_prob, do NOT use credible_interval)
az.plot_posterior(
    idata,
    var_names=["mu", "sigma"],
    hdi_prob=0.95
)
plt.suptitle("Posterior Distributions (Fish Length Model)", y=1.02)
plt.tight_layout()
plt.show()