From 827047b3c4f9acdf7eab23a2fca8c612a5749b73 Mon Sep 17 00:00:00 2001
From: "brandon s. tober" <tobers.brandon@gmail.com>
Date: Tue, 9 Sep 2025 10:10:51 -0400
Subject: [PATCH 1/2] Bug fix in exporting extra variables, added MCMC stats

---
 pygem/bin/run/run_calibration.py |   4 +
 pygem/bin/run/run_simulation.py  |   4 +-
 pygem/output.py                  |   7 +-
 pygem/utils/stats.py             | 136 +++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+), 4 deletions(-)
 create mode 100644 pygem/utils/stats.py

diff --git a/pygem/bin/run/run_calibration.py b/pygem/bin/run/run_calibration.py
index a0446a7c..7f65a323 100755
--- a/pygem/bin/run/run_calibration.py
+++ b/pygem/bin/run/run_calibration.py
@@ -46,6 +46,7 @@
     single_flowline_glacier_directory,
     single_flowline_glacier_directory_with_calving,
 )
+from pygem.utils.stats import mcmc_stats
 
 # from oggm.core import climate
 # from oggm.core.flowline import FluxBasedModel
@@ -2135,6 +2136,9 @@ def must_melt(kp, tbias, ddfsnow, **kwargs):
                     modelprms_export['mb_obs_mwea_err'] = [float(mb_obs_mwea_err)]
                     modelprms_export['priors'] = priors
 
+                    # compute stats on mcmc parameters
+                    modelprms_export = mcmc_stats(modelprms_export)
+
                     modelprms_fn = glacier_str + '-modelprms_dict.json'
                     modelprms_fp = [
                         (
diff --git a/pygem/bin/run/run_simulation.py b/pygem/bin/run/run_simulation.py
index 1225bd50..40e07c43 100755
--- a/pygem/bin/run/run_simulation.py
+++ b/pygem/bin/run/run_simulation.py
@@ -606,7 +606,7 @@ def run(list_packed_vars):
         rgiid = main_glac_rgi.loc[main_glac_rgi.index.values[glac], 'RGIId']
 
         try:
-            # for batman in [0]:
+        # for batman in [0]:
 
             # ===== Load glacier data: area (km2), ice thickness (m), width (km) =====
             if (
@@ -1779,6 +1779,7 @@ def run(list_packed_vars):
                             sim_endyear=args.sim_endyear,
                             option_calibration=args.option_calibration,
                             option_bias_adjustment=args.option_bias_adjustment,
+                            extra_vars=args.export_extra_vars,
                         )
                         for n_iter in range(nsims):
                             # pass model params for iteration and update output dataset model params
@@ -1878,6 +1879,7 @@ def run(list_packed_vars):
                         sim_endyear=args.sim_endyear,
                         option_calibration=args.option_calibration,
                         option_bias_adjustment=args.option_bias_adjustment,
+                        extra_vars=args.export_extra_vars,
                     )
                     # create and return xarray dataset
                     output_stats.create_xr_ds()
diff --git a/pygem/output.py b/pygem/output.py
index eb151808..5a906606 100644
--- a/pygem/output.py
+++ b/pygem/output.py
@@ -90,6 +90,7 @@ class single_glacier:
     sim_endyear: int
     option_calibration: str
     option_bias_adjustment: str
+    extra_vars: bool = False
 
     def __post_init__(self):
         """
@@ -492,7 +493,7 @@ def _update_dicts(self):
             }
 
         # optionally store extra variables
-        if pygem_prms['sim']['out']['export_extra_vars']:
+        if self.extra_vars:
             self.output_coords_dict['glac_prec_monthly'] = collections.OrderedDict(
                 [('glac', self.glac_values), ('time', self.time_values)]
             )
@@ -791,8 +792,8 @@ class binned_stats(single_glacier):
         Flag indicating whether additional binned components are included in the dataset.
     """
 
-    nbins: int
-    binned_components: bool
+    nbins: int = 0
+    binned_components: bool = False
 
     def __post_init__(self):
         """
diff --git a/pygem/utils/stats.py b/pygem/utils/stats.py
new file mode 100644
index 00000000..bb4cecfe
--- /dev/null
+++ b/pygem/utils/stats.py
@@ -0,0 +1,136 @@
+"""
+Python Glacier Evolution Model (PyGEM)
+
+copyright © 2018 David Rounce <drounce@cmu.edu>
+
+Distrubted under the MIT lisence
+
+Model statistics module
+"""
+
+import numpy as np
+import arviz as az
+
+def effective_n(x):
+    """
+    Compute the effective sample size of a trace.
+
+    Takes the trace and computes the effective sample size
+    according to its detrended autocorrelation.
+
+    Parameters
+    ----------
+    x : list or array of chain samples
+
+    Returns
+    -------
+    effective_n : int
+        effective sample size
+    """
+    if len(set(x)) == 1:
+        return 1
+    try:
+        # detrend trace using mean to be consistent with statistics
+        # definition of autocorrelation
+        x = np.asarray(x)
+        x = (x - x.mean())
+        # compute autocorrelation (note: only need second half since
+        # they are symmetric)
+        rho = np.correlate(x, x, mode='full')
+        rho = rho[len(rho)//2:]
+        # normalize the autocorrelation values
+        #  note: rho[0] is the variance * n_samples, so this is consistent
+        #  with the statistics definition of autocorrelation on wikipedia
+        # (dividing by n_samples gives you the expected value).
+        rho_norm = rho / rho[0]
+        # Iterate until sum of consecutive estimates of autocorrelation is
+        # negative to avoid issues with the sum being -0.5, which returns an
+        # effective_n of infinity
+        negative_autocorr = False
+        t = 1
+        n = len(x)
+        while not negative_autocorr and (t < n):
+            if not t % 2:
+                negative_autocorr = sum(rho_norm[t-1:t+1]) < 0
+            t += 1
+        return int(n / (1 + 2*rho_norm[1:t].sum()))
+    except:
+        return None
+    
+
+def mcmc_stats(chains_dict, 
+               params=['tbias','kp','ddfsnow','ddfice','rhoabl', 'rhoacc','mb_mwea']):
+    """
+    Compute per-chain and overall summary stats for MCMC samples.
+
+    Parameters
+    ----------
+    chains_dict : dict
+        Dictionary with structure:
+        {
+            "param1": {
+                "chain1": [...],
+                "chain2": [...],
+                ...
+            },
+            ...
+        }
+
+    Returns
+    -------
+    summary_stats : dict
+        Dictionary with structure:
+        {
+            "param1": {
+                "mean": [...],         # per chain
+                "std": [...],
+                "median": [...],
+                "q025": [...],
+                "q975": [...],
+                "ess": ...,            # overall
+                "r_hat": ...           # overall
+            },
+            ...
+        }
+    """
+    summary_stats = {}
+
+    for param, chains in chains_dict.items():
+        if param not in params:
+            continue
+
+        # Stack chains into array: shape (n_chains, n_samples)
+        chain_names = sorted(chains)  # ensure consistent order
+        samples = np.array([chains[c] for c in chain_names])
+
+        # Per-chain stats
+        means = np.mean(samples, axis=1).tolist()
+        stds = np.std(samples, axis=1, ddof=1).tolist()
+        medians = np.median(samples, axis=1).tolist()
+        q25 = np.quantile(samples, 0.25, axis=1).tolist()
+        q75 = np.quantile(samples, 0.75, axis=1).tolist()
+        ess = [effective_n(x) for x in samples]
+        # Overall stats (R-hat)
+        if samples.shape[0] > 1:
+            # calculate the gelman-rubin stat for each variable across all chains
+            # pass chains as 2d array to arviz using the from_dict() method
+            # convert the chains into an InferenceData object
+            idata = az.from_dict(posterior={param: samples})
+            # calculate the Gelman-Rubin statistic (rhat)
+            r_hat = float(az.rhat(idata).to_array().values[0])
+        else :
+            r_hat = None
+
+        summary_stats[param] = {
+            "mean": means,
+            "std": stds,
+            "median": medians,
+            "q25": q25,
+            "q75": q75,
+            "ess": ess,
+            "r_hat": r_hat
+        }
+
+    chains_dict['_summary_stats_'] = summary_stats
+
+    return chains_dict
\ No newline at end of file

From a64a921fd1234f69bcccbb0f1449331981530681 Mon Sep 17 00:00:00 2001
From: "brandon s. tober" <tobers.brandon@gmail.com>
Date: Tue, 9 Sep 2025 10:19:55 -0400
Subject: [PATCH 2/2] Applied ruff formating/linting

---
 pygem/bin/run/run_simulation.py |  2 +-
 pygem/utils/stats.py            | 37 ++++++++++++++++++---------------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/pygem/bin/run/run_simulation.py b/pygem/bin/run/run_simulation.py
index 40e07c43..3066befe 100755
--- a/pygem/bin/run/run_simulation.py
+++ b/pygem/bin/run/run_simulation.py
@@ -606,7 +606,7 @@ def run(list_packed_vars):
         rgiid = main_glac_rgi.loc[main_glac_rgi.index.values[glac], 'RGIId']
 
         try:
-        # for batman in [0]:
+            # for batman in [0]:
 
             # ===== Load glacier data: area (km2), ice thickness (m), width (km) =====
             if (
diff --git a/pygem/utils/stats.py b/pygem/utils/stats.py
index bb4cecfe..a09b15a2 100644
--- a/pygem/utils/stats.py
+++ b/pygem/utils/stats.py
@@ -8,8 +8,9 @@
 Model statistics module
 """
 
-import numpy as np
 import arviz as az
+import numpy as np
+
 
 def effective_n(x):
     """
@@ -33,11 +34,11 @@ def effective_n(x):
         # detrend trace using mean to be consistent with statistics
         # definition of autocorrelation
         x = np.asarray(x)
-        x = (x - x.mean())
+        x = x - x.mean()
         # compute autocorrelation (note: only need second half since
         # they are symmetric)
         rho = np.correlate(x, x, mode='full')
-        rho = rho[len(rho)//2:]
+        rho = rho[len(rho) // 2 :]
         # normalize the autocorrelation values
         #  note: rho[0] is the variance * n_samples, so this is consistent
         #  with the statistics definition of autocorrelation on wikipedia
@@ -51,15 +52,17 @@ def effective_n(x):
         n = len(x)
         while not negative_autocorr and (t < n):
             if not t % 2:
-                negative_autocorr = sum(rho_norm[t-1:t+1]) < 0
+                negative_autocorr = sum(rho_norm[t - 1 : t + 1]) < 0
             t += 1
-        return int(n / (1 + 2*rho_norm[1:t].sum()))
+        return int(n / (1 + 2 * rho_norm[1:t].sum()))
     except:
         return None
-    
 
-def mcmc_stats(chains_dict, 
-               params=['tbias','kp','ddfsnow','ddfice','rhoabl', 'rhoacc','mb_mwea']):
+
+def mcmc_stats(
+    chains_dict,
+    params=['tbias', 'kp', 'ddfsnow', 'ddfice', 'rhoabl', 'rhoacc', 'mb_mwea'],
+):
     """
     Compute per-chain and overall summary stats for MCMC samples.
 
@@ -118,19 +121,19 @@ def mcmc_stats(chains_dict,
             idata = az.from_dict(posterior={param: samples})
             # calculate the Gelman-Rubin statistic (rhat)
             r_hat = float(az.rhat(idata).to_array().values[0])
-        else :
+        else:
             r_hat = None
 
         summary_stats[param] = {
-            "mean": means,
-            "std": stds,
-            "median": medians,
-            "q25": q25,
-            "q75": q75,
-            "ess": ess,
-            "r_hat": r_hat
+            'mean': means,
+            'std': stds,
+            'median': medians,
+            'q25': q25,
+            'q75': q75,
+            'ess': ess,
+            'r_hat': r_hat,
         }
 
     chains_dict['_summary_stats_'] = summary_stats
 
-    return chains_dict
\ No newline at end of file
+    return chains_dict