From b16126271f17173882040a1afee944fd3fa0530e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:27:51 -0400 Subject: [PATCH] Fix QRF stochastic prediction: persistent RNG, unbiased median, monotone quantiles Three correctness bugs in _QRFModel.predict are fixed together because they interact in the stochastic-imputation code path: 1. Seed reset on every predict() (#1). np.random.default_rng(self.seed) was called inside predict(), so repeated predict() calls on the same X returned identical draws and collapsed multiple-imputation variance to zero. The RNG is now created once in __init__ and consumed progressively across calls. 2. Quantile-grid bias (#2). The grid [0.091..0.909] combined with .astype(int) (which floors) biased stochastic "median" predictions low and truncated the tails. Stochastic draws are now rounded (not floored) onto a fine symmetric grid so the empirical mean of selected quantiles matches the intended mean_quantile. 3. Per-row random quantile for explicit multi-quantile predict (#3). When users passed quantiles=[0.1, 0.5, 0.9] for prediction intervals, each quantile request sampled a separate random per-row index, producing crossed quantiles. QRFResults._predict now routes explicit quantiles through a deterministic exact_quantile path that guarantees row-level monotonicity. Adds regression tests: - test_qrf_repeated_predict_calls_produce_different_draws - test_qrf_stochastic_median_is_unbiased - test_qrf_multi_quantile_per_row_monotonicity --- .../qrf-randomness-and-quantiles.fixed.md | 1 + microimpute/models/qrf.py | 93 +++++++++++++---- tests/test_models/test_qrf.py | 99 +++++++++++++++++++ 3 files changed, 175 insertions(+), 18 deletions(-) create mode 100644 changelog.d/qrf-randomness-and-quantiles.fixed.md diff --git a/changelog.d/qrf-randomness-and-quantiles.fixed.md b/changelog.d/qrf-randomness-and-quantiles.fixed.md new file mode 100644 index 0000000..1b24a5c --- /dev/null +++ b/changelog.d/qrf-randomness-and-quantiles.fixed.md @@ -0,0 +1 @@ +Fixed three correctness bugs in `_QRFModel.predict`: (1) the random-number generator was re-initialised from `self.seed` on every `predict()` call, so repeated calls returned identical draws and multiple-imputation variance collapsed to zero — the RNG is now created once in `__init__` and advanced across calls; (2) the quantile grid `[0.091..0.909]` combined with `.astype(int)` truncation systematically biased stochastic median predictions low and truncated the tails — the implementation now rounds (rather than floors) a beta-distribution draw onto a fine symmetric grid so the empirical mean maps to the intended quantile; (3) when users passed explicit `quantiles=[q1, q2, q3]`, each quantile request drew its own per-row random index, producing crossed quantiles — `QRFResults._predict` now routes explicit quantiles through a deterministic `exact_quantile` path that guarantees row-level monotonicity. diff --git a/microimpute/models/qrf.py b/microimpute/models/qrf.py index 3f46c5f..b97eacc 100644 --- a/microimpute/models/qrf.py +++ b/microimpute/models/qrf.py @@ -147,6 +147,9 @@ def __init__(self, seed: int, logger): self.logger = logger self.qrf = None self.output_column = None + # Create the RNG once at construction so that repeated predict() + # calls consume state progressively and return different draws. + self._rng = np.random.default_rng(self.seed) def fit(self, X: pd.DataFrame, y: pd.Series, **qrf_kwargs: Any) -> None: """Fit the QRF model. @@ -172,28 +175,69 @@ def predict( X: pd.DataFrame, mean_quantile: float = 0.5, count_samples: int = 10, + exact_quantile: Optional[float] = None, ) -> pd.Series: """Predict using the fitted model with beta distribution sampling. Note: Assumes X is already preprocessed with categorical encoding handled by the base ImputerResults class. - """ - # Generate quantile grid - eps = 1.0 / (count_samples + 1) - quantile_grid = np.linspace(eps, 1.0 - eps, count_samples) - pred = self.qrf.predict(X, quantiles=list(quantile_grid)) - # Sample from beta distribution - random_generator = np.random.default_rng(self.seed) + Args: + X: Predictor matrix (already preprocessed). + mean_quantile: Mean quantile for beta-distribution sampling. Only + used when ``exact_quantile`` is None. + count_samples: Number of samples for the legacy quantile-grid code + path (kept for backward compatibility but no longer used by the + unbiased implementation). + exact_quantile: If provided, query the underlying QRF at exactly + this quantile (no beta sampling). This guarantees monotonicity + across quantiles for a given row and is used when the caller + supplies an explicit ``quantiles`` list. + """ + # Deterministic path: user asked for a specific quantile — query the + # QRF directly so that for any row i, + # prediction(q_low) <= prediction(q_mid) <= prediction(q_high). + if exact_quantile is not None: + pred = self.qrf.predict(X, quantiles=[float(exact_quantile)]) + pred = np.asarray(pred).reshape(len(X), -1)[:, 0] + return pd.Series(pred, index=X.index, name=self.output_column) + + # Stochastic path: draw one continuous quantile per row from a Beta + # distribution centred at ``mean_quantile`` (Beta(a,1) with + # a = mean_quantile/(1 - mean_quantile); for mean_quantile=0.5 this + # reduces to Uniform(0,1), so Beta(1,1) gives E[q]=0.5 and an + # unbiased empirical median in the limit). The old implementation + # converted a continuous beta draw into an integer index via + # ``np.clip(beta(a,1)*10).astype(int)``, which *floored* (not + # rounded) the index and used a grid of [0.091..0.909] — this + # systematically biased the median low and truncated the tails. a = mean_quantile / (1 - mean_quantile) - input_quantiles = random_generator.beta(a, 1, size=len(X)) * count_samples - input_quantiles = np.clip(input_quantiles.astype(int), 0, count_samples - 1) + # Use the instance RNG so repeated predict() calls on the same X + # produce independent draws (previously each call reset the RNG + # from ``self.seed``, collapsing variance to zero). + continuous_quantiles = self._rng.beta(a, 1, size=len(X)) + + # Bucket continuous quantiles onto a fine symmetric grid covering the + # full open interval (0, 1). Using round() (not floor) keeps the + # mapping centred on the intended quantile, so the empirical mean of + # mapped quantiles ≈ ``mean_quantile``. We avoid exact 0 and 1 because + # QRF cannot extrapolate beyond observed extremes. + grid_size = max(int(count_samples), 101) + eps = 1.0 / (grid_size + 1) + quantile_grid = np.linspace(eps, 1.0 - eps, grid_size) + # Round (not floor) onto the grid to eliminate the low-side bias. + grid_indices = np.clip( + np.rint(continuous_quantiles * (grid_size - 1)).astype(int), + 0, + grid_size - 1, + ) - # Extract predictions - if len(pred.shape) == 2: - predictions = pred[np.arange(len(pred)), input_quantiles] + pred = self.qrf.predict(X, quantiles=list(quantile_grid)) + pred = np.asarray(pred) + if pred.ndim == 2: + predictions = pred[np.arange(len(X)), grid_indices] else: - predictions = pred[np.arange(len(pred)), :, input_quantiles] + predictions = pred[np.arange(len(X)), :, grid_indices] return pd.Series(predictions, index=X.index, name=self.output_column) @@ -413,11 +457,24 @@ def _predict( return_probs=False, ) else: - # Regression for numeric targets - imputed_values = model.predict( - X_test_augmented[encoded_predictors], - mean_quantile=q, - ) + # Regression for numeric targets. + # If the caller passed an explicit ``quantiles`` list + # (the user wants to inspect specific quantiles, e.g. + # for prediction intervals), we query the QRF at + # exactly ``q`` per row — NO beta sampling. This + # guarantees row-level monotonicity across quantiles. + # Otherwise, sample stochastically around ``q`` (the + # beta-mean default for imputation variance). + if quantiles: + imputed_values = model.predict( + X_test_augmented[encoded_predictors], + exact_quantile=q, + ) + else: + imputed_values = model.predict( + X_test_augmented[encoded_predictors], + mean_quantile=q, + ) imputed_df[variable] = imputed_values diff --git a/tests/test_models/test_qrf.py b/tests/test_models/test_qrf.py index c97868a..799e1c1 100644 --- a/tests/test_models/test_qrf.py +++ b/tests/test_models/test_qrf.py @@ -179,6 +179,105 @@ def test_qrf_beta_distribution_sampling(): assert median[0.5]["y"].iloc[i] <= extreme_high[0.99]["y"].iloc[i] +def test_qrf_repeated_predict_calls_produce_different_draws( + simple_data: pd.DataFrame, +) -> None: + """Regression test for the seed-reset bug (#1): two sequential predict() + calls on the same X must draw different stochastic quantiles.""" + train_data = simple_data[:80] + test_data = simple_data[80:][["x1", "x2"]] + + model = QRF() + fitted = model.fit( + train_data, + predictors=["x1", "x2"], + imputed_variables=["y"], + n_estimators=50, + ) + + preds1 = fitted.predict(test_data) + preds2 = fitted.predict(test_data) + + # Predictions must differ across calls (stochastic imputation), not be + # bit-identical as they were when the RNG was re-seeded every call. + assert not np.allclose(preds1["y"].values, preds2["y"].values), ( + "Repeated predict() calls must produce different draws" + ) + + +def test_qrf_stochastic_median_is_unbiased() -> None: + """Regression test for the quantile-grid bias bug (#2): the mean of many + stochastic median predictions must approximate the true median (q=0.5) + query, not a systematically lower value.""" + rng = np.random.default_rng(0) + n = 400 + x = rng.normal(size=n) + # Symmetric noise -> true median ~ 3*x. + y = 3.0 * x + rng.normal(size=n) + data = pd.DataFrame({"x": x, "y": y}) + + model = QRF() + fitted = model.fit( + data, + predictors=["x"], + imputed_variables=["y"], + n_estimators=200, + min_samples_leaf=5, + ) + + x_grid = pd.DataFrame({"x": np.linspace(-1.5, 1.5, 50)}) + + # Deterministic median via exact-quantile path (should not be biased). + exact_median = fitted.predict(x_grid, quantiles=[0.5])[0.5]["y"].values + + # Average stochastic predictions across many calls; each call uses a + # fresh draw from Beta(1,1) = Uniform(0,1), so the mean of their + # selected quantiles is ≈ 0.5 and the average prediction should match + # the exact median. + n_draws = 60 + stochastic_means = np.zeros(len(x_grid)) + for _ in range(n_draws): + stochastic_means += fitted.predict(x_grid)["y"].values + stochastic_means /= n_draws + + # With n_draws averaging, the gap should be small relative to y's scale. + scale = np.std(y) + assert np.abs(stochastic_means - exact_median).mean() < 0.25 * scale, ( + "Stochastic median prediction is biased vs deterministic median" + ) + + +def test_qrf_multi_quantile_per_row_monotonicity() -> None: + """Regression test for #3: when the user passes explicit + ``quantiles=[q_low, q_mid, q_high]``, each row must satisfy + q_low <= q_mid <= q_high (no per-row random quantile draw).""" + rng = np.random.default_rng(1) + n = 200 + x = rng.normal(size=n) + y = 2.0 * x + rng.normal(size=n) + data = pd.DataFrame({"x": x, "y": y}) + + model = QRF() + fitted = model.fit( + data, + predictors=["x"], + imputed_variables=["y"], + n_estimators=100, + min_samples_leaf=5, + ) + + x_test = pd.DataFrame({"x": rng.normal(size=50)}) + preds = fitted.predict(x_test, quantiles=[0.1, 0.5, 0.9]) + + q_low = preds[0.1]["y"].values + q_mid = preds[0.5]["y"].values + q_high = preds[0.9]["y"].values + + # All rows must have non-crossing quantiles. + assert np.all(q_low <= q_mid + 1e-9), "q=0.1 prediction must be <= q=0.5" + assert np.all(q_mid <= q_high + 1e-9), "q=0.5 prediction must be <= q=0.9" + + # === Categorical Variable Tests ===