Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/qrf-randomness-and-quantiles.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed three correctness bugs in `_QRFModel.predict`: (1) the random-number generator was re-initialised from `self.seed` on every `predict()` call, so repeated calls returned identical draws and multiple-imputation variance collapsed to zero — the RNG is now created once in `__init__` and advanced across calls; (2) the quantile grid `[0.091..0.909]` combined with `.astype(int)` truncation systematically biased stochastic median predictions low and truncated the tails — the implementation now rounds (rather than floors) a beta-distribution draw onto a fine symmetric grid so the empirical mean maps to the intended quantile; (3) when users passed explicit `quantiles=[q1, q2, q3]`, each quantile request drew its own per-row random index, producing crossed quantiles — `QRFResults._predict` now routes explicit quantiles through a deterministic `exact_quantile` path that guarantees row-level monotonicity.
93 changes: 75 additions & 18 deletions microimpute/models/qrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ def __init__(self, seed: int, logger):
self.logger = logger
self.qrf = None
self.output_column = None
# Create the RNG once at construction so that repeated predict()
# calls consume state progressively and return different draws.
self._rng = np.random.default_rng(self.seed)

def fit(self, X: pd.DataFrame, y: pd.Series, **qrf_kwargs: Any) -> None:
"""Fit the QRF model.
Expand All @@ -172,28 +175,69 @@ def predict(
X: pd.DataFrame,
mean_quantile: float = 0.5,
count_samples: int = 10,
exact_quantile: Optional[float] = None,
) -> pd.Series:
"""Predict using the fitted model with beta distribution sampling.

Note: Assumes X is already preprocessed with categorical encoding
handled by the base ImputerResults class.
"""
# Generate quantile grid
eps = 1.0 / (count_samples + 1)
quantile_grid = np.linspace(eps, 1.0 - eps, count_samples)
pred = self.qrf.predict(X, quantiles=list(quantile_grid))

# Sample from beta distribution
random_generator = np.random.default_rng(self.seed)
Args:
X: Predictor matrix (already preprocessed).
mean_quantile: Mean quantile for beta-distribution sampling. Only
used when ``exact_quantile`` is None.
count_samples: Number of samples for the legacy quantile-grid code
path (kept for backward compatibility but no longer used by the
unbiased implementation).
exact_quantile: If provided, query the underlying QRF at exactly
this quantile (no beta sampling). This guarantees monotonicity
across quantiles for a given row and is used when the caller
supplies an explicit ``quantiles`` list.
"""
# Deterministic path: user asked for a specific quantile — query the
# QRF directly so that for any row i,
# prediction(q_low) <= prediction(q_mid) <= prediction(q_high).
if exact_quantile is not None:
pred = self.qrf.predict(X, quantiles=[float(exact_quantile)])
pred = np.asarray(pred).reshape(len(X), -1)[:, 0]
return pd.Series(pred, index=X.index, name=self.output_column)

# Stochastic path: draw one continuous quantile per row from a Beta
# distribution centred at ``mean_quantile`` (Beta(a,1) with
# a = mean_quantile/(1 - mean_quantile); for mean_quantile=0.5 this
# reduces to Uniform(0,1), so Beta(1,1) gives E[q]=0.5 and an
# unbiased empirical median in the limit). The old implementation
# converted a continuous beta draw into an integer index via
# ``np.clip(beta(a,1)*10).astype(int)``, which *floored* (not
# rounded) the index and used a grid of [0.091..0.909] — this
# systematically biased the median low and truncated the tails.
a = mean_quantile / (1 - mean_quantile)
input_quantiles = random_generator.beta(a, 1, size=len(X)) * count_samples
input_quantiles = np.clip(input_quantiles.astype(int), 0, count_samples - 1)
# Use the instance RNG so repeated predict() calls on the same X
# produce independent draws (previously each call reset the RNG
# from ``self.seed``, collapsing variance to zero).
continuous_quantiles = self._rng.beta(a, 1, size=len(X))

# Bucket continuous quantiles onto a fine symmetric grid covering the
# full open interval (0, 1). Using round() (not floor) keeps the
# mapping centred on the intended quantile, so the empirical mean of
# mapped quantiles ≈ ``mean_quantile``. We avoid exact 0 and 1 because
# QRF cannot extrapolate beyond observed extremes.
grid_size = max(int(count_samples), 101)
eps = 1.0 / (grid_size + 1)
quantile_grid = np.linspace(eps, 1.0 - eps, grid_size)
# Round (not floor) onto the grid to eliminate the low-side bias.
grid_indices = np.clip(
np.rint(continuous_quantiles * (grid_size - 1)).astype(int),
0,
grid_size - 1,
)

# Extract predictions
if len(pred.shape) == 2:
predictions = pred[np.arange(len(pred)), input_quantiles]
pred = self.qrf.predict(X, quantiles=list(quantile_grid))
pred = np.asarray(pred)
if pred.ndim == 2:
predictions = pred[np.arange(len(X)), grid_indices]
else:
predictions = pred[np.arange(len(pred)), :, input_quantiles]
predictions = pred[np.arange(len(X)), :, grid_indices]

return pd.Series(predictions, index=X.index, name=self.output_column)

Expand Down Expand Up @@ -413,11 +457,24 @@ def _predict(
return_probs=False,
)
else:
# Regression for numeric targets
imputed_values = model.predict(
X_test_augmented[encoded_predictors],
mean_quantile=q,
)
# Regression for numeric targets.
# If the caller passed an explicit ``quantiles`` list
# (the user wants to inspect specific quantiles, e.g.
# for prediction intervals), we query the QRF at
# exactly ``q`` per row — NO beta sampling. This
# guarantees row-level monotonicity across quantiles.
# Otherwise, sample stochastically around ``q`` (the
# beta-mean default for imputation variance).
if quantiles:
imputed_values = model.predict(
X_test_augmented[encoded_predictors],
exact_quantile=q,
)
else:
imputed_values = model.predict(
X_test_augmented[encoded_predictors],
mean_quantile=q,
)

imputed_df[variable] = imputed_values

Expand Down
99 changes: 99 additions & 0 deletions tests/test_models/test_qrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,105 @@ def test_qrf_beta_distribution_sampling():
assert median[0.5]["y"].iloc[i] <= extreme_high[0.99]["y"].iloc[i]


def test_qrf_repeated_predict_calls_produce_different_draws(
simple_data: pd.DataFrame,
) -> None:
"""Regression test for the seed-reset bug (#1): two sequential predict()
calls on the same X must draw different stochastic quantiles."""
train_data = simple_data[:80]
test_data = simple_data[80:][["x1", "x2"]]

model = QRF()
fitted = model.fit(
train_data,
predictors=["x1", "x2"],
imputed_variables=["y"],
n_estimators=50,
)

preds1 = fitted.predict(test_data)
preds2 = fitted.predict(test_data)

# Predictions must differ across calls (stochastic imputation), not be
# bit-identical as they were when the RNG was re-seeded every call.
assert not np.allclose(preds1["y"].values, preds2["y"].values), (
"Repeated predict() calls must produce different draws"
)


def test_qrf_stochastic_median_is_unbiased() -> None:
"""Regression test for the quantile-grid bias bug (#2): the mean of many
stochastic median predictions must approximate the true median (q=0.5)
query, not a systematically lower value."""
rng = np.random.default_rng(0)
n = 400
x = rng.normal(size=n)
# Symmetric noise -> true median ~ 3*x.
y = 3.0 * x + rng.normal(size=n)
data = pd.DataFrame({"x": x, "y": y})

model = QRF()
fitted = model.fit(
data,
predictors=["x"],
imputed_variables=["y"],
n_estimators=200,
min_samples_leaf=5,
)

x_grid = pd.DataFrame({"x": np.linspace(-1.5, 1.5, 50)})

# Deterministic median via exact-quantile path (should not be biased).
exact_median = fitted.predict(x_grid, quantiles=[0.5])[0.5]["y"].values

# Average stochastic predictions across many calls; each call uses a
# fresh draw from Beta(1,1) = Uniform(0,1), so the mean of their
# selected quantiles is ≈ 0.5 and the average prediction should match
# the exact median.
n_draws = 60
stochastic_means = np.zeros(len(x_grid))
for _ in range(n_draws):
stochastic_means += fitted.predict(x_grid)["y"].values
stochastic_means /= n_draws

# With n_draws averaging, the gap should be small relative to y's scale.
scale = np.std(y)
assert np.abs(stochastic_means - exact_median).mean() < 0.25 * scale, (
"Stochastic median prediction is biased vs deterministic median"
)


def test_qrf_multi_quantile_per_row_monotonicity() -> None:
"""Regression test for #3: when the user passes explicit
``quantiles=[q_low, q_mid, q_high]``, each row must satisfy
q_low <= q_mid <= q_high (no per-row random quantile draw)."""
rng = np.random.default_rng(1)
n = 200
x = rng.normal(size=n)
y = 2.0 * x + rng.normal(size=n)
data = pd.DataFrame({"x": x, "y": y})

model = QRF()
fitted = model.fit(
data,
predictors=["x"],
imputed_variables=["y"],
n_estimators=100,
min_samples_leaf=5,
)

x_test = pd.DataFrame({"x": rng.normal(size=50)})
preds = fitted.predict(x_test, quantiles=[0.1, 0.5, 0.9])

q_low = preds[0.1]["y"].values
q_mid = preds[0.5]["y"].values
q_high = preds[0.9]["y"].values

# All rows must have non-crossing quantiles.
assert np.all(q_low <= q_mid + 1e-9), "q=0.1 prediction must be <= q=0.5"
assert np.all(q_mid <= q_high + 1e-9), "q=0.5 prediction must be <= q=0.9"


# === Categorical Variable Tests ===


Expand Down
Loading