Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,24 @@ STUDIES_DEFAULT_TIMEOUT_S=60
# values to a whitelist of cron-expressible cadences.
# RELYLOOP_PR_POLL_MINUTES=15

# --- feat_judgments_periodic_resume_sweep ------------------------------
# Cron cadence (in minutes) for the resume_stuck_judgment_lists worker,
# which re-enqueues `judgment_lists.status='generating'` rows whose initial
# Arq enqueue failed (e.g., transient Redis outage). Default 15 matches
# RELYLOOP_PR_POLL_MINUTES so operators have one mental model for cron
# settings. Same whitelist of cron-expressible cadences applies (see
# backend.workers.pr_reconcile.SUPPORTED_POLL_MINUTES).
# RELYLOOP_JUDGMENTS_RESUME_SWEEP_MINUTES=15
#
# Maximum re-enqueue attempts per (judgment_list_id, UTC day). When this
# cap trips the cron skips the row and emits a `judgment_resume_capped`
# WARN log — the signal that a stuck row is structurally broken (bad
# rubric, missing query template, etc.) and needs operator inspection.
# Raise to ~96 if operators have legitimately long-running judgment
# generation jobs whose ticks exhaust the cap before completion; see
# spec §10 Threat 5 for the recovery model.
# RELYLOOP_JUDGMENTS_RESUME_MAX_PER_DAY=24

# --- Build-time only --------------------------------------------------
# RELYLOOP_GIT_SHA is injected at `docker buildx build` via --build-arg.
# `make up` propagates the current short-SHA when invoking compose.
51 changes: 51 additions & 0 deletions backend/app/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,57 @@ def _validate_pr_poll_minutes(cls, value: int) -> int:
)
return value

relyloop_judgments_resume_sweep_minutes: int = Field(
default=15,
ge=1,
le=1440,
description=(
"Cron cadence for the resume_stuck_judgment_lists worker "
"(feat_judgments_periodic_resume_sweep FR-3). MVP1 default 15. "
"Restricted to the same whitelist as RELYLOOP_PR_POLL_MINUTES: see "
"backend.workers.pr_reconcile.SUPPORTED_POLL_MINUTES. Lives "
"alongside the boot-time resume sweep at "
"backend/workers/all.py:148-161; the cron picks up rows the API "
"couldn't enqueue mid-run."
),
)

@field_validator("relyloop_judgments_resume_sweep_minutes")
@classmethod
def _validate_judgments_resume_sweep_minutes(cls, value: int) -> int:
"""Narrow to the same cron-expressible whitelist as relyloop_pr_poll_minutes.

Reuses SUPPORTED_POLL_MINUTES from pr_reconcile so operators have one
mental model for cron cadence settings (feat_judgments_periodic_resume_sweep
spec §7.4 + locked decision #1). Lazy import for the same reason as
:meth:`_validate_pr_poll_minutes`: the settings module is imported
very early, before backend.workers.pr_reconcile.
"""
from backend.workers.pr_reconcile import SUPPORTED_POLL_MINUTES

if value not in SUPPORTED_POLL_MINUTES:
raise ValueError(
f"RELYLOOP_JUDGMENTS_RESUME_SWEEP_MINUTES={value} is not in the "
f"supported set {sorted(SUPPORTED_POLL_MINUTES)}. Pick a divisor "
"of 60 (≤60) or a multiple of 60 that divides 1440 (>60)."
)
return value

relyloop_judgments_resume_max_per_day: int = Field(
default=24,
ge=1,
le=10000,
description=(
"Maximum re-enqueue attempts per (judgment_list_id, UTC day) "
"before the cron skips a row and emits judgment_resume_capped "
"at WARN (feat_judgments_periodic_resume_sweep FR-4). MVP1 "
"default 24 (≈ one per hour at 15-min cadence). Raise to ~96 "
"if operators have legitimately long-running judgment generation "
"jobs that exhaust the cap mid-run; spec §10 Threat 5 documents "
"the recovery model (boot-sweep on next worker restart)."
),
)

es_heap_size: str = Field(
default="512m",
description="ES_JAVA_OPTS heap sizing for the elasticsearch+opensearch containers",
Expand Down
Loading
Loading