Skip to content

Commit fa1709c

Browse files
committed
fix(node): avoid reconnecting nodes on transient xray stats failures
1 parent 5fa6008 commit fa1709c

1 file changed

Lines changed: 15 additions & 2 deletions

File tree

app/jobs/node_checker.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@
2323
ACTIVE_NODE_STATUSES = [NodeStatus.connected, NodeStatus.connecting, NodeStatus.error]
2424

2525

26+
def should_reconnect_after_health_error(error_code: int | None, error_message: str | None) -> bool:
27+
if error_code is None:
28+
return False
29+
30+
detail = (error_message or "").lower()
31+
if error_code in {500, 502, 503, 504} and (
32+
"failed to get sys stats" in detail or "core is not started yet" in detail
33+
):
34+
return False
35+
36+
return error_code > -1
37+
38+
2639
async def verify_node_backend_health(node: PasarGuardNode, node_name: str) -> tuple[Health, int | None, str | None]:
2740
"""
2841
Verify node health by checking backend stats.
@@ -134,8 +147,8 @@ async def process_node_health_check(db_node: Node, node: PasarGuardNode):
134147
# Record actual error in database
135148
async with GetDB() as db:
136149
await NodeOperation._update_single_node_status(db, db_node.id, NodeStatus.error, message=error_message)
137-
# Only reconnect for non-timeout errors (code > -1)
138-
if error_code is not None and error_code > -1:
150+
# Let pg-node recover transient Xray API/core failures internally.
151+
if should_reconnect_after_health_error(error_code, error_message):
139152
async with GetDB() as db:
140153
await node_operator.connect_single_node(db, db_node.id)
141154
# For timeout (code=-1 or None), just wait - don't reconnect

0 commit comments

Comments
 (0)