|
23 | 23 | ACTIVE_NODE_STATUSES = [NodeStatus.connected, NodeStatus.connecting, NodeStatus.error] |
24 | 24 |
|
25 | 25 |
|
| 26 | +def should_reconnect_after_health_error(error_code: int | None, error_message: str | None) -> bool: |
| 27 | + if error_code is None: |
| 28 | + return False |
| 29 | + |
| 30 | + detail = (error_message or "").lower() |
| 31 | + if error_code in {500, 502, 503, 504} and ( |
| 32 | + "failed to get sys stats" in detail or "core is not started yet" in detail |
| 33 | + ): |
| 34 | + return False |
| 35 | + |
| 36 | + return error_code > -1 |
| 37 | + |
| 38 | + |
26 | 39 | async def verify_node_backend_health(node: PasarGuardNode, node_name: str) -> tuple[Health, int | None, str | None]: |
27 | 40 | """ |
28 | 41 | Verify node health by checking backend stats. |
@@ -134,8 +147,8 @@ async def process_node_health_check(db_node: Node, node: PasarGuardNode): |
134 | 147 | # Record actual error in database |
135 | 148 | async with GetDB() as db: |
136 | 149 | await NodeOperation._update_single_node_status(db, db_node.id, NodeStatus.error, message=error_message) |
137 | | - # Only reconnect for non-timeout errors (code > -1) |
138 | | - if error_code is not None and error_code > -1: |
| 150 | + # Let pg-node recover transient Xray API/core failures internally. |
| 151 | + if should_reconnect_after_health_error(error_code, error_message): |
139 | 152 | async with GetDB() as db: |
140 | 153 | await node_operator.connect_single_node(db, db_node.id) |
141 | 154 | # For timeout (code=-1 or None), just wait - don't reconnect |
|
0 commit comments