diff --git a/server/loomengine_server/api/models/tasks.py b/server/loomengine_server/api/models/tasks.py index 1cc293d6..646ac296 100644 --- a/server/loomengine_server/api/models/tasks.py +++ b/server/loomengine_server/api/models/tasks.py @@ -137,6 +137,7 @@ def _process_error(self, detail, max_retries, exponential_delay=False): if self.has_terminal_status(): return + self._kill_children(detail=detail) # Do this before attempting retry failure_count = int(getattr(self, failure_count_attribute)) + 1 if failure_count <= max_retries: self.setattrs_and_save_with_retries( @@ -166,9 +167,8 @@ def _process_error(self, detail, max_retries, 'Retries exceeded for %s' % failure_text.lower(), detail=detail, is_error=True) - self._kill_children(detail=detail) self.run.fail(detail='Task %s failed' % self.uuid) - + def system_error(self, detail=''): self._process_error( detail, @@ -325,8 +325,6 @@ def create_unsaved_task_from_input_set(cls, input_set, run, run_outputs): def create_and_activate_task_attempt(self): try: - self._kill_children( - detail="TaskAttempt errored or timed out and was restarted.") task_attempt = TaskAttempt.create_from_task(self) self.activate_task_attempt(task_attempt) return task_attempt diff --git a/server/loomengine_server/core/settings.py b/server/loomengine_server/core/settings.py index 9fbfb3df..c2a145aa 100644 --- a/server/loomengine_server/core/settings.py +++ b/server/loomengine_server/core/settings.py @@ -122,9 +122,9 @@ def _add_url_prefix(path): DISABLE_DELETE = to_boolean(os.getenv('LOOM_DISABLE_DELETE', 'False')) FORCE_RERUN = to_boolean(os.getenv('LOOM_FORCE_RERUN', 'False')) -TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS = int(os.getenv('LOOM_TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS', '60')) -TASKRUNNER_HEARTBEAT_TIMEOUT_SECONDS = int(os.getenv('LOOM_TASKRUNNER_HEARTBEAT_TIMEOUT_SECONDS', TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS*2.5)) -SYSTEM_CHECK_INTERVAL_MINUTES = int(os.getenv('LOOM_SYSTEM_CHECK_INTERVAL_MINUTES', '15')) +TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS = float(os.getenv('LOOM_TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS', '60')) +TASKRUNNER_HEARTBEAT_TIMEOUT_SECONDS = float(os.getenv('LOOM_TASKRUNNER_HEARTBEAT_TIMEOUT_SECONDS', TASKRUNNER_HEARTBEAT_INTERVAL_SECONDS*2.5)) +SYSTEM_CHECK_INTERVAL_MINUTES = float(os.getenv('LOOM_SYSTEM_CHECK_INTERVAL_MINUTES', '15')) PRESERVE_ON_FAILURE = to_boolean(os.getenv('LOOM_PRESERVE_ON_FAILURE', 'False')) PRESERVE_ALL = to_boolean(os.getenv('LOOM_PRESERVE_ALL', 'False')) TASK_TIMEOUT_HOURS = float(os.getenv(