From ae832f2320f311a58e2a718b37eab33c53d77c27 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Fri, 7 Jul 2017 20:50:14 +0800 Subject: [PATCH] terminate job if fail too much --- docker/k8s_tools.py | 13 +++++++++++++ docker/paddle_k8s | 4 ++++ 2 files changed, 17 insertions(+) diff --git a/docker/k8s_tools.py b/docker/k8s_tools.py index 16f265e4..7d64f5ff 100644 --- a/docker/k8s_tools.py +++ b/docker/k8s_tools.py @@ -53,6 +53,17 @@ def fetch_trainer_id(): return i return None +def fetch_job_fail_count(): + batch_client = client.BatchV1Api() + job_list = batch_client.list_namespaced_job(NAMESPACE) + for j in job_list.items: + if j.metadata.name == PADDLE_JOB_NAME: + failed_count = j.status.failed + break + if not failed_count: + return 0 + else: + return failed_count if __name__ == "__main__": command = sys.argv[1] @@ -62,3 +73,5 @@ def fetch_trainer_id(): print fetch_trainer_id() elif command == "wait_pods_running": wait_pods_running(sys.argv[2], sys.argv[3]) + elif command == "fetch_job_fail_count": + print fetch_job_fail_count() diff --git a/docker/paddle_k8s b/docker/paddle_k8s index 379f3437..08b249e6 100755 --- a/docker/paddle_k8s +++ b/docker/paddle_k8s @@ -25,6 +25,10 @@ check_trainer_ret() { echo "Program Abort" > /dev/termination-log fi echo "termination log wroted..." + FAILED_COUNT=$(python /root/k8s_tools.py fetch_pserver_ips) + if [ $FAILED_COUNT -ge $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then + exit 0 + fi exit $ret }