From 5948274616a7cc24ac8c825de6e41bb64ae8c963 Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Sun, 1 Mar 2026 14:33:28 +0100 Subject: [PATCH 1/3] Add service startup failure diagnostics to all roles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a service fails to start (bad config, missing certs, etc.), the roles now detect the failure within seconds and surface the actual error from journalctl instead of waiting for a blind port timeout. Each role's restart handler delegates to a restart_and_verify task file that wraps the restart in a block/rescue — if the service module fails or the service dies shortly after, the rescue collects the last 50 journal lines and fails with a clear diagnostic message. The ES and Kibana port waits are replaced with a smarter loop that checks both service health and port availability on each iteration, failing fast with diagnostics if the service dies mid-wait. Includes an elasticsearch_diagnostics molecule scenario that validates both the transparent happy path and the fast-failure path by injecting a bogus setting and asserting that the failure message contains log output. --- .github/workflows/test_role_elasticsearch.yml | 4 ++ .../elasticsearch_diagnostics/converge.yml | 65 +++++++++++++++++++ molecule/elasticsearch_diagnostics/create.yml | 1 + .../elasticsearch_diagnostics/destroy.yml | 1 + .../elasticsearch_diagnostics/molecule.yml | 45 +++++++++++++ .../elasticsearch_diagnostics/prepare.yml | 8 +++ .../requirements.yml | 3 + molecule/elasticsearch_diagnostics/verify.yml | 40 ++++++++++++ roles/beats/handlers/main.yml | 18 ++--- roles/beats/tasks/restart_and_verify_beat.yml | 31 +++++++++ roles/elasticsearch/handlers/main.yml | 5 +- .../restart_and_verify_elasticsearch.yml | 32 +++++++++ .../elasticsearch/tasks/wait_for_instance.yml | 49 ++++++++++++-- roles/kibana/handlers/main.yml | 4 +- roles/kibana/tasks/main.yml | 52 ++++++++++++--- .../tasks/restart_and_verify_kibana.yml | 31 +++++++++ roles/logstash/handlers/main.yml | 10 +-- .../tasks/restart_and_verify_logstash.yml | 32 +++++++++ 18 files changed, 394 insertions(+), 37 deletions(-) create mode 100644 molecule/elasticsearch_diagnostics/converge.yml create mode 120000 molecule/elasticsearch_diagnostics/create.yml create mode 120000 molecule/elasticsearch_diagnostics/destroy.yml create mode 100644 molecule/elasticsearch_diagnostics/molecule.yml create mode 100644 molecule/elasticsearch_diagnostics/prepare.yml create mode 100644 molecule/elasticsearch_diagnostics/requirements.yml create mode 100644 molecule/elasticsearch_diagnostics/verify.yml create mode 100644 roles/beats/tasks/restart_and_verify_beat.yml create mode 100644 roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml create mode 100644 roles/kibana/tasks/restart_and_verify_kibana.yml create mode 100644 roles/logstash/tasks/restart_and_verify_logstash.yml diff --git a/.github/workflows/test_role_elasticsearch.yml b/.github/workflows/test_role_elasticsearch.yml index 0a5b3125..887a3fd4 100644 --- a/.github/workflows/test_role_elasticsearch.yml +++ b/.github/workflows/test_role_elasticsearch.yml @@ -73,6 +73,10 @@ jobs: release: - 8 - 9 + include: + - distro: debian13 + scenario: elasticsearch_diagnostics + release: 9 steps: - name: Check out code diff --git a/molecule/elasticsearch_diagnostics/converge.yml b/molecule/elasticsearch_diagnostics/converge.yml new file mode 100644 index 00000000..e9e4586f --- /dev/null +++ b/molecule/elasticsearch_diagnostics/converge.yml @@ -0,0 +1,65 @@ +--- +- name: Install Elasticsearch with good config + hosts: all + vars: + elasticstack_full_stack: false + elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | default('9', true) | int }}" + elasticsearch_heap: "1" + elasticstack_no_log: false + tasks: + - name: Include Elastic repos role + ansible.builtin.include_role: + name: oddly.elasticstack.repos + + - name: Include Elasticsearch role + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + + - name: CI — set lenient disk watermarks + ansible.builtin.include_tasks: ../shared/set_ci_watermarks.yml + +- name: Test startup failure diagnostics + hosts: all + tasks: + - name: Back up good config + ansible.builtin.copy: + src: /etc/elasticsearch/elasticsearch.yml + dest: /etc/elasticsearch/elasticsearch.yml.good + remote_src: true + mode: "0644" + + - name: Test bad config produces fast failure with diagnostics + block: + - name: Inject bad setting into elasticsearch.yml + ansible.builtin.lineinfile: + path: /etc/elasticsearch/elasticsearch.yml + line: "bogus.nonexistent.setting: true" + + - name: Attempt restart with bad config (should fail with diagnostics) + ansible.builtin.include_tasks: + file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" + + - name: This should not be reached + ansible.builtin.fail: + msg: "restart_and_verify_elasticsearch.yml did not fail on bad config" + + rescue: + - name: Assert failure includes diagnostic log output + ansible.builtin.assert: + that: + - ansible_failed_result.msg is defined + - "'Recent log output' in ansible_failed_result.msg" + fail_msg: >- + Expected failure message with 'Recent log output', got: + {{ ansible_failed_result.msg | default('no message') }} + + - name: Restore good config + ansible.builtin.copy: + src: /etc/elasticsearch/elasticsearch.yml.good + dest: /etc/elasticsearch/elasticsearch.yml + remote_src: true + mode: "0644" + + - name: Restart Elasticsearch with restored config + ansible.builtin.include_tasks: + file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" diff --git a/molecule/elasticsearch_diagnostics/create.yml b/molecule/elasticsearch_diagnostics/create.yml new file mode 120000 index 00000000..138c2c2f --- /dev/null +++ b/molecule/elasticsearch_diagnostics/create.yml @@ -0,0 +1 @@ +../shared/create.yml \ No newline at end of file diff --git a/molecule/elasticsearch_diagnostics/destroy.yml b/molecule/elasticsearch_diagnostics/destroy.yml new file mode 120000 index 00000000..d18e9a34 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/destroy.yml @@ -0,0 +1 @@ +../shared/destroy.yml \ No newline at end of file diff --git a/molecule/elasticsearch_diagnostics/molecule.yml b/molecule/elasticsearch_diagnostics/molecule.yml new file mode 100644 index 00000000..529eda8f --- /dev/null +++ b/molecule/elasticsearch_diagnostics/molecule.yml @@ -0,0 +1,45 @@ +--- +prerun: false +dependency: + name: galaxy + options: + requirements-file: requirements.yml +driver: + name: default +platforms: + - name: "es-diag-${MOLECULE_DISTRO:-debian11}-r${ELASTIC_RELEASE:-9}" + groups: + - elasticsearch + distro: "${MOLECULE_DISTRO:-debian11}" + memory_mb: 3328 +provisioner: + name: ansible + env: + ANSIBLE_LOG_PATH: /var/log/ansible.log + connection_options: + ansible_connection: ssh + ansible_user: root + ansible_ssh_retries: 3 + ansible_ssh_common_args: >- + -o StrictHostKeyChecking=no + -o "ProxyCommand=ssh -o StrictHostKeyChecking=no -o BatchMode=yes + -i ${MOLECULE_SSH_KEY:-~/.ssh/molecule_id_ed25519} + -W %h:%p root@${INCUS_HOST:-172.30.0.172}" + inventory: + group_vars: + all: + ansible_python_interpreter: /usr/bin/python3 +scenario: + test_sequence: + - dependency + - cleanup + - destroy + - syntax + - create + - prepare + - converge + - verify + - cleanup + - destroy +verifier: + name: ansible diff --git a/molecule/elasticsearch_diagnostics/prepare.yml b/molecule/elasticsearch_diagnostics/prepare.yml new file mode 100644 index 00000000..0a7703a1 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/prepare.yml @@ -0,0 +1,8 @@ +--- +- name: Prepare + hosts: all + vars: + distro_cache_url: "{{ lookup('env', 'DISTRO_CACHE_URL') }}" + tasks: + - name: Common prepare tasks + ansible.builtin.include_tasks: ../shared/prepare_common.yml diff --git a/molecule/elasticsearch_diagnostics/requirements.yml b/molecule/elasticsearch_diagnostics/requirements.yml new file mode 100644 index 00000000..8dd51618 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/requirements.yml @@ -0,0 +1,3 @@ +--- +collections: + - community.general diff --git a/molecule/elasticsearch_diagnostics/verify.yml b/molecule/elasticsearch_diagnostics/verify.yml new file mode 100644 index 00000000..9097bb20 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/verify.yml @@ -0,0 +1,40 @@ +--- +- name: Verify Elasticsearch is healthy after diagnostics test + hosts: all + tasks: + - name: Check Elasticsearch service status + ansible.builtin.service_facts: + + - name: Verify Elasticsearch is running + ansible.builtin.assert: + that: + - ansible_facts.services['elasticsearch.service'].state == 'running' + fail_msg: "Elasticsearch is not running after config restore" + + - name: Fetch Elastic password + ansible.builtin.shell: | + set -o pipefail + grep "PASSWORD elastic " /usr/share/elasticsearch/initial_passwords | + awk {' print $4 '} + args: + executable: /bin/bash + register: elastic_pass + changed_when: false + + - name: Verify Elasticsearch API is responsive + ansible.builtin.uri: + url: "https://localhost:9200/_cluster/health" + method: GET + validate_certs: false + force_basic_auth: true + user: elastic + password: "{{ elastic_pass.stdout }}" + status_code: 200 + return_content: true + register: health + + - name: Verify cluster health is green or yellow + ansible.builtin.assert: + that: + - health.json.status in ['green', 'yellow'] + fail_msg: "Cluster health is {{ health.json.status }}" diff --git a/roles/beats/handlers/main.yml b/roles/beats/handlers/main.yml index 475d3e95..088d9048 100644 --- a/roles/beats/handlers/main.yml +++ b/roles/beats/handlers/main.yml @@ -2,27 +2,27 @@ # handlers file for beats # - name: Restart Filebeat - ansible.builtin.service: - name: filebeat - state: restarted + ansible.builtin.include_tasks: restart_and_verify_beat.yml + vars: + _beat_service_name: filebeat when: - not ansible_check_mode - beats_filebeat | bool - beats_filebeat_enable | bool - name: Restart Auditbeat - ansible.builtin.service: - name: auditbeat - state: restarted + ansible.builtin.include_tasks: restart_and_verify_beat.yml + vars: + _beat_service_name: auditbeat when: - not ansible_check_mode - beats_auditbeat | bool - beats_auditbeat_enable | bool - name: Restart Metricbeat - ansible.builtin.service: - name: metricbeat - state: restarted + ansible.builtin.include_tasks: restart_and_verify_beat.yml + vars: + _beat_service_name: metricbeat when: - not ansible_check_mode - beats_metricbeat | bool diff --git a/roles/beats/tasks/restart_and_verify_beat.yml b/roles/beats/tasks/restart_and_verify_beat.yml new file mode 100644 index 00000000..073cb7a5 --- /dev/null +++ b/roles/beats/tasks/restart_and_verify_beat.yml @@ -0,0 +1,31 @@ +--- + +- name: "Restart and verify beat — {{ _beat_service_name }}" # noqa: name[template] + block: + - name: "Restart beat service — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.service: + name: "{{ _beat_service_name }}" + state: restarted + + - name: "Verify beat service is running — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.systemd: + name: "{{ _beat_service_name }}" + register: _beat_service_state + until: _beat_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: "Get recent journal output — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.command: + cmd: "journalctl -u {{ _beat_service_name }} --no-pager -n 50" + register: _beat_journal + changed_when: false + + - name: "Fail with startup diagnostics — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.fail: + msg: | + {{ _beat_service_name }} failed to start. + + Recent log output: + {{ _beat_journal.stdout }} diff --git a/roles/elasticsearch/handlers/main.yml b/roles/elasticsearch/handlers/main.yml index 7f105b2d..59d35193 100644 --- a/roles/elasticsearch/handlers/main.yml +++ b/roles/elasticsearch/handlers/main.yml @@ -1,10 +1,7 @@ --- # handlers file for elasticsearch - name: Restart Elasticsearch - ansible.builtin.service: - name: elasticsearch - state: restarted - daemon_reload: true + ansible.builtin.include_tasks: restart_and_verify_elasticsearch.yml when: - not ansible_check_mode - elasticsearch_enable | bool diff --git a/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml new file mode 100644 index 00000000..6dd99be2 --- /dev/null +++ b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml @@ -0,0 +1,32 @@ +--- + +- name: Restart and verify Elasticsearch + block: + - name: Restart Elasticsearch service + ansible.builtin.service: + name: elasticsearch + state: restarted + daemon_reload: true + + - name: Verify Elasticsearch is running + ansible.builtin.systemd: + name: elasticsearch + register: _elasticsearch_service_state + until: _elasticsearch_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: Get recent Elasticsearch journal output + ansible.builtin.command: + cmd: journalctl -u elasticsearch --no-pager -n 50 + register: _elasticsearch_journal + changed_when: false + + - name: Fail with Elasticsearch startup diagnostics + ansible.builtin.fail: + msg: | + Elasticsearch failed to start. + + Recent log output: + {{ _elasticsearch_journal.stdout }} diff --git a/roles/elasticsearch/tasks/wait_for_instance.yml b/roles/elasticsearch/tasks/wait_for_instance.yml index aeccf807..2a5e38ed 100644 --- a/roles/elasticsearch/tasks/wait_for_instance.yml +++ b/roles/elasticsearch/tasks/wait_for_instance.yml @@ -1,12 +1,51 @@ --- -- name: Wait for instance - ansible.builtin.wait_for: - host: "{{ elasticsearch_api_host }}" - port: "{{ elasticstack_elasticsearch_http_port }}" - timeout: 600 +- name: Wait for Elasticsearch to be ready when: not ansible_check_mode tags: - certificates - renew_ca - renew_es_cert + block: + - name: Wait for Elasticsearch port with service health check + ansible.builtin.shell: + cmd: | + if ! systemctl is-active --quiet elasticsearch; then + exit 2 + fi + if ss -tln | grep -q ':{{ elasticstack_elasticsearch_http_port }} '; then + exit 0 + fi + exit 1 + register: _es_wait_result + until: _es_wait_result.rc == 0 + retries: 120 + delay: 5 + changed_when: false + failed_when: _es_wait_result.rc == 2 + + rescue: + - name: Get recent Elasticsearch journal output + ansible.builtin.command: + cmd: journalctl -u elasticsearch --no-pager -n 50 + register: _es_wait_journal + changed_when: false + + - name: Fail with Elasticsearch diagnostics (service crashed) + ansible.builtin.fail: + msg: | + Elasticsearch service died while waiting for port {{ elasticstack_elasticsearch_http_port }}. + + Recent log output: + {{ _es_wait_journal.stdout }} + when: _es_wait_result.rc | default(0) == 2 + + - name: Fail with Elasticsearch diagnostics (port timeout) + ansible.builtin.fail: + msg: | + Elasticsearch port {{ elasticstack_elasticsearch_http_port }} did not become available within 600s. + Service state: {{ _es_wait_result.stdout | default('unknown') }} + + Recent log output: + {{ _es_wait_journal.stdout }} + when: _es_wait_result.rc | default(0) != 2 diff --git a/roles/kibana/handlers/main.yml b/roles/kibana/handlers/main.yml index 0da98678..5d380ac1 100644 --- a/roles/kibana/handlers/main.yml +++ b/roles/kibana/handlers/main.yml @@ -5,9 +5,7 @@ daemon_reload: true - name: Restart Kibana - ansible.builtin.service: - name: kibana - state: restarted + ansible.builtin.include_tasks: restart_and_verify_kibana.yml when: - not ansible_check_mode - kibana_enable | bool diff --git a/roles/kibana/tasks/main.yml b/roles/kibana/tasks/main.yml index abea87c8..7ef5aafc 100644 --- a/roles/kibana/tasks/main.yml +++ b/roles/kibana/tasks/main.yml @@ -126,13 +126,49 @@ - not ansible_check_mode register: kibana_freshstart -# the following is useful when running tests or extra tasks that need to -# have Kibana running. Escape it on Rocky8, because it gets time out with Elastic 8 - -- name: Wait for Kibana to start - ansible.builtin.wait_for: - host: localhost - port: 5601 - timeout: 300 +- name: Wait for Kibana to be ready when: not ansible_check_mode + block: + - name: Wait for Kibana port with service health check + ansible.builtin.shell: + cmd: | + if ! systemctl is-active --quiet kibana; then + exit 2 + fi + if ss -tln | grep -q ':5601 '; then + exit 0 + fi + exit 1 + register: _kibana_wait_result + until: _kibana_wait_result.rc == 0 + retries: 60 + delay: 5 + changed_when: false + failed_when: _kibana_wait_result.rc == 2 + + rescue: + - name: Get recent Kibana journal output + ansible.builtin.command: + cmd: journalctl -u kibana --no-pager -n 50 + register: _kibana_wait_journal + changed_when: false + + - name: Fail with Kibana diagnostics (service crashed) + ansible.builtin.fail: + msg: | + Kibana service died while waiting for port 5601. + + Recent log output: + {{ _kibana_wait_journal.stdout }} + when: _kibana_wait_result.rc | default(0) == 2 + + - name: Fail with Kibana diagnostics (port timeout) + ansible.builtin.fail: + msg: | + Kibana port 5601 did not become available within 300s. + Service state: {{ _kibana_wait_result.stdout | default('unknown') }} + + Recent log output: + {{ _kibana_wait_journal.stdout }} + when: _kibana_wait_result.rc | default(0) != 2 diff --git a/roles/kibana/tasks/restart_and_verify_kibana.yml b/roles/kibana/tasks/restart_and_verify_kibana.yml new file mode 100644 index 00000000..5f23510e --- /dev/null +++ b/roles/kibana/tasks/restart_and_verify_kibana.yml @@ -0,0 +1,31 @@ +--- + +- name: Restart and verify Kibana + block: + - name: Restart Kibana service + ansible.builtin.service: + name: kibana + state: restarted + + - name: Verify Kibana is running + ansible.builtin.systemd: + name: kibana + register: _kibana_service_state + until: _kibana_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: Get recent Kibana journal output + ansible.builtin.command: + cmd: journalctl -u kibana --no-pager -n 50 + register: _kibana_journal + changed_when: false + + - name: Fail with Kibana startup diagnostics + ansible.builtin.fail: + msg: | + Kibana failed to start. + + Recent log output: + {{ _kibana_journal.stdout }} diff --git a/roles/logstash/handlers/main.yml b/roles/logstash/handlers/main.yml index 71087b81..20eb0b78 100644 --- a/roles/logstash/handlers/main.yml +++ b/roles/logstash/handlers/main.yml @@ -1,20 +1,14 @@ --- # handlers file for logstash - name: Restart Logstash - ansible.builtin.service: - name: logstash - state: restarted - daemon_reload: true + ansible.builtin.include_tasks: restart_and_verify_logstash.yml when: - not ansible_check_mode - logstash_enable | bool - not logstash_freshstart.changed | bool - name: Restart Logstash noauto - ansible.builtin.service: - name: logstash - state: restarted - daemon_reload: true + ansible.builtin.include_tasks: restart_and_verify_logstash.yml when: - not ansible_check_mode - not logstash_config_autoreload diff --git a/roles/logstash/tasks/restart_and_verify_logstash.yml b/roles/logstash/tasks/restart_and_verify_logstash.yml new file mode 100644 index 00000000..b3af5977 --- /dev/null +++ b/roles/logstash/tasks/restart_and_verify_logstash.yml @@ -0,0 +1,32 @@ +--- + +- name: Restart and verify Logstash + block: + - name: Restart Logstash service + ansible.builtin.service: + name: logstash + state: restarted + daemon_reload: true + + - name: Verify Logstash is running + ansible.builtin.systemd: + name: logstash + register: _logstash_service_state + until: _logstash_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: Get recent Logstash journal output + ansible.builtin.command: + cmd: journalctl -u logstash --no-pager -n 50 + register: _logstash_journal + changed_when: false + + - name: Fail with Logstash startup diagnostics + ansible.builtin.fail: + msg: | + Logstash failed to start. + + Recent log output: + {{ _logstash_journal.stdout }} From 2f0371150225c7bfb3bb3b19e6f45f57e3c20cfc Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Sun, 1 Mar 2026 15:05:48 +0100 Subject: [PATCH 2/3] Add tests/integration to .gitignore The hand-rolled integration test suite targets specific Proxmox infrastructure and shouldn't be tracked in the repo. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 25ff940b..68c6d93f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__/ *.swp *.tar.gz venv/ +tests/integration/ From ac6f33be23f3a1245658bd1c183195cb63d3318b Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Sun, 1 Mar 2026 17:17:23 +0100 Subject: [PATCH 3/3] Wait for Kibana HTTP readiness instead of port open Kibana opens port 5601 during its Preboot phase (~8 seconds after start) but takes 1-2 minutes to serve HTTP. Port-based wait_for checks pass during Preboot, leaving Kibana unable to serve requests when subsequent tasks or verify scripts run. Changed all three Kibana wait locations to check for HTTP 200/401 on /api/status instead of using ss/wait_for on the port: - roles/kibana/tasks/main.yml (smart watchdog) - roles/kibana/tasks/restart_and_verify_kibana.yml (handler verify) - roles/elasticsearch/handlers/restart_kibana.yml (ES cert handler) The ES role's restart_kibana handler also gets full diagnostics: if Kibana fails to start after the cert restart, the handler collects journalctl output and fails with an actionable message instead of silently leaving Kibana down. --- .../elasticsearch/handlers/restart_kibana.yml | 47 +++++++++++++++++-- roles/kibana/tasks/main.yml | 11 +++-- .../tasks/restart_and_verify_kibana.yml | 14 ++++++ 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/roles/elasticsearch/handlers/restart_kibana.yml b/roles/elasticsearch/handlers/restart_kibana.yml index ffaa4548..e168027b 100644 --- a/roles/elasticsearch/handlers/restart_kibana.yml +++ b/roles/elasticsearch/handlers/restart_kibana.yml @@ -5,9 +5,46 @@ delegate_to: "{{ item }}" changed_when: false -- name: Restart Kibana - ansible.builtin.service: - name: kibana - state: restarted - delegate_to: "{{ item }}" +- name: Restart and wait for Kibana when: "'kibana' in hostvars[item].ansible_facts.packages | default({})" + block: + - name: Restart Kibana service + ansible.builtin.service: + name: kibana + state: restarted + delegate_to: "{{ item }}" + + - name: Wait for Kibana HTTP readiness after restart + ansible.builtin.shell: + cmd: | + if ! systemctl is-active --quiet kibana; then + exit 2 + fi + HTTP_CODE=$(curl -sk -o /dev/null -w '%{http_code}' http://localhost:5601/api/status 2>/dev/null) || true + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then + exit 0 + fi + exit 1 + delegate_to: "{{ item }}" + register: _kibana_handler_wait + until: _kibana_handler_wait.rc == 0 + retries: 60 + delay: 5 + changed_when: false + failed_when: _kibana_handler_wait.rc == 2 + + rescue: + - name: Get recent Kibana journal output + ansible.builtin.command: + cmd: journalctl -u kibana --no-pager -n 50 + register: _kibana_handler_journal + delegate_to: "{{ item }}" + changed_when: false + + - name: Fail with Kibana startup diagnostics + ansible.builtin.fail: + msg: | + Kibana failed to start after restart by elasticsearch handler. + + Recent log output: + {{ _kibana_handler_journal.stdout }} diff --git a/roles/kibana/tasks/main.yml b/roles/kibana/tasks/main.yml index 7ef5aafc..86c38e0a 100644 --- a/roles/kibana/tasks/main.yml +++ b/roles/kibana/tasks/main.yml @@ -129,13 +129,14 @@ - name: Wait for Kibana to be ready when: not ansible_check_mode block: - - name: Wait for Kibana port with service health check + - name: Wait for Kibana HTTP with service health check ansible.builtin.shell: cmd: | if ! systemctl is-active --quiet kibana; then exit 2 fi - if ss -tln | grep -q ':5601 '; then + HTTP_CODE=$(curl -sk -o /dev/null -w '%{http_code}' http://localhost:5601/api/status 2>/dev/null) || true + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then exit 0 fi exit 1 @@ -156,16 +157,16 @@ - name: Fail with Kibana diagnostics (service crashed) ansible.builtin.fail: msg: | - Kibana service died while waiting for port 5601. + Kibana service died while waiting for HTTP readiness. Recent log output: {{ _kibana_wait_journal.stdout }} when: _kibana_wait_result.rc | default(0) == 2 - - name: Fail with Kibana diagnostics (port timeout) + - name: Fail with Kibana diagnostics (HTTP timeout) ansible.builtin.fail: msg: | - Kibana port 5601 did not become available within 300s. + Kibana HTTP did not become ready within 300s. Service state: {{ _kibana_wait_result.stdout | default('unknown') }} Recent log output: diff --git a/roles/kibana/tasks/restart_and_verify_kibana.yml b/roles/kibana/tasks/restart_and_verify_kibana.yml index 5f23510e..c50c1a8e 100644 --- a/roles/kibana/tasks/restart_and_verify_kibana.yml +++ b/roles/kibana/tasks/restart_and_verify_kibana.yml @@ -15,6 +15,20 @@ retries: 5 delay: 3 + - name: Wait for Kibana HTTP readiness + ansible.builtin.shell: + cmd: | + HTTP_CODE=$(curl -sk -o /dev/null -w '%{http_code}' http://localhost:5601/api/status 2>/dev/null) || true + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then + exit 0 + fi + exit 1 + register: _kibana_http_check + until: _kibana_http_check.rc == 0 + retries: 60 + delay: 5 + changed_when: false + rescue: - name: Get recent Kibana journal output ansible.builtin.command: