diff --git a/.github/workflows/test_role_elasticsearch.yml b/.github/workflows/test_role_elasticsearch.yml index 0a5b3125..887a3fd4 100644 --- a/.github/workflows/test_role_elasticsearch.yml +++ b/.github/workflows/test_role_elasticsearch.yml @@ -73,6 +73,10 @@ jobs: release: - 8 - 9 + include: + - distro: debian13 + scenario: elasticsearch_diagnostics + release: 9 steps: - name: Check out code diff --git a/.gitignore b/.gitignore index 25ff940b..68c6d93f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__/ *.swp *.tar.gz venv/ +tests/integration/ diff --git a/molecule/elasticsearch_diagnostics/converge.yml b/molecule/elasticsearch_diagnostics/converge.yml new file mode 100644 index 00000000..e9e4586f --- /dev/null +++ b/molecule/elasticsearch_diagnostics/converge.yml @@ -0,0 +1,65 @@ +--- +- name: Install Elasticsearch with good config + hosts: all + vars: + elasticstack_full_stack: false + elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | default('9', true) | int }}" + elasticsearch_heap: "1" + elasticstack_no_log: false + tasks: + - name: Include Elastic repos role + ansible.builtin.include_role: + name: oddly.elasticstack.repos + + - name: Include Elasticsearch role + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + + - name: CI — set lenient disk watermarks + ansible.builtin.include_tasks: ../shared/set_ci_watermarks.yml + +- name: Test startup failure diagnostics + hosts: all + tasks: + - name: Back up good config + ansible.builtin.copy: + src: /etc/elasticsearch/elasticsearch.yml + dest: /etc/elasticsearch/elasticsearch.yml.good + remote_src: true + mode: "0644" + + - name: Test bad config produces fast failure with diagnostics + block: + - name: Inject bad setting into elasticsearch.yml + ansible.builtin.lineinfile: + path: /etc/elasticsearch/elasticsearch.yml + line: "bogus.nonexistent.setting: true" + + - name: Attempt restart with bad config (should fail with diagnostics) + ansible.builtin.include_tasks: + file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" + + - name: This should not be reached + ansible.builtin.fail: + msg: "restart_and_verify_elasticsearch.yml did not fail on bad config" + + rescue: + - name: Assert failure includes diagnostic log output + ansible.builtin.assert: + that: + - ansible_failed_result.msg is defined + - "'Recent log output' in ansible_failed_result.msg" + fail_msg: >- + Expected failure message with 'Recent log output', got: + {{ ansible_failed_result.msg | default('no message') }} + + - name: Restore good config + ansible.builtin.copy: + src: /etc/elasticsearch/elasticsearch.yml.good + dest: /etc/elasticsearch/elasticsearch.yml + remote_src: true + mode: "0644" + + - name: Restart Elasticsearch with restored config + ansible.builtin.include_tasks: + file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" diff --git a/molecule/elasticsearch_diagnostics/create.yml b/molecule/elasticsearch_diagnostics/create.yml new file mode 120000 index 00000000..138c2c2f --- /dev/null +++ b/molecule/elasticsearch_diagnostics/create.yml @@ -0,0 +1 @@ +../shared/create.yml \ No newline at end of file diff --git a/molecule/elasticsearch_diagnostics/destroy.yml b/molecule/elasticsearch_diagnostics/destroy.yml new file mode 120000 index 00000000..d18e9a34 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/destroy.yml @@ -0,0 +1 @@ +../shared/destroy.yml \ No newline at end of file diff --git a/molecule/elasticsearch_diagnostics/molecule.yml b/molecule/elasticsearch_diagnostics/molecule.yml new file mode 100644 index 00000000..529eda8f --- /dev/null +++ b/molecule/elasticsearch_diagnostics/molecule.yml @@ -0,0 +1,45 @@ +--- +prerun: false +dependency: + name: galaxy + options: + requirements-file: requirements.yml +driver: + name: default +platforms: + - name: "es-diag-${MOLECULE_DISTRO:-debian11}-r${ELASTIC_RELEASE:-9}" + groups: + - elasticsearch + distro: "${MOLECULE_DISTRO:-debian11}" + memory_mb: 3328 +provisioner: + name: ansible + env: + ANSIBLE_LOG_PATH: /var/log/ansible.log + connection_options: + ansible_connection: ssh + ansible_user: root + ansible_ssh_retries: 3 + ansible_ssh_common_args: >- + -o StrictHostKeyChecking=no + -o "ProxyCommand=ssh -o StrictHostKeyChecking=no -o BatchMode=yes + -i ${MOLECULE_SSH_KEY:-~/.ssh/molecule_id_ed25519} + -W %h:%p root@${INCUS_HOST:-172.30.0.172}" + inventory: + group_vars: + all: + ansible_python_interpreter: /usr/bin/python3 +scenario: + test_sequence: + - dependency + - cleanup + - destroy + - syntax + - create + - prepare + - converge + - verify + - cleanup + - destroy +verifier: + name: ansible diff --git a/molecule/elasticsearch_diagnostics/prepare.yml b/molecule/elasticsearch_diagnostics/prepare.yml new file mode 100644 index 00000000..0a7703a1 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/prepare.yml @@ -0,0 +1,8 @@ +--- +- name: Prepare + hosts: all + vars: + distro_cache_url: "{{ lookup('env', 'DISTRO_CACHE_URL') }}" + tasks: + - name: Common prepare tasks + ansible.builtin.include_tasks: ../shared/prepare_common.yml diff --git a/molecule/elasticsearch_diagnostics/requirements.yml b/molecule/elasticsearch_diagnostics/requirements.yml new file mode 100644 index 00000000..8dd51618 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/requirements.yml @@ -0,0 +1,3 @@ +--- +collections: + - community.general diff --git a/molecule/elasticsearch_diagnostics/verify.yml b/molecule/elasticsearch_diagnostics/verify.yml new file mode 100644 index 00000000..9097bb20 --- /dev/null +++ b/molecule/elasticsearch_diagnostics/verify.yml @@ -0,0 +1,40 @@ +--- +- name: Verify Elasticsearch is healthy after diagnostics test + hosts: all + tasks: + - name: Check Elasticsearch service status + ansible.builtin.service_facts: + + - name: Verify Elasticsearch is running + ansible.builtin.assert: + that: + - ansible_facts.services['elasticsearch.service'].state == 'running' + fail_msg: "Elasticsearch is not running after config restore" + + - name: Fetch Elastic password + ansible.builtin.shell: | + set -o pipefail + grep "PASSWORD elastic " /usr/share/elasticsearch/initial_passwords | + awk {' print $4 '} + args: + executable: /bin/bash + register: elastic_pass + changed_when: false + + - name: Verify Elasticsearch API is responsive + ansible.builtin.uri: + url: "https://localhost:9200/_cluster/health" + method: GET + validate_certs: false + force_basic_auth: true + user: elastic + password: "{{ elastic_pass.stdout }}" + status_code: 200 + return_content: true + register: health + + - name: Verify cluster health is green or yellow + ansible.builtin.assert: + that: + - health.json.status in ['green', 'yellow'] + fail_msg: "Cluster health is {{ health.json.status }}" diff --git a/roles/beats/handlers/main.yml b/roles/beats/handlers/main.yml index 475d3e95..088d9048 100644 --- a/roles/beats/handlers/main.yml +++ b/roles/beats/handlers/main.yml @@ -2,27 +2,27 @@ # handlers file for beats # - name: Restart Filebeat - ansible.builtin.service: - name: filebeat - state: restarted + ansible.builtin.include_tasks: restart_and_verify_beat.yml + vars: + _beat_service_name: filebeat when: - not ansible_check_mode - beats_filebeat | bool - beats_filebeat_enable | bool - name: Restart Auditbeat - ansible.builtin.service: - name: auditbeat - state: restarted + ansible.builtin.include_tasks: restart_and_verify_beat.yml + vars: + _beat_service_name: auditbeat when: - not ansible_check_mode - beats_auditbeat | bool - beats_auditbeat_enable | bool - name: Restart Metricbeat - ansible.builtin.service: - name: metricbeat - state: restarted + ansible.builtin.include_tasks: restart_and_verify_beat.yml + vars: + _beat_service_name: metricbeat when: - not ansible_check_mode - beats_metricbeat | bool diff --git a/roles/beats/tasks/restart_and_verify_beat.yml b/roles/beats/tasks/restart_and_verify_beat.yml new file mode 100644 index 00000000..073cb7a5 --- /dev/null +++ b/roles/beats/tasks/restart_and_verify_beat.yml @@ -0,0 +1,31 @@ +--- + +- name: "Restart and verify beat — {{ _beat_service_name }}" # noqa: name[template] + block: + - name: "Restart beat service — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.service: + name: "{{ _beat_service_name }}" + state: restarted + + - name: "Verify beat service is running — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.systemd: + name: "{{ _beat_service_name }}" + register: _beat_service_state + until: _beat_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: "Get recent journal output — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.command: + cmd: "journalctl -u {{ _beat_service_name }} --no-pager -n 50" + register: _beat_journal + changed_when: false + + - name: "Fail with startup diagnostics — {{ _beat_service_name }}" # noqa: name[template] + ansible.builtin.fail: + msg: | + {{ _beat_service_name }} failed to start. + + Recent log output: + {{ _beat_journal.stdout }} diff --git a/roles/elasticsearch/handlers/main.yml b/roles/elasticsearch/handlers/main.yml index 7f105b2d..59d35193 100644 --- a/roles/elasticsearch/handlers/main.yml +++ b/roles/elasticsearch/handlers/main.yml @@ -1,10 +1,7 @@ --- # handlers file for elasticsearch - name: Restart Elasticsearch - ansible.builtin.service: - name: elasticsearch - state: restarted - daemon_reload: true + ansible.builtin.include_tasks: restart_and_verify_elasticsearch.yml when: - not ansible_check_mode - elasticsearch_enable | bool diff --git a/roles/elasticsearch/handlers/restart_kibana.yml b/roles/elasticsearch/handlers/restart_kibana.yml index ffaa4548..e168027b 100644 --- a/roles/elasticsearch/handlers/restart_kibana.yml +++ b/roles/elasticsearch/handlers/restart_kibana.yml @@ -5,9 +5,46 @@ delegate_to: "{{ item }}" changed_when: false -- name: Restart Kibana - ansible.builtin.service: - name: kibana - state: restarted - delegate_to: "{{ item }}" +- name: Restart and wait for Kibana when: "'kibana' in hostvars[item].ansible_facts.packages | default({})" + block: + - name: Restart Kibana service + ansible.builtin.service: + name: kibana + state: restarted + delegate_to: "{{ item }}" + + - name: Wait for Kibana HTTP readiness after restart + ansible.builtin.shell: + cmd: | + if ! systemctl is-active --quiet kibana; then + exit 2 + fi + HTTP_CODE=$(curl -sk -o /dev/null -w '%{http_code}' http://localhost:5601/api/status 2>/dev/null) || true + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then + exit 0 + fi + exit 1 + delegate_to: "{{ item }}" + register: _kibana_handler_wait + until: _kibana_handler_wait.rc == 0 + retries: 60 + delay: 5 + changed_when: false + failed_when: _kibana_handler_wait.rc == 2 + + rescue: + - name: Get recent Kibana journal output + ansible.builtin.command: + cmd: journalctl -u kibana --no-pager -n 50 + register: _kibana_handler_journal + delegate_to: "{{ item }}" + changed_when: false + + - name: Fail with Kibana startup diagnostics + ansible.builtin.fail: + msg: | + Kibana failed to start after restart by elasticsearch handler. + + Recent log output: + {{ _kibana_handler_journal.stdout }} diff --git a/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml new file mode 100644 index 00000000..6dd99be2 --- /dev/null +++ b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml @@ -0,0 +1,32 @@ +--- + +- name: Restart and verify Elasticsearch + block: + - name: Restart Elasticsearch service + ansible.builtin.service: + name: elasticsearch + state: restarted + daemon_reload: true + + - name: Verify Elasticsearch is running + ansible.builtin.systemd: + name: elasticsearch + register: _elasticsearch_service_state + until: _elasticsearch_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: Get recent Elasticsearch journal output + ansible.builtin.command: + cmd: journalctl -u elasticsearch --no-pager -n 50 + register: _elasticsearch_journal + changed_when: false + + - name: Fail with Elasticsearch startup diagnostics + ansible.builtin.fail: + msg: | + Elasticsearch failed to start. + + Recent log output: + {{ _elasticsearch_journal.stdout }} diff --git a/roles/elasticsearch/tasks/wait_for_instance.yml b/roles/elasticsearch/tasks/wait_for_instance.yml index aeccf807..2a5e38ed 100644 --- a/roles/elasticsearch/tasks/wait_for_instance.yml +++ b/roles/elasticsearch/tasks/wait_for_instance.yml @@ -1,12 +1,51 @@ --- -- name: Wait for instance - ansible.builtin.wait_for: - host: "{{ elasticsearch_api_host }}" - port: "{{ elasticstack_elasticsearch_http_port }}" - timeout: 600 +- name: Wait for Elasticsearch to be ready when: not ansible_check_mode tags: - certificates - renew_ca - renew_es_cert + block: + - name: Wait for Elasticsearch port with service health check + ansible.builtin.shell: + cmd: | + if ! systemctl is-active --quiet elasticsearch; then + exit 2 + fi + if ss -tln | grep -q ':{{ elasticstack_elasticsearch_http_port }} '; then + exit 0 + fi + exit 1 + register: _es_wait_result + until: _es_wait_result.rc == 0 + retries: 120 + delay: 5 + changed_when: false + failed_when: _es_wait_result.rc == 2 + + rescue: + - name: Get recent Elasticsearch journal output + ansible.builtin.command: + cmd: journalctl -u elasticsearch --no-pager -n 50 + register: _es_wait_journal + changed_when: false + + - name: Fail with Elasticsearch diagnostics (service crashed) + ansible.builtin.fail: + msg: | + Elasticsearch service died while waiting for port {{ elasticstack_elasticsearch_http_port }}. + + Recent log output: + {{ _es_wait_journal.stdout }} + when: _es_wait_result.rc | default(0) == 2 + + - name: Fail with Elasticsearch diagnostics (port timeout) + ansible.builtin.fail: + msg: | + Elasticsearch port {{ elasticstack_elasticsearch_http_port }} did not become available within 600s. + Service state: {{ _es_wait_result.stdout | default('unknown') }} + + Recent log output: + {{ _es_wait_journal.stdout }} + when: _es_wait_result.rc | default(0) != 2 diff --git a/roles/kibana/handlers/main.yml b/roles/kibana/handlers/main.yml index 0da98678..5d380ac1 100644 --- a/roles/kibana/handlers/main.yml +++ b/roles/kibana/handlers/main.yml @@ -5,9 +5,7 @@ daemon_reload: true - name: Restart Kibana - ansible.builtin.service: - name: kibana - state: restarted + ansible.builtin.include_tasks: restart_and_verify_kibana.yml when: - not ansible_check_mode - kibana_enable | bool diff --git a/roles/kibana/tasks/main.yml b/roles/kibana/tasks/main.yml index abea87c8..86c38e0a 100644 --- a/roles/kibana/tasks/main.yml +++ b/roles/kibana/tasks/main.yml @@ -126,13 +126,50 @@ - not ansible_check_mode register: kibana_freshstart -# the following is useful when running tests or extra tasks that need to -# have Kibana running. Escape it on Rocky8, because it gets time out with Elastic 8 - -- name: Wait for Kibana to start - ansible.builtin.wait_for: - host: localhost - port: 5601 - timeout: 300 +- name: Wait for Kibana to be ready when: not ansible_check_mode + block: + - name: Wait for Kibana HTTP with service health check + ansible.builtin.shell: + cmd: | + if ! systemctl is-active --quiet kibana; then + exit 2 + fi + HTTP_CODE=$(curl -sk -o /dev/null -w '%{http_code}' http://localhost:5601/api/status 2>/dev/null) || true + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then + exit 0 + fi + exit 1 + register: _kibana_wait_result + until: _kibana_wait_result.rc == 0 + retries: 60 + delay: 5 + changed_when: false + failed_when: _kibana_wait_result.rc == 2 + + rescue: + - name: Get recent Kibana journal output + ansible.builtin.command: + cmd: journalctl -u kibana --no-pager -n 50 + register: _kibana_wait_journal + changed_when: false + + - name: Fail with Kibana diagnostics (service crashed) + ansible.builtin.fail: + msg: | + Kibana service died while waiting for HTTP readiness. + + Recent log output: + {{ _kibana_wait_journal.stdout }} + when: _kibana_wait_result.rc | default(0) == 2 + + - name: Fail with Kibana diagnostics (HTTP timeout) + ansible.builtin.fail: + msg: | + Kibana HTTP did not become ready within 300s. + Service state: {{ _kibana_wait_result.stdout | default('unknown') }} + + Recent log output: + {{ _kibana_wait_journal.stdout }} + when: _kibana_wait_result.rc | default(0) != 2 diff --git a/roles/kibana/tasks/restart_and_verify_kibana.yml b/roles/kibana/tasks/restart_and_verify_kibana.yml new file mode 100644 index 00000000..c50c1a8e --- /dev/null +++ b/roles/kibana/tasks/restart_and_verify_kibana.yml @@ -0,0 +1,45 @@ +--- + +- name: Restart and verify Kibana + block: + - name: Restart Kibana service + ansible.builtin.service: + name: kibana + state: restarted + + - name: Verify Kibana is running + ansible.builtin.systemd: + name: kibana + register: _kibana_service_state + until: _kibana_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + - name: Wait for Kibana HTTP readiness + ansible.builtin.shell: + cmd: | + HTTP_CODE=$(curl -sk -o /dev/null -w '%{http_code}' http://localhost:5601/api/status 2>/dev/null) || true + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then + exit 0 + fi + exit 1 + register: _kibana_http_check + until: _kibana_http_check.rc == 0 + retries: 60 + delay: 5 + changed_when: false + + rescue: + - name: Get recent Kibana journal output + ansible.builtin.command: + cmd: journalctl -u kibana --no-pager -n 50 + register: _kibana_journal + changed_when: false + + - name: Fail with Kibana startup diagnostics + ansible.builtin.fail: + msg: | + Kibana failed to start. + + Recent log output: + {{ _kibana_journal.stdout }} diff --git a/roles/logstash/handlers/main.yml b/roles/logstash/handlers/main.yml index 71087b81..20eb0b78 100644 --- a/roles/logstash/handlers/main.yml +++ b/roles/logstash/handlers/main.yml @@ -1,20 +1,14 @@ --- # handlers file for logstash - name: Restart Logstash - ansible.builtin.service: - name: logstash - state: restarted - daemon_reload: true + ansible.builtin.include_tasks: restart_and_verify_logstash.yml when: - not ansible_check_mode - logstash_enable | bool - not logstash_freshstart.changed | bool - name: Restart Logstash noauto - ansible.builtin.service: - name: logstash - state: restarted - daemon_reload: true + ansible.builtin.include_tasks: restart_and_verify_logstash.yml when: - not ansible_check_mode - not logstash_config_autoreload diff --git a/roles/logstash/tasks/restart_and_verify_logstash.yml b/roles/logstash/tasks/restart_and_verify_logstash.yml new file mode 100644 index 00000000..b3af5977 --- /dev/null +++ b/roles/logstash/tasks/restart_and_verify_logstash.yml @@ -0,0 +1,32 @@ +--- + +- name: Restart and verify Logstash + block: + - name: Restart Logstash service + ansible.builtin.service: + name: logstash + state: restarted + daemon_reload: true + + - name: Verify Logstash is running + ansible.builtin.systemd: + name: logstash + register: _logstash_service_state + until: _logstash_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + + rescue: + - name: Get recent Logstash journal output + ansible.builtin.command: + cmd: journalctl -u logstash --no-pager -n 50 + register: _logstash_journal + changed_when: false + + - name: Fail with Logstash startup diagnostics + ansible.builtin.fail: + msg: | + Logstash failed to start. + + Recent log output: + {{ _logstash_journal.stdout }}