Skip to content

Commit

Permalink
fix: Handle systemctl commands when dbus not ready
Browse files Browse the repository at this point in the history
During `cloud-init status`, we check systemctl to ensure the status
we're reporting is accurate. However, we can get an error from systemctl
if dbus isn't ready yet. This commit will either ignore the error if we
can assume that cloud-init is still running, or retry for a few seconds
before giving up and keeping cloud-init's originally detected status.

Fixes canonicalGH-4676
  • Loading branch information
TheRealFalcon committed Dec 11, 2023
1 parent 949af54 commit a55996b
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 14 deletions.
41 changes: 37 additions & 4 deletions cloudinit/cmd/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import json
import os
import sys
import time
from copy import deepcopy
from time import gmtime, sleep, strftime
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
Expand Down Expand Up @@ -272,12 +273,15 @@ def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]:
return (bootstatus_code, reason)


def _get_systemd_status() -> Optional[UXAppStatus]:
"""Get status from systemd.
def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]:
"""Get if systemd is in error or running state.
Using systemd, we can get more fine-grained status of the
individual unit. Determine if we're still
running or if there's an error we haven't otherwise detected
running or if there's an error we haven't otherwise detected.
If we don't detect error or running, return None as we don't want to
report any other particular status based on systemd.
"""
for service in [
"cloud-final.service",
Expand Down Expand Up @@ -324,6 +328,35 @@ def _get_systemd_status() -> Optional[UXAppStatus]:
return None


def _get_error_or_running_from_systemd_with_retry(
existing_status, max_wait=5
) -> Optional[UXAppStatus]:
"""Get systemd status and retry if dbus isn't ready.
If cloud-init has determined that we're still running, then we can
ignore the status from systemd. However, if cloud-init has detected error,
then we should retry on systemd status so we don't incorrectly report
error state while cloud-init is still running.
"""
start_time = time.time()
while time.time() - start_time < max_wait:
try:
return _get_error_or_running_from_systemd()
except subp.ProcessExecutionError:
if existing_status in (
UXAppStatus.DEGRADED_RUNNING,
UXAppStatus.RUNNING,
):
return None
sleep(0.25)
print(
f"Failed to get status from systemd after {max_wait} seconds. "
"Cloud-init may still be running.",
file=sys.stderr,
)
return None


def get_status_details(paths: Optional[Paths] = None) -> StatusDetails:
"""Return a dict with status, details and errors.
Expand Down Expand Up @@ -395,7 +428,7 @@ def get_status_details(paths: Optional[Paths] = None) -> StatusDetails:
UXAppStatus.NOT_RUN,
UXAppStatus.DISABLED,
):
systemd_status = _get_systemd_status()
systemd_status = _get_error_or_running_from_systemd_with_retry(status)
if systemd_status:
status = systemd_status

Expand Down
111 changes: 101 additions & 10 deletions tests/unittests/cmd/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@

import pytest

from cloudinit import subp
from cloudinit.atomic_helper import write_json
from cloudinit.cmd import status
from cloudinit.cmd.status import UXAppStatus, _get_systemd_status
from cloudinit.cmd.status import (
UXAppStatus,
_get_error_or_running_from_systemd,
_get_error_or_running_from_systemd_with_retry,
)
from cloudinit.subp import SubpResult
from cloudinit.util import ensure_file
from tests.unittests.helpers import wrap_and_call
Expand Down Expand Up @@ -59,7 +64,10 @@ class TestStatus:
"Cloud-init enabled by systemd cloud-init-generator",
),
)
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_get_status_details_ds_none(
self,
m_get_systemd_status,
Expand Down Expand Up @@ -704,7 +712,10 @@ def fakeexists(filepath):
],
)
@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_output(
self,
m_get_systemd_status,
Expand Down Expand Up @@ -745,7 +756,10 @@ def test_status_output(
assert out == expected_status

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_wait_blocks_until_done(
self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
):
Expand Down Expand Up @@ -796,7 +810,10 @@ def fake_sleep(interval):
assert out == "....\nstatus: done\n"

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_wait_blocks_until_error(
self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
):
Expand Down Expand Up @@ -849,7 +866,10 @@ def fake_sleep(interval):
assert out == "....\nstatus: error\n"

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_main(
self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
):
Expand All @@ -873,15 +893,20 @@ def test_status_main(
assert out == "status: running\n"


class TestSystemdStatusDetails:
class TestGetErrorOrRunningFromSystemd:
@pytest.fixture(autouse=True)
def common_mocks(self, mocker):
mocker.patch("cloudinit.cmd.status.sleep")
yield

@pytest.mark.parametrize(
["active_state", "unit_file_state", "sub_state", "main_pid", "status"],
[
# To cut down on the combination of states, I'm grouping
# enabled, enabled-runtime, and static into an "enabled" state
# and everything else functionally disabled.
# Additionally, SubStates are undocumented and may mean something
# different depending on the ActiveState they are mapped too.
# different depending on the ActiveState they are mapped to.
# Because of this I'm only testing SubState combinations seen
# in real-world testing (or using "any" string if we dont care).
("activating", "enabled", "start", "123", UXAppStatus.RUNNING),
Expand Down Expand Up @@ -910,7 +935,7 @@ class TestSystemdStatusDetails:
("failed", "invalid", "failed", "0", UXAppStatus.ERROR),
],
)
def test_get_systemd_status(
def test_get_error_or_running_from_systemd(
self, active_state, unit_file_state, sub_state, main_pid, status
):
with mock.patch(
Expand All @@ -923,4 +948,70 @@ def test_get_systemd_status(
stderr=None,
),
):
assert _get_systemd_status() == status
assert _get_error_or_running_from_systemd() == status

def test_exception_while_running(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
)
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.RUNNING, max_wait=1000
)
is None
)
assert 1 == m_subp.call_count
assert "Failed to get status" not in capsys.readouterr().err

def test_retry(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=[
subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
SubpResult(
"ActiveState=activating\nUnitFileState=enabled\n"
"SubState=start\nMainPID=123\n",
stderr=None,
),
],
)
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.ERROR, max_wait=1000
)
is UXAppStatus.RUNNING
)
assert 3 == m_subp.call_count
assert "Failed to get status" not in capsys.readouterr().err

def test_retry_timeout(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
)
mocker.patch("time.time", side_effect=[1, 2, 50])
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.ERROR, max_wait=5
)
is None
)
assert 1 == m_subp.call_count
assert (
"Failed to get status from systemd after 5 seconds. "
"Cloud-init may still be running."
) in capsys.readouterr().err

0 comments on commit a55996b

Please sign in to comment.