Skip to content

Commit

Permalink
fix: Handle systemctl commands when dbus not ready (canonical#4681)
Browse files Browse the repository at this point in the history
fix: Handle systemctl commands when dbus not ready

During `cloud-init status`, we check systemctl to ensure the status
we're reporting is accurate. However, we can get an error from systemctl
if dbus isn't ready yet. This commit will either ignore the error if we
can assume that cloud-init is still running, or retry until we get a
proper response from systemctl.

Fixes canonicalGH-4676
  • Loading branch information
TheRealFalcon committed Dec 13, 2023
1 parent 4006c23 commit d29b744
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 17 deletions.
54 changes: 47 additions & 7 deletions cloudinit/cmd/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def handle_status_args(name, args) -> int:
"""Handle calls to 'cloud-init status' as a subcommand."""
# Read configured paths
paths = read_cfg_paths()
details = get_status_details(paths)
details = get_status_details(paths, args.wait)
if args.wait:
while details.status in (
UXAppStatus.NOT_RUN,
Expand All @@ -147,7 +147,7 @@ def handle_status_args(name, args) -> int:
if args.format == "tabular":
sys.stdout.write(".")
sys.stdout.flush()
details = get_status_details(paths)
details = get_status_details(paths, args.wait)
sleep(0.25)
details_dict: Dict[str, Union[None, str, List[str], Dict[str, Any]]] = {
"datasource": details.datasource,
Expand Down Expand Up @@ -272,12 +272,15 @@ def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]:
return (bootstatus_code, reason)


def _get_systemd_status() -> Optional[UXAppStatus]:
"""Get status from systemd.
def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]:
"""Get if systemd is in error or running state.
Using systemd, we can get more fine-grained status of the
individual unit. Determine if we're still
running or if there's an error we haven't otherwise detected
running or if there's an error we haven't otherwise detected.
If we don't detect error or running, return None as we don't want to
report any other particular status based on systemd.
"""
for service in [
"cloud-final.service",
Expand Down Expand Up @@ -324,7 +327,42 @@ def _get_systemd_status() -> Optional[UXAppStatus]:
return None


def get_status_details(paths: Optional[Paths] = None) -> StatusDetails:
def _get_error_or_running_from_systemd_with_retry(
existing_status: UXAppStatus, *, wait: bool
) -> Optional[UXAppStatus]:
"""Get systemd status and retry if dbus isn't ready.
If cloud-init has determined that we're still running, then we can
ignore the status from systemd. However, if cloud-init has detected error,
then we should retry on systemd status so we don't incorrectly report
error state while cloud-init is still running.
"""
while True:
try:
return _get_error_or_running_from_systemd()
except subp.ProcessExecutionError as e:
last_exception = e
if existing_status in (
UXAppStatus.DEGRADED_RUNNING,
UXAppStatus.RUNNING,
):
return None
if wait:
sleep(0.25)
else:
break
print(
"Failed to get status from systemd. "
"Cloud-init status may be inaccurate. ",
f"Error from systemctl: {last_exception.stderr}",
file=sys.stderr,
)
return None


def get_status_details(
paths: Optional[Paths] = None, wait: bool = False
) -> StatusDetails:
"""Return a dict with status, details and errors.
@param paths: An initialized cloudinit.helpers.paths object.
Expand Down Expand Up @@ -395,7 +433,9 @@ def get_status_details(paths: Optional[Paths] = None) -> StatusDetails:
UXAppStatus.NOT_RUN,
UXAppStatus.DISABLED,
):
systemd_status = _get_systemd_status()
systemd_status = _get_error_or_running_from_systemd_with_retry(
status, wait=wait
)
if systemd_status:
status = systemd_status

Expand Down
111 changes: 101 additions & 10 deletions tests/unittests/cmd/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@

import pytest

from cloudinit import subp
from cloudinit.atomic_helper import write_json
from cloudinit.cmd import status
from cloudinit.cmd.status import UXAppStatus, _get_systemd_status
from cloudinit.cmd.status import (
UXAppStatus,
_get_error_or_running_from_systemd,
_get_error_or_running_from_systemd_with_retry,
)
from cloudinit.subp import SubpResult
from cloudinit.util import ensure_file
from tests.unittests.helpers import wrap_and_call
Expand Down Expand Up @@ -59,7 +64,10 @@ class TestStatus:
"Cloud-init enabled by systemd cloud-init-generator",
),
)
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_get_status_details_ds_none(
self,
m_get_systemd_status,
Expand Down Expand Up @@ -704,7 +712,10 @@ def fakeexists(filepath):
],
)
@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_output(
self,
m_get_systemd_status,
Expand Down Expand Up @@ -745,7 +756,10 @@ def test_status_output(
assert out == expected_status

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_wait_blocks_until_done(
self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
):
Expand Down Expand Up @@ -796,7 +810,10 @@ def fake_sleep(interval):
assert out == "....\nstatus: done\n"

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_wait_blocks_until_error(
self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
):
Expand Down Expand Up @@ -849,7 +866,10 @@ def fake_sleep(interval):
assert out == "....\nstatus: error\n"

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
return_value=None,
)
def test_status_main(
self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
):
Expand All @@ -873,15 +893,20 @@ def test_status_main(
assert out == "status: running\n"


class TestSystemdStatusDetails:
class TestGetErrorOrRunningFromSystemd:
@pytest.fixture(autouse=True)
def common_mocks(self, mocker):
mocker.patch("cloudinit.cmd.status.sleep")
yield

@pytest.mark.parametrize(
["active_state", "unit_file_state", "sub_state", "main_pid", "status"],
[
# To cut down on the combination of states, I'm grouping
# enabled, enabled-runtime, and static into an "enabled" state
# and everything else functionally disabled.
# Additionally, SubStates are undocumented and may mean something
# different depending on the ActiveState they are mapped too.
# different depending on the ActiveState they are mapped to.
# Because of this I'm only testing SubState combinations seen
# in real-world testing (or using "any" string if we dont care).
("activating", "enabled", "start", "123", UXAppStatus.RUNNING),
Expand Down Expand Up @@ -910,7 +935,7 @@ class TestSystemdStatusDetails:
("failed", "invalid", "failed", "0", UXAppStatus.ERROR),
],
)
def test_get_systemd_status(
def test_get_error_or_running_from_systemd(
self, active_state, unit_file_state, sub_state, main_pid, status
):
with mock.patch(
Expand All @@ -923,4 +948,70 @@ def test_get_systemd_status(
stderr=None,
),
):
assert _get_systemd_status() == status
assert _get_error_or_running_from_systemd() == status

def test_exception_while_running(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
)
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.RUNNING, wait=True
)
is None
)
assert 1 == m_subp.call_count
assert "Failed to get status" not in capsys.readouterr().err

def test_retry(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=[
subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
SubpResult(
"ActiveState=activating\nUnitFileState=enabled\n"
"SubState=start\nMainPID=123\n",
stderr=None,
),
],
)
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.ERROR, wait=True
)
is UXAppStatus.RUNNING
)
assert 3 == m_subp.call_count
assert "Failed to get status" not in capsys.readouterr().err

def test_retry_no_wait(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=subp.ProcessExecutionError(
"Message recipient disconnected from message bus without"
" replying"
),
)
mocker.patch("time.time", side_effect=[1, 2, 50])
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.ERROR, wait=False
)
is None
)
assert 1 == m_subp.call_count
assert (
"Failed to get status from systemd. "
"Cloud-init status may be inaccurate."
) in capsys.readouterr().err

0 comments on commit d29b744

Please sign in to comment.