fix: Handle systemctl commands when dbus not ready (canonical#4681)

fix: Handle systemctl commands when dbus not ready During `cloud-init status`, we check systemctl to ensure the status we're reporting is accurate. However, we can get an error from systemctl if dbus isn't ready yet. This commit will either ignore the error if we can assume that cloud-init is still running, or retry until we get a proper response from systemctl. Fixes canonicalGH-4676
TheRealFalcon · Dec 13, 2023 · d29b744 · d29b744
1 parent 4006c23
commit d29b744
Show file tree

Hide file tree

Showing 2 changed files with 148 additions and 17 deletions.
diff --git a/cloudinit/cmd/status.py b/cloudinit/cmd/status.py
@@ -137,7 +137,7 @@ def handle_status_args(name, args) -> int:
     """Handle calls to 'cloud-init status' as a subcommand."""
     # Read configured paths
     paths = read_cfg_paths()
-    details = get_status_details(paths)
+    details = get_status_details(paths, args.wait)
     if args.wait:
         while details.status in (
             UXAppStatus.NOT_RUN,
@@ -147,7 +147,7 @@ def handle_status_args(name, args) -> int:
             if args.format == "tabular":
                 sys.stdout.write(".")
                 sys.stdout.flush()
-            details = get_status_details(paths)
+            details = get_status_details(paths, args.wait)
             sleep(0.25)
     details_dict: Dict[str, Union[None, str, List[str], Dict[str, Any]]] = {
         "datasource": details.datasource,
@@ -272,12 +272,15 @@ def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]:
     return (bootstatus_code, reason)
 
 
-def _get_systemd_status() -> Optional[UXAppStatus]:
-    """Get status from systemd.
+def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]:
+    """Get if systemd is in error or running state.
 
     Using systemd, we can get more fine-grained status of the
     individual unit. Determine if we're still
-    running or if there's an error we haven't otherwise detected
+    running or if there's an error we haven't otherwise detected.
+
+    If we don't detect error or running, return None as we don't want to
+    report any other particular status based on systemd.
     """
     for service in [
         "cloud-final.service",
@@ -324,7 +327,42 @@ def _get_systemd_status() -> Optional[UXAppStatus]:
     return None
 
 
-def get_status_details(paths: Optional[Paths] = None) -> StatusDetails:
+def _get_error_or_running_from_systemd_with_retry(
+    existing_status: UXAppStatus, *, wait: bool
+) -> Optional[UXAppStatus]:
+    """Get systemd status and retry if dbus isn't ready.
+
+    If cloud-init has determined that we're still running, then we can
+    ignore the status from systemd. However, if cloud-init has detected error,
+    then we should retry on systemd status so we don't incorrectly report
+    error state while cloud-init is still running.
+    """
+    while True:
+        try:
+            return _get_error_or_running_from_systemd()
+        except subp.ProcessExecutionError as e:
+            last_exception = e
+            if existing_status in (
+                UXAppStatus.DEGRADED_RUNNING,
+                UXAppStatus.RUNNING,
+            ):
+                return None
+            if wait:
+                sleep(0.25)
+            else:
+                break
+    print(
+        "Failed to get status from systemd. "
+        "Cloud-init status may be inaccurate. ",
+        f"Error from systemctl: {last_exception.stderr}",
+        file=sys.stderr,
+    )
+    return None
+
+
+def get_status_details(
+    paths: Optional[Paths] = None, wait: bool = False
+) -> StatusDetails:
     """Return a dict with status, details and errors.
 
     @param paths: An initialized cloudinit.helpers.paths object.
@@ -395,7 +433,9 @@ def get_status_details(paths: Optional[Paths] = None) -> StatusDetails:
         UXAppStatus.NOT_RUN,
         UXAppStatus.DISABLED,
     ):
-        systemd_status = _get_systemd_status()
+        systemd_status = _get_error_or_running_from_systemd_with_retry(
+            status, wait=wait
+        )
         if systemd_status:
             status = systemd_status
 

diff --git a/tests/unittests/cmd/test_status.py b/tests/unittests/cmd/test_status.py
@@ -9,9 +9,14 @@
 
 import pytest
 
+from cloudinit import subp
 from cloudinit.atomic_helper import write_json
 from cloudinit.cmd import status
-from cloudinit.cmd.status import UXAppStatus, _get_systemd_status
+from cloudinit.cmd.status import (
+    UXAppStatus,
+    _get_error_or_running_from_systemd,
+    _get_error_or_running_from_systemd_with_retry,
+)
 from cloudinit.subp import SubpResult
 from cloudinit.util import ensure_file
 from tests.unittests.helpers import wrap_and_call
@@ -59,7 +64,10 @@ class TestStatus:
             "Cloud-init enabled by systemd cloud-init-generator",
         ),
     )
-    @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
+    @mock.patch(
+        f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
+        return_value=None,
+    )
     def test_get_status_details_ds_none(
         self,
         m_get_systemd_status,
@@ -704,7 +712,10 @@ def fakeexists(filepath):
         ],
     )
     @mock.patch(M_PATH + "read_cfg_paths")
-    @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
+    @mock.patch(
+        f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
+        return_value=None,
+    )
     def test_status_output(
         self,
         m_get_systemd_status,
@@ -745,7 +756,10 @@ def test_status_output(
             assert out == expected_status
 
     @mock.patch(M_PATH + "read_cfg_paths")
-    @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
+    @mock.patch(
+        f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
+        return_value=None,
+    )
     def test_status_wait_blocks_until_done(
         self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
     ):
@@ -796,7 +810,10 @@ def fake_sleep(interval):
         assert out == "....\nstatus: done\n"
 
     @mock.patch(M_PATH + "read_cfg_paths")
-    @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
+    @mock.patch(
+        f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
+        return_value=None,
+    )
     def test_status_wait_blocks_until_error(
         self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
     ):
@@ -849,7 +866,10 @@ def fake_sleep(interval):
         assert out == "....\nstatus: error\n"
 
     @mock.patch(M_PATH + "read_cfg_paths")
-    @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None)
+    @mock.patch(
+        f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
+        return_value=None,
+    )
     def test_status_main(
         self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys
     ):
@@ -873,15 +893,20 @@ def test_status_main(
         assert out == "status: running\n"
 
 
-class TestSystemdStatusDetails:
+class TestGetErrorOrRunningFromSystemd:
+    @pytest.fixture(autouse=True)
+    def common_mocks(self, mocker):
+        mocker.patch("cloudinit.cmd.status.sleep")
+        yield
+
     @pytest.mark.parametrize(
         ["active_state", "unit_file_state", "sub_state", "main_pid", "status"],
         [
             # To cut down on the combination of states, I'm grouping
             # enabled, enabled-runtime, and static into an "enabled" state
             # and everything else functionally disabled.
             # Additionally, SubStates are undocumented and may mean something
-            # different depending on the ActiveState they are mapped too.
+            # different depending on the ActiveState they are mapped to.
             # Because of this I'm only testing SubState combinations seen
             # in real-world testing (or using "any" string if we dont care).
             ("activating", "enabled", "start", "123", UXAppStatus.RUNNING),
@@ -910,7 +935,7 @@ class TestSystemdStatusDetails:
             ("failed", "invalid", "failed", "0", UXAppStatus.ERROR),
         ],
     )
-    def test_get_systemd_status(
+    def test_get_error_or_running_from_systemd(
         self, active_state, unit_file_state, sub_state, main_pid, status
     ):
         with mock.patch(
@@ -923,4 +948,70 @@ def test_get_systemd_status(
                 stderr=None,
             ),
         ):
-            assert _get_systemd_status() == status
+            assert _get_error_or_running_from_systemd() == status
+
+    def test_exception_while_running(self, mocker, capsys):
+        m_subp = mocker.patch(
+            f"{M_PATH}subp.subp",
+            side_effect=subp.ProcessExecutionError(
+                "Message recipient disconnected from message bus without"
+                " replying"
+            ),
+        )
+        assert (
+            _get_error_or_running_from_systemd_with_retry(
+                UXAppStatus.RUNNING, wait=True
+            )
+            is None
+        )
+        assert 1 == m_subp.call_count
+        assert "Failed to get status" not in capsys.readouterr().err
+
+    def test_retry(self, mocker, capsys):
+        m_subp = mocker.patch(
+            f"{M_PATH}subp.subp",
+            side_effect=[
+                subp.ProcessExecutionError(
+                    "Message recipient disconnected from message bus without"
+                    " replying"
+                ),
+                subp.ProcessExecutionError(
+                    "Message recipient disconnected from message bus without"
+                    " replying"
+                ),
+                SubpResult(
+                    "ActiveState=activating\nUnitFileState=enabled\n"
+                    "SubState=start\nMainPID=123\n",
+                    stderr=None,
+                ),
+            ],
+        )
+        assert (
+            _get_error_or_running_from_systemd_with_retry(
+                UXAppStatus.ERROR, wait=True
+            )
+            is UXAppStatus.RUNNING
+        )
+        assert 3 == m_subp.call_count
+        assert "Failed to get status" not in capsys.readouterr().err
+
+    def test_retry_no_wait(self, mocker, capsys):
+        m_subp = mocker.patch(
+            f"{M_PATH}subp.subp",
+            side_effect=subp.ProcessExecutionError(
+                "Message recipient disconnected from message bus without"
+                " replying"
+            ),
+        )
+        mocker.patch("time.time", side_effect=[1, 2, 50])
+        assert (
+            _get_error_or_running_from_systemd_with_retry(
+                UXAppStatus.ERROR, wait=False
+            )
+            is None
+        )
+        assert 1 == m_subp.call_count
+        assert (
+            "Failed to get status from systemd. "
+            "Cloud-init status may be inaccurate."
+        ) in capsys.readouterr().err