Skip to content

Commit

Permalink
[PSU daemon] Support PSU power threshold checking (sonic-net#288)
Browse files Browse the repository at this point in the history
  • Loading branch information
stephenxs committed Nov 21, 2022
1 parent 707a720 commit ed818f8
Show file tree
Hide file tree
Showing 3 changed files with 213 additions and 12 deletions.
52 changes: 51 additions & 1 deletion sonic-psud/scripts/psud
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ PSU_INFO_VOLTAGE_MAX_TH_FIELD = 'voltage_max_threshold'
PSU_INFO_VOLTAGE_MIN_TH_FIELD = 'voltage_min_threshold'
PSU_INFO_CURRENT_FIELD = 'current'
PSU_INFO_POWER_FIELD = 'power'
PSU_INFO_POWER_OVERLOAD = 'power_overload'
PSU_INFO_POWER_WARNING_SUPPRESS_THRESHOLD = 'power_warning_suppress_threshold'
PSU_INFO_POWER_CRITICAL_THRESHOLD = 'power_critical_threshold'
PSU_INFO_FRU_FIELD = 'is_replaceable'
PSU_INFO_IN_VOLTAGE_FIELD = 'input_voltage'
PSU_INFO_IN_CURRENT_FIELD = 'input_current'
Expand Down Expand Up @@ -283,6 +286,8 @@ class PsuStatus(object):
self.power_good = True
self.voltage_good = True
self.temperature_good = True
self.check_psu_power_threshold = False
self.power_exceeded_threshold = False
self.logger = logger

def set_presence(self, presence):
Expand Down Expand Up @@ -339,6 +344,13 @@ class PsuStatus(object):
self.temperature_good = temperature_good
return True

def set_power_exceed_threshold(self, power_exceeded_threshold):
if power_exceeded_threshold == self.power_exceeded_threshold:
return False

self.power_exceeded_threshold = power_exceeded_threshold
return True

def is_ok(self):
return self.presence and self.power_good and self.voltage_good and self.temperature_good

Expand Down Expand Up @@ -486,6 +498,8 @@ class DaemonPsud(daemon_base.DaemonBase):
'PSU absence warning cleared: {} is inserted back.'.format(name),
'PSU absence warning: {} is not present.'.format(name)
)
if not psu_status.presence:
psu_status.check_psu_power_threshold = False

if presence_changed or self.first_run:
# Have to update PSU fan data here because PSU presence status changed. If we don't
Expand All @@ -495,13 +509,46 @@ class DaemonPsud(daemon_base.DaemonBase):
# every 60 seconds, it may still treat PSU state to "OK" and PSU LED to "red".
self._update_psu_fan_data(psu, index)

if presence and psu_status.set_power_good(power_good):
power_good_changed = psu_status.set_power_good(power_good)
if presence and power_good_changed:
set_led = True
log_on_status_changed(self, psu_status.power_good,
'Power absence warning cleared: {} power is back to normal.'.format(name),
'Power absence warning: {} is out of power.'.format(name)
)

if presence and power_good_changed or self.first_run:
psu_status.check_psu_power_threshold = False
if psu_status.power_good:
# power_good has been updated and it is True, which means it was False
# Initialize power exceeding threshold state in this case
if (try_get(psu.get_psu_power_critical_threshold) and try_get(psu.get_psu_power_warning_suppress_threshold) and power != NOT_AVAILABLE):
psu_status.check_psu_power_threshold = True

power_exceeded_threshold = psu_status.power_exceeded_threshold
power_warning_suppress_threshold = try_get(psu.get_psu_power_warning_suppress_threshold, NOT_AVAILABLE)
power_critical_threshold = try_get(psu.get_psu_power_critical_threshold, NOT_AVAILABLE)
if psu_status.check_psu_power_threshold:
if power_warning_suppress_threshold == NOT_AVAILABLE or power_critical_threshold == NOT_AVAILABLE:
self.log_error("PSU power thresholds become invalid: threshold {} critical threshold {}".format(power_warning_suppress_threshold, power_critical_threshold))
psu_status.check_psu_power_threshold = False
psu_status.power_exceeded_threshold = False
elif psu_status.power_exceeded_threshold:
# The failing threshold is the warning threshold
if power < power_warning_suppress_threshold:
# Clear alarm
power_exceeded_threshold = False
else:
# The rising threshold is the critical threshold
if power >= power_critical_threshold:
# Raise alarm
power_exceeded_threshold = True

if psu_status.set_power_exceed_threshold(power_exceeded_threshold):
log_on_status_changed(self, not psu_status.power_exceeded_threshold,
'PSU power warning cleared: {} power {} is back to normal.'.format(name, power),
'PSU power warning: {} power {} exceeds critical threshold {}.'.format(name, power, power_critical_threshold))

if presence and psu_status.set_voltage(voltage, voltage_high_threshold, voltage_low_threshold):
set_led = True
log_on_status_changed(self, psu_status.voltage_good,
Expand Down Expand Up @@ -532,6 +579,9 @@ class DaemonPsud(daemon_base.DaemonBase):
(PSU_INFO_VOLTAGE_MAX_TH_FIELD, str(voltage_high_threshold)),
(PSU_INFO_CURRENT_FIELD, str(current)),
(PSU_INFO_POWER_FIELD, str(power)),
(PSU_INFO_POWER_WARNING_SUPPRESS_THRESHOLD, str(power_warning_suppress_threshold)),
(PSU_INFO_POWER_CRITICAL_THRESHOLD, str(power_critical_threshold)),
(PSU_INFO_POWER_OVERLOAD, str(power_exceeded_threshold)),
(PSU_INFO_FRU_FIELD, str(is_replaceable)),
(PSU_INFO_IN_CURRENT_FIELD, str(in_current)),
(PSU_INFO_IN_VOLTAGE_FIELD, str(in_voltage)),
Expand Down
6 changes: 6 additions & 0 deletions sonic-psud/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,12 @@ def set_status_led(self, color):
self._status_led_color = color
return True

def get_psu_power_critical_threshold(self):
raise NotImplementedError

def get_psu_power_warning_suppress_threshold(self):
raise NotImplementedError

# Methods inherited from DeviceBase class and related setters
def get_name(self):
return self._name
Expand Down
167 changes: 156 additions & 11 deletions sonic-psud/tests/test_DaemonPsud.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,16 +143,7 @@ def test_update_psu_data(self):
expected_calls = [mock.call("Failed to update PSU data - Test message")] * 2
assert daemon_psud.log_warning.mock_calls == expected_calls

@mock.patch('psud._wrapper_get_psu_presence', mock.MagicMock())
@mock.patch('psud._wrapper_get_psu_status', mock.MagicMock())
def test_update_single_psu_data(self):
psud._wrapper_get_psu_presence.return_value = True
psud._wrapper_get_psu_status.return_value = True

psu1 = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234')
psud.platform_chassis = MockChassis()
psud.platform_chassis._psu_list.append(psu1)

def _construct_expected_fvp(self, power=100.0, power_warning_suppress_threshold='N/A', power_critical_threshold='N/A', power_overload=False):
expected_fvp = psud.swsscommon.FieldValuePairs(
[(psud.PSU_INFO_MODEL_FIELD, 'Fake Model'),
(psud.PSU_INFO_SERIAL_FIELD, '12345678'),
Expand All @@ -163,17 +154,171 @@ def test_update_single_psu_data(self):
(psud.PSU_INFO_VOLTAGE_MIN_TH_FIELD, '11.0'),
(psud.PSU_INFO_VOLTAGE_MAX_TH_FIELD, '13.0'),
(psud.PSU_INFO_CURRENT_FIELD, '8.0'),
(psud.PSU_INFO_POWER_FIELD, '100.0'),
(psud.PSU_INFO_POWER_FIELD, str(power)),
(psud.PSU_INFO_POWER_WARNING_SUPPRESS_THRESHOLD, str(power_warning_suppress_threshold)),
(psud.PSU_INFO_POWER_CRITICAL_THRESHOLD, str(power_critical_threshold)),
(psud.PSU_INFO_POWER_OVERLOAD, str(power_overload)),
(psud.PSU_INFO_FRU_FIELD, 'True'),
(psud.PSU_INFO_IN_VOLTAGE_FIELD, '220.25'),
(psud.PSU_INFO_IN_CURRENT_FIELD, '0.72'),
(psud.PSU_INFO_POWER_MAX_FIELD, 'N/A'),
])
return expected_fvp

@mock.patch('psud._wrapper_get_psu_presence', mock.MagicMock())
@mock.patch('psud._wrapper_get_psu_status', mock.MagicMock())
def test_update_single_psu_data(self):
psud._wrapper_get_psu_presence.return_value = True
psud._wrapper_get_psu_status.return_value = True

psu1 = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234')
psud.platform_chassis = MockChassis()
psud.platform_chassis._psu_list.append(psu1)

expected_fvp = self._construct_expected_fvp()

daemon_psud = psud.DaemonPsud(SYSLOG_IDENTIFIER)
daemon_psud.psu_tbl = mock.MagicMock()
daemon_psud._update_single_psu_data(1, psu1)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold

@mock.patch('psud.daemon_base.db_connect', mock.MagicMock())
def test_power_threshold(self):
psu = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234')
psud.platform_chassis = MockChassis()
psud.platform_chassis._psu_list.append(psu)

daemon_psud = psud.DaemonPsud(SYSLOG_IDENTIFIER)

daemon_psud.psu_tbl = mock.MagicMock()
psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=120.0)
psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(return_value=110.0)

# Normal start. All good and all thresholds are supported
# Power is in normal range (below warning threshold)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(100.0, 110.0, 120.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

daemon_psud.first_run = False

# Power is increasing across the warning threshold
# Normal => (warning, critical)
psu.set_power(115.0)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# Power is increasing across the critical threshold. Alarm raised
# (warning, critical) => (critical, )
psu.set_power(125.0)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# Power is decreasing across the critical threshold. Alarm not cleared
# (critical, ) => (warning, critical)
psu.set_power(115.0)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# Power is decreasing across the warning threshold. Alarm cleared
# (warning, critical) => Normal
psu.set_power(105.0)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
daemon_psud._update_led_color()

# Power is increasing across the critical threshold. Alarm raised
# Normal => (critical, )
psu.set_power(125.0)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# Power is increasing across the critical threshold. Alarm raised
# (critical, ) => Normal
psu.set_power(105.0)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# PSU power becomes down
psu.set_status(False)
daemon_psud._update_single_psu_data(1, psu)
daemon_psud._update_led_color()
assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
assert psu.STATUS_LED_COLOR_RED == psu.get_status_led()

# PSU power becomes up
psu.set_status(True)
daemon_psud._update_single_psu_data(1, psu)
daemon_psud._update_led_color()
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# PSU becomes absent
psu.set_presence(False)
daemon_psud._update_single_psu_data(1, psu)
daemon_psud._update_led_color()
assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
assert psu.STATUS_LED_COLOR_RED == psu.get_status_led()

# PSU becomes present
psu.set_presence(True)
daemon_psud._update_single_psu_data(1, psu)
daemon_psud._update_led_color()
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()

# Thresholds become invalid on the fly
psu.get_psu_power_critical_threshold = mock.MagicMock(side_effect=NotImplementedError(''))
daemon_psud._update_single_psu_data(1, psu)
assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=120.0)
daemon_psud.psu_status_dict[1].check_psu_power_threshold = True
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(side_effect=NotImplementedError(''))
daemon_psud._update_single_psu_data(1, psu)
assert not daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold

def test_set_psu_led(self):
mock_logger = mock.MagicMock()
Expand Down

0 comments on commit ed818f8

Please sign in to comment.