Skip to content

Commit

Permalink
add new PCIe Persistent error logging to fpgainfo events (#2953)
Browse files Browse the repository at this point in the history
* add new PCIe Persistent error logging to fpgainfo events

support  new PCIe Persistent error to fpgainfo events for N6000 and enabled in BMC version 11.5 version

0x0430 Magic Num- PCIE Errors (Updated) 0x53696D12 in process context in main cycle PCIe error log
0x0434 Time stamp low Low timestamp 1
0x0438 Time stamp high High timestamp 2
0x043C PCIe Link Status PCIE Link Status
0x0440 PCIE Uncorrectable Err Status PCIE Uncorrectable Err Status
0x0444 PCIE Uncorrectable Err Mask PCIE Uncorrectable Err Mask
0x0448 PCIE Uncorrectable Err Severity PCIE Uncorrectable Err Severity
0x044C PCIE Correctable Err Status PCIE Correctable Err Status
0x0450 PCIE Correctable Err Mask PCIE Correctable Err Mask
0x0454 PCIE Cap And Ctrl PCIE Cap And Ctrl
0x0458 PCIE Header Log DW 1 PCIE Header Log DW 1
0x045C PCIE Header Log DW 2 PCIE Header Log DW 2
0x0460 PCIE Header Log DW 3 PCIE Header Log DW 3
---------

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
  • Loading branch information
anandaravuri committed Jun 1, 2023
1 parent 108557a commit 646c88f
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 11 deletions.
145 changes: 134 additions & 11 deletions libraries/libboard/board_n6000/board_event_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ enum bel_magic {
BEL_POWER_OFF_STATUS = 0x53696C34,
BEL_SENSORS_STATE = 0x53696C56,
BEL_SENSORS_STATUS = 0x53696C78,
BEL_PCI_ERROR_STATUS = 0x53696C9A
BEL_PCI_ERROR_STATUS = 0x53696C9A,
BEL_PCI_V1_ERROR_STATUS = 0x53696D12
};

enum bel_power_regulator {
Expand Down Expand Up @@ -298,7 +299,7 @@ static void reserved_bit(const char *label, uint32_t value, size_t offset)
printf(" " BEL_LABEL_FMT "*** RESERVED BIT [%lu] IS NOT ZERO: %d\n", 46, label, offset, bit);
}

static void bel_print_power_on_status(struct bel_power_on_status *status, struct bel_timeof_day *timeof_day, bool print_bits)
void bel_print_power_on_status(struct bel_power_on_status *status, struct bel_timeof_day *timeof_day, bool print_bits)
{
if (status->header.magic != BEL_POWER_ON_STATUS)
return;
Expand Down Expand Up @@ -456,7 +457,7 @@ static void bel_print_sensor_alert(uint32_t sensor_alert, size_t offset)
for (i = 0; i < last; i++, info++)
bel_print_fail(info->label, sensor_alert, i);
}
static void bel_print_power_off_status(struct bel_power_off_status *status, bool print_bits)
void bel_print_power_off_status(struct bel_power_off_status *status, bool print_bits)
{
if (status->header.magic != BEL_POWER_OFF_STATUS)
return;
Expand Down Expand Up @@ -605,7 +606,7 @@ static void bel_print_power_off_status(struct bel_power_off_status *status, bool
bel_print_sensor_alert(status->sensor_alert_1, 64);
}

static size_t bel_print_sensor(struct bel_sensor_state *state, size_t last)
size_t bel_print_sensor(struct bel_sensor_state *state, size_t last)
{
struct bel_sensor_info *info = NULL;
size_t next = last + 1;
Expand All @@ -617,6 +618,9 @@ static size_t bel_print_sensor(struct bel_sensor_state *state, size_t last)
if (info->id == state->id)
break;

if (state->id == 0)
break;

next = (next + 1) % ARRAY_SIZE(bel_sensor_info);
}

Expand All @@ -635,7 +639,7 @@ static size_t bel_print_sensor(struct bel_sensor_state *state, size_t last)
return next;
}

static void bel_print_sensors_state(struct bel_sensors_state *state)
void bel_print_sensors_state(struct bel_sensors_state *state)
{
size_t idx = -1;
size_t i;
Expand All @@ -649,7 +653,7 @@ static void bel_print_sensors_state(struct bel_sensors_state *state)
idx = bel_print_sensor(&state->sensor_state[i], idx);
}

static void bel_print_sensors_status_ext(const char *label, struct bel_ext_status *status, size_t idx)
void bel_print_sensors_status_ext(const char *label, struct bel_ext_status *status, size_t idx)
{
struct bel_sensor_info *info = &bel_power_regulator_info[idx];
char l[32];
Expand Down Expand Up @@ -690,7 +694,7 @@ static void bel_print_sensors_status_ext(const char *label, struct bel_ext_statu
bel_print_bit("Invalid/Unsupported Command", status->cml, 7);
}

static void bel_print_sensors_status(struct bel_sensors_status *status)
void bel_print_sensors_status(struct bel_sensors_status *status)
{
if (status->header.magic != BEL_SENSORS_STATUS)
return;
Expand All @@ -706,7 +710,7 @@ static void bel_print_sensors_status(struct bel_sensors_status *status)
bel_print_value("ED8401 Status", status->ed8401_status);
}

static void bel_print_max10_seu(struct bel_max10_seu *status)
void bel_print_max10_seu(struct bel_max10_seu *status)
{
if (status->header.magic != BEL_MAX10_SEU_STATUS)
return;
Expand All @@ -715,7 +719,7 @@ static void bel_print_max10_seu(struct bel_max10_seu *status)

}

static void bel_print_timeof_day(struct bel_timeof_day *timeof_day)
void bel_print_timeof_day(struct bel_timeof_day *timeof_day)
{
if (timeof_day->header.magic != BEL_TIMEOF_DAY_STATUS)
return;
Expand All @@ -727,7 +731,7 @@ static void bel_print_timeof_day(struct bel_timeof_day *timeof_day)
bel_print_value("TimeOfDay offset high", timeof_day->timeofday_offset_high);
}

static void bel_print_fpga_seu(struct bel_fpga_seu *status)
void bel_print_fpga_seu(struct bel_fpga_seu *status)
{
if (status->header.magic != BEL_FPGA_SEU_STATUS)
return;
Expand All @@ -736,7 +740,7 @@ static void bel_print_fpga_seu(struct bel_fpga_seu *status)
bel_print_bit("FPGA SEU error status", status->fpga_seu, 1);
}

static void bel_print_pci_error_status(struct bel_pci_error_status *status, bool print_bits)
void bel_print_pci_error_status(struct bel_pci_error_status *status, bool print_bits)
{
if (status->header.magic != BEL_PCI_ERROR_STATUS)
return;
Expand Down Expand Up @@ -779,6 +783,124 @@ static void bel_print_pci_error_status(struct bel_pci_error_status *status, bool

}

void bel_print_pci_v1_error_status(struct bel_pcie_v1_error_status *status, bool print_bits)
{

if (status->header.magic != BEL_PCI_V1_ERROR_STATUS)
return;

bel_print_header("PCI Error Status Time", &status->header);

// PCIe Link Status
bel_print_value("PCIe Link Status", status->pcie_link_status);
if (print_bits) {
bel_print_field("Current Link Speed", status->pcie_link_status, 0, 3);
bel_print_field("Negotiated Link Speed", status->pcie_link_status, 4, 9);
bel_print_bit("Link Training ", status->pcie_link_status, 11);
bel_print_bit("Slot Clock Configuration", status->pcie_link_status, 12);
bel_print_bit("Data link layer link active", status->pcie_link_status, 13);
bel_print_bit("Link Bandwidth Management Status", status->pcie_link_status, 14);
bel_print_bit("Link Autonomous Management Status", status->pcie_link_status, 15);
}

// PCIe Uncorrectable Error
bel_print_value("PCIe Uncorrectable Error", status->pcie_uncorr_err);
if (print_bits) {
bel_print_bit("Data Link Protocol error Status", status->pcie_uncorr_err, 4);
bel_print_bit("Surprise down error Status", status->pcie_uncorr_err, 5);
bel_print_bit("Poisoned TLP received", status->pcie_uncorr_err, 12);
bel_print_bit("Flow Control Protocol Errors Status", status->pcie_uncorr_err, 13);
bel_print_bit("Completion Timeout Status", status->pcie_uncorr_err, 14);
bel_print_bit("Completer Abort error Status", status->pcie_uncorr_err, 15);
bel_print_bit("Unexpected Completion Status", status->pcie_uncorr_err, 16);
bel_print_bit("Receiver Overflow Status", status->pcie_uncorr_err, 17);
bel_print_bit("Malformed TLP Status", status->pcie_uncorr_err, 18);
bel_print_bit("ECRC Error Status", status->pcie_uncorr_err, 19);
bel_print_bit("Unsupported Request Error Status", status->pcie_uncorr_err, 20);
bel_print_bit("ACS Violation Status", status->pcie_uncorr_err, 21);
bel_print_bit("Uncorrectable Internal Error Status", status->pcie_uncorr_err, 22);
bel_print_bit("MC Blocked TLP Status", status->pcie_uncorr_err, 23);
bel_print_bit("AtomicOp Egress Blocked Status", status->pcie_uncorr_err, 24);
bel_print_bit("TLP Prefix Blocked Status", status->pcie_uncorr_err, 25);
bel_print_bit("Poisoned TLP Egress Blocked Status", status->pcie_uncorr_err, 26);
}

// PCIe Uncorrectable Err Mask
bel_print_value("PCIe Uncorrectable Err Mask", status->pcie_uncorr_err_mask);
if (print_bits) {
bel_print_bit("Data Link Protocol error", status->pcie_uncorr_err, 4);
bel_print_bit("Surprise down error", status->pcie_uncorr_err, 5);
bel_print_bit("Poisoned TLP received", status->pcie_uncorr_err, 12);
bel_print_bit("Flow Control Protocol Errors", status->pcie_uncorr_err, 13);
bel_print_bit("Completion Timeout", status->pcie_uncorr_err, 14);
bel_print_bit("Completer Abort error", status->pcie_uncorr_err, 15);
bel_print_bit("Unexpected Completion", status->pcie_uncorr_err, 16);
bel_print_bit("Receiver Overflow", status->pcie_uncorr_err, 17);
bel_print_bit("Malformed TLP", status->pcie_uncorr_err, 18);
bel_print_bit("ECRC Error", status->pcie_uncorr_err, 19);
bel_print_bit("Unsupported Request Error", status->pcie_uncorr_err, 20);
bel_print_bit("ACS Violation", status->pcie_uncorr_err, 21);
bel_print_bit("Uncorrectable Internal Error", status->pcie_uncorr_err, 22);
bel_print_bit("MC Blocked TLP", status->pcie_uncorr_err, 23);
bel_print_bit("AtomicOp Egress Blocked", status->pcie_uncorr_err, 24);
bel_print_bit("TLP Prefix Blocked", status->pcie_uncorr_err, 25);
bel_print_bit("Poisoned TLP Egress Blocked", status->pcie_uncorr_err, 26);
}

//PCIE Uncorrectable Err Severity
bel_print_value("PCIe Uncorrectable Err Severity", status->pcie_uncorr_err_severity);
if (print_bits) {
bel_print_bit("Data Link Protocol error", status->pcie_uncorr_err_severity, 4);
bel_print_bit("Surprise Down Error", status->pcie_uncorr_err_severity, 5);
bel_print_bit("Poisoned TLP", status->pcie_uncorr_err_severity, 12);
bel_print_bit("Flow Control protocol error", status->pcie_uncorr_err_severity, 13);
bel_print_bit("Completion Timeout", status->pcie_uncorr_err_severity, 14);
bel_print_bit("Completer Abort (CA) was transmitted", status->pcie_uncorr_err_severity, 15);
bel_print_bit("Unexpected Completion was received", status->pcie_uncorr_err_severity, 16);
bel_print_bit("Receiver Overflow", status->pcie_uncorr_err_severity, 17);
bel_print_bit("Malformed TLP Received", status->pcie_uncorr_err_severity, 18);
bel_print_bit("ECRC Error Detected", status->pcie_uncorr_err_severity, 19);
bel_print_bit("Unsupported Request Received", status->pcie_uncorr_err_severity, 20);
}

//PCIE Correctable Err Status
bel_print_value("PCIe Correctable Err Status", status->pcie_corr_err_status);
if (print_bits) {
bel_print_bit("Receiver Error status", status->pcie_corr_err_status, 0);
bel_print_bit("Bad TLP status", status->pcie_corr_err_status, 6);
bel_print_bit("Bad DLLP status", status->pcie_corr_err_status, 7);
bel_print_bit("Replay Number Rollover status", status->pcie_corr_err_status, 8);
bel_print_bit("Replay timer Timeout status", status->pcie_corr_err_status, 12);
bel_print_bit("Advisory Non-Fatal Error status", status->pcie_corr_err_status, 13);
bel_print_bit("Corrected internal error status", status->pcie_corr_err_status, 14);
}

//PCIE Correctable Err Mask
bel_print_value("PCIe Correctable Err Mask", status->pcie_corr_err_mask);
if (print_bits) {
bel_print_bit("Receiver Error", status->pcie_corr_err_status, 0);
bel_print_bit("Bad TLP", status->pcie_corr_err_status, 6);
bel_print_bit("Bad DLLP", status->pcie_corr_err_status, 7);
bel_print_bit("Replay Number Rollover", status->pcie_corr_err_status, 8);
bel_print_bit("Replay timer Timeout", status->pcie_corr_err_status, 12);
bel_print_bit("Advisory Non-Fatal Error", status->pcie_corr_err_status, 13);
bel_print_bit("Corrected internal error", status->pcie_corr_err_status, 14);
}

//PCIE Cap And Ctrl
bel_print_value("PCIe Cap And Ctrl", status->pcie_cap_ctrl);

//PCIE Header Log DW 1
bel_print_value("PCIE Header Log DW1", status->pcie_header_log1);
//PCIE Header Log DW 2
bel_print_value("PCIE Header Log DW2", status->pcie_header_log1);
//PCIE Header Log DW 3
bel_print_value("PCIE Header Log DW3", status->pcie_header_log1);
//PCIE Header Log DW 4
bel_print_value("PCIE Header Log DW4", status->pcie_header_log1);

}

uint32_t bel_ptr_count(void)
{
return BEL_BLOCK_COUNT;
Expand Down Expand Up @@ -845,6 +967,7 @@ void bel_print(struct bel_event *event, bool print_sensors, bool print_bits)
bel_print_sensors_state(&event->sensors_state);
bel_print_sensors_status(&event->sensors_status);
}
bel_print_pci_v1_error_status(&event->pcie_v1_error_status, print_bits);

}

Expand Down
17 changes: 17 additions & 0 deletions libraries/libboard/board_n6000/board_event_log.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ struct bel_sensors_status {
struct bel_ext_status ir38063;
struct bel_ext_status isl68220;
uint32_t ed8401_status;
uint32_t reserved[16];
} __attribute__((__packed__));

struct bel_timeof_day {
Expand All @@ -125,6 +126,21 @@ struct bel_pci_error_status {
uint32_t reserved[7];
} __attribute__((__packed__));

struct bel_pcie_v1_error_status {
struct bel_header header;
uint32_t pcie_link_status;
uint32_t pcie_uncorr_err;
uint32_t pcie_uncorr_err_mask;
uint32_t pcie_uncorr_err_severity;
uint32_t pcie_corr_err_status;
uint32_t pcie_corr_err_mask;
uint32_t pcie_cap_ctrl;
uint32_t pcie_header_log1;
uint32_t pcie_header_log2;
uint32_t pcie_header_log3;
uint32_t pcie_header_log4;
} __attribute__((__packed__)) ;

struct bel_event {
union {
struct {
Expand All @@ -136,6 +152,7 @@ struct bel_event {
struct bel_power_off_status power_off_status;
struct bel_sensors_state sensors_state;
struct bel_sensors_status sensors_status;
struct bel_pcie_v1_error_status pcie_v1_error_status;
};
uint32_t data[1];
};
Expand Down
76 changes: 76 additions & 0 deletions tests/board/test_board_n6000.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

#include "libboard/board_common/board_common.h"
#include "libboard/board_n6000/board_n6000.h"
#include "libboard/board_n6000/board_event_log.h"

using namespace opae::testing;

Expand Down Expand Up @@ -73,6 +74,25 @@ ssize_t board_n6000_c_p::eintr_write(int fd, void *buf, size_t count)
return total_written;
}

extern "C" {
void bel_print_pci_error_status(struct bel_pci_error_status* status,
bool print_bits);
void bel_print_timeof_day(struct bel_timeof_day* timeof_day);
void bel_print_fpga_seu(struct bel_fpga_seu* status);
void bel_print_max10_seu(struct bel_max10_seu* status);
void bel_print_sensors_status(struct bel_sensors_status* status);
void bel_print_sensors_state(struct bel_sensors_state* state);
void bel_print_power_off_status(struct bel_power_off_status* status,
bool print_bits);
void bel_print_power_on_status(struct bel_power_on_status* status,
struct bel_timeof_day* timeof_day,
bool print_bits);
void bel_print(struct bel_event* event, bool print_sensors, bool print_bits);
void bel_print_pci_v1_error_status(struct bel_pcie_v1_error_status* status, bool print_bits);
}



fpga_result board_n6000_c_p::write_sysfs_file(const char *file,
void *buf, size_t count) {
fpga_result res = FPGA_OK;
Expand Down Expand Up @@ -440,6 +460,62 @@ TEST_P(board_dfl_n6000_c_p, board_n6000_12) {
EXPECT_EQ(print_mac_info(device_token_), FPGA_OK);
}

/**
* @test board_n6000_13
* @brief Tests: prints event log functions
* @details prints event logs <br>
*/
TEST_P(board_dfl_n6000_c_p, board_n6000_13) {
struct bel_pci_error_status pcie_status;
memset(&pcie_status, 0x0, sizeof(pcie_status));
pcie_status.header.magic = 0x53696C9A;
EXPECT_NO_THROW(bel_print_pci_error_status(&pcie_status, true));

struct bel_timeof_day timeof_day;
memset(&timeof_day, 0x0, sizeof(timeof_day));
timeof_day.header.magic = 0x53696CF0;
EXPECT_NO_THROW(bel_print_timeof_day(&timeof_day));

struct bel_max10_seu max10_seu;
memset(&max10_seu, 0x0, sizeof(max10_seu));
max10_seu.header.magic = 0x53696CBC;
EXPECT_NO_THROW(bel_print_max10_seu(&max10_seu));

struct bel_fpga_seu fpga_seu;
memset(&fpga_seu, 0x0, sizeof(fpga_seu));
fpga_seu.header.magic = 0x53696CDE;
EXPECT_NO_THROW(bel_print_fpga_seu(&fpga_seu));

struct bel_power_off_status power_off_status;
memset(&power_off_status, 0x0, sizeof(power_off_status));
power_off_status.header.magic = 0x53696C34;
bel_print_power_off_status(&power_off_status, true);

struct bel_power_on_status power_on_status;
memset(&power_on_status, 0x0, sizeof(power_on_status));
power_on_status.header.magic = 0x53696C12;
EXPECT_NO_THROW(bel_print_power_on_status(&power_on_status, &timeof_day, true));

struct bel_sensors_status sensors_status;
memset(&sensors_status, 0x0, sizeof(sensors_status));
sensors_status.header.magic = 0x53696C78;
EXPECT_NO_THROW(bel_print_sensors_status(&sensors_status));

struct bel_sensors_state sensors_state;
memset(&sensors_state, 0x0, sizeof(sensors_state));
sensors_state.header.magic = 0x53696C56;
EXPECT_NO_THROW(bel_print_sensors_state(&sensors_state));

struct bel_event event;
EXPECT_NO_THROW(bel_print(&event, true, true));

struct bel_pcie_v1_error_status pcie_v1_error_status;
memset(&pcie_v1_error_status, 0x0, sizeof(pcie_v1_error_status));
pcie_v1_error_status.header.magic = 0x53696D12;
EXPECT_NO_THROW(bel_print_pci_v1_error_status(&pcie_v1_error_status,true));

}

GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(board_dfl_n6000_c_p);
INSTANTIATE_TEST_SUITE_P(board_dfl_n6000_c, board_dfl_n6000_c_p,
::testing::ValuesIn(test_platform::mock_platforms({
Expand Down
1 change: 1 addition & 0 deletions tests/framework/mock/test_system.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include <thread>
#include <mutex>
#include <atomic>
#include <stdexcept>
#include "platform/fpga_hw.h"
#include <glob.h>

Expand Down
Binary file modified tests/framework/mock_sys_dfl_n6000_sku0_nlb0.tar.gz
Binary file not shown.

0 comments on commit 646c88f

Please sign in to comment.