From fb1741950a4436e3f8fb03b0444025a8d5c4c872 Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Mon, 16 May 2022 14:00:45 +1000 Subject: [PATCH 01/14] 14677 mlxcx NULL deref panic due to race in mlxcx_cmd_taskq Change-Id: If82cbd13b21fac25c929afa096e0bdb53c26e46d --- usr/src/uts/common/io/mlxcx/mlxcx.h | 7 +- usr/src/uts/common/io/mlxcx/mlxcx_cmd.c | 83 +++++++++++++++--------- usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 6 ++ 3 files changed, 59 insertions(+), 37 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index c2843790cca8..c8896f22f15d 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -318,12 +318,7 @@ typedef struct mlxcx_cmd_queue { uint8_t mcmd_size_l2; uint8_t mcmd_stride_l2; uint_t mcmd_size; - /* - * The mask has a bit for each command slot, there are a maximum - * of 32 slots. When the bit is set in the mask, it indicates - * the slot is available. - */ - uint32_t mcmd_mask; + uint8_t mcmd_next; /* next command slot */ mlxcx_cmd_t *mcmd_active[MLXCX_CMD_MAX]; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c index 2183413d2bc6..65c714f1c7ce 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c @@ -569,7 +569,7 @@ mlxcx_cmd_queue_init(mlxcx_t *mlxp) return (B_FALSE); } - cmd->mcmd_mask = (uint32_t)((1ULL << cmd->mcmd_size) - 1); + cmd->mcmd_next = 0; mutex_init(&cmd->mcmd_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&cmd->mcmd_cv, NULL, CV_DRIVER, NULL); @@ -840,31 +840,34 @@ mlxcx_cmd_copy_output(mlxcx_cmd_ent_t *ent, mlxcx_cmd_t *cmd) } static uint_t -mlxcx_cmd_reserve_slot(mlxcx_cmd_queue_t *cmdq) +mlxcx_cmd_reserve_slot(mlxcx_cmd_queue_t *cmdq, mlxcx_cmd_t *cmd) { - uint_t slot; - + uint_t i, slot; + ASSERT(mutex_owned(&cmd->mlcmd_lock)); mutex_enter(&cmdq->mcmd_lock); - slot = ddi_ffs(cmdq->mcmd_mask); - while (slot == 0) { + while (1) { + for (i = 0; i < MLXCX_CMD_MAX; ++i) { + slot = (cmdq->mcmd_next + i) % MLXCX_CMD_MAX; + if (cmdq->mcmd_active[slot] == NULL) + break; + } + if (cmdq->mcmd_active[slot] == NULL) { + cmdq->mcmd_active[slot] = cmd; + cmdq->mcmd_next = slot + 1; + mutex_exit(&cmdq->mcmd_lock); + return (slot); + } cv_wait(&cmdq->mcmd_cv, &cmdq->mcmd_lock); - slot = ddi_ffs(cmdq->mcmd_mask); } - - cmdq->mcmd_mask &= ~(1U << --slot); - - ASSERT3P(cmdq->mcmd_active[slot], ==, NULL); - - mutex_exit(&cmdq->mcmd_lock); - - return (slot); } static void -mlxcx_cmd_release_slot(mlxcx_cmd_queue_t *cmdq, uint_t slot) +mlxcx_cmd_release_slot(mlxcx_cmd_queue_t *cmdq, uint_t slot, mlxcx_cmd_t *cmd) { + ASSERT(mutex_owned(&cmd->mlcmd_lock)); mutex_enter(&cmdq->mcmd_lock); - cmdq->mcmd_mask |= 1U << slot; + ASSERT3P(cmdq->mcmd_active[slot], ==, cmd); + cmdq->mcmd_active[slot] = NULL; cv_broadcast(&cmdq->mcmd_cv); mutex_exit(&cmdq->mcmd_lock); } @@ -876,6 +879,8 @@ mlxcx_cmd_done(mlxcx_cmd_t *cmd, uint_t slot) mlxcx_cmd_queue_t *cmdq = &mlxp->mlx_cmd; mlxcx_cmd_ent_t *ent; + ASSERT(mutex_owned(&cmd->mlcmd_lock)); + /* * Command is done. Save relevant data. Once we broadcast on the CV and * drop the lock, we must not touch it again. @@ -885,17 +890,16 @@ mlxcx_cmd_done(mlxcx_cmd_t *cmd, uint_t slot) ent = (mlxcx_cmd_ent_t *)(cmdq->mcmd_dma.mxdb_va + (slot << cmdq->mcmd_stride_l2)); - mutex_enter(&cmd->mlcmd_lock); cmd->mlcmd_status = MLXCX_CMD_STATUS(ent->mce_status); if (cmd->mlcmd_status == 0) mlxcx_cmd_copy_output(ent, cmd); cmd->mlcmd_state = MLXCX_CMD_S_DONE; cv_broadcast(&cmd->mlcmd_cv); - mutex_exit(&cmd->mlcmd_lock); - cmdq->mcmd_active[slot] = NULL; - mlxcx_cmd_release_slot(cmdq, slot); + mlxcx_cmd_release_slot(cmdq, slot, cmd); + + mutex_exit(&cmd->mlcmd_lock); } static void @@ -907,14 +911,14 @@ mlxcx_cmd_taskq(void *arg) mlxcx_cmd_ent_t *ent; uint_t poll, slot; - ASSERT3S(cmd->mlcmd_op, !=, 0); + mutex_enter(&cmd->mlcmd_lock); + + VERIFY3S(cmd->mlcmd_op, !=, 0); - slot = mlxcx_cmd_reserve_slot(cmdq); + slot = mlxcx_cmd_reserve_slot(cmdq, cmd); ent = (mlxcx_cmd_ent_t *)(cmdq->mcmd_dma.mxdb_va + (slot << cmdq->mcmd_stride_l2)); - cmdq->mcmd_active[slot] = cmd; - /* * Command queue is currently ours as we set busy. */ @@ -924,15 +928,25 @@ mlxcx_cmd_taskq(void *arg) ent->mce_out_length = to_be32(cmd->mlcmd_outlen); ent->mce_token = cmd->mlcmd_token; ent->mce_sig = 0; - ent->mce_status = MLXCX_CMD_HW_OWNED; mlxcx_cmd_prep_input(ent, cmd); mlxcx_cmd_prep_output(ent, cmd); + + /* + * Ensure all of the other fields of the entry are written before + * we switch the owner to hardware (the device might start executing + * right away) + */ + membar_producer(); + ent->mce_status = MLXCX_CMD_HW_OWNED; + MLXCX_DMA_SYNC(cmdq->mcmd_dma, DDI_DMA_SYNC_FORDEV); mlxcx_put32(mlxp, MLXCX_ISS_CMD_DOORBELL, 1 << slot); - if (!cmd->mlcmd_poll) + if (!cmd->mlcmd_poll) { + mutex_exit(&cmd->mlcmd_lock); return; + } for (poll = 0; poll < mlxcx_cmd_tries; poll++) { delay(drv_usectohz(mlxcx_cmd_delay)); @@ -947,21 +961,21 @@ mlxcx_cmd_taskq(void *arg) */ if (poll == mlxcx_cmd_tries) { - mutex_enter(&cmd->mlcmd_lock); cmd->mlcmd_status = MLXCX_CMD_R_TIMEOUT; cmd->mlcmd_state = MLXCX_CMD_S_ERROR; cv_broadcast(&cmd->mlcmd_cv); + + mlxcx_cmd_release_slot(cmdq, slot, cmd); + mutex_exit(&cmd->mlcmd_lock); mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_NO_RESPONSE); - cmdq->mcmd_active[slot] = NULL; - mlxcx_cmd_release_slot(cmdq, slot); - return; } mlxcx_cmd_done(cmd, slot); + /* mlxcx_cmd_done releases mlcmd_lock */ } void @@ -980,10 +994,17 @@ mlxcx_cmd_completion(mlxcx_t *mlxp, mlxcx_eventq_ent_t *ent) comp_vec &= ~(1U << --slot); cmd = cmdq->mcmd_active[slot]; + + /* + * This field is never modified, so we shouldn't need to hold + * mlcmd_lock before checking it. + */ if (cmd->mlcmd_poll) continue; + mutex_enter(&cmd->mlcmd_lock); mlxcx_cmd_done(cmd, slot); + /* mlxcx_cmd_done releases mlcmd_lock */ } } diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 7711c501288e..2708884413c0 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -1453,6 +1453,12 @@ mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); + /* + * Make sure all prior stores are flushed out before we update the + * counter: hardware can immediately start executing after this write + * (the doorbell below just makes sure it's awake) + */ + membar_producer(); mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); ASSERT(mlwq->mlwq_cq != NULL); From 448f2403eb6d44b294369b260bf1c57a25a206d1 Mon Sep 17 00:00:00 2001 From: Dan McDonald Date: Mon, 29 Jan 2024 15:52:46 -0500 Subject: [PATCH 02/14] xxxxx mlxcx needs DDI_DEVICE_ATTR_V1 --- usr/src/uts/common/io/mlxcx/mlxcx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index 12f80cf76830..f8effecbfd80 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -1259,7 +1259,7 @@ mlxcx_regs_map(mlxcx_t *mlxp) * device. */ bzero(&da, sizeof (ddi_device_acc_attr_t)); - da.devacc_attr_version = DDI_DEVICE_ATTR_V0; + da.devacc_attr_version = DDI_DEVICE_ATTR_V1; da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { From 9bb6578d766c800b240092e4340e58af98306104 Mon Sep 17 00:00:00 2001 From: Dan McDonald Date: Thu, 28 Sep 2023 11:14:56 -0400 Subject: [PATCH 03/14] inline ddi_{get,put} wrappers --- usr/src/uts/common/io/mlxcx/mlxcx.c | 55 -------------------------- usr/src/uts/common/io/mlxcx/mlxcx.h | 61 +++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 63 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index f8effecbfd80..4f0e3f47bc8d 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -650,61 +650,6 @@ mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) va_end(ap); } -uint16_t -mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); -} - -uint32_t -mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); -} - -uint64_t -mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); -} - -void -mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); -} - -void -mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); -} - -void -mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) -{ - /* - * The UAR is always inside the first BAR, which we mapped as - * mlx_regs - */ - uintptr_t addr = off + (uintptr_t)mlu->mlu_base + - (uintptr_t)mlxp->mlx_regs_base; - ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); -} - -void -mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) -{ - uintptr_t addr = off + (uintptr_t)mlu->mlu_base + - (uintptr_t)mlxp->mlx_regs_base; - ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); -} - static void mlxcx_fm_fini(mlxcx_t *mlxp) { diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index c8896f22f15d..43be3603fe8a 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -1239,17 +1239,62 @@ struct mlxcx { }; /* - * Register access + * Register access. Use static inlines. */ -extern uint16_t mlxcx_get16(mlxcx_t *, uintptr_t); -extern uint32_t mlxcx_get32(mlxcx_t *, uintptr_t); -extern uint64_t mlxcx_get64(mlxcx_t *, uintptr_t); +static inline uint16_t +mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); +} + +static inline uint32_t +mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); +} + +static inline uint64_t +mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); +} + +static inline void +mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); +} -extern void mlxcx_put32(mlxcx_t *, uintptr_t, uint32_t); -extern void mlxcx_put64(mlxcx_t *, uintptr_t, uint64_t); +static inline void +mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); +} + +static inline void +mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) +{ + /* + * The UAR is always inside the first BAR, which we mapped as + * mlx_regs + */ + uintptr_t addr = off + (uintptr_t)mlu->mlu_base + + (uintptr_t)mlxp->mlx_regs_base; + ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); +} -extern void mlxcx_uar_put32(mlxcx_t *, mlxcx_uar_t *, uintptr_t, uint32_t); -extern void mlxcx_uar_put64(mlxcx_t *, mlxcx_uar_t *, uintptr_t, uint64_t); +static inline void +mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) +{ + uintptr_t addr = off + (uintptr_t)mlu->mlu_base + + (uintptr_t)mlxp->mlx_regs_base; + ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); +} /* * Logging functions. From 518ed3f53838a174c8048b448773fc411e10bb56 Mon Sep 17 00:00:00 2001 From: Gordon Ross Date: Fri, 9 Feb 2024 14:19:53 -0500 Subject: [PATCH 04/14] xxxxx mlxcx should lower DMA threshold Reviewed by: Jerry Jelinek Reviewed by: Sam Zaydel --- usr/src/uts/common/io/mlxcx/mlxcx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 43be3603fe8a..17b6dc2fad2a 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -167,7 +167,7 @@ extern "C" { * How big does an mblk have to be before we dma_bind() it instead of * bcopying? */ -#define MLXCX_TX_BIND_THRESHOLD_DFLT 2048 +#define MLXCX_TX_BIND_THRESHOLD_DFLT 512 /* * How often to check the status of completion queues for overflow and From e5984ebfce2a53d3f0a54e52014ebe9ff19150fe Mon Sep 17 00:00:00 2001 From: Robert Mustacchi Date: Fri, 24 Feb 2023 23:19:43 +0000 Subject: [PATCH 05/14] 15445 mlxcx MAC_PROP_MEDIA support Change-Id: I4b63e2b4ccb6f0a11c25e69cb99459e28a994b6c --- usr/src/uts/common/io/mlxcx/mlxcx_gld.c | 121 ++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index c01fc94a4eff..bac7f7a99546 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -13,6 +13,7 @@ * Copyright (c) 2021, the University of Queensland * Copyright 2020 RackTop Systems, Inc. * Copyright 2023 MNX Cloud, Inc. + * Copyright 2023 Oxide Computer Company */ /* @@ -29,6 +30,7 @@ #include #include +#include /* Need these for mac_vlan_header_info() */ #include @@ -200,6 +202,119 @@ mlxcx_link_fec_cap(link_fec_t fec, mlxcx_pplm_fec_caps_t *pfecp) return (B_TRUE); } +static mac_ether_media_t +mlxcx_mac_media(mlxcx_port_t *port) +{ + switch (port->mlp_oper_status) { + case MLXCX_PORT_STATUS_UP: + case MLXCX_PORT_STATUS_UP_ONCE: + break; + case MLXCX_PORT_STATUS_DOWN: + return (ETHER_MEDIA_NONE); + case MLXCX_PORT_STATUS_DISABLED: + return (ETHER_MEDIA_UNKNOWN); + } + + switch (port->mlp_oper_proto) { + case MLXCX_PROTO_SGMII: + return (ETHER_MEDIA_1000_SGMII); + case MLXCX_PROTO_1000BASE_KX: + return (ETHER_MEDIA_1000BASE_KX); + case MLXCX_PROTO_10GBASE_CX4: + return (ETHER_MEDIA_10GBASE_CX4); + case MLXCX_PROTO_10GBASE_KX4: + return (ETHER_MEDIA_10GBASE_KX4); + case MLXCX_PROTO_10GBASE_KR: + return (ETHER_MEDIA_10GBASE_KR); + case MLXCX_PROTO_40GBASE_CR4: + return (ETHER_MEDIA_40GBASE_CR4); + case MLXCX_PROTO_40GBASE_KR4: + return (ETHER_MEDIA_40GBASE_KR4); + case MLXCX_PROTO_SGMII_100BASE: + return (ETHER_MEDIA_100_SGMII); + case MLXCX_PROTO_10GBASE_CR: + return (ETHER_MEDIA_10GBASE_CR); + case MLXCX_PROTO_10GBASE_SR: + return (ETHER_MEDIA_10GBASE_SR); + case MLXCX_PROTO_10GBASE_ER_LR: + return (ETHER_MEDIA_10GBASE_LR); + case MLXCX_PROTO_40GBASE_SR4: + return (ETHER_MEDIA_40GBASE_SR4); + case MLXCX_PROTO_40GBASE_LR4_ER4: + return (ETHER_MEDIA_40GBASE_LR4); + case MLXCX_PROTO_50GBASE_SR2: + return (ETHER_MEDIA_50GBASE_SR2); + case MLXCX_PROTO_100GBASE_CR4: + return (ETHER_MEDIA_100GBASE_CR4); + case MLXCX_PROTO_100GBASE_SR4: + return (ETHER_MEDIA_100GBASE_SR4); + case MLXCX_PROTO_100GBASE_KR4: + return (ETHER_MEDIA_100GBASE_KR4); + case MLXCX_PROTO_25GBASE_CR: + return (ETHER_MEDIA_25GBASE_CR); + case MLXCX_PROTO_25GBASE_KR: + return (ETHER_MEDIA_25GBASE_KR); + case MLXCX_PROTO_25GBASE_SR: + return (ETHER_MEDIA_25GBASE_SR); + case MLXCX_PROTO_50GBASE_CR2: + return (ETHER_MEDIA_50GBASE_CR2); + case MLXCX_PROTO_50GBASE_KR2: + return (ETHER_MEDIA_50GBASE_KR2); + default: + /* FALLTHRU */ + break; + } + + switch (port->mlp_ext_oper_proto) { + case MLXCX_EXTPROTO_SGMII_100BASE: + return (ETHER_MEDIA_100_SGMII); + case MLXCX_EXTPROTO_1000BASE_X_SGMII: + return (ETHER_MEDIA_1000_SGMII); + case MLXCX_EXTPROTO_5GBASE_R: + return (ETHER_MEDIA_5000BASE_KR); /* XXX KEBE ASKS use _KR ? */ + case MLXCX_EXTPROTO_10GBASE_XFI_XAUI_1: + return (ETHER_MEDIA_10G_XAUI); + case MLXCX_EXTPROTO_40GBASE_XLAUI_4_XLPPI_4: + return (ETHER_MEDIA_40G_XLPPI); + case MLXCX_EXTPROTO_25GAUI_1_25GBASE_CR_KR: + return (ETHER_MEDIA_25G_AUI); + case MLXCX_EXTPROTO_50GAUI_2_LAUI_2_50GBASE_CR2_KR2: + case MLXCX_EXTPROTO_50GAUI_1_LAUI_1_50GBASE_CR_KR: + /* No type for 50G AUI as far as I can see. */ + return (ETHER_MEDIA_UNKNOWN); + case MLXCX_EXTPROTO_CAUI_4_100GBASE_CR4_KR4: + return (ETHER_MEDIA_100GBASE_CAUI4); + case MLXCX_EXTPROTO_100GAUI_2_100GBASE_CR2_KR2: + case MLXCX_EXTPROTO_100GAUI_1_100GBASE_CR_KR: + /* No type for 100G AUI as far as I can see. */ + return (ETHER_MEDIA_UNKNOWN); + /* + * NOTE: These report unsupported but keeping them in active code for + * detection purposes. + */ + case MLXCX_EXTPROTO_200GAUI_4_200GBASE_CR4_KR4: + return (ETHER_MEDIA_200GAUI_4); + case MLXCX_EXTPROTO_200GAUI_2_200GBASE_CR2_KR2: + return (ETHER_MEDIA_200GAUI_2); + case MLXCX_EXTPROTO_400GAUI_8_400GBASE_CR8: + return (ETHER_MEDIA_400GAUI_8); + case MLXCX_EXTPROTO_400GAUI_4_400GBASE_CR4: + return (ETHER_MEDIA_400GAUI_4); + default: + /* + * There ARE legitimate single-bit values we don't support, + * and should just return 0 immediately. We will ASSERT() + * that it's a single-bit value, however. + */ + /* This check should work okay for 0 too. */ + ASSERT0((uint32_t)port->mlp_ext_oper_proto & + ((uint32_t)port->mlp_ext_oper_proto - 1U)); + break; + } + + return (ETHER_MEDIA_UNKNOWN); +} + static int mlxcx_mac_stat_rfc_2863(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat, uint64_t *val) @@ -340,6 +455,9 @@ mlxcx_mac_stat(void *arg, uint_t stat, uint64_t *val) case MAC_STAT_NORCVBUF: *val = port->mlp_stats.mlps_rx_drops; break; + case ETHER_STAT_XCVR_INUSE: + *val = (uint64_t)mlxcx_mac_media(port); + break; default: ret = ENOTSUP; } @@ -1453,6 +1571,9 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, *(link_state_t *)pr_val = LINK_STATE_UNKNOWN; } break; + case MAC_PROP_MEDIA: + *(mac_ether_media_t *)pr_val = mlxcx_mac_media(port); + break; case MAC_PROP_AUTONEG: if (pr_valsize < sizeof (uint8_t)) { ret = EOVERFLOW; From 3f2b2d415d7c39767aba9d3e79e56e8a4236ad97 Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Tue, 31 Oct 2023 13:12:43 +1000 Subject: [PATCH 06/14] mlxcx: batch up work under bufshard locks in ISR, don't take and release for every packet Change-Id: I569b38c507db3a51c97dd4a58afab1d17f4b7caf --- usr/src/uts/common/io/mlxcx/mlxcx.c | 27 +++ usr/src/uts/common/io/mlxcx/mlxcx.h | 20 +- usr/src/uts/common/io/mlxcx/mlxcx_intr.c | 8 +- usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 229 ++++++++++++++++++++++- 4 files changed, 276 insertions(+), 8 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index 4f0e3f47bc8d..501529a0daf4 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -761,6 +761,7 @@ mlxcx_teardown_bufs(mlxcx_t *mlxp) list_destroy(&mlxp->mlx_buf_shards); kmem_cache_destroy(mlxp->mlx_bufs_cache); + kmem_cache_destroy(mlxp->mlx_mbrm_cache); } static void @@ -1379,6 +1380,26 @@ mlxcx_bufs_cache_destr(void *arg, void *cookie) list_destroy(&b->mlb_tx_chain); } +static int +mlxcx_mbrm_cache_constr(void *arg, void *cookie, int kmflags) +{ + mlxcx_t *mlxp = cookie; + mlxcx_buf_return_mblk_t *mbrm = arg; + (void)mlxp; + bzero(mbrm, sizeof (mlxcx_buf_return_mblk_t)); + return (0); +} + +static void +mlxcx_mbrm_cache_destr(void *arg, void *cookie) +{ + mlxcx_t *mlxp = cookie; + mlxcx_buf_return_mblk_t *mbrm = arg; + (void)mlxp; + VERIFY3P(mbrm->mbrm_mp, ==, NULL); + VERIFY(!list_link_active(&mbrm->mbrm_entry)); +} + mlxcx_buf_shard_t * mlxcx_mlbs_create(mlxcx_t *mlxp) { @@ -1412,6 +1433,12 @@ mlxcx_setup_bufs(mlxcx_t *mlxp) sizeof (mlxcx_buffer_t), sizeof (uint64_t), mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, NULL, mlxp, NULL, 0); + (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_mbrm_cache", + ddi_get_instance(mlxp->mlx_dip)); + mlxp->mlx_mbrm_cache = kmem_cache_create(namebuf, + sizeof (mlxcx_buf_return_mblk_t), sizeof (uint64_t), + mlxcx_mbrm_cache_constr, mlxcx_mbrm_cache_destr, + NULL, mlxp, NULL, 0); list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), offsetof(mlxcx_buf_shard_t, mlbs_entry)); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 17b6dc2fad2a..77569e9a5266 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -1225,6 +1225,7 @@ struct mlxcx { mlxcx_ring_group_t *mlx_tx_groups; kmem_cache_t *mlx_bufs_cache; + kmem_cache_t *mlx_mbrm_cache; list_t mlx_buf_shards; ddi_periodic_t mlx_eq_checktimer; @@ -1238,6 +1239,23 @@ struct mlxcx { mlxcx_temp_sensor_t *mlx_temp_sensors; }; +typedef struct mlxcx_buf_return_mblk { + list_node_t mbrm_entry; + mblk_t *mbrm_mp; +} mlxcx_buf_return_mblk_t; + +#define MLXCX_BUF_RETURN_BATCH_SHARDS 4 +typedef struct mlxcx_buf_return_batch { + uint mbrb_n[MLXCX_BUF_RETURN_BATCH_SHARDS]; + mlxcx_buf_shard_t *mbrb_shard[MLXCX_BUF_RETURN_BATCH_SHARDS]; + list_t mbrb_list[MLXCX_BUF_RETURN_BATCH_SHARDS]; + list_t mbrb_mblks; +} mlxcx_buf_return_batch_t; + +extern void mlxcx_buf_return_batch_init(mlxcx_buf_return_batch_t *); +extern void mlxcx_buf_return_batch_flush(mlxcx_t *, mlxcx_buf_return_batch_t *); + + /* * Register access. Use static inlines. */ @@ -1410,7 +1428,7 @@ extern void mlxcx_teardown_rx_group(mlxcx_t *, mlxcx_ring_group_t *); extern void mlxcx_teardown_tx_group(mlxcx_t *, mlxcx_ring_group_t *); extern void mlxcx_tx_completion(mlxcx_t *, mlxcx_completion_queue_t *, - mlxcx_completionq_ent_t *, mlxcx_buffer_t *); + mlxcx_completionq_ent_t *, mlxcx_buffer_t *, mlxcx_buf_return_batch_t *); extern mblk_t *mlxcx_rx_completion(mlxcx_t *, mlxcx_completion_queue_t *, mlxcx_completionq_ent_t *, mlxcx_buffer_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c index e2f51411719d..2b2ac78334bb 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c @@ -874,6 +874,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, uint_t rx_frames = 0; uint_t comp_cnt = 0; int64_t wqebbs, bufcnt; + mlxcx_buf_return_batch_t rbatch; *mpp = NULL; @@ -886,6 +887,8 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, nmp = cmp = mp = NULL; + mlxcx_buf_return_batch_init(&rbatch); + wqebbs = 0; bufcnt = 0; for (cent = mlxcx_cq_next(mlcq); cent != NULL; @@ -979,7 +982,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, switch (mlcq->mlcq_wq->mlwq_type) { case MLXCX_WQ_TYPE_SENDQ: - mlxcx_tx_completion(mlxp, mlcq, cent, buf); + mlxcx_tx_completion(mlxp, mlcq, cent, buf, &rbatch); break; case MLXCX_WQ_TYPE_RECVQ: nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); @@ -1006,6 +1009,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, * high->low water mark. */ if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { + mlxcx_buf_return_batch_flush(mlxp, &rbatch); mlxcx_update_cqci(mlxp, mlcq); /* * Both these variables are incremented using @@ -1024,6 +1028,8 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, break; } + mlxcx_buf_return_batch_flush(mlxp, &rbatch); + if (comp_cnt > 0) { mlxcx_update_cqci(mlxp, mlcq); atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 2708884413c0..13d7930b5460 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -1997,16 +1997,22 @@ mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); } +static void mlxcx_buf_return_batch_push(mlxcx_t *mlxp, + mlxcx_buf_return_batch_t *mbrb, mlxcx_buffer_t *b); +static void mlxcx_buf_return_batch_push_chain(mlxcx_t *mlxp, + mlxcx_buf_return_batch_t *mbrb, mlxcx_buffer_t *b0, boolean_t keepmp); + void mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, - mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) + mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf, + mlxcx_buf_return_batch_t *mbrb) { ASSERT(mutex_owned(&mlcq->mlcq_mtx)); if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { mlxcx_completionq_error_ent_t *eent = (mlxcx_completionq_error_ent_t *)ent; mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); mlxcx_check_sq(mlxp, mlcq->mlcq_wq); mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); @@ -2015,24 +2021,24 @@ mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); return; } if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", ent->mlcqe_send_wqe_opcode); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); return; } if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); return; } - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); } mblk_t * @@ -2496,6 +2502,29 @@ mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) mlxcx_buf_return(mlxp, b0); } +static void +mlxcx_buf_return_batch_push_chain(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, + mlxcx_buffer_t *b0, boolean_t keepmp) +{ + mlxcx_buffer_t *b; + + if (b0->mlb_tx_head != b0) { + mlxcx_buf_return_batch_push(mlxp, mbrb, b0); + return; + } + + b = list_head(&b0->mlb_tx_chain); + while (b != NULL) { + mlxcx_buf_return_batch_push(mlxp, mbrb, b); + b = list_next(&b0->mlb_tx_chain, b); + } + if (keepmp) { + b0->mlb_tx_mp = NULL; + b0->mlb_tx_head = NULL; + } + mlxcx_buf_return_batch_push(mlxp, mbrb, b0); +} + inline void mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr) { @@ -2504,6 +2533,194 @@ mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr) s->mlbs_hiwat2 = 3 * (s->mlbs_ntotal / 4); } +static void mlxcx_buf_return_batch_flush_shard(mlxcx_t *, + mlxcx_buf_return_batch_t *, uint); + +static void +mlxcx_buf_return_batch_push(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, + mlxcx_buffer_t *b) +{ + uint i, found = 0; + uint min_n, min_n_i; + mlxcx_buf_shard_t *s = b->mlb_shard; + + VERIFY(!list_link_active(&b->mlb_cq_entry)); + + /* + * Are we already spooling up buffers for this shard? If so, add it + * to that existing list + */ + for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + if (mbrb->mbrb_shard[i] == s) { + found = 1; + break; + } + } + if (!found) { + /* Do we have any unused shard slots? If so, use that. */ + for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + if (mbrb->mbrb_shard[i] == NULL) { + mbrb->mbrb_shard[i] = s; + found = 1; + break; + } + } + } + if (!found) { + /* Otherwise evict the least popular shard. */ + min_n = mbrb->mbrb_n[0]; + min_n_i = 0; + for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + if (mbrb->mbrb_n[i] < min_n) { + min_n = mbrb->mbrb_n[i]; + min_n_i = i; + } + } + mlxcx_buf_return_batch_flush_shard(mlxp, mbrb, min_n_i); + i = min_n_i; + found = 1; + } + ASSERT(found); + + ++mbrb->mbrb_n[i]; + list_insert_tail(&mbrb->mbrb_list[i], b); +} + +void +mlxcx_buf_return_batch_init(mlxcx_buf_return_batch_t *mbrb) +{ + uint i; + list_create(&mbrb->mbrb_mblks, sizeof (mlxcx_buf_return_mblk_t), + offsetof(mlxcx_buf_return_mblk_t, mbrm_entry)); + for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + mbrb->mbrb_shard[i] = NULL; + list_create(&mbrb->mbrb_list[i], sizeof (mlxcx_buffer_t), + offsetof(mlxcx_buffer_t, mlb_cq_entry)); + } +} + +static void +mlxcx_buf_return_step1(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, + mlxcx_buffer_t *b) +{ + mlxcx_buffer_t *txhead = b->mlb_tx_head; + mlxcx_buf_shard_t *s = b->mlb_shard; + mlxcx_buf_return_mblk_t *mbrm; + mblk_t *mp = b->mlb_tx_mp; + + VERIFY3U(b->mlb_state, !=, MLXCX_BUFFER_FREE); + ASSERT3P(b->mlb_mlx, ==, mlxp); + + b->mlb_wqe_index = 0; + b->mlb_tx_mp = NULL; + b->mlb_used = 0; + b->mlb_wqebbs = 0; + if (txhead == b) { + mbrm = kmem_cache_alloc(mlxp->mlx_mbrm_cache, KM_SLEEP); + mbrm->mbrm_mp = mp; + list_insert_tail(&mbrb->mbrb_mblks, mbrm); + } + ASSERT(list_is_empty(&b->mlb_tx_chain)); + + if (b->mlb_foreign) { + if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { + mlxcx_dma_unbind(mlxp, &b->mlb_dma); + } + } +} + +static void +mlxcx_buf_return_step2(mlxcx_t *mlxp, mlxcx_buffer_t *b) +{ + mlxcx_buffer_state_t oldstate = b->mlb_state; + mlxcx_buffer_t *txhead = b->mlb_tx_head; + mlxcx_buf_shard_t *s = b->mlb_shard; + mblk_t *mp = b->mlb_tx_mp; + + ASSERT(mutex_owned(&s->mlbs_mtx)); + + b->mlb_state = MLXCX_BUFFER_FREE; + b->mlb_tx_head = NULL; + + switch (oldstate) { + case MLXCX_BUFFER_INIT: + mlxcx_bufshard_adjust_total(s, 1); + break; + case MLXCX_BUFFER_ON_WQ: + list_remove(&s->mlbs_busy, b); + break; + case MLXCX_BUFFER_ON_LOAN: + ASSERT(!b->mlb_foreign); + --s->mlbs_nloaned; + list_remove(&s->mlbs_loaned, b); + if (s->mlbs_state == MLXCX_SHARD_DRAINING) { + /* + * When we're draining, Eg during mac_stop(), + * we destroy the buffer immediately rather than + * recycling it. Otherwise we risk leaving it + * on the free list and leaking it. + */ + list_insert_tail(&s->mlbs_free, b); + mlxcx_buf_destroy(mlxp, b); + /* + * Teardown might be waiting for loaned list to empty. + */ + cv_broadcast(&s->mlbs_free_nonempty); + return; + } + break; + case MLXCX_BUFFER_FREE: + VERIFY(0); + break; + case MLXCX_BUFFER_ON_CHAIN: + ASSERT(txhead != NULL); + list_remove(&txhead->mlb_tx_chain, b); + list_remove(&s->mlbs_busy, b); + break; + } + + list_insert_tail(&s->mlbs_free, b); + cv_broadcast(&s->mlbs_free_nonempty); +} + +static void +mlxcx_buf_return_batch_flush_shard(mlxcx_t *mlxp, + mlxcx_buf_return_batch_t *mbrb, uint i) +{ + mlxcx_buffer_t *b; + mlxcx_buf_return_mblk_t *mbrm; + + b = list_head(&mbrb->mbrb_list[i]); + while (b != NULL) { + mlxcx_buf_return_step1(mlxp, mbrb, b); + b = list_next(&mbrb->mbrb_list[i], b); + } + mutex_enter(&mbrb->mbrb_shard[i]->mlbs_mtx); + while ((b = list_remove_head(&mbrb->mbrb_list[i]))) { + mlxcx_buf_return_step2(mlxp, b); + } + mutex_exit(&mbrb->mbrb_shard[i]->mlbs_mtx); + while ((mbrm = list_remove_head(&mbrb->mbrb_mblks))) { + freemsg(mbrm->mbrm_mp); + mbrm->mbrm_mp = NULL; + kmem_cache_free(mlxp->mlx_mbrm_cache, mbrm); + } + + mbrb->mbrb_shard[i] = NULL; + mbrb->mbrb_n[i] = 0; +} + +void +mlxcx_buf_return_batch_flush(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb) +{ + uint i; + for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + if (mbrb->mbrb_shard[i] == NULL) + continue; + mlxcx_buf_return_batch_flush_shard(mlxp, mbrb, i); + } +} + void mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) { From 206c0205242f06e97a7cf4a873541246c2c4a13c Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Tue, 7 Nov 2023 06:01:40 +0000 Subject: [PATCH 07/14] mlxcx: bufbmtx very hot thanks to sq_ring_dbell --- usr/src/uts/common/io/mlxcx/mlxcx.h | 1 + usr/src/uts/common/io/mlxcx/mlxcx_intr.c | 1 + usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 26 +++++++++++++++++------- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 77569e9a5266..5b94dd4b78de 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -624,6 +624,7 @@ typedef struct mlxcx_completion_queue { list_t mlcq_buffers; kmutex_t mlcq_bufbmtx; list_t mlcq_buffers_b; + uint64_t mlcq_bufbgen; uint_t mlcq_check_disarm_cnt; uint64_t mlcq_check_disarm_cc; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c index 2b2ac78334bb..656f3a497307 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c @@ -942,6 +942,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, list_move_tail(&mlcq->mlcq_buffers, &mlcq->mlcq_buffers_b); added = B_TRUE; + ++mlcq->mlcq_bufbgen; } mutex_exit(&mlcq->mlcq_bufbmtx); if (added) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 13d7930b5460..5e269fe3dcda 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -1559,6 +1559,7 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, mlxcx_buffer_t *b; ddi_fm_error_t err; boolean_t rv; + uint64_t bufbgen; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); ASSERT3P(b0->mlb_tx_head, ==, b0); @@ -1700,21 +1701,32 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, } /* - * Hold the bufmtx whilst ringing the doorbell, to prevent - * the buffer from being moved to another list, so we can - * safely remove it should the ring fail. + * Stash the bufbgen counter, which is incremented every time + * buffers_b is merged into buffers. This lets us easily tell which + * list we need to take the buffer back from if we fail in + * sq_ring_dbell (which will only happen if everything is going pretty + * badly). */ mutex_enter(&cq->mlcq_bufbmtx); - + bufbgen = cq->mlcq_bufbgen; list_insert_tail(&cq->mlcq_buffers_b, b0); + mutex_exit(&cq->mlcq_bufbmtx); + if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { atomic_inc_64(&cq->mlcq_bufcnt); } else { - list_remove(&cq->mlcq_buffers_b, b0); + mutex_enter(&cq->mlcq_bufbmtx); + if (bufbgen == cq->mlcq_bufbgen) { + list_remove(&cq->mlcq_buffers_b, b0); + mutex_exit(&cq->mlcq_bufbmtx); + } else { + mutex_exit(&cq->mlcq_bufbmtx); + mutex_enter(&cq->mlcq_mtx); + list_remove(&cq->mlcq_buffers, b0); + mutex_exit(&cq->mlcq_mtx); + } } - mutex_exit(&cq->mlcq_bufbmtx); - return (rv); } From f8aaea43f321bde7bc5fe6319bde41da99ba4285 Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Wed, 8 Nov 2023 03:19:13 +0000 Subject: [PATCH 08/14] mlxcx: multi-TIS and bf alternation --- usr/src/uts/common/io/mlxcx/mlxcx.h | 4 +++- usr/src/uts/common/io/mlxcx/mlxcx_reg.h | 4 ++-- usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 30 +++++++++++++++--------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 5b94dd4b78de..809652b35be6 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -887,6 +887,8 @@ typedef enum { MLXCX_TIRS_PER_GROUP } mlxcx_tir_role_t; +#define MLXCX_TIS_PER_GROUP 8 + typedef struct { avl_node_t mlgm_group_entry; list_node_t mlgm_fe_entry; @@ -911,7 +913,7 @@ struct mlxcx_ring_group { mac_group_handle_t mlg_mac_hdl; union { - mlxcx_tis_t mlg_tis; + mlxcx_tis_t mlg_tis[MLXCX_TIS_PER_GROUP]; mlxcx_tir_t mlg_tir[MLXCX_TIRS_PER_GROUP]; }; mlxcx_port_t *mlg_port; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index 2265bd054d07..fa4eda06c639 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -71,8 +71,8 @@ #define MLXCX_UAR_EQ_NOARM 0x0048 /* Number of blue flame reg pairs per UAR */ -#define MLXCX_BF_PER_UAR 2 -#define MLXCX_BF_PER_UAR_MASK 0x1 +#define MLXCX_BF_PER_UAR 4 +#define MLXCX_BF_PER_UAR_MASK (MLXCX_BF_PER_UAR - 1) #define MLXCX_BF_SIZE 0x100 #define MLXCX_BF_BASE 0x0800 diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 5e269fe3dcda..53c53a62a64a 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -685,12 +685,16 @@ mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) g->mlg_state &= ~MLXCX_GROUP_WQS; } - if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && - g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && - !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { - if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { - mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", - g->mlg_tis.mltis_num); + if ((g->mlg_state & MLXCX_GROUP_TIRTIS)) { + for (i = 0; i < MLXCX_TIS_PER_GROUP; ++i) { + if (!(g->mlg_tis[i].mltis_state & MLXCX_TIS_CREATED)) + continue; + if (g->mlg_tis[i].mltis_state & MLXCX_TIS_DESTROYED) + continue; + if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis[i])) { + mlxcx_warn(mlxp, "failed to destroy tis %u for " + "tx ring", g->mlg_tis[i].mltis_num); + } } } g->mlg_state &= ~MLXCX_GROUP_TIRTIS; @@ -1324,6 +1328,7 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) mlxcx_completion_queue_t *cq; mlxcx_work_queue_t *sq; uint_t i; + mlxcx_tis_t *tis; ASSERT3S(g->mlg_state, ==, 0); @@ -1341,11 +1346,13 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); g->mlg_state |= MLXCX_GROUP_WQS; - g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; + for (i = 0; i < MLXCX_TIS_PER_GROUP; ++i) { + g->mlg_tis[i].mltis_tdom = &mlxp->mlx_tdom; - if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { - mutex_exit(&g->mlg_mtx); - return (B_FALSE); + if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis[i])) { + mutex_exit(&g->mlg_mtx); + return (B_FALSE); + } } g->mlg_state |= MLXCX_GROUP_TIRTIS; @@ -1370,7 +1377,8 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) cq->mlcq_stats = &g->mlg_port->mlp_stats; sq = &g->mlg_wqs[i]; - if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { + tis = &g->mlg_tis[i % MLXCX_TIS_PER_GROUP]; + if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, tis, sq)) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } From b2f543d6e33d24d277b2958085d7d1523233ff0c Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Wed, 8 Nov 2023 00:10:02 +0000 Subject: [PATCH 09/14] mlxcx: prepare sqes outside wq lock --- usr/src/uts/common/io/mlxcx/mlxcx.h | 10 + usr/src/uts/common/io/mlxcx/mlxcx_gld.c | 7 +- usr/src/uts/common/io/mlxcx/mlxcx_reg.h | 2 + usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 267 +++++++++++++++-------- 4 files changed, 189 insertions(+), 97 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 809652b35be6..37582316063d 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -571,6 +571,14 @@ typedef struct mlxcx_buffer { mlxcx_dma_buffer_t mlb_dma; mblk_t *mlb_mp; frtn_t mlb_frtn; + + /* spooled up sendq entries ready to push into the ring */ + union { + mlxcx_sendq_ent_t *mlb_sqe; + mlxcx_sendq_extra_ent_t *mlb_esqe; + }; + size_t mlb_sqe_size; + uint_t mlb_sqe_count; } mlxcx_buffer_t; typedef enum { @@ -1423,6 +1431,8 @@ extern boolean_t mlxcx_sq_add_buffer(mlxcx_t *, mlxcx_work_queue_t *, uint8_t *, size_t, uint32_t, mlxcx_buffer_t *); extern boolean_t mlxcx_sq_add_nop(mlxcx_t *, mlxcx_work_queue_t *); extern void mlxcx_rq_refill(mlxcx_t *, mlxcx_work_queue_t *); +extern void mlxcx_buf_prepare_sqe(mlxcx_t *, mlxcx_work_queue_t *, + mlxcx_buffer_t *, uint8_t *, size_t, uint32_t); extern void mlxcx_teardown_groups(mlxcx_t *); extern void mlxcx_wq_teardown(mlxcx_t *, mlxcx_work_queue_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index bac7f7a99546..e98680b64800 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -676,6 +676,9 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) return (mp); } + (void) mlxcx_buf_prepare_sqe(mlxp, sq, b, inline_hdrs, inline_hdrlen, + chkflags); + mutex_enter(&sq->mlwq_mtx); VERIFY3U(sq->mlwq_inline_mode, <=, MLXCX_ETH_INLINE_L2); cq = sq->mlwq_cq; @@ -721,6 +724,8 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) goto blocked; } + mutex_exit(&sq->mlwq_mtx); + /* * Now that we've successfully enqueued the rest of the packet, * free any mblks that we cut off while inlining headers. @@ -730,8 +735,6 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) freeb(mp); } - mutex_exit(&sq->mlwq_mtx); - return (NULL); blocked: diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index fa4eda06c639..88b2d0a0f061 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -404,6 +404,8 @@ typedef enum { #define MLXCX_WQE_OCTOWORD 16 #define MLXCX_SQE_MAX_DS ((1 << 6) - 1) + +#define MLXCX_SQE_BUF 4 /* * Calculate the max number of address pointers in a single ethernet * send message. This is the remainder from MLXCX_SQE_MAX_DS diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 53c53a62a64a..40ab48833e07 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -1555,16 +1555,11 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, mlxcx_buffer_t *b0) { - uint_t index, first, ents; + uint_t index, first, ents, j; mlxcx_completion_queue_t *cq; mlxcx_sendq_ent_t *ent0; mlxcx_sendq_extra_ent_t *ent; - mlxcx_wqe_data_seg_t *seg; - uint_t ptri, nptr; - const ddi_dma_cookie_t *c; - size_t rem; uint64_t wqebb_used; - mlxcx_buffer_t *b; ddi_fm_error_t err; boolean_t rv; uint64_t bufbgen; @@ -1574,45 +1569,6 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); cq = mlwq->mlwq_cq; - index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); - ent0 = &mlwq->mlwq_send_ent[index]; - b0->mlb_wqe_index = mlwq->mlwq_pc; - ents = 1; - - first = index; - - bzero(ent0, sizeof (mlxcx_sendq_ent_t)); - ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; - ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); - ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); - - set_bits8(&ent0->mlsqe_control.mlcs_flags, - MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); - set_bits8(&ent0->mlsqe_control.mlcs_flags, - MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); - - VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); - set_bits16(&ent0->mlsqe_eth.mles_szflags, - MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); - if (inlinelen > 0) { - bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, - inlinelen); - } - - ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / - MLXCX_WQE_OCTOWORD; - - if (chkflags & HCK_IPV4_HDRCKSUM) { - ASSERT(mlxp->mlx_caps->mlc_checksum); - set_bit8(&ent0->mlsqe_eth.mles_csflags, - MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); - } - if (chkflags & HCK_FULLCKSUM) { - ASSERT(mlxp->mlx_caps->mlc_checksum); - set_bit8(&ent0->mlsqe_eth.mles_csflags, - MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); - } - /* * mlwq_wqebb_used is only incremented whilst holding * the mlwq_mtx mutex, but it is decremented (atomically) in @@ -1623,65 +1579,66 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, */ wqebb_used = mlwq->mlwq_wqebb_used; - b = b0; - ptri = 0; - nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); - seg = ent0->mlsqe_data; - while (b != NULL) { - rem = b->mlb_used; + if ((b0->mlb_wqebbs + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); - c = NULL; - while (rem > 0 && - (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { - if (ptri >= nptr) { - if ((ents + wqebb_used) >= mlwq->mlwq_nents) - return (B_FALSE); + index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); + first = index; + ents = 0; - index = (mlwq->mlwq_pc + ents) & - (mlwq->mlwq_nents - 1); - ent = &mlwq->mlwq_send_extra_ent[index]; - ++ents; + if (b0->mlb_sqe == NULL || b0->mlb_wqebbs == 0) + return (B_FALSE); - seg = ent->mlsqe_data; - ptri = 0; - nptr = sizeof (ent->mlsqe_data) / - sizeof (mlxcx_wqe_data_seg_t); - } + /* + * Don't let a multi-WQEBB send request wrap around the ring -- if + * it looks like we need to do that, pad with NOPs to the end. + */ + if (index + b0->mlb_wqebbs > mlwq->mlwq_nents) { + while (index != 0) { + if ((ents + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); - seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); - if (c->dmac_size > rem) { - seg->mlds_byte_count = to_be32(rem); - rem = 0; - } else { - seg->mlds_byte_count = to_be32(c->dmac_size); - rem -= c->dmac_size; - } - seg->mlds_address = to_be64(c->dmac_laddress); - ++seg; - ++ptri; - ++ent0->mlsqe_control.mlcs_ds; + ent0 = &mlwq->mlwq_send_ent[index]; - ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, - MLXCX_SQE_MAX_DS); - } + bzero(ent0, sizeof (mlxcx_sendq_ent_t)); + ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; + ent0->mlsqe_control.mlcs_qp_or_sq = + to_be24(mlwq->mlwq_num); + ent0->mlsqe_control.mlcs_wqe_index = + to_be16(mlwq->mlwq_pc + ents); - if (b == b0) { - b = list_head(&b0->mlb_tx_chain); - } else { - b = list_next(&b0->mlb_tx_chain, b); + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); + + ent0->mlsqe_control.mlcs_ds = 1; + + ++ents; + index = (mlwq->mlwq_pc + ents) & (mlwq->mlwq_nents - 1); } } - b0->mlb_wqebbs = ents; - mlwq->mlwq_pc += ents; - atomic_add_64(&mlwq->mlwq_wqebb_used, ents); + ent0 = &mlwq->mlwq_send_ent[index]; + b0->mlb_wqe_index = mlwq->mlwq_pc + ents; + ++ents; - for (; ptri < nptr; ++ptri, ++seg) { - seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); - seg->mlds_byte_count = to_be32(0); - seg->mlds_address = to_be64(0); + bcopy(&b0->mlb_sqe[0], ent0, sizeof (*ent0)); + ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); + + for (j = 1; j < b0->mlb_wqebbs; ++j) { + if ((ents + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); + index = (mlwq->mlwq_pc + ents) & + (mlwq->mlwq_nents - 1); + ++ents; + ent = &mlwq->mlwq_send_extra_ent[index]; + bcopy(&b0->mlb_esqe[j], ent, sizeof (*ent)); } + mlwq->mlwq_pc += ents; + atomic_add_64(&mlwq->mlwq_wqebb_used, ents); + /* * Make sure the workqueue entry is flushed out before updating * the doorbell. @@ -2281,7 +2238,8 @@ mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); bcopy(rptr, b->mlb_dma.mxdb_va, sz); - MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); + (void)ddi_dma_sync(b->mlb_dma.mxdb_dma_handle, 0, sz, + DDI_DMA_SYNC_FORDEV); ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); @@ -2339,6 +2297,120 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, return (b); } +void +mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, + mlxcx_buffer_t *b0, uint8_t *inlinehdrs, size_t inlinelen, + uint32_t chkflags) +{ + mlxcx_sendq_ent_t *ent0; + mlxcx_sendq_extra_ent_t *ent; + mlxcx_wqe_data_seg_t *seg; + uint_t ents, ptri, nptr; + const ddi_dma_cookie_t *c; + size_t rem; + mlxcx_buffer_t *b; + + ASSERT3P(b0->mlb_tx_head, ==, b0); + ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); + + if (b0->mlb_sqe == NULL) { + b0->mlb_sqe_count = MLXCX_SQE_BUF; + b0->mlb_sqe_size = b0->mlb_sqe_count * + sizeof (mlxcx_sendq_ent_t); + b0->mlb_sqe = kmem_zalloc(b0->mlb_sqe_size, KM_SLEEP); + } + + ents = 1; + ent0 = &b0->mlb_sqe[0]; + + bzero(ent0, sizeof (mlxcx_sendq_ent_t)); + ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; + ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); + /* mlcs_wqe_index set by mlxcx_sq_add_buffer */ + + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); + + VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); + set_bits16(&ent0->mlsqe_eth.mles_szflags, + MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); + if (inlinelen > 0) { + bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, + inlinelen); + } + + ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / + MLXCX_WQE_OCTOWORD; + + if (chkflags & HCK_IPV4_HDRCKSUM) { + ASSERT(mlxp->mlx_caps->mlc_checksum); + set_bit8(&ent0->mlsqe_eth.mles_csflags, + MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); + } + if (chkflags & HCK_FULLCKSUM) { + ASSERT(mlxp->mlx_caps->mlc_checksum); + set_bit8(&ent0->mlsqe_eth.mles_csflags, + MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); + } + + b = b0; + ptri = 0; + nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); + seg = ent0->mlsqe_data; + while (b != NULL) { + rem = b->mlb_used; + + c = NULL; + while (rem > 0 && + (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { + if (ptri >= nptr) { + if (ents > b0->mlb_sqe_count) + return; + + ent = &b0->mlb_esqe[ents]; + ++ents; + + seg = ent->mlsqe_data; + ptri = 0; + nptr = sizeof (ent->mlsqe_data) / + sizeof (mlxcx_wqe_data_seg_t); + } + + seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); + if (c->dmac_size > rem) { + seg->mlds_byte_count = to_be32(rem); + rem = 0; + } else { + seg->mlds_byte_count = to_be32(c->dmac_size); + rem -= c->dmac_size; + } + seg->mlds_address = to_be64(c->dmac_laddress); + ++seg; + ++ptri; + ++ent0->mlsqe_control.mlcs_ds; + + ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, + MLXCX_SQE_MAX_DS); + } + + if (b == b0) { + b = list_head(&b0->mlb_tx_chain); + } else { + b = list_next(&b0->mlb_tx_chain, b); + } + } + + b0->mlb_wqebbs = ents; + + for (; ptri < nptr; ++ptri, ++seg) { + seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); + seg->mlds_byte_count = to_be32(0); + seg->mlds_address = to_be64(0); + } +} + uint_t mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) @@ -2624,7 +2696,6 @@ mlxcx_buf_return_step1(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, mlxcx_buffer_t *b) { mlxcx_buffer_t *txhead = b->mlb_tx_head; - mlxcx_buf_shard_t *s = b->mlb_shard; mlxcx_buf_return_mblk_t *mbrm; mblk_t *mp = b->mlb_tx_mp; @@ -2655,7 +2726,6 @@ mlxcx_buf_return_step2(mlxcx_t *mlxp, mlxcx_buffer_t *b) mlxcx_buffer_state_t oldstate = b->mlb_state; mlxcx_buffer_t *txhead = b->mlb_tx_head; mlxcx_buf_shard_t *s = b->mlb_shard; - mblk_t *mp = b->mlb_tx_mp; ASSERT(mutex_owned(&s->mlbs_mtx)); @@ -2839,6 +2909,13 @@ mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) mlxcx_bufshard_adjust_total(s, -1); } + if (b->mlb_sqe != NULL) { + kmem_free(b->mlb_sqe, b->mlb_sqe_size); + b->mlb_sqe = NULL; + b->mlb_sqe_size = 0; + b->mlb_sqe_count = 0; + } + /* * This is going back to the kmem cache, so it needs to be set up in * the same way we expect a new buffer to come out (state INIT, other From ca01060726b02014a3ae8a72581feb7b918f5dba Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Wed, 15 Nov 2023 03:37:44 +0000 Subject: [PATCH 10/14] mlxcx: lso support --- usr/src/uts/common/io/mlxcx/mlxcx.h | 43 ++++--- usr/src/uts/common/io/mlxcx/mlxcx_cmd.c | 2 +- usr/src/uts/common/io/mlxcx/mlxcx_gld.c | 82 ++++++++++--- usr/src/uts/common/io/mlxcx/mlxcx_reg.h | 10 +- usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 144 +++++++++++++++++------ 5 files changed, 210 insertions(+), 71 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 37582316063d..15ebccd441fd 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -647,14 +647,15 @@ typedef struct mlxcx_completion_queue { } mlxcx_completion_queue_t; typedef enum { - MLXCX_WQ_ALLOC = 1 << 0, - MLXCX_WQ_CREATED = 1 << 1, - MLXCX_WQ_STARTED = 1 << 2, - MLXCX_WQ_DESTROYED = 1 << 3, - MLXCX_WQ_TEARDOWN = 1 << 4, - MLXCX_WQ_BUFFERS = 1 << 5, - MLXCX_WQ_REFILLING = 1 << 6, - MLXCX_WQ_BLOCKED_MAC = 1 << 7 + MLXCX_WQ_INIT = 1 << 0, + MLXCX_WQ_ALLOC = 1 << 1, + MLXCX_WQ_CREATED = 1 << 2, + MLXCX_WQ_STARTED = 1 << 3, + MLXCX_WQ_DESTROYED = 1 << 4, + MLXCX_WQ_TEARDOWN = 1 << 5, + MLXCX_WQ_BUFFERS = 1 << 6, + MLXCX_WQ_REFILLING = 1 << 7, + MLXCX_WQ_BLOCKED_MAC = 1 << 8 } mlxcx_workq_state_t; typedef enum { @@ -1255,12 +1256,15 @@ typedef struct mlxcx_buf_return_mblk { mblk_t *mbrm_mp; } mlxcx_buf_return_mblk_t; -#define MLXCX_BUF_RETURN_BATCH_SHARDS 4 +#define MLXCX_BRB_SHARDS 4 +#define MLXCX_BRB_INLINE_MBLKS 8 typedef struct mlxcx_buf_return_batch { - uint mbrb_n[MLXCX_BUF_RETURN_BATCH_SHARDS]; - mlxcx_buf_shard_t *mbrb_shard[MLXCX_BUF_RETURN_BATCH_SHARDS]; - list_t mbrb_list[MLXCX_BUF_RETURN_BATCH_SHARDS]; + uint mbrb_n[MLXCX_BRB_SHARDS]; + mlxcx_buf_shard_t *mbrb_shard[MLXCX_BRB_SHARDS]; + list_t mbrb_list[MLXCX_BRB_SHARDS]; list_t mbrb_mblks; + mblk_t *mbrb_inline_mblk[MLXCX_BRB_INLINE_MBLKS]; + uint mbrb_inline_mblks; } mlxcx_buf_return_batch_t; extern void mlxcx_buf_return_batch_init(mlxcx_buf_return_batch_t *); @@ -1428,11 +1432,20 @@ extern boolean_t mlxcx_rq_add_buffer(mlxcx_t *, mlxcx_work_queue_t *, extern boolean_t mlxcx_rq_add_buffers(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **, size_t); extern boolean_t mlxcx_sq_add_buffer(mlxcx_t *, mlxcx_work_queue_t *, - uint8_t *, size_t, uint32_t, mlxcx_buffer_t *); + mlxcx_buffer_t *); extern boolean_t mlxcx_sq_add_nop(mlxcx_t *, mlxcx_work_queue_t *); extern void mlxcx_rq_refill(mlxcx_t *, mlxcx_work_queue_t *); -extern void mlxcx_buf_prepare_sqe(mlxcx_t *, mlxcx_work_queue_t *, - mlxcx_buffer_t *, uint8_t *, size_t, uint32_t); + +typedef struct mlxcx_tx_ctx { + uint8_t mtc_inline_hdrs[MLXCX_MAX_INLINE_HEADERLEN]; + size_t mtc_inline_hdrlen; + uint32_t mtc_chkflags; + uint32_t mtc_mss; + uint32_t mtc_lsoflags; +} mlxcx_tx_ctx_t; + +extern boolean_t mlxcx_buf_prepare_sqe(mlxcx_t *, mlxcx_work_queue_t *, + mlxcx_buffer_t *, const mlxcx_tx_ctx_t *); extern void mlxcx_teardown_groups(mlxcx_t *); extern void mlxcx_wq_teardown(mlxcx_t *, mlxcx_work_queue_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c index 65c714f1c7ce..dcf91021c9ac 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c @@ -3746,7 +3746,7 @@ CTASSERT(sizeof (mlxcx_completionq_error_ent_t) == CTASSERT(sizeof (mlxcx_wqe_control_seg_t) == (1 << 4)); CTASSERT(offsetof(mlxcx_wqe_eth_seg_t, mles_inline_headers) == 0x0e); -CTASSERT(sizeof (mlxcx_wqe_eth_seg_t) == (1 << 5)); +CTASSERT(sizeof (mlxcx_wqe_eth_seg_t) == (1 << 4)); CTASSERT(sizeof (mlxcx_wqe_data_seg_t) == (1 << 4)); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index e98680b64800..e8f545c2d73a 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -627,30 +627,58 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) mlxcx_t *mlxp = sq->mlwq_mlx; mlxcx_completion_queue_t *cq; mlxcx_buffer_t *b; - mac_header_info_t mhi; + mac_ether_offload_info_t meoi; mblk_t *kmp, *nmp; - uint8_t inline_hdrs[MLXCX_MAX_INLINE_HEADERLEN]; - size_t inline_hdrlen, rem, off; - uint32_t chkflags = 0; + size_t rem, off; boolean_t ok; size_t take = 0; uint_t bcount; + mlxcx_tx_ctx_t ctx; VERIFY(mp->b_next == NULL); - mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &chkflags); + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &ctx.mtc_chkflags); + mac_lso_get(mp, &ctx.mtc_mss, &ctx.mtc_lsoflags); - if (mac_vlan_header_info(mlxp->mlx_mac_hdl, mp, &mhi) != 0) { + if (mac_ether_offload_info(mp, &meoi) != 0 || + (meoi.meoi_flags & MEOI_L2INFO_SET) == 0) { /* * We got given a frame without a valid L2 header on it. We * can't really transmit that (mlx parts don't like it), so * we will just drop it on the floor. */ + mlxcx_warn(mlxp, "!tried to tx packet with no valid L2 header;" + " dropping it on the floor"); freemsg(mp); return (NULL); } - inline_hdrlen = rem = mhi.mhi_hdrsize; + ctx.mtc_inline_hdrlen = meoi.meoi_l2hlen; + + /* + * If we're doing LSO, we need to find the end of the TCP header, and + * inline up to that point. + */ + if (ctx.mtc_lsoflags & HW_LSO) { + if ((meoi.meoi_flags & MEOI_L3INFO_SET) == 0 || + (meoi.meoi_flags & MEOI_L4INFO_SET) == 0) { + mlxcx_warn(mlxp, "!tried to tx LSO packet with no " + "valid L3/L4 headers; dropping it on the floor"); + freemsg(mp); + return (NULL); + } + ctx.mtc_inline_hdrlen += meoi.meoi_l3hlen + meoi.meoi_l4hlen; + } + + if (ctx.mtc_inline_hdrlen > MLXCX_MAX_INLINE_HEADERLEN) { + mlxcx_warn(mlxp, "!tried to tx LSO packet with headers that " + "are too long (%u bytes, max is %u); dropping it on the " + "floor", ctx.mtc_inline_hdrlen, MLXCX_MAX_INLINE_HEADERLEN); + freemsg(mp); + return (NULL); + } + + rem = ctx.mtc_inline_hdrlen; kmp = mp; off = 0; @@ -661,7 +689,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) take = sz; if (take > rem) take = rem; - bcopy(kmp->b_rptr, inline_hdrs + off, take); + bcopy(kmp->b_rptr, ctx.mtc_inline_hdrs + off, take); rem -= take; off += take; if (take == sz) { @@ -676,8 +704,12 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) return (mp); } - (void) mlxcx_buf_prepare_sqe(mlxp, sq, b, inline_hdrs, inline_hdrlen, - chkflags); + if (!mlxcx_buf_prepare_sqe(mlxp, sq, b, &ctx)) { + mlxcx_warn(mlxp, "!tried to tx packet that couldn't fit in " + "an SQE, dropping"); + freemsg(mp); + return (NULL); + } mutex_enter(&sq->mlwq_mtx); VERIFY3U(sq->mlwq_inline_mode, <=, MLXCX_ETH_INLINE_L2); @@ -716,8 +748,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) goto blocked; } - ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen, - chkflags, b); + ok = mlxcx_sq_add_buffer(mlxp, sq, b); if (!ok) { atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC); atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); @@ -1247,6 +1278,7 @@ mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data) mac_capab_rings_t *cap_rings; mac_capab_led_t *cap_leds; mac_capab_transceiver_t *cap_txr; + mac_capab_lso_t *cap_lso; uint_t i, n = 0; switch (cap) { @@ -1279,10 +1311,10 @@ mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data) break; case MAC_CAPAB_HCKSUM: - if (mlxp->mlx_caps->mlc_checksum) { - *(uint32_t *)cap_data = HCKSUM_INET_FULL_V4 | - HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM; - } + if (!mlxp->mlx_caps->mlc_checksum) + return (B_FALSE); + *(uint32_t *)cap_data = HCKSUM_INET_FULL_V4 | + HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM; break; case MAC_CAPAB_LED: @@ -1303,6 +1335,24 @@ mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data) cap_txr->mct_read = mlxcx_mac_txr_read; break; + case MAC_CAPAB_LSO: + cap_lso = cap_data; + + if (!mlxp->mlx_caps->mlc_lso) + return (B_FALSE); + + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | + LSO_TX_BASIC_TCP_IPV6; + /* + * Cap LSO sends at 64k due to limitations in the TCP stack + * (full length needs to fit in an IP header apparently) + */ + cap_lso->lso_basic_tcp_ipv4.lso_max = + MIN(mlxp->mlx_caps->mlc_max_lso_size, UINT16_MAX); + cap_lso->lso_basic_tcp_ipv6.lso_max = + MIN(mlxp->mlx_caps->mlc_max_lso_size, UINT16_MAX); + break; + default: return (B_FALSE); } diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index 88b2d0a0f061..b826838f2e06 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -405,7 +405,7 @@ typedef enum { #define MLXCX_WQE_OCTOWORD 16 #define MLXCX_SQE_MAX_DS ((1 << 6) - 1) -#define MLXCX_SQE_BUF 4 +#define MLXCX_SQE_BUF 16 /* * Calculate the max number of address pointers in a single ethernet * send message. This is the remainder from MLXCX_SQE_MAX_DS @@ -458,16 +458,16 @@ typedef enum { /* CSTYLED */ #define MLXCX_SQE_ETH_INLINE_HDR_SZ (bitdef_t){0, 0x03ff} #define MLXCX_SQE_ETH_SZFLAG_VLAN (1 << 15) -#define MLXCX_MAX_INLINE_HEADERLEN 64 +#define MLXCX_MAX_INLINE_HEADERLEN (2 + MLXCX_WQE_OCTOWORD * 12) typedef struct { uint8_t mles_rsvd[4]; bits8_t mles_csflags; uint8_t mles_rsvd2[1]; - uint16_t mles_mss; + uint16be_t mles_mss; uint8_t mles_rsvd3[4]; bits16_t mles_szflags; - uint8_t mles_inline_headers[18]; + uint8_t mles_inline_headers[2]; } mlxcx_wqe_eth_seg_t; typedef struct { @@ -481,7 +481,7 @@ typedef struct { typedef struct { mlxcx_wqe_control_seg_t mlsqe_control; mlxcx_wqe_eth_seg_t mlsqe_eth; - mlxcx_wqe_data_seg_t mlsqe_data[1]; + mlxcx_wqe_data_seg_t mlsqe_data[2]; } mlxcx_sendq_ent_t; typedef struct { diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 40ab48833e07..6bff4f2fc9c0 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -188,6 +188,9 @@ mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_completion_queue_t *mlcq; + if (!(mlwq->mlwq_state & MLXCX_WQ_INIT)) + return; + /* * If something is holding the lock on a long operation like a * refill, setting this flag asks them to exit early if possible. @@ -242,6 +245,7 @@ mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mutex_exit(&mlcq->mlcq_mtx); mutex_destroy(&mlwq->mlwq_mtx); + mlwq->mlwq_state &= ~MLXCX_WQ_INIT; } void @@ -400,6 +404,7 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_insert_tail(&mlxp->mlx_wqs, wq); + wq->mlwq_state |= MLXCX_WQ_INIT; mutex_enter(&wq->mlwq_mtx); @@ -444,6 +449,7 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_insert_tail(&mlxp->mlx_wqs, wq); + wq->mlwq_state |= MLXCX_WQ_INIT; mutex_enter(&wq->mlwq_mtx); @@ -667,6 +673,8 @@ mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) if (g->mlg_state & MLXCX_GROUP_WQS) { for (i = 0; i < g->mlg_nwqs; ++i) { wq = &g->mlg_wqs[i]; + if (!(wq->mlwq_state & MLXCX_WQ_INIT)) + continue; mutex_enter(&wq->mlwq_mtx); cq = wq->mlwq_cq; if (wq->mlwq_state & MLXCX_WQ_STARTED && @@ -1371,8 +1379,10 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) } if (!mlxcx_cq_setup(mlxp, eq, &cq, - mlxp->mlx_props.mldp_cq_size_shift)) + mlxp->mlx_props.mldp_cq_size_shift)) { + mutex_exit(&g->mlg_mtx); return (B_FALSE); + } cq->mlcq_stats = &g->mlg_port->mlp_stats; @@ -1552,7 +1562,6 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) boolean_t mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, - uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, mlxcx_buffer_t *b0) { uint_t index, first, ents, j; @@ -2002,7 +2011,8 @@ mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, return; } - if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { + if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND && + ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_LSO) { mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", ent->mlcqe_send_wqe_opcode); mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); @@ -2189,6 +2199,11 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, b->mlb_foreign = B_TRUE; mlxcx_dma_buf_attr(mlxp, &attr); + /* + * Foreign bufs are used on the sendq and can have more pointers than + * standard bufs (which can be used on sq or rq). + */ + attr.dma_attr_sgllen = MLXCX_SQE_MAX_PTRS; ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); if (!ret) { @@ -2196,6 +2211,11 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, return (B_FALSE); } + /* All foreign bufs get an SQE buf automatically. */ + b->mlb_sqe_count = MLXCX_SQE_BUF; + b->mlb_sqe_size = b->mlb_sqe_count * sizeof (mlxcx_sendq_ent_t); + b->mlb_sqe = kmem_zalloc(b->mlb_sqe_size, KM_SLEEP); + *bp = b; return (B_TRUE); @@ -2284,8 +2304,7 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, if (b == NULL) return (NULL); - ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, - B_FALSE); + ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_TRUE); if (!ret) { mlxcx_buf_return(mlxp, b); @@ -2297,17 +2316,16 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, return (b); } -void +boolean_t mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, - mlxcx_buffer_t *b0, uint8_t *inlinehdrs, size_t inlinelen, - uint32_t chkflags) + mlxcx_buffer_t *b0, const mlxcx_tx_ctx_t *ctx) { mlxcx_sendq_ent_t *ent0; mlxcx_sendq_extra_ent_t *ent; mlxcx_wqe_data_seg_t *seg; uint_t ents, ptri, nptr; const ddi_dma_cookie_t *c; - size_t rem; + size_t rem, take, off; mlxcx_buffer_t *b; ASSERT3P(b0->mlb_tx_head, ==, b0); @@ -2333,32 +2351,74 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, set_bits8(&ent0->mlsqe_control.mlcs_flags, MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); - VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); - set_bits16(&ent0->mlsqe_eth.mles_szflags, - MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); - if (inlinelen > 0) { - bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, - inlinelen); - } - ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / MLXCX_WQE_OCTOWORD; + ptri = 0; + seg = ent0->mlsqe_data; + nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); + + VERIFY3U(ctx->mtc_inline_hdrlen, <=, MLXCX_MAX_INLINE_HEADERLEN); + set_bits16(&ent0->mlsqe_eth.mles_szflags, + MLXCX_SQE_ETH_INLINE_HDR_SZ, ctx->mtc_inline_hdrlen); + if (ctx->mtc_inline_hdrlen > 0) { + ASSERT3U(ctx->mtc_inline_hdrlen, >, + sizeof (ent0->mlsqe_eth.mles_inline_headers)); + rem = ctx->mtc_inline_hdrlen; + off = 0; + + off += sizeof (ent0->mlsqe_eth.mles_inline_headers); + rem -= sizeof (ent0->mlsqe_eth.mles_inline_headers); + + while (rem > 0) { + if (ptri >= nptr) { + if (ents >= b0->mlb_sqe_count) + return (B_FALSE); - if (chkflags & HCK_IPV4_HDRCKSUM) { + ent = &b0->mlb_esqe[ents]; + ++ents; + + seg = ent->mlsqe_data; + ptri = 0; + nptr = sizeof (ent->mlsqe_data) / + sizeof (mlxcx_wqe_data_seg_t); + } + take = sizeof (mlxcx_wqe_data_seg_t); + if (take > rem) + take = rem; + off += take; + rem -= take; + + ++seg; + ++ptri; + ++ent0->mlsqe_control.mlcs_ds; + + ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, + MLXCX_SQE_MAX_DS); + } + + bcopy(ctx->mtc_inline_hdrs, + ent0->mlsqe_eth.mles_inline_headers, + ctx->mtc_inline_hdrlen); + } + + if (ctx->mtc_chkflags & HCK_IPV4_HDRCKSUM) { ASSERT(mlxp->mlx_caps->mlc_checksum); set_bit8(&ent0->mlsqe_eth.mles_csflags, MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); } - if (chkflags & HCK_FULLCKSUM) { + if (ctx->mtc_chkflags & HCK_FULLCKSUM) { ASSERT(mlxp->mlx_caps->mlc_checksum); set_bit8(&ent0->mlsqe_eth.mles_csflags, MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); } + if (ctx->mtc_lsoflags & HW_LSO) { + ASSERT(mlxp->mlx_caps->mlc_lso); + ASSERT3U(ctx->mctx_inline_hdrlen, >, 0); + ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_LSO; + ent0->mlsqe_eth.mles_mss = to_be16(ctx->mtc_mss); + } b = b0; - ptri = 0; - nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); - seg = ent0->mlsqe_data; while (b != NULL) { rem = b->mlb_used; @@ -2366,8 +2426,8 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, while (rem > 0 && (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { if (ptri >= nptr) { - if (ents > b0->mlb_sqe_count) - return; + if (ents >= b0->mlb_sqe_count) + return (B_FALSE); ent = &b0->mlb_esqe[ents]; ++ents; @@ -2402,13 +2462,15 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, } } - b0->mlb_wqebbs = ents; - for (; ptri < nptr; ++ptri, ++seg) { seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); seg->mlds_byte_count = to_be32(0); seg->mlds_address = to_be64(0); } + + b0->mlb_wqebbs = ents; + + return (B_TRUE); } uint_t @@ -2642,7 +2704,7 @@ mlxcx_buf_return_batch_push(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, * Are we already spooling up buffers for this shard? If so, add it * to that existing list */ - for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { if (mbrb->mbrb_shard[i] == s) { found = 1; break; @@ -2650,7 +2712,7 @@ mlxcx_buf_return_batch_push(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, } if (!found) { /* Do we have any unused shard slots? If so, use that. */ - for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { if (mbrb->mbrb_shard[i] == NULL) { mbrb->mbrb_shard[i] = s; found = 1; @@ -2662,7 +2724,7 @@ mlxcx_buf_return_batch_push(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, /* Otherwise evict the least popular shard. */ min_n = mbrb->mbrb_n[0]; min_n_i = 0; - for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { if (mbrb->mbrb_n[i] < min_n) { min_n = mbrb->mbrb_n[i]; min_n_i = i; @@ -2684,8 +2746,12 @@ mlxcx_buf_return_batch_init(mlxcx_buf_return_batch_t *mbrb) uint i; list_create(&mbrb->mbrb_mblks, sizeof (mlxcx_buf_return_mblk_t), offsetof(mlxcx_buf_return_mblk_t, mbrm_entry)); - for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + mbrb->mbrb_inline_mblks = 0; + for (i = 0; i < MLXCX_BRB_INLINE_MBLKS; ++i) + mbrb->mbrb_inline_mblk[i] = NULL; + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { mbrb->mbrb_shard[i] = NULL; + mbrb->mbrb_n[i] = 0; list_create(&mbrb->mbrb_list[i], sizeof (mlxcx_buffer_t), offsetof(mlxcx_buffer_t, mlb_cq_entry)); } @@ -2707,9 +2773,13 @@ mlxcx_buf_return_step1(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, b->mlb_used = 0; b->mlb_wqebbs = 0; if (txhead == b) { - mbrm = kmem_cache_alloc(mlxp->mlx_mbrm_cache, KM_SLEEP); - mbrm->mbrm_mp = mp; - list_insert_tail(&mbrb->mbrb_mblks, mbrm); + if (mbrb->mbrb_inline_mblks >= MLXCX_BRB_INLINE_MBLKS) { + mbrm = kmem_cache_alloc(mlxp->mlx_mbrm_cache, KM_SLEEP); + mbrm->mbrm_mp = mp; + list_insert_tail(&mbrb->mbrb_mblks, mbrm); + } else { + mbrb->mbrb_inline_mblk[mbrb->mbrb_inline_mblks++] = mp; + } } ASSERT(list_is_empty(&b->mlb_tx_chain)); @@ -2779,6 +2849,7 @@ mlxcx_buf_return_batch_flush_shard(mlxcx_t *mlxp, { mlxcx_buffer_t *b; mlxcx_buf_return_mblk_t *mbrm; + uint j; b = list_head(&mbrb->mbrb_list[i]); while (b != NULL) { @@ -2790,6 +2861,11 @@ mlxcx_buf_return_batch_flush_shard(mlxcx_t *mlxp, mlxcx_buf_return_step2(mlxp, b); } mutex_exit(&mbrb->mbrb_shard[i]->mlbs_mtx); + for (j = 0; j < mbrb->mbrb_inline_mblks; ++j) { + freemsg(mbrb->mbrb_inline_mblk[j]); + mbrb->mbrb_inline_mblk[j] = NULL; + } + mbrb->mbrb_inline_mblks = 0; while ((mbrm = list_remove_head(&mbrb->mbrb_mblks))) { freemsg(mbrm->mbrm_mp); mbrm->mbrm_mp = NULL; @@ -2804,7 +2880,7 @@ void mlxcx_buf_return_batch_flush(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb) { uint i; - for (i = 0; i < MLXCX_BUF_RETURN_BATCH_SHARDS; ++i) { + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { if (mbrb->mbrb_shard[i] == NULL) continue; mlxcx_buf_return_batch_flush_shard(mlxp, mbrb, i); From cc586c2760b1f92f4c9dabfbdd00f2c2eec21439 Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Mon, 4 Dec 2023 02:30:03 +0000 Subject: [PATCH 11/14] mlxcx should wait to free inlined header mblks until tx is done --- usr/src/uts/common/io/mlxcx/mlxcx.h | 2 +- usr/src/uts/common/io/mlxcx/mlxcx_gld.c | 13 ++----------- usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 6 +++--- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 15ebccd441fd..380c58f8ab5c 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -1416,7 +1416,7 @@ extern void mlxcx_shard_ready(mlxcx_buf_shard_t *); extern void mlxcx_shard_draining(mlxcx_buf_shard_t *); extern uint_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *, - mblk_t *, size_t, mlxcx_buffer_t **); + mblk_t *, mblk_t *, size_t, mlxcx_buffer_t **); extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *); extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index e8f545c2d73a..db9212e0fbb5 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -628,7 +628,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) mlxcx_completion_queue_t *cq; mlxcx_buffer_t *b; mac_ether_offload_info_t meoi; - mblk_t *kmp, *nmp; + mblk_t *kmp; size_t rem, off; boolean_t ok; size_t take = 0; @@ -698,7 +698,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) } } - bcount = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b); + bcount = mlxcx_buf_bind_or_copy(mlxp, sq, mp, kmp, take, &b); if (bcount == 0) { atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); return (mp); @@ -757,15 +757,6 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) mutex_exit(&sq->mlwq_mtx); - /* - * Now that we've successfully enqueued the rest of the packet, - * free any mblks that we cut off while inlining headers. - */ - for (; mp != kmp; mp = nmp) { - nmp = mp->b_cont; - freeb(mp); - } - return (NULL); blocked: diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 6bff4f2fc9c0..6ba73e1f681b 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -2475,7 +2475,7 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, - mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) + mblk_t *mp0, mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) { mlxcx_buffer_t *b, *b0 = NULL; boolean_t first = B_TRUE; @@ -2498,7 +2498,7 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, if (!first) b->mlb_state = MLXCX_BUFFER_ON_CHAIN; - b->mlb_tx_mp = mp; + b->mlb_tx_mp = first ? mp0 : mp; b->mlb_tx_head = b0; b->mlb_used = MBLKL(mp) - offset; @@ -2531,7 +2531,7 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, freemsg(mp); return (0); } - freemsg(mpb); + freemsg(mp0); b0->mlb_tx_mp = mp; b0->mlb_tx_head = b0; From 6a16c08d472d07368d19c7b000eaf4e082d2cec0 Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Tue, 12 Dec 2023 02:57:57 +0000 Subject: [PATCH 12/14] mlxcx: bump max pointers for wq mem, allow deeper rings --- usr/src/uts/common/io/mlxcx/mlxcx.c | 58 +++++++++------ usr/src/uts/common/io/mlxcx/mlxcx_cmd.c | 94 +++++++++++++++---------- usr/src/uts/common/io/mlxcx/mlxcx_reg.h | 4 +- 3 files changed, 95 insertions(+), 61 deletions(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index 501529a0daf4..01b0ebdb6afc 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -1490,11 +1490,12 @@ mlxcx_eq_check(void *arg) { mlxcx_t *mlxp = (mlxcx_t *)arg; mlxcx_event_queue_t *eq; - mlxcx_eventq_ctx_t ctx; + mlxcx_eventq_ctx_t *ctx; const char *str; - uint_t i; + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + for (i = 0; i < mlxp->mlx_intr_count; ++i) { eq = &mlxp->mlx_eqs[i]; @@ -1508,11 +1509,11 @@ mlxcx_eq_check(void *arg) */ ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); - if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) + if (!mlxcx_cmd_query_eq(mlxp, eq, ctx)) continue; str = "???"; - switch (ctx.mleqc_status) { + switch (ctx->mleqc_status) { case MLXCX_EQ_STATUS_OK: break; case MLXCX_EQ_STATUS_WRITE_FAILURE: @@ -1520,14 +1521,14 @@ mlxcx_eq_check(void *arg) break; } - if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { + if (ctx->mleqc_status != MLXCX_EQ_STATUS_OK) { mlxcx_fm_qstate_ereport(mlxp, "event", - eq->mleq_num, str, ctx.mleqc_status); + eq->mleq_num, str, ctx->mleqc_status); mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", - eq->mleq_intr_index, ctx.mleqc_status, str); + eq->mleq_intr_index, ctx->mleqc_status, str); } - if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && + if (ctx->mleqc_state != MLXCX_EQ_ST_ARMED && (eq->mleq_state & MLXCX_EQ_ARMED)) { if (eq->mleq_cc == eq->mleq_check_disarm_cc && ++eq->mleq_check_disarm_cnt >= 3) { @@ -1541,6 +1542,8 @@ mlxcx_eq_check(void *arg) eq->mleq_check_disarm_cnt = 0; } } + + kmem_free(ctx, sizeof (*ctx)); } static void @@ -1548,10 +1551,12 @@ mlxcx_cq_check(void *arg) { mlxcx_t *mlxp = (mlxcx_t *)arg; mlxcx_completion_queue_t *cq; - mlxcx_completionq_ctx_t ctx; + mlxcx_completionq_ctx_t *ctx; const char *str, *type; uint_t v; + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; cq = list_next(&mlxp->mlx_cqs, cq)) { @@ -1569,7 +1574,7 @@ mlxcx_cq_check(void *arg) if (cq->mlcq_fm_repd_qstate) continue; - if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) + if (!mlxcx_cmd_query_cq(mlxp, cq, ctx)) continue; if (cq->mlcq_wq != NULL) { @@ -1585,7 +1590,7 @@ mlxcx_cq_check(void *arg) } str = "???"; - v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); + v = get_bits32(ctx->mlcqc_flags, MLXCX_CQ_CTX_STATUS); switch (v) { case MLXCX_CQC_STATUS_OK: break; @@ -1608,7 +1613,7 @@ mlxcx_cq_check(void *arg) cq->mlcq_fm_repd_qstate = B_TRUE; } - v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); + v = get_bits32(ctx->mlcqc_flags, MLXCX_CQ_CTX_STATE); if (v != MLXCX_CQC_STATE_ARMED && (cq->mlcq_state & MLXCX_CQ_ARMED) && !(cq->mlcq_state & MLXCX_CQ_POLLING)) { @@ -1624,19 +1629,25 @@ mlxcx_cq_check(void *arg) cq->mlcq_check_disarm_cc = 0; } } + + kmem_free(ctx, sizeof (*ctx)); } void mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) { - mlxcx_sq_ctx_t ctx; + mlxcx_sq_ctx_t *ctx; mlxcx_sq_state_t state; - if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) + ctx = kmem_zalloc(sizeof (mlxcx_sq_ctx_t), KM_SLEEP); + + if (!mlxcx_cmd_query_sq(mlxp, sq, ctx)) { + kmem_free(ctx, sizeof (*ctx)); return; + } - ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); - state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); + ASSERT3U(from_be24(ctx->mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); + state = get_bits32(ctx->mlsqc_flags, MLXCX_SQ_STATE); switch (state) { case MLXCX_SQ_STATE_RST: if (sq->mlwq_state & MLXCX_WQ_STARTED) { @@ -1663,20 +1674,25 @@ mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) sq->mlwq_fm_repd_qstate = B_TRUE; break; } + + kmem_free(ctx, sizeof (mlxcx_sq_ctx_t)); } void mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) { - mlxcx_rq_ctx_t ctx; + mlxcx_rq_ctx_t *ctx; mlxcx_rq_state_t state; + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); - if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) + if (!mlxcx_cmd_query_rq(mlxp, rq, ctx)) { + kmem_free(ctx, sizeof (*ctx)); return; + } - ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); - state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); + ASSERT3U(from_be24(ctx->mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); + state = get_bits32(ctx->mlrqc_flags, MLXCX_RQ_STATE); switch (state) { case MLXCX_RQ_STATE_RST: if (rq->mlwq_state & MLXCX_WQ_STARTED) { @@ -1703,6 +1719,8 @@ mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) rq->mlwq_fm_repd_qstate = B_TRUE; break; } + + kmem_free(ctx, sizeof (*ctx)); } static void diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c index dcf91021c9ac..8a1e9b5d57e2 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c @@ -2146,7 +2146,7 @@ boolean_t mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_eq_in_t in; + mlxcx_cmd_create_eq_in_t *in; mlxcx_cmd_create_eq_out_t out; boolean_t ret; mlxcx_eventq_ctx_t *ctx; @@ -2154,7 +2154,7 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_eq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mleq->mleq_mtx)); @@ -2162,15 +2162,15 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) VERIFY0(mleq->mleq_state & MLXCX_EQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_eq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_eq_head, MLXCX_OP_CREATE_EQ, 0); - ctx = &in.mlxi_create_eq_context; + ctx = &in->mlxi_create_eq_context; ctx->mleqc_uar_page = to_be24(mleq->mleq_uar->mlu_num); ctx->mleqc_log_eq_size = mleq->mleq_entshift; ctx->mleqc_intr = mleq->mleq_intr_index; - in.mlxi_create_eq_event_bitmask = to_be64(mleq->mleq_events); + in->mlxi_create_eq_event_bitmask = to_be64(mleq->mleq_events); npages = 0; c = NULL; @@ -2180,7 +2180,7 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) while (rem > 0) { ASSERT3U(pa & 0xfff, ==, 0); ASSERT3U(rem, >=, MLXCX_HW_PAGE_SIZE); - in.mlxi_create_eq_pas[npages++] = to_be64(pa); + in->mlxi_create_eq_pas[npages++] = to_be64(pa); rem -= MLXCX_HW_PAGE_SIZE; pa += MLXCX_HW_PAGE_SIZE; } @@ -2190,8 +2190,9 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) insize = offsetof(mlxcx_cmd_create_eq_in_t, mlxi_create_eq_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -2202,6 +2203,7 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) mleq->mleq_num = out.mlxo_create_eq_eqn; } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -2211,11 +2213,11 @@ mlxcx_cmd_query_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_eq_in_t in; - mlxcx_cmd_query_eq_out_t out; + mlxcx_cmd_query_eq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_eq_out_t), KM_SLEEP); VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); VERIFY(mleq->mleq_state & MLXCX_EQ_CREATED); @@ -2226,18 +2228,20 @@ mlxcx_cmd_query_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq, in.mlxi_query_eq_eqn = mleq->mleq_num; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_eq_context, ctxp, + bcopy(&out->mlxo_query_eq_context, ctxp, sizeof (mlxcx_eventq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2310,7 +2314,7 @@ boolean_t mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_cq_in_t in; + mlxcx_cmd_create_cq_in_t *in; mlxcx_cmd_create_cq_out_t out; boolean_t ret; mlxcx_completionq_ctx_t *ctx; @@ -2318,7 +2322,7 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_cq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mlcq->mlcq_mtx)); @@ -2326,10 +2330,10 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) VERIFY0(mlcq->mlcq_state & MLXCX_CQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_cq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_cq_head, MLXCX_OP_CREATE_CQ, 0); - ctx = &in.mlxi_create_cq_context; + ctx = &in->mlxi_create_cq_context; ctx->mlcqc_uar_page = to_be24(mlcq->mlcq_uar->mlu_num); ctx->mlcqc_log_cq_size = mlcq->mlcq_entshift; ctx->mlcqc_eqn = mlcq->mlcq_eq->mleq_num; @@ -2348,7 +2352,7 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) while (rem > 0) { ASSERT3U(pa & 0xfff, ==, 0); ASSERT3U(rem, >=, MLXCX_HW_PAGE_SIZE); - in.mlxi_create_cq_pas[npages++] = to_be64(pa); + in->mlxi_create_cq_pas[npages++] = to_be64(pa); rem -= MLXCX_HW_PAGE_SIZE; pa += MLXCX_HW_PAGE_SIZE; } @@ -2358,8 +2362,9 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) insize = offsetof(mlxcx_cmd_create_cq_in_t, mlxi_create_cq_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -2370,6 +2375,7 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) mlcq->mlcq_num = from_be24(out.mlxo_create_cq_cqn); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -2379,11 +2385,11 @@ mlxcx_cmd_query_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_rq_in_t in; - mlxcx_cmd_query_rq_out_t out; + mlxcx_cmd_query_rq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_rq_out_t), KM_SLEEP); VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); VERIFY(mlwq->mlwq_state & MLXCX_WQ_CREATED); @@ -2395,18 +2401,20 @@ mlxcx_cmd_query_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, in.mlxi_query_rq_rqn = to_be24(mlwq->mlwq_num); - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_rq_context, ctxp, + bcopy(&out->mlxo_query_rq_context, ctxp, sizeof (mlxcx_rq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2416,11 +2424,11 @@ mlxcx_cmd_query_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_sq_in_t in; - mlxcx_cmd_query_sq_out_t out; + mlxcx_cmd_query_sq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_sq_out_t), KM_SLEEP); VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); VERIFY(mlwq->mlwq_state & MLXCX_WQ_CREATED); @@ -2432,18 +2440,20 @@ mlxcx_cmd_query_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, in.mlxi_query_sq_sqn = to_be24(mlwq->mlwq_num); - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_sq_context, ctxp, + bcopy(&out->mlxo_query_sq_context, ctxp, sizeof (mlxcx_sq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2453,11 +2463,11 @@ mlxcx_cmd_query_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_cq_in_t in; - mlxcx_cmd_query_cq_out_t out; + mlxcx_cmd_query_cq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_cq_out_t), KM_SLEEP); VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); VERIFY(mlcq->mlcq_state & MLXCX_CQ_CREATED); @@ -2468,18 +2478,20 @@ mlxcx_cmd_query_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, in.mlxi_query_cq_cqn = to_be24(mlcq->mlcq_num); - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_cq_context, ctxp, + bcopy(&out->mlxo_query_cq_context, ctxp, sizeof (mlxcx_completionq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2522,7 +2534,7 @@ boolean_t mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_rq_in_t in; + mlxcx_cmd_create_rq_in_t *in; mlxcx_cmd_create_rq_out_t out; boolean_t ret; mlxcx_rq_ctx_t *ctx; @@ -2530,7 +2542,7 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_rq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); @@ -2539,10 +2551,10 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) VERIFY0(mlwq->mlwq_state & MLXCX_WQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_rq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_rq_head, MLXCX_OP_CREATE_RQ, 0); - ctx = &in.mlxi_create_rq_context; + ctx = &in->mlxi_create_rq_context; set_bit32(&ctx->mlrqc_flags, MLXCX_RQ_FLAGS_RLKEY); set_bit32(&ctx->mlrqc_flags, MLXCX_RQ_FLAGS_FLUSH_IN_ERROR); @@ -2579,8 +2591,9 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) offsetof(mlxcx_workq_ctx_t, mlwqc_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -2591,6 +2604,7 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mlwq->mlwq_num = from_be24(out.mlxo_create_rq_rqn); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -3399,7 +3413,7 @@ boolean_t mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_sq_in_t in; + mlxcx_cmd_create_sq_in_t *in; mlxcx_cmd_create_sq_out_t out; boolean_t ret; mlxcx_sq_ctx_t *ctx; @@ -3407,7 +3421,7 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_sq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); @@ -3416,10 +3430,10 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) VERIFY0(mlwq->mlwq_state & MLXCX_WQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_sq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_sq_head, MLXCX_OP_CREATE_SQ, 0); - ctx = &in.mlxi_create_sq_context; + ctx = &in->mlxi_create_sq_context; set_bit32(&ctx->mlsqc_flags, MLXCX_SQ_FLAGS_RLKEY); set_bit32(&ctx->mlsqc_flags, MLXCX_SQ_FLAGS_FLUSH_IN_ERROR); @@ -3462,8 +3476,9 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) offsetof(mlxcx_workq_ctx_t, mlwqc_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -3474,6 +3489,7 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mlwq->mlwq_num = from_be24(out.mlxo_create_sq_sqn); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index b826838f2e06..d38bd6deb77e 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -642,7 +642,7 @@ typedef enum { .bit_shift = 25, \ .bit_mask = 0x06000000 } -#define MLXCX_WORKQ_CTX_MAX_ADDRESSES 128 +#define MLXCX_WORKQ_CTX_MAX_ADDRESSES 1024 typedef struct mlxcx_workq_ctx { bits32_t mlwqc_flags; @@ -1590,7 +1590,7 @@ typedef struct { /* * This is an artificial limit that we're imposing on our actions. */ -#define MLXCX_CREATE_QUEUE_MAX_PAGES 128 +#define MLXCX_CREATE_QUEUE_MAX_PAGES 1024 typedef struct { mlxcx_cmd_in_t mlxi_create_eq_head; From 37df6b6a66132de7c8c1b218d8b7072bd3c251ef Mon Sep 17 00:00:00 2001 From: Alex Wilson Date: Mon, 4 Dec 2023 02:28:51 +0000 Subject: [PATCH 13/14] mlxcx: add tx latency timers with DEBUG --- usr/src/uts/common/io/mlxcx/mlxcx.h | 38 +++++++++++++++++++ usr/src/uts/common/io/mlxcx/mlxcx_gld.c | 28 ++++++++++++++ usr/src/uts/common/io/mlxcx/mlxcx_intr.c | 2 + usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 47 +++++++++++++++++++++++- 4 files changed, 114 insertions(+), 1 deletion(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 380c58f8ab5c..3acfd9abb73f 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -246,6 +246,21 @@ extern uint_t mlxcx_stuck_intr_count; */ #define MLXCX_FUNC_ID_MAX 0 +#if defined(DEBUG) +#define MLXCX_PERF_TIMERS +#endif + +#if defined(MLXCX_PERF_TIMERS) +static inline void +mlxcx_ptimer(hrtime_t *arr, uint idx) +{ + arr[idx] = gethrtime(); +} +#define MLXCX_PTIMER(A, I) mlxcx_ptimer(A, I) +#else +#define MLXCX_PTIMER(A, I) +#endif + /* * Forwards */ @@ -547,6 +562,25 @@ typedef struct mlxcx_buf_shard { kcondvar_t mlbs_free_nonempty; } mlxcx_buf_shard_t; +typedef enum { + MLXCX_BUF_TIMER_PRE_RING_TX, + MLXCX_BUF_TIMER_POST_OFFLOAD_INFO, + MLXCX_BUF_TIMER_POST_INLINE_BCOPY, + MLXCX_BUF_TIMER_POST_BUF_BIND_COPY, + MLXCX_BUF_TIMER_POST_SQE_BUF, + MLXCX_BUF_TIMER_POST_PREPARE_SQE_INLINE, + MLXCX_BUF_TIMER_POST_PREPARE_SQE, + MLXCX_BUF_TIMER_POST_WQ_MTX, + MLXCX_BUF_TIMER_POST_SQE_IN_RING, + MLXCX_BUF_TIMER_POST_SQ_ADD_BUF, + MLXCX_BUF_TIMER_PRE_TX_COMP, + MLXCX_BUF_TIMER_PRE_STEP2, + MLXCX_BUF_TIMER_COPY_TOTAL, + MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL, + MLXCX_BUF_TIMER_BIND_MBLK_TOTAL, + MLXCX_BUF_TIMER_MAX +} mlxcx_buf_timer_t; + typedef struct mlxcx_buffer { mlxcx_buf_shard_t *mlb_shard; list_node_t mlb_entry; @@ -579,6 +613,10 @@ typedef struct mlxcx_buffer { }; size_t mlb_sqe_size; uint_t mlb_sqe_count; + +#if defined(MLXCX_PERF_TIMERS) + hrtime_t mlb_t[MLXCX_BUF_TIMER_MAX]; +#endif } mlxcx_buffer_t; typedef enum { diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index db9212e0fbb5..8f75141a8867 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -634,9 +634,18 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) size_t take = 0; uint_t bcount; mlxcx_tx_ctx_t ctx; +#if defined(MLXCX_PERF_TIMERS) + hrtime_t times[MLXCX_BUF_TIMER_MAX]; + uint i; +#endif VERIFY(mp->b_next == NULL); +#if defined(MLXCX_PERF_TIMERS) + bzero(times, sizeof (times)); + times[MLXCX_BUF_TIMER_PRE_RING_TX] = gethrtime(); +#endif + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &ctx.mtc_chkflags); mac_lso_get(mp, &ctx.mtc_mss, &ctx.mtc_lsoflags); @@ -653,6 +662,10 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) return (NULL); } +#if defined(MLXCX_PERF_TIMERS) + times[MLXCX_BUF_TIMER_POST_OFFLOAD_INFO] = gethrtime(); +#endif + ctx.mtc_inline_hdrlen = meoi.meoi_l2hlen; /* @@ -698,12 +711,22 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) } } + MLXCX_PTIMER(times, MLXCX_BUF_TIMER_POST_INLINE_BCOPY); + bcount = mlxcx_buf_bind_or_copy(mlxp, sq, mp, kmp, take, &b); if (bcount == 0) { atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); return (mp); } + MLXCX_PTIMER(times, MLXCX_BUF_TIMER_POST_BUF_BIND_COPY); + +#if defined(MLXCX_PERF_TIMERS) + /* Copy our temporary timers over to the buffer_t */ + for (i = 0; i <= MLXCX_BUF_TIMER_POST_BUF_BIND_COPY; ++i) + b->mlb_t[i] = times[i]; +#endif + if (!mlxcx_buf_prepare_sqe(mlxp, sq, b, &ctx)) { mlxcx_warn(mlxp, "!tried to tx packet that couldn't fit in " "an SQE, dropping"); @@ -711,10 +734,14 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) return (NULL); } + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_POST_PREPARE_SQE); + mutex_enter(&sq->mlwq_mtx); VERIFY3U(sq->mlwq_inline_mode, <=, MLXCX_ETH_INLINE_L2); cq = sq->mlwq_cq; + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_POST_WQ_MTX); + /* * state is a single int, so read-only access without the CQ lock * should be fine. @@ -756,6 +783,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) } mutex_exit(&sq->mlwq_mtx); + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_POST_SQ_ADD_BUF); return (NULL); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c index 656f3a497307..8739f628e419 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c @@ -981,6 +981,8 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, list_remove(&mlcq->mlcq_buffers, buf); bufcnt++; + MLXCX_PTIMER(buf->mlb_t, MLXCX_BUF_TIMER_PRE_TX_COMP); + switch (mlcq->mlcq_wq->mlwq_type) { case MLXCX_WQ_TYPE_SENDQ: mlxcx_tx_completion(mlxp, mlcq, cent, buf, &rbatch); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 6ba73e1f681b..9ff2d0c87eb4 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -1674,6 +1674,8 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, return (B_FALSE); } + MLXCX_PTIMER(b0->mlb_t, MLXCX_BUF_TIMER_POST_SQE_IN_RING); + /* * Stash the bufbgen counter, which is incremented every time * buffers_b is merged into buffers. This lets us easily tell which @@ -2285,6 +2287,11 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, size_t sz; boolean_t ret; +#if defined(MLXCX_PERF_TIMERS) + hrtime_t t0, t1; + t0 = gethrtime(); +#endif + rptr = mp->b_rptr; sz = MBLKL(mp); @@ -2299,17 +2306,37 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { b = mlxcx_copy_data(mlxp, wq, rptr, sz); +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL] += t1 - t0; +#endif } else { b = mlxcx_buf_take_foreign(mlxp, wq); if (b == NULL) return (NULL); +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL] += t1 - t0; + t0 = t1; +#endif ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_TRUE); +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_BIND_MBLK_TOTAL] += t1 - t0; + t0 = t1; +#endif + if (!ret) { mlxcx_buf_return(mlxp, b); b = mlxcx_copy_data(mlxp, wq, rptr, sz); + +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL] += t1 - t0; +#endif } } @@ -2338,6 +2365,8 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, b0->mlb_sqe = kmem_zalloc(b0->mlb_sqe_size, KM_SLEEP); } + MLXCX_PTIMER(b0->mlb_t, MLXCX_BUF_TIMER_POST_SQE_BUF); + ents = 1; ent0 = &b0->mlb_sqe[0]; @@ -2418,6 +2447,8 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, ent0->mlsqe_eth.mles_mss = to_be16(ctx->mtc_mss); } + MLXCX_PTIMER(b0->mlb_t, MLXCX_BUF_TIMER_POST_PREPARE_SQE_INLINE); + b = b0; while (b != NULL) { rem = b->mlb_used; @@ -2502,8 +2533,17 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, b->mlb_tx_head = b0; b->mlb_used = MBLKL(mp) - offset; - if (!first) + if (!first) { list_insert_tail(&b0->mlb_tx_chain, b); +#if defined(MLXCX_PERF_TIMERS) + b0->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL] += + b->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL]; + b0->mlb_t[MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL] += + b->mlb_t[MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL]; + b0->mlb_t[MLXCX_BUF_TIMER_BIND_MBLK_TOTAL] += + b->mlb_t[MLXCX_BUF_TIMER_BIND_MBLK_TOTAL]; +#endif + } first = B_FALSE; offset = 0; @@ -2839,6 +2879,10 @@ mlxcx_buf_return_step2(mlxcx_t *mlxp, mlxcx_buffer_t *b) break; } +#if defined(MLXCX_PERF_TIMERS) + bzero(b->mlb_t, sizeof (b->mlb_t)); +#endif + list_insert_tail(&s->mlbs_free, b); cv_broadcast(&s->mlbs_free_nonempty); } @@ -2858,6 +2902,7 @@ mlxcx_buf_return_batch_flush_shard(mlxcx_t *mlxp, } mutex_enter(&mbrb->mbrb_shard[i]->mlbs_mtx); while ((b = list_remove_head(&mbrb->mbrb_list[i]))) { + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_PRE_STEP2); mlxcx_buf_return_step2(mlxp, b); } mutex_exit(&mbrb->mbrb_shard[i]->mlbs_mtx); From 3d151f2015d701051be95c1f944a142af475b854 Mon Sep 17 00:00:00 2001 From: Dan McDonald Date: Fri, 9 Feb 2024 18:38:02 -0500 Subject: [PATCH 14/14] Typo in LSO ASSERT() statement --- usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 9ff2d0c87eb4..2c939e797287 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -2442,7 +2442,7 @@ mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, } if (ctx->mtc_lsoflags & HW_LSO) { ASSERT(mlxp->mlx_caps->mlc_lso); - ASSERT3U(ctx->mctx_inline_hdrlen, >, 0); + ASSERT3U(ctx->mtc_inline_hdrlen, >, 0); ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_LSO; ent0->mlsqe_eth.mles_mss = to_be16(ctx->mtc_mss); }