diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index 12f80cf76830..01b0ebdb6afc 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -650,61 +650,6 @@ mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) va_end(ap); } -uint16_t -mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); -} - -uint32_t -mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); -} - -uint64_t -mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); -} - -void -mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); -} - -void -mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) -{ - uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; - ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); -} - -void -mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) -{ - /* - * The UAR is always inside the first BAR, which we mapped as - * mlx_regs - */ - uintptr_t addr = off + (uintptr_t)mlu->mlu_base + - (uintptr_t)mlxp->mlx_regs_base; - ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); -} - -void -mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) -{ - uintptr_t addr = off + (uintptr_t)mlu->mlu_base + - (uintptr_t)mlxp->mlx_regs_base; - ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); -} - static void mlxcx_fm_fini(mlxcx_t *mlxp) { @@ -816,6 +761,7 @@ mlxcx_teardown_bufs(mlxcx_t *mlxp) list_destroy(&mlxp->mlx_buf_shards); kmem_cache_destroy(mlxp->mlx_bufs_cache); + kmem_cache_destroy(mlxp->mlx_mbrm_cache); } static void @@ -1259,7 +1205,7 @@ mlxcx_regs_map(mlxcx_t *mlxp) * device. */ bzero(&da, sizeof (ddi_device_acc_attr_t)); - da.devacc_attr_version = DDI_DEVICE_ATTR_V0; + da.devacc_attr_version = DDI_DEVICE_ATTR_V1; da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { @@ -1434,6 +1380,26 @@ mlxcx_bufs_cache_destr(void *arg, void *cookie) list_destroy(&b->mlb_tx_chain); } +static int +mlxcx_mbrm_cache_constr(void *arg, void *cookie, int kmflags) +{ + mlxcx_t *mlxp = cookie; + mlxcx_buf_return_mblk_t *mbrm = arg; + (void)mlxp; + bzero(mbrm, sizeof (mlxcx_buf_return_mblk_t)); + return (0); +} + +static void +mlxcx_mbrm_cache_destr(void *arg, void *cookie) +{ + mlxcx_t *mlxp = cookie; + mlxcx_buf_return_mblk_t *mbrm = arg; + (void)mlxp; + VERIFY3P(mbrm->mbrm_mp, ==, NULL); + VERIFY(!list_link_active(&mbrm->mbrm_entry)); +} + mlxcx_buf_shard_t * mlxcx_mlbs_create(mlxcx_t *mlxp) { @@ -1467,6 +1433,12 @@ mlxcx_setup_bufs(mlxcx_t *mlxp) sizeof (mlxcx_buffer_t), sizeof (uint64_t), mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, NULL, mlxp, NULL, 0); + (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_mbrm_cache", + ddi_get_instance(mlxp->mlx_dip)); + mlxp->mlx_mbrm_cache = kmem_cache_create(namebuf, + sizeof (mlxcx_buf_return_mblk_t), sizeof (uint64_t), + mlxcx_mbrm_cache_constr, mlxcx_mbrm_cache_destr, + NULL, mlxp, NULL, 0); list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), offsetof(mlxcx_buf_shard_t, mlbs_entry)); @@ -1518,11 +1490,12 @@ mlxcx_eq_check(void *arg) { mlxcx_t *mlxp = (mlxcx_t *)arg; mlxcx_event_queue_t *eq; - mlxcx_eventq_ctx_t ctx; + mlxcx_eventq_ctx_t *ctx; const char *str; - uint_t i; + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + for (i = 0; i < mlxp->mlx_intr_count; ++i) { eq = &mlxp->mlx_eqs[i]; @@ -1536,11 +1509,11 @@ mlxcx_eq_check(void *arg) */ ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); - if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) + if (!mlxcx_cmd_query_eq(mlxp, eq, ctx)) continue; str = "???"; - switch (ctx.mleqc_status) { + switch (ctx->mleqc_status) { case MLXCX_EQ_STATUS_OK: break; case MLXCX_EQ_STATUS_WRITE_FAILURE: @@ -1548,14 +1521,14 @@ mlxcx_eq_check(void *arg) break; } - if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { + if (ctx->mleqc_status != MLXCX_EQ_STATUS_OK) { mlxcx_fm_qstate_ereport(mlxp, "event", - eq->mleq_num, str, ctx.mleqc_status); + eq->mleq_num, str, ctx->mleqc_status); mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", - eq->mleq_intr_index, ctx.mleqc_status, str); + eq->mleq_intr_index, ctx->mleqc_status, str); } - if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && + if (ctx->mleqc_state != MLXCX_EQ_ST_ARMED && (eq->mleq_state & MLXCX_EQ_ARMED)) { if (eq->mleq_cc == eq->mleq_check_disarm_cc && ++eq->mleq_check_disarm_cnt >= 3) { @@ -1569,6 +1542,8 @@ mlxcx_eq_check(void *arg) eq->mleq_check_disarm_cnt = 0; } } + + kmem_free(ctx, sizeof (*ctx)); } static void @@ -1576,10 +1551,12 @@ mlxcx_cq_check(void *arg) { mlxcx_t *mlxp = (mlxcx_t *)arg; mlxcx_completion_queue_t *cq; - mlxcx_completionq_ctx_t ctx; + mlxcx_completionq_ctx_t *ctx; const char *str, *type; uint_t v; + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; cq = list_next(&mlxp->mlx_cqs, cq)) { @@ -1597,7 +1574,7 @@ mlxcx_cq_check(void *arg) if (cq->mlcq_fm_repd_qstate) continue; - if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) + if (!mlxcx_cmd_query_cq(mlxp, cq, ctx)) continue; if (cq->mlcq_wq != NULL) { @@ -1613,7 +1590,7 @@ mlxcx_cq_check(void *arg) } str = "???"; - v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); + v = get_bits32(ctx->mlcqc_flags, MLXCX_CQ_CTX_STATUS); switch (v) { case MLXCX_CQC_STATUS_OK: break; @@ -1636,7 +1613,7 @@ mlxcx_cq_check(void *arg) cq->mlcq_fm_repd_qstate = B_TRUE; } - v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); + v = get_bits32(ctx->mlcqc_flags, MLXCX_CQ_CTX_STATE); if (v != MLXCX_CQC_STATE_ARMED && (cq->mlcq_state & MLXCX_CQ_ARMED) && !(cq->mlcq_state & MLXCX_CQ_POLLING)) { @@ -1652,19 +1629,25 @@ mlxcx_cq_check(void *arg) cq->mlcq_check_disarm_cc = 0; } } + + kmem_free(ctx, sizeof (*ctx)); } void mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) { - mlxcx_sq_ctx_t ctx; + mlxcx_sq_ctx_t *ctx; mlxcx_sq_state_t state; - if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) + ctx = kmem_zalloc(sizeof (mlxcx_sq_ctx_t), KM_SLEEP); + + if (!mlxcx_cmd_query_sq(mlxp, sq, ctx)) { + kmem_free(ctx, sizeof (*ctx)); return; + } - ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); - state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); + ASSERT3U(from_be24(ctx->mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); + state = get_bits32(ctx->mlsqc_flags, MLXCX_SQ_STATE); switch (state) { case MLXCX_SQ_STATE_RST: if (sq->mlwq_state & MLXCX_WQ_STARTED) { @@ -1691,20 +1674,25 @@ mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) sq->mlwq_fm_repd_qstate = B_TRUE; break; } + + kmem_free(ctx, sizeof (mlxcx_sq_ctx_t)); } void mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) { - mlxcx_rq_ctx_t ctx; + mlxcx_rq_ctx_t *ctx; mlxcx_rq_state_t state; + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); - if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) + if (!mlxcx_cmd_query_rq(mlxp, rq, ctx)) { + kmem_free(ctx, sizeof (*ctx)); return; + } - ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); - state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); + ASSERT3U(from_be24(ctx->mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); + state = get_bits32(ctx->mlrqc_flags, MLXCX_RQ_STATE); switch (state) { case MLXCX_RQ_STATE_RST: if (rq->mlwq_state & MLXCX_WQ_STARTED) { @@ -1731,6 +1719,8 @@ mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) rq->mlwq_fm_repd_qstate = B_TRUE; break; } + + kmem_free(ctx, sizeof (*ctx)); } static void diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index c2843790cca8..3acfd9abb73f 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -167,7 +167,7 @@ extern "C" { * How big does an mblk have to be before we dma_bind() it instead of * bcopying? */ -#define MLXCX_TX_BIND_THRESHOLD_DFLT 2048 +#define MLXCX_TX_BIND_THRESHOLD_DFLT 512 /* * How often to check the status of completion queues for overflow and @@ -246,6 +246,21 @@ extern uint_t mlxcx_stuck_intr_count; */ #define MLXCX_FUNC_ID_MAX 0 +#if defined(DEBUG) +#define MLXCX_PERF_TIMERS +#endif + +#if defined(MLXCX_PERF_TIMERS) +static inline void +mlxcx_ptimer(hrtime_t *arr, uint idx) +{ + arr[idx] = gethrtime(); +} +#define MLXCX_PTIMER(A, I) mlxcx_ptimer(A, I) +#else +#define MLXCX_PTIMER(A, I) +#endif + /* * Forwards */ @@ -318,12 +333,7 @@ typedef struct mlxcx_cmd_queue { uint8_t mcmd_size_l2; uint8_t mcmd_stride_l2; uint_t mcmd_size; - /* - * The mask has a bit for each command slot, there are a maximum - * of 32 slots. When the bit is set in the mask, it indicates - * the slot is available. - */ - uint32_t mcmd_mask; + uint8_t mcmd_next; /* next command slot */ mlxcx_cmd_t *mcmd_active[MLXCX_CMD_MAX]; @@ -552,6 +562,25 @@ typedef struct mlxcx_buf_shard { kcondvar_t mlbs_free_nonempty; } mlxcx_buf_shard_t; +typedef enum { + MLXCX_BUF_TIMER_PRE_RING_TX, + MLXCX_BUF_TIMER_POST_OFFLOAD_INFO, + MLXCX_BUF_TIMER_POST_INLINE_BCOPY, + MLXCX_BUF_TIMER_POST_BUF_BIND_COPY, + MLXCX_BUF_TIMER_POST_SQE_BUF, + MLXCX_BUF_TIMER_POST_PREPARE_SQE_INLINE, + MLXCX_BUF_TIMER_POST_PREPARE_SQE, + MLXCX_BUF_TIMER_POST_WQ_MTX, + MLXCX_BUF_TIMER_POST_SQE_IN_RING, + MLXCX_BUF_TIMER_POST_SQ_ADD_BUF, + MLXCX_BUF_TIMER_PRE_TX_COMP, + MLXCX_BUF_TIMER_PRE_STEP2, + MLXCX_BUF_TIMER_COPY_TOTAL, + MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL, + MLXCX_BUF_TIMER_BIND_MBLK_TOTAL, + MLXCX_BUF_TIMER_MAX +} mlxcx_buf_timer_t; + typedef struct mlxcx_buffer { mlxcx_buf_shard_t *mlb_shard; list_node_t mlb_entry; @@ -576,6 +605,18 @@ typedef struct mlxcx_buffer { mlxcx_dma_buffer_t mlb_dma; mblk_t *mlb_mp; frtn_t mlb_frtn; + + /* spooled up sendq entries ready to push into the ring */ + union { + mlxcx_sendq_ent_t *mlb_sqe; + mlxcx_sendq_extra_ent_t *mlb_esqe; + }; + size_t mlb_sqe_size; + uint_t mlb_sqe_count; + +#if defined(MLXCX_PERF_TIMERS) + hrtime_t mlb_t[MLXCX_BUF_TIMER_MAX]; +#endif } mlxcx_buffer_t; typedef enum { @@ -629,6 +670,7 @@ typedef struct mlxcx_completion_queue { list_t mlcq_buffers; kmutex_t mlcq_bufbmtx; list_t mlcq_buffers_b; + uint64_t mlcq_bufbgen; uint_t mlcq_check_disarm_cnt; uint64_t mlcq_check_disarm_cc; @@ -643,14 +685,15 @@ typedef struct mlxcx_completion_queue { } mlxcx_completion_queue_t; typedef enum { - MLXCX_WQ_ALLOC = 1 << 0, - MLXCX_WQ_CREATED = 1 << 1, - MLXCX_WQ_STARTED = 1 << 2, - MLXCX_WQ_DESTROYED = 1 << 3, - MLXCX_WQ_TEARDOWN = 1 << 4, - MLXCX_WQ_BUFFERS = 1 << 5, - MLXCX_WQ_REFILLING = 1 << 6, - MLXCX_WQ_BLOCKED_MAC = 1 << 7 + MLXCX_WQ_INIT = 1 << 0, + MLXCX_WQ_ALLOC = 1 << 1, + MLXCX_WQ_CREATED = 1 << 2, + MLXCX_WQ_STARTED = 1 << 3, + MLXCX_WQ_DESTROYED = 1 << 4, + MLXCX_WQ_TEARDOWN = 1 << 5, + MLXCX_WQ_BUFFERS = 1 << 6, + MLXCX_WQ_REFILLING = 1 << 7, + MLXCX_WQ_BLOCKED_MAC = 1 << 8 } mlxcx_workq_state_t; typedef enum { @@ -891,6 +934,8 @@ typedef enum { MLXCX_TIRS_PER_GROUP } mlxcx_tir_role_t; +#define MLXCX_TIS_PER_GROUP 8 + typedef struct { avl_node_t mlgm_group_entry; list_node_t mlgm_fe_entry; @@ -915,7 +960,7 @@ struct mlxcx_ring_group { mac_group_handle_t mlg_mac_hdl; union { - mlxcx_tis_t mlg_tis; + mlxcx_tis_t mlg_tis[MLXCX_TIS_PER_GROUP]; mlxcx_tir_t mlg_tir[MLXCX_TIRS_PER_GROUP]; }; mlxcx_port_t *mlg_port; @@ -1230,6 +1275,7 @@ struct mlxcx { mlxcx_ring_group_t *mlx_tx_groups; kmem_cache_t *mlx_bufs_cache; + kmem_cache_t *mlx_mbrm_cache; list_t mlx_buf_shards; ddi_periodic_t mlx_eq_checktimer; @@ -1243,18 +1289,83 @@ struct mlxcx { mlxcx_temp_sensor_t *mlx_temp_sensors; }; +typedef struct mlxcx_buf_return_mblk { + list_node_t mbrm_entry; + mblk_t *mbrm_mp; +} mlxcx_buf_return_mblk_t; + +#define MLXCX_BRB_SHARDS 4 +#define MLXCX_BRB_INLINE_MBLKS 8 +typedef struct mlxcx_buf_return_batch { + uint mbrb_n[MLXCX_BRB_SHARDS]; + mlxcx_buf_shard_t *mbrb_shard[MLXCX_BRB_SHARDS]; + list_t mbrb_list[MLXCX_BRB_SHARDS]; + list_t mbrb_mblks; + mblk_t *mbrb_inline_mblk[MLXCX_BRB_INLINE_MBLKS]; + uint mbrb_inline_mblks; +} mlxcx_buf_return_batch_t; + +extern void mlxcx_buf_return_batch_init(mlxcx_buf_return_batch_t *); +extern void mlxcx_buf_return_batch_flush(mlxcx_t *, mlxcx_buf_return_batch_t *); + + /* - * Register access + * Register access. Use static inlines. */ -extern uint16_t mlxcx_get16(mlxcx_t *, uintptr_t); -extern uint32_t mlxcx_get32(mlxcx_t *, uintptr_t); -extern uint64_t mlxcx_get64(mlxcx_t *, uintptr_t); +static inline uint16_t +mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); +} -extern void mlxcx_put32(mlxcx_t *, uintptr_t, uint32_t); -extern void mlxcx_put64(mlxcx_t *, uintptr_t, uint64_t); +static inline uint32_t +mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); +} + +static inline uint64_t +mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); +} + +static inline void +mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); +} + +static inline void +mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) +{ + uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; + ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); +} -extern void mlxcx_uar_put32(mlxcx_t *, mlxcx_uar_t *, uintptr_t, uint32_t); -extern void mlxcx_uar_put64(mlxcx_t *, mlxcx_uar_t *, uintptr_t, uint64_t); +static inline void +mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) +{ + /* + * The UAR is always inside the first BAR, which we mapped as + * mlx_regs + */ + uintptr_t addr = off + (uintptr_t)mlu->mlu_base + + (uintptr_t)mlxp->mlx_regs_base; + ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); +} + +static inline void +mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) +{ + uintptr_t addr = off + (uintptr_t)mlu->mlu_base + + (uintptr_t)mlxp->mlx_regs_base; + ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); +} /* * Logging functions. @@ -1343,7 +1454,7 @@ extern void mlxcx_shard_ready(mlxcx_buf_shard_t *); extern void mlxcx_shard_draining(mlxcx_buf_shard_t *); extern uint_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *, - mblk_t *, size_t, mlxcx_buffer_t **); + mblk_t *, mblk_t *, size_t, mlxcx_buffer_t **); extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *); extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *); @@ -1359,10 +1470,21 @@ extern boolean_t mlxcx_rq_add_buffer(mlxcx_t *, mlxcx_work_queue_t *, extern boolean_t mlxcx_rq_add_buffers(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **, size_t); extern boolean_t mlxcx_sq_add_buffer(mlxcx_t *, mlxcx_work_queue_t *, - uint8_t *, size_t, uint32_t, mlxcx_buffer_t *); + mlxcx_buffer_t *); extern boolean_t mlxcx_sq_add_nop(mlxcx_t *, mlxcx_work_queue_t *); extern void mlxcx_rq_refill(mlxcx_t *, mlxcx_work_queue_t *); +typedef struct mlxcx_tx_ctx { + uint8_t mtc_inline_hdrs[MLXCX_MAX_INLINE_HEADERLEN]; + size_t mtc_inline_hdrlen; + uint32_t mtc_chkflags; + uint32_t mtc_mss; + uint32_t mtc_lsoflags; +} mlxcx_tx_ctx_t; + +extern boolean_t mlxcx_buf_prepare_sqe(mlxcx_t *, mlxcx_work_queue_t *, + mlxcx_buffer_t *, const mlxcx_tx_ctx_t *); + extern void mlxcx_teardown_groups(mlxcx_t *); extern void mlxcx_wq_teardown(mlxcx_t *, mlxcx_work_queue_t *); extern void mlxcx_cq_teardown(mlxcx_t *, mlxcx_completion_queue_t *); @@ -1370,7 +1492,7 @@ extern void mlxcx_teardown_rx_group(mlxcx_t *, mlxcx_ring_group_t *); extern void mlxcx_teardown_tx_group(mlxcx_t *, mlxcx_ring_group_t *); extern void mlxcx_tx_completion(mlxcx_t *, mlxcx_completion_queue_t *, - mlxcx_completionq_ent_t *, mlxcx_buffer_t *); + mlxcx_completionq_ent_t *, mlxcx_buffer_t *, mlxcx_buf_return_batch_t *); extern mblk_t *mlxcx_rx_completion(mlxcx_t *, mlxcx_completion_queue_t *, mlxcx_completionq_ent_t *, mlxcx_buffer_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c index 2183413d2bc6..8a1e9b5d57e2 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c @@ -569,7 +569,7 @@ mlxcx_cmd_queue_init(mlxcx_t *mlxp) return (B_FALSE); } - cmd->mcmd_mask = (uint32_t)((1ULL << cmd->mcmd_size) - 1); + cmd->mcmd_next = 0; mutex_init(&cmd->mcmd_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&cmd->mcmd_cv, NULL, CV_DRIVER, NULL); @@ -840,31 +840,34 @@ mlxcx_cmd_copy_output(mlxcx_cmd_ent_t *ent, mlxcx_cmd_t *cmd) } static uint_t -mlxcx_cmd_reserve_slot(mlxcx_cmd_queue_t *cmdq) +mlxcx_cmd_reserve_slot(mlxcx_cmd_queue_t *cmdq, mlxcx_cmd_t *cmd) { - uint_t slot; - + uint_t i, slot; + ASSERT(mutex_owned(&cmd->mlcmd_lock)); mutex_enter(&cmdq->mcmd_lock); - slot = ddi_ffs(cmdq->mcmd_mask); - while (slot == 0) { + while (1) { + for (i = 0; i < MLXCX_CMD_MAX; ++i) { + slot = (cmdq->mcmd_next + i) % MLXCX_CMD_MAX; + if (cmdq->mcmd_active[slot] == NULL) + break; + } + if (cmdq->mcmd_active[slot] == NULL) { + cmdq->mcmd_active[slot] = cmd; + cmdq->mcmd_next = slot + 1; + mutex_exit(&cmdq->mcmd_lock); + return (slot); + } cv_wait(&cmdq->mcmd_cv, &cmdq->mcmd_lock); - slot = ddi_ffs(cmdq->mcmd_mask); } - - cmdq->mcmd_mask &= ~(1U << --slot); - - ASSERT3P(cmdq->mcmd_active[slot], ==, NULL); - - mutex_exit(&cmdq->mcmd_lock); - - return (slot); } static void -mlxcx_cmd_release_slot(mlxcx_cmd_queue_t *cmdq, uint_t slot) +mlxcx_cmd_release_slot(mlxcx_cmd_queue_t *cmdq, uint_t slot, mlxcx_cmd_t *cmd) { + ASSERT(mutex_owned(&cmd->mlcmd_lock)); mutex_enter(&cmdq->mcmd_lock); - cmdq->mcmd_mask |= 1U << slot; + ASSERT3P(cmdq->mcmd_active[slot], ==, cmd); + cmdq->mcmd_active[slot] = NULL; cv_broadcast(&cmdq->mcmd_cv); mutex_exit(&cmdq->mcmd_lock); } @@ -876,6 +879,8 @@ mlxcx_cmd_done(mlxcx_cmd_t *cmd, uint_t slot) mlxcx_cmd_queue_t *cmdq = &mlxp->mlx_cmd; mlxcx_cmd_ent_t *ent; + ASSERT(mutex_owned(&cmd->mlcmd_lock)); + /* * Command is done. Save relevant data. Once we broadcast on the CV and * drop the lock, we must not touch it again. @@ -885,17 +890,16 @@ mlxcx_cmd_done(mlxcx_cmd_t *cmd, uint_t slot) ent = (mlxcx_cmd_ent_t *)(cmdq->mcmd_dma.mxdb_va + (slot << cmdq->mcmd_stride_l2)); - mutex_enter(&cmd->mlcmd_lock); cmd->mlcmd_status = MLXCX_CMD_STATUS(ent->mce_status); if (cmd->mlcmd_status == 0) mlxcx_cmd_copy_output(ent, cmd); cmd->mlcmd_state = MLXCX_CMD_S_DONE; cv_broadcast(&cmd->mlcmd_cv); - mutex_exit(&cmd->mlcmd_lock); - cmdq->mcmd_active[slot] = NULL; - mlxcx_cmd_release_slot(cmdq, slot); + mlxcx_cmd_release_slot(cmdq, slot, cmd); + + mutex_exit(&cmd->mlcmd_lock); } static void @@ -907,14 +911,14 @@ mlxcx_cmd_taskq(void *arg) mlxcx_cmd_ent_t *ent; uint_t poll, slot; - ASSERT3S(cmd->mlcmd_op, !=, 0); + mutex_enter(&cmd->mlcmd_lock); + + VERIFY3S(cmd->mlcmd_op, !=, 0); - slot = mlxcx_cmd_reserve_slot(cmdq); + slot = mlxcx_cmd_reserve_slot(cmdq, cmd); ent = (mlxcx_cmd_ent_t *)(cmdq->mcmd_dma.mxdb_va + (slot << cmdq->mcmd_stride_l2)); - cmdq->mcmd_active[slot] = cmd; - /* * Command queue is currently ours as we set busy. */ @@ -924,15 +928,25 @@ mlxcx_cmd_taskq(void *arg) ent->mce_out_length = to_be32(cmd->mlcmd_outlen); ent->mce_token = cmd->mlcmd_token; ent->mce_sig = 0; - ent->mce_status = MLXCX_CMD_HW_OWNED; mlxcx_cmd_prep_input(ent, cmd); mlxcx_cmd_prep_output(ent, cmd); + + /* + * Ensure all of the other fields of the entry are written before + * we switch the owner to hardware (the device might start executing + * right away) + */ + membar_producer(); + ent->mce_status = MLXCX_CMD_HW_OWNED; + MLXCX_DMA_SYNC(cmdq->mcmd_dma, DDI_DMA_SYNC_FORDEV); mlxcx_put32(mlxp, MLXCX_ISS_CMD_DOORBELL, 1 << slot); - if (!cmd->mlcmd_poll) + if (!cmd->mlcmd_poll) { + mutex_exit(&cmd->mlcmd_lock); return; + } for (poll = 0; poll < mlxcx_cmd_tries; poll++) { delay(drv_usectohz(mlxcx_cmd_delay)); @@ -947,21 +961,21 @@ mlxcx_cmd_taskq(void *arg) */ if (poll == mlxcx_cmd_tries) { - mutex_enter(&cmd->mlcmd_lock); cmd->mlcmd_status = MLXCX_CMD_R_TIMEOUT; cmd->mlcmd_state = MLXCX_CMD_S_ERROR; cv_broadcast(&cmd->mlcmd_cv); + + mlxcx_cmd_release_slot(cmdq, slot, cmd); + mutex_exit(&cmd->mlcmd_lock); mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_NO_RESPONSE); - cmdq->mcmd_active[slot] = NULL; - mlxcx_cmd_release_slot(cmdq, slot); - return; } mlxcx_cmd_done(cmd, slot); + /* mlxcx_cmd_done releases mlcmd_lock */ } void @@ -980,10 +994,17 @@ mlxcx_cmd_completion(mlxcx_t *mlxp, mlxcx_eventq_ent_t *ent) comp_vec &= ~(1U << --slot); cmd = cmdq->mcmd_active[slot]; + + /* + * This field is never modified, so we shouldn't need to hold + * mlcmd_lock before checking it. + */ if (cmd->mlcmd_poll) continue; + mutex_enter(&cmd->mlcmd_lock); mlxcx_cmd_done(cmd, slot); + /* mlxcx_cmd_done releases mlcmd_lock */ } } @@ -2125,7 +2146,7 @@ boolean_t mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_eq_in_t in; + mlxcx_cmd_create_eq_in_t *in; mlxcx_cmd_create_eq_out_t out; boolean_t ret; mlxcx_eventq_ctx_t *ctx; @@ -2133,7 +2154,7 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_eq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mleq->mleq_mtx)); @@ -2141,15 +2162,15 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) VERIFY0(mleq->mleq_state & MLXCX_EQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_eq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_eq_head, MLXCX_OP_CREATE_EQ, 0); - ctx = &in.mlxi_create_eq_context; + ctx = &in->mlxi_create_eq_context; ctx->mleqc_uar_page = to_be24(mleq->mleq_uar->mlu_num); ctx->mleqc_log_eq_size = mleq->mleq_entshift; ctx->mleqc_intr = mleq->mleq_intr_index; - in.mlxi_create_eq_event_bitmask = to_be64(mleq->mleq_events); + in->mlxi_create_eq_event_bitmask = to_be64(mleq->mleq_events); npages = 0; c = NULL; @@ -2159,7 +2180,7 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) while (rem > 0) { ASSERT3U(pa & 0xfff, ==, 0); ASSERT3U(rem, >=, MLXCX_HW_PAGE_SIZE); - in.mlxi_create_eq_pas[npages++] = to_be64(pa); + in->mlxi_create_eq_pas[npages++] = to_be64(pa); rem -= MLXCX_HW_PAGE_SIZE; pa += MLXCX_HW_PAGE_SIZE; } @@ -2169,8 +2190,9 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) insize = offsetof(mlxcx_cmd_create_eq_in_t, mlxi_create_eq_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -2181,6 +2203,7 @@ mlxcx_cmd_create_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) mleq->mleq_num = out.mlxo_create_eq_eqn; } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -2190,11 +2213,11 @@ mlxcx_cmd_query_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_eq_in_t in; - mlxcx_cmd_query_eq_out_t out; + mlxcx_cmd_query_eq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_eq_out_t), KM_SLEEP); VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); VERIFY(mleq->mleq_state & MLXCX_EQ_CREATED); @@ -2205,18 +2228,20 @@ mlxcx_cmd_query_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq, in.mlxi_query_eq_eqn = mleq->mleq_num; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_eq_context, ctxp, + bcopy(&out->mlxo_query_eq_context, ctxp, sizeof (mlxcx_eventq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2289,7 +2314,7 @@ boolean_t mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_cq_in_t in; + mlxcx_cmd_create_cq_in_t *in; mlxcx_cmd_create_cq_out_t out; boolean_t ret; mlxcx_completionq_ctx_t *ctx; @@ -2297,7 +2322,7 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_cq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mlcq->mlcq_mtx)); @@ -2305,10 +2330,10 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) VERIFY0(mlcq->mlcq_state & MLXCX_CQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_cq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_cq_head, MLXCX_OP_CREATE_CQ, 0); - ctx = &in.mlxi_create_cq_context; + ctx = &in->mlxi_create_cq_context; ctx->mlcqc_uar_page = to_be24(mlcq->mlcq_uar->mlu_num); ctx->mlcqc_log_cq_size = mlcq->mlcq_entshift; ctx->mlcqc_eqn = mlcq->mlcq_eq->mleq_num; @@ -2327,7 +2352,7 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) while (rem > 0) { ASSERT3U(pa & 0xfff, ==, 0); ASSERT3U(rem, >=, MLXCX_HW_PAGE_SIZE); - in.mlxi_create_cq_pas[npages++] = to_be64(pa); + in->mlxi_create_cq_pas[npages++] = to_be64(pa); rem -= MLXCX_HW_PAGE_SIZE; pa += MLXCX_HW_PAGE_SIZE; } @@ -2337,8 +2362,9 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) insize = offsetof(mlxcx_cmd_create_cq_in_t, mlxi_create_cq_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -2349,6 +2375,7 @@ mlxcx_cmd_create_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) mlcq->mlcq_num = from_be24(out.mlxo_create_cq_cqn); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -2358,11 +2385,11 @@ mlxcx_cmd_query_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_rq_in_t in; - mlxcx_cmd_query_rq_out_t out; + mlxcx_cmd_query_rq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_rq_out_t), KM_SLEEP); VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); VERIFY(mlwq->mlwq_state & MLXCX_WQ_CREATED); @@ -2374,18 +2401,20 @@ mlxcx_cmd_query_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, in.mlxi_query_rq_rqn = to_be24(mlwq->mlwq_num); - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_rq_context, ctxp, + bcopy(&out->mlxo_query_rq_context, ctxp, sizeof (mlxcx_rq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2395,11 +2424,11 @@ mlxcx_cmd_query_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_sq_in_t in; - mlxcx_cmd_query_sq_out_t out; + mlxcx_cmd_query_sq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_sq_out_t), KM_SLEEP); VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); VERIFY(mlwq->mlwq_state & MLXCX_WQ_CREATED); @@ -2411,18 +2440,20 @@ mlxcx_cmd_query_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, in.mlxi_query_sq_sqn = to_be24(mlwq->mlwq_num); - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_sq_context, ctxp, + bcopy(&out->mlxo_query_sq_context, ctxp, sizeof (mlxcx_sq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2432,11 +2463,11 @@ mlxcx_cmd_query_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, { mlxcx_cmd_t cmd; mlxcx_cmd_query_cq_in_t in; - mlxcx_cmd_query_cq_out_t out; + mlxcx_cmd_query_cq_out_t *out; boolean_t ret; bzero(&in, sizeof (in)); - bzero(&out, sizeof (out)); + out = kmem_zalloc(sizeof (mlxcx_cmd_query_cq_out_t), KM_SLEEP); VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); VERIFY(mlcq->mlcq_state & MLXCX_CQ_CREATED); @@ -2447,18 +2478,20 @@ mlxcx_cmd_query_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, in.mlxi_query_cq_cqn = to_be24(mlcq->mlcq_num); - if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, &in, sizeof (in), out, sizeof (*out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); ret = mlxcx_cmd_evaluate(mlxp, &cmd); if (ret) { - bcopy(&out.mlxo_query_cq_context, ctxp, + bcopy(&out->mlxo_query_cq_context, ctxp, sizeof (mlxcx_completionq_ctx_t)); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(out, sizeof (*out)); return (ret); } @@ -2501,7 +2534,7 @@ boolean_t mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_rq_in_t in; + mlxcx_cmd_create_rq_in_t *in; mlxcx_cmd_create_rq_out_t out; boolean_t ret; mlxcx_rq_ctx_t *ctx; @@ -2509,7 +2542,7 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_rq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); @@ -2518,10 +2551,10 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) VERIFY0(mlwq->mlwq_state & MLXCX_WQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_rq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_rq_head, MLXCX_OP_CREATE_RQ, 0); - ctx = &in.mlxi_create_rq_context; + ctx = &in->mlxi_create_rq_context; set_bit32(&ctx->mlrqc_flags, MLXCX_RQ_FLAGS_RLKEY); set_bit32(&ctx->mlrqc_flags, MLXCX_RQ_FLAGS_FLUSH_IN_ERROR); @@ -2558,8 +2591,9 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) offsetof(mlxcx_workq_ctx_t, mlwqc_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -2570,6 +2604,7 @@ mlxcx_cmd_create_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mlwq->mlwq_num = from_be24(out.mlxo_create_rq_rqn); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -3378,7 +3413,7 @@ boolean_t mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_cmd_t cmd; - mlxcx_cmd_create_sq_in_t in; + mlxcx_cmd_create_sq_in_t *in; mlxcx_cmd_create_sq_out_t out; boolean_t ret; mlxcx_sq_ctx_t *ctx; @@ -3386,7 +3421,7 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) const ddi_dma_cookie_t *c; uint64_t pa, npages; - bzero(&in, sizeof (in)); + in = kmem_zalloc(sizeof (mlxcx_cmd_create_sq_in_t), KM_SLEEP); bzero(&out, sizeof (out)); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); @@ -3395,10 +3430,10 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) VERIFY0(mlwq->mlwq_state & MLXCX_WQ_CREATED); mlxcx_cmd_init(mlxp, &cmd); - mlxcx_cmd_in_header_init(&cmd, &in.mlxi_create_sq_head, + mlxcx_cmd_in_header_init(&cmd, &in->mlxi_create_sq_head, MLXCX_OP_CREATE_SQ, 0); - ctx = &in.mlxi_create_sq_context; + ctx = &in->mlxi_create_sq_context; set_bit32(&ctx->mlsqc_flags, MLXCX_SQ_FLAGS_RLKEY); set_bit32(&ctx->mlsqc_flags, MLXCX_SQ_FLAGS_FLUSH_IN_ERROR); @@ -3441,8 +3476,9 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) offsetof(mlxcx_workq_ctx_t, mlwqc_pas) + sizeof (uint64_t) * npages; - if (!mlxcx_cmd_send(mlxp, &cmd, &in, insize, &out, sizeof (out))) { + if (!mlxcx_cmd_send(mlxp, &cmd, in, insize, &out, sizeof (out))) { mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (B_FALSE); } mlxcx_cmd_wait(&cmd); @@ -3453,6 +3489,7 @@ mlxcx_cmd_create_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mlwq->mlwq_num = from_be24(out.mlxo_create_sq_sqn); } mlxcx_cmd_fini(mlxp, &cmd); + kmem_free(in, sizeof (*in)); return (ret); } @@ -3725,7 +3762,7 @@ CTASSERT(sizeof (mlxcx_completionq_error_ent_t) == CTASSERT(sizeof (mlxcx_wqe_control_seg_t) == (1 << 4)); CTASSERT(offsetof(mlxcx_wqe_eth_seg_t, mles_inline_headers) == 0x0e); -CTASSERT(sizeof (mlxcx_wqe_eth_seg_t) == (1 << 5)); +CTASSERT(sizeof (mlxcx_wqe_eth_seg_t) == (1 << 4)); CTASSERT(sizeof (mlxcx_wqe_data_seg_t) == (1 << 4)); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index c01fc94a4eff..8f75141a8867 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -13,6 +13,7 @@ * Copyright (c) 2021, the University of Queensland * Copyright 2020 RackTop Systems, Inc. * Copyright 2023 MNX Cloud, Inc. + * Copyright 2023 Oxide Computer Company */ /* @@ -29,6 +30,7 @@ #include #include +#include /* Need these for mac_vlan_header_info() */ #include @@ -200,6 +202,119 @@ mlxcx_link_fec_cap(link_fec_t fec, mlxcx_pplm_fec_caps_t *pfecp) return (B_TRUE); } +static mac_ether_media_t +mlxcx_mac_media(mlxcx_port_t *port) +{ + switch (port->mlp_oper_status) { + case MLXCX_PORT_STATUS_UP: + case MLXCX_PORT_STATUS_UP_ONCE: + break; + case MLXCX_PORT_STATUS_DOWN: + return (ETHER_MEDIA_NONE); + case MLXCX_PORT_STATUS_DISABLED: + return (ETHER_MEDIA_UNKNOWN); + } + + switch (port->mlp_oper_proto) { + case MLXCX_PROTO_SGMII: + return (ETHER_MEDIA_1000_SGMII); + case MLXCX_PROTO_1000BASE_KX: + return (ETHER_MEDIA_1000BASE_KX); + case MLXCX_PROTO_10GBASE_CX4: + return (ETHER_MEDIA_10GBASE_CX4); + case MLXCX_PROTO_10GBASE_KX4: + return (ETHER_MEDIA_10GBASE_KX4); + case MLXCX_PROTO_10GBASE_KR: + return (ETHER_MEDIA_10GBASE_KR); + case MLXCX_PROTO_40GBASE_CR4: + return (ETHER_MEDIA_40GBASE_CR4); + case MLXCX_PROTO_40GBASE_KR4: + return (ETHER_MEDIA_40GBASE_KR4); + case MLXCX_PROTO_SGMII_100BASE: + return (ETHER_MEDIA_100_SGMII); + case MLXCX_PROTO_10GBASE_CR: + return (ETHER_MEDIA_10GBASE_CR); + case MLXCX_PROTO_10GBASE_SR: + return (ETHER_MEDIA_10GBASE_SR); + case MLXCX_PROTO_10GBASE_ER_LR: + return (ETHER_MEDIA_10GBASE_LR); + case MLXCX_PROTO_40GBASE_SR4: + return (ETHER_MEDIA_40GBASE_SR4); + case MLXCX_PROTO_40GBASE_LR4_ER4: + return (ETHER_MEDIA_40GBASE_LR4); + case MLXCX_PROTO_50GBASE_SR2: + return (ETHER_MEDIA_50GBASE_SR2); + case MLXCX_PROTO_100GBASE_CR4: + return (ETHER_MEDIA_100GBASE_CR4); + case MLXCX_PROTO_100GBASE_SR4: + return (ETHER_MEDIA_100GBASE_SR4); + case MLXCX_PROTO_100GBASE_KR4: + return (ETHER_MEDIA_100GBASE_KR4); + case MLXCX_PROTO_25GBASE_CR: + return (ETHER_MEDIA_25GBASE_CR); + case MLXCX_PROTO_25GBASE_KR: + return (ETHER_MEDIA_25GBASE_KR); + case MLXCX_PROTO_25GBASE_SR: + return (ETHER_MEDIA_25GBASE_SR); + case MLXCX_PROTO_50GBASE_CR2: + return (ETHER_MEDIA_50GBASE_CR2); + case MLXCX_PROTO_50GBASE_KR2: + return (ETHER_MEDIA_50GBASE_KR2); + default: + /* FALLTHRU */ + break; + } + + switch (port->mlp_ext_oper_proto) { + case MLXCX_EXTPROTO_SGMII_100BASE: + return (ETHER_MEDIA_100_SGMII); + case MLXCX_EXTPROTO_1000BASE_X_SGMII: + return (ETHER_MEDIA_1000_SGMII); + case MLXCX_EXTPROTO_5GBASE_R: + return (ETHER_MEDIA_5000BASE_KR); /* XXX KEBE ASKS use _KR ? */ + case MLXCX_EXTPROTO_10GBASE_XFI_XAUI_1: + return (ETHER_MEDIA_10G_XAUI); + case MLXCX_EXTPROTO_40GBASE_XLAUI_4_XLPPI_4: + return (ETHER_MEDIA_40G_XLPPI); + case MLXCX_EXTPROTO_25GAUI_1_25GBASE_CR_KR: + return (ETHER_MEDIA_25G_AUI); + case MLXCX_EXTPROTO_50GAUI_2_LAUI_2_50GBASE_CR2_KR2: + case MLXCX_EXTPROTO_50GAUI_1_LAUI_1_50GBASE_CR_KR: + /* No type for 50G AUI as far as I can see. */ + return (ETHER_MEDIA_UNKNOWN); + case MLXCX_EXTPROTO_CAUI_4_100GBASE_CR4_KR4: + return (ETHER_MEDIA_100GBASE_CAUI4); + case MLXCX_EXTPROTO_100GAUI_2_100GBASE_CR2_KR2: + case MLXCX_EXTPROTO_100GAUI_1_100GBASE_CR_KR: + /* No type for 100G AUI as far as I can see. */ + return (ETHER_MEDIA_UNKNOWN); + /* + * NOTE: These report unsupported but keeping them in active code for + * detection purposes. + */ + case MLXCX_EXTPROTO_200GAUI_4_200GBASE_CR4_KR4: + return (ETHER_MEDIA_200GAUI_4); + case MLXCX_EXTPROTO_200GAUI_2_200GBASE_CR2_KR2: + return (ETHER_MEDIA_200GAUI_2); + case MLXCX_EXTPROTO_400GAUI_8_400GBASE_CR8: + return (ETHER_MEDIA_400GAUI_8); + case MLXCX_EXTPROTO_400GAUI_4_400GBASE_CR4: + return (ETHER_MEDIA_400GAUI_4); + default: + /* + * There ARE legitimate single-bit values we don't support, + * and should just return 0 immediately. We will ASSERT() + * that it's a single-bit value, however. + */ + /* This check should work okay for 0 too. */ + ASSERT0((uint32_t)port->mlp_ext_oper_proto & + ((uint32_t)port->mlp_ext_oper_proto - 1U)); + break; + } + + return (ETHER_MEDIA_UNKNOWN); +} + static int mlxcx_mac_stat_rfc_2863(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat, uint64_t *val) @@ -340,6 +455,9 @@ mlxcx_mac_stat(void *arg, uint_t stat, uint64_t *val) case MAC_STAT_NORCVBUF: *val = port->mlp_stats.mlps_rx_drops; break; + case ETHER_STAT_XCVR_INUSE: + *val = (uint64_t)mlxcx_mac_media(port); + break; default: ret = ENOTSUP; } @@ -509,30 +627,71 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) mlxcx_t *mlxp = sq->mlwq_mlx; mlxcx_completion_queue_t *cq; mlxcx_buffer_t *b; - mac_header_info_t mhi; - mblk_t *kmp, *nmp; - uint8_t inline_hdrs[MLXCX_MAX_INLINE_HEADERLEN]; - size_t inline_hdrlen, rem, off; - uint32_t chkflags = 0; + mac_ether_offload_info_t meoi; + mblk_t *kmp; + size_t rem, off; boolean_t ok; size_t take = 0; uint_t bcount; + mlxcx_tx_ctx_t ctx; +#if defined(MLXCX_PERF_TIMERS) + hrtime_t times[MLXCX_BUF_TIMER_MAX]; + uint i; +#endif VERIFY(mp->b_next == NULL); - mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &chkflags); +#if defined(MLXCX_PERF_TIMERS) + bzero(times, sizeof (times)); + times[MLXCX_BUF_TIMER_PRE_RING_TX] = gethrtime(); +#endif + + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &ctx.mtc_chkflags); + mac_lso_get(mp, &ctx.mtc_mss, &ctx.mtc_lsoflags); - if (mac_vlan_header_info(mlxp->mlx_mac_hdl, mp, &mhi) != 0) { + if (mac_ether_offload_info(mp, &meoi) != 0 || + (meoi.meoi_flags & MEOI_L2INFO_SET) == 0) { /* * We got given a frame without a valid L2 header on it. We * can't really transmit that (mlx parts don't like it), so * we will just drop it on the floor. */ + mlxcx_warn(mlxp, "!tried to tx packet with no valid L2 header;" + " dropping it on the floor"); + freemsg(mp); + return (NULL); + } + +#if defined(MLXCX_PERF_TIMERS) + times[MLXCX_BUF_TIMER_POST_OFFLOAD_INFO] = gethrtime(); +#endif + + ctx.mtc_inline_hdrlen = meoi.meoi_l2hlen; + + /* + * If we're doing LSO, we need to find the end of the TCP header, and + * inline up to that point. + */ + if (ctx.mtc_lsoflags & HW_LSO) { + if ((meoi.meoi_flags & MEOI_L3INFO_SET) == 0 || + (meoi.meoi_flags & MEOI_L4INFO_SET) == 0) { + mlxcx_warn(mlxp, "!tried to tx LSO packet with no " + "valid L3/L4 headers; dropping it on the floor"); + freemsg(mp); + return (NULL); + } + ctx.mtc_inline_hdrlen += meoi.meoi_l3hlen + meoi.meoi_l4hlen; + } + + if (ctx.mtc_inline_hdrlen > MLXCX_MAX_INLINE_HEADERLEN) { + mlxcx_warn(mlxp, "!tried to tx LSO packet with headers that " + "are too long (%u bytes, max is %u); dropping it on the " + "floor", ctx.mtc_inline_hdrlen, MLXCX_MAX_INLINE_HEADERLEN); freemsg(mp); return (NULL); } - inline_hdrlen = rem = mhi.mhi_hdrsize; + rem = ctx.mtc_inline_hdrlen; kmp = mp; off = 0; @@ -543,7 +702,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) take = sz; if (take > rem) take = rem; - bcopy(kmp->b_rptr, inline_hdrs + off, take); + bcopy(kmp->b_rptr, ctx.mtc_inline_hdrs + off, take); rem -= take; off += take; if (take == sz) { @@ -552,16 +711,37 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) } } - bcount = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b); + MLXCX_PTIMER(times, MLXCX_BUF_TIMER_POST_INLINE_BCOPY); + + bcount = mlxcx_buf_bind_or_copy(mlxp, sq, mp, kmp, take, &b); if (bcount == 0) { atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); return (mp); } + MLXCX_PTIMER(times, MLXCX_BUF_TIMER_POST_BUF_BIND_COPY); + +#if defined(MLXCX_PERF_TIMERS) + /* Copy our temporary timers over to the buffer_t */ + for (i = 0; i <= MLXCX_BUF_TIMER_POST_BUF_BIND_COPY; ++i) + b->mlb_t[i] = times[i]; +#endif + + if (!mlxcx_buf_prepare_sqe(mlxp, sq, b, &ctx)) { + mlxcx_warn(mlxp, "!tried to tx packet that couldn't fit in " + "an SQE, dropping"); + freemsg(mp); + return (NULL); + } + + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_POST_PREPARE_SQE); + mutex_enter(&sq->mlwq_mtx); VERIFY3U(sq->mlwq_inline_mode, <=, MLXCX_ETH_INLINE_L2); cq = sq->mlwq_cq; + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_POST_WQ_MTX); + /* * state is a single int, so read-only access without the CQ lock * should be fine. @@ -595,24 +775,15 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) goto blocked; } - ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen, - chkflags, b); + ok = mlxcx_sq_add_buffer(mlxp, sq, b); if (!ok) { atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC); atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); goto blocked; } - /* - * Now that we've successfully enqueued the rest of the packet, - * free any mblks that we cut off while inlining headers. - */ - for (; mp != kmp; mp = nmp) { - nmp = mp->b_cont; - freeb(mp); - } - mutex_exit(&sq->mlwq_mtx); + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_POST_SQ_ADD_BUF); return (NULL); @@ -1126,6 +1297,7 @@ mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data) mac_capab_rings_t *cap_rings; mac_capab_led_t *cap_leds; mac_capab_transceiver_t *cap_txr; + mac_capab_lso_t *cap_lso; uint_t i, n = 0; switch (cap) { @@ -1158,10 +1330,10 @@ mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data) break; case MAC_CAPAB_HCKSUM: - if (mlxp->mlx_caps->mlc_checksum) { - *(uint32_t *)cap_data = HCKSUM_INET_FULL_V4 | - HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM; - } + if (!mlxp->mlx_caps->mlc_checksum) + return (B_FALSE); + *(uint32_t *)cap_data = HCKSUM_INET_FULL_V4 | + HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM; break; case MAC_CAPAB_LED: @@ -1182,6 +1354,24 @@ mlxcx_mac_getcapab(void *arg, mac_capab_t cap, void *cap_data) cap_txr->mct_read = mlxcx_mac_txr_read; break; + case MAC_CAPAB_LSO: + cap_lso = cap_data; + + if (!mlxp->mlx_caps->mlc_lso) + return (B_FALSE); + + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | + LSO_TX_BASIC_TCP_IPV6; + /* + * Cap LSO sends at 64k due to limitations in the TCP stack + * (full length needs to fit in an IP header apparently) + */ + cap_lso->lso_basic_tcp_ipv4.lso_max = + MIN(mlxp->mlx_caps->mlc_max_lso_size, UINT16_MAX); + cap_lso->lso_basic_tcp_ipv6.lso_max = + MIN(mlxp->mlx_caps->mlc_max_lso_size, UINT16_MAX); + break; + default: return (B_FALSE); } @@ -1453,6 +1643,9 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, *(link_state_t *)pr_val = LINK_STATE_UNKNOWN; } break; + case MAC_PROP_MEDIA: + *(mac_ether_media_t *)pr_val = mlxcx_mac_media(port); + break; case MAC_PROP_AUTONEG: if (pr_valsize < sizeof (uint8_t)) { ret = EOVERFLOW; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c index e2f51411719d..8739f628e419 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c @@ -874,6 +874,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, uint_t rx_frames = 0; uint_t comp_cnt = 0; int64_t wqebbs, bufcnt; + mlxcx_buf_return_batch_t rbatch; *mpp = NULL; @@ -886,6 +887,8 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, nmp = cmp = mp = NULL; + mlxcx_buf_return_batch_init(&rbatch); + wqebbs = 0; bufcnt = 0; for (cent = mlxcx_cq_next(mlcq); cent != NULL; @@ -939,6 +942,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, list_move_tail(&mlcq->mlcq_buffers, &mlcq->mlcq_buffers_b); added = B_TRUE; + ++mlcq->mlcq_bufbgen; } mutex_exit(&mlcq->mlcq_bufbmtx); if (added) @@ -977,9 +981,11 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, list_remove(&mlcq->mlcq_buffers, buf); bufcnt++; + MLXCX_PTIMER(buf->mlb_t, MLXCX_BUF_TIMER_PRE_TX_COMP); + switch (mlcq->mlcq_wq->mlwq_type) { case MLXCX_WQ_TYPE_SENDQ: - mlxcx_tx_completion(mlxp, mlcq, cent, buf); + mlxcx_tx_completion(mlxp, mlcq, cent, buf, &rbatch); break; case MLXCX_WQ_TYPE_RECVQ: nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); @@ -1006,6 +1012,7 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, * high->low water mark. */ if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { + mlxcx_buf_return_batch_flush(mlxp, &rbatch); mlxcx_update_cqci(mlxp, mlcq); /* * Both these variables are incremented using @@ -1024,6 +1031,8 @@ mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, break; } + mlxcx_buf_return_batch_flush(mlxp, &rbatch); + if (comp_cnt > 0) { mlxcx_update_cqci(mlxp, mlcq); atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index 2265bd054d07..d38bd6deb77e 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -71,8 +71,8 @@ #define MLXCX_UAR_EQ_NOARM 0x0048 /* Number of blue flame reg pairs per UAR */ -#define MLXCX_BF_PER_UAR 2 -#define MLXCX_BF_PER_UAR_MASK 0x1 +#define MLXCX_BF_PER_UAR 4 +#define MLXCX_BF_PER_UAR_MASK (MLXCX_BF_PER_UAR - 1) #define MLXCX_BF_SIZE 0x100 #define MLXCX_BF_BASE 0x0800 @@ -404,6 +404,8 @@ typedef enum { #define MLXCX_WQE_OCTOWORD 16 #define MLXCX_SQE_MAX_DS ((1 << 6) - 1) + +#define MLXCX_SQE_BUF 16 /* * Calculate the max number of address pointers in a single ethernet * send message. This is the remainder from MLXCX_SQE_MAX_DS @@ -456,16 +458,16 @@ typedef enum { /* CSTYLED */ #define MLXCX_SQE_ETH_INLINE_HDR_SZ (bitdef_t){0, 0x03ff} #define MLXCX_SQE_ETH_SZFLAG_VLAN (1 << 15) -#define MLXCX_MAX_INLINE_HEADERLEN 64 +#define MLXCX_MAX_INLINE_HEADERLEN (2 + MLXCX_WQE_OCTOWORD * 12) typedef struct { uint8_t mles_rsvd[4]; bits8_t mles_csflags; uint8_t mles_rsvd2[1]; - uint16_t mles_mss; + uint16be_t mles_mss; uint8_t mles_rsvd3[4]; bits16_t mles_szflags; - uint8_t mles_inline_headers[18]; + uint8_t mles_inline_headers[2]; } mlxcx_wqe_eth_seg_t; typedef struct { @@ -479,7 +481,7 @@ typedef struct { typedef struct { mlxcx_wqe_control_seg_t mlsqe_control; mlxcx_wqe_eth_seg_t mlsqe_eth; - mlxcx_wqe_data_seg_t mlsqe_data[1]; + mlxcx_wqe_data_seg_t mlsqe_data[2]; } mlxcx_sendq_ent_t; typedef struct { @@ -640,7 +642,7 @@ typedef enum { .bit_shift = 25, \ .bit_mask = 0x06000000 } -#define MLXCX_WORKQ_CTX_MAX_ADDRESSES 128 +#define MLXCX_WORKQ_CTX_MAX_ADDRESSES 1024 typedef struct mlxcx_workq_ctx { bits32_t mlwqc_flags; @@ -1588,7 +1590,7 @@ typedef struct { /* * This is an artificial limit that we're imposing on our actions. */ -#define MLXCX_CREATE_QUEUE_MAX_PAGES 128 +#define MLXCX_CREATE_QUEUE_MAX_PAGES 1024 typedef struct { mlxcx_cmd_in_t mlxi_create_eq_head; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 7711c501288e..2c939e797287 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -188,6 +188,9 @@ mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_completion_queue_t *mlcq; + if (!(mlwq->mlwq_state & MLXCX_WQ_INIT)) + return; + /* * If something is holding the lock on a long operation like a * refill, setting this flag asks them to exit early if possible. @@ -242,6 +245,7 @@ mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mutex_exit(&mlcq->mlcq_mtx); mutex_destroy(&mlwq->mlwq_mtx); + mlwq->mlwq_state &= ~MLXCX_WQ_INIT; } void @@ -400,6 +404,7 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_insert_tail(&mlxp->mlx_wqs, wq); + wq->mlwq_state |= MLXCX_WQ_INIT; mutex_enter(&wq->mlwq_mtx); @@ -444,6 +449,7 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_insert_tail(&mlxp->mlx_wqs, wq); + wq->mlwq_state |= MLXCX_WQ_INIT; mutex_enter(&wq->mlwq_mtx); @@ -667,6 +673,8 @@ mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) if (g->mlg_state & MLXCX_GROUP_WQS) { for (i = 0; i < g->mlg_nwqs; ++i) { wq = &g->mlg_wqs[i]; + if (!(wq->mlwq_state & MLXCX_WQ_INIT)) + continue; mutex_enter(&wq->mlwq_mtx); cq = wq->mlwq_cq; if (wq->mlwq_state & MLXCX_WQ_STARTED && @@ -685,12 +693,16 @@ mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) g->mlg_state &= ~MLXCX_GROUP_WQS; } - if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && - g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && - !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { - if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { - mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", - g->mlg_tis.mltis_num); + if ((g->mlg_state & MLXCX_GROUP_TIRTIS)) { + for (i = 0; i < MLXCX_TIS_PER_GROUP; ++i) { + if (!(g->mlg_tis[i].mltis_state & MLXCX_TIS_CREATED)) + continue; + if (g->mlg_tis[i].mltis_state & MLXCX_TIS_DESTROYED) + continue; + if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis[i])) { + mlxcx_warn(mlxp, "failed to destroy tis %u for " + "tx ring", g->mlg_tis[i].mltis_num); + } } } g->mlg_state &= ~MLXCX_GROUP_TIRTIS; @@ -1324,6 +1336,7 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) mlxcx_completion_queue_t *cq; mlxcx_work_queue_t *sq; uint_t i; + mlxcx_tis_t *tis; ASSERT3S(g->mlg_state, ==, 0); @@ -1341,11 +1354,13 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); g->mlg_state |= MLXCX_GROUP_WQS; - g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; + for (i = 0; i < MLXCX_TIS_PER_GROUP; ++i) { + g->mlg_tis[i].mltis_tdom = &mlxp->mlx_tdom; - if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { - mutex_exit(&g->mlg_mtx); - return (B_FALSE); + if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis[i])) { + mutex_exit(&g->mlg_mtx); + return (B_FALSE); + } } g->mlg_state |= MLXCX_GROUP_TIRTIS; @@ -1364,13 +1379,16 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) } if (!mlxcx_cq_setup(mlxp, eq, &cq, - mlxp->mlx_props.mldp_cq_size_shift)) + mlxp->mlx_props.mldp_cq_size_shift)) { + mutex_exit(&g->mlg_mtx); return (B_FALSE); + } cq->mlcq_stats = &g->mlg_port->mlp_stats; sq = &g->mlg_wqs[i]; - if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { + tis = &g->mlg_tis[i % MLXCX_TIS_PER_GROUP]; + if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, tis, sq)) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } @@ -1453,6 +1471,12 @@ mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); + /* + * Make sure all prior stores are flushed out before we update the + * counter: hardware can immediately start executing after this write + * (the doorbell below just makes sure it's awake) + */ + membar_producer(); mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); ASSERT(mlwq->mlwq_cq != NULL); @@ -1538,66 +1562,22 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) boolean_t mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, - uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, mlxcx_buffer_t *b0) { - uint_t index, first, ents; + uint_t index, first, ents, j; mlxcx_completion_queue_t *cq; mlxcx_sendq_ent_t *ent0; mlxcx_sendq_extra_ent_t *ent; - mlxcx_wqe_data_seg_t *seg; - uint_t ptri, nptr; - const ddi_dma_cookie_t *c; - size_t rem; uint64_t wqebb_used; - mlxcx_buffer_t *b; ddi_fm_error_t err; boolean_t rv; + uint64_t bufbgen; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); ASSERT3P(b0->mlb_tx_head, ==, b0); ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); cq = mlwq->mlwq_cq; - index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); - ent0 = &mlwq->mlwq_send_ent[index]; - b0->mlb_wqe_index = mlwq->mlwq_pc; - ents = 1; - - first = index; - - bzero(ent0, sizeof (mlxcx_sendq_ent_t)); - ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; - ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); - ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); - - set_bits8(&ent0->mlsqe_control.mlcs_flags, - MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); - set_bits8(&ent0->mlsqe_control.mlcs_flags, - MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); - - VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); - set_bits16(&ent0->mlsqe_eth.mles_szflags, - MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); - if (inlinelen > 0) { - bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, - inlinelen); - } - - ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / - MLXCX_WQE_OCTOWORD; - - if (chkflags & HCK_IPV4_HDRCKSUM) { - ASSERT(mlxp->mlx_caps->mlc_checksum); - set_bit8(&ent0->mlsqe_eth.mles_csflags, - MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); - } - if (chkflags & HCK_FULLCKSUM) { - ASSERT(mlxp->mlx_caps->mlc_checksum); - set_bit8(&ent0->mlsqe_eth.mles_csflags, - MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); - } - /* * mlwq_wqebb_used is only incremented whilst holding * the mlwq_mtx mutex, but it is decremented (atomically) in @@ -1608,65 +1588,66 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, */ wqebb_used = mlwq->mlwq_wqebb_used; - b = b0; - ptri = 0; - nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); - seg = ent0->mlsqe_data; - while (b != NULL) { - rem = b->mlb_used; + if ((b0->mlb_wqebbs + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); - c = NULL; - while (rem > 0 && - (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { - if (ptri >= nptr) { - if ((ents + wqebb_used) >= mlwq->mlwq_nents) - return (B_FALSE); + index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); + first = index; + ents = 0; - index = (mlwq->mlwq_pc + ents) & - (mlwq->mlwq_nents - 1); - ent = &mlwq->mlwq_send_extra_ent[index]; - ++ents; + if (b0->mlb_sqe == NULL || b0->mlb_wqebbs == 0) + return (B_FALSE); - seg = ent->mlsqe_data; - ptri = 0; - nptr = sizeof (ent->mlsqe_data) / - sizeof (mlxcx_wqe_data_seg_t); - } + /* + * Don't let a multi-WQEBB send request wrap around the ring -- if + * it looks like we need to do that, pad with NOPs to the end. + */ + if (index + b0->mlb_wqebbs > mlwq->mlwq_nents) { + while (index != 0) { + if ((ents + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); - seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); - if (c->dmac_size > rem) { - seg->mlds_byte_count = to_be32(rem); - rem = 0; - } else { - seg->mlds_byte_count = to_be32(c->dmac_size); - rem -= c->dmac_size; - } - seg->mlds_address = to_be64(c->dmac_laddress); - ++seg; - ++ptri; - ++ent0->mlsqe_control.mlcs_ds; + ent0 = &mlwq->mlwq_send_ent[index]; - ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, - MLXCX_SQE_MAX_DS); - } + bzero(ent0, sizeof (mlxcx_sendq_ent_t)); + ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; + ent0->mlsqe_control.mlcs_qp_or_sq = + to_be24(mlwq->mlwq_num); + ent0->mlsqe_control.mlcs_wqe_index = + to_be16(mlwq->mlwq_pc + ents); - if (b == b0) { - b = list_head(&b0->mlb_tx_chain); - } else { - b = list_next(&b0->mlb_tx_chain, b); + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); + + ent0->mlsqe_control.mlcs_ds = 1; + + ++ents; + index = (mlwq->mlwq_pc + ents) & (mlwq->mlwq_nents - 1); } } - b0->mlb_wqebbs = ents; - mlwq->mlwq_pc += ents; - atomic_add_64(&mlwq->mlwq_wqebb_used, ents); + ent0 = &mlwq->mlwq_send_ent[index]; + b0->mlb_wqe_index = mlwq->mlwq_pc + ents; + ++ents; - for (; ptri < nptr; ++ptri, ++seg) { - seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); - seg->mlds_byte_count = to_be32(0); - seg->mlds_address = to_be64(0); + bcopy(&b0->mlb_sqe[0], ent0, sizeof (*ent0)); + ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); + + for (j = 1; j < b0->mlb_wqebbs; ++j) { + if ((ents + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); + index = (mlwq->mlwq_pc + ents) & + (mlwq->mlwq_nents - 1); + ++ents; + ent = &mlwq->mlwq_send_extra_ent[index]; + bcopy(&b0->mlb_esqe[j], ent, sizeof (*ent)); } + mlwq->mlwq_pc += ents; + atomic_add_64(&mlwq->mlwq_wqebb_used, ents); + /* * Make sure the workqueue entry is flushed out before updating * the doorbell. @@ -1693,22 +1674,35 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, return (B_FALSE); } + MLXCX_PTIMER(b0->mlb_t, MLXCX_BUF_TIMER_POST_SQE_IN_RING); + /* - * Hold the bufmtx whilst ringing the doorbell, to prevent - * the buffer from being moved to another list, so we can - * safely remove it should the ring fail. + * Stash the bufbgen counter, which is incremented every time + * buffers_b is merged into buffers. This lets us easily tell which + * list we need to take the buffer back from if we fail in + * sq_ring_dbell (which will only happen if everything is going pretty + * badly). */ mutex_enter(&cq->mlcq_bufbmtx); - + bufbgen = cq->mlcq_bufbgen; list_insert_tail(&cq->mlcq_buffers_b, b0); + mutex_exit(&cq->mlcq_bufbmtx); + if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { atomic_inc_64(&cq->mlcq_bufcnt); } else { - list_remove(&cq->mlcq_buffers_b, b0); + mutex_enter(&cq->mlcq_bufbmtx); + if (bufbgen == cq->mlcq_bufbgen) { + list_remove(&cq->mlcq_buffers_b, b0); + mutex_exit(&cq->mlcq_bufbmtx); + } else { + mutex_exit(&cq->mlcq_bufbmtx); + mutex_enter(&cq->mlcq_mtx); + list_remove(&cq->mlcq_buffers, b0); + mutex_exit(&cq->mlcq_mtx); + } } - mutex_exit(&cq->mlcq_bufbmtx); - return (rv); } @@ -1991,16 +1985,22 @@ mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); } +static void mlxcx_buf_return_batch_push(mlxcx_t *mlxp, + mlxcx_buf_return_batch_t *mbrb, mlxcx_buffer_t *b); +static void mlxcx_buf_return_batch_push_chain(mlxcx_t *mlxp, + mlxcx_buf_return_batch_t *mbrb, mlxcx_buffer_t *b0, boolean_t keepmp); + void mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, - mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) + mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf, + mlxcx_buf_return_batch_t *mbrb) { ASSERT(mutex_owned(&mlcq->mlcq_mtx)); if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { mlxcx_completionq_error_ent_t *eent = (mlxcx_completionq_error_ent_t *)ent; mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); mlxcx_check_sq(mlxp, mlcq->mlcq_wq); mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); @@ -2009,24 +2009,25 @@ mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); return; } - if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { + if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND && + ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_LSO) { mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", ent->mlcqe_send_wqe_opcode); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); return; } if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); return; } - mlxcx_buf_return_chain(mlxp, buf, B_FALSE); + mlxcx_buf_return_batch_push_chain(mlxp, mbrb, buf, B_FALSE); } mblk_t * @@ -2200,6 +2201,11 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, b->mlb_foreign = B_TRUE; mlxcx_dma_buf_attr(mlxp, &attr); + /* + * Foreign bufs are used on the sendq and can have more pointers than + * standard bufs (which can be used on sq or rq). + */ + attr.dma_attr_sgllen = MLXCX_SQE_MAX_PTRS; ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); if (!ret) { @@ -2207,6 +2213,11 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, return (B_FALSE); } + /* All foreign bufs get an SQE buf automatically. */ + b->mlb_sqe_count = MLXCX_SQE_BUF; + b->mlb_sqe_size = b->mlb_sqe_count * sizeof (mlxcx_sendq_ent_t); + b->mlb_sqe = kmem_zalloc(b->mlb_sqe_size, KM_SLEEP); + *bp = b; return (B_TRUE); @@ -2249,7 +2260,8 @@ mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); bcopy(rptr, b->mlb_dma.mxdb_va, sz); - MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); + (void)ddi_dma_sync(b->mlb_dma.mxdb_dma_handle, 0, sz, + DDI_DMA_SYNC_FORDEV); ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); @@ -2275,6 +2287,11 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, size_t sz; boolean_t ret; +#if defined(MLXCX_PERF_TIMERS) + hrtime_t t0, t1; + t0 = gethrtime(); +#endif + rptr = mp->b_rptr; sz = MBLKL(mp); @@ -2289,27 +2306,207 @@ mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { b = mlxcx_copy_data(mlxp, wq, rptr, sz); +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL] += t1 - t0; +#endif } else { b = mlxcx_buf_take_foreign(mlxp, wq); if (b == NULL) return (NULL); +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL] += t1 - t0; + t0 = t1; +#endif - ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, - B_FALSE); + ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_TRUE); + +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_BIND_MBLK_TOTAL] += t1 - t0; + t0 = t1; +#endif if (!ret) { mlxcx_buf_return(mlxp, b); b = mlxcx_copy_data(mlxp, wq, rptr, sz); + +#if defined(MLXCX_PERF_TIMERS) + t1 = gethrtime(); + b->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL] += t1 - t0; +#endif } } return (b); } +boolean_t +mlxcx_buf_prepare_sqe(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, + mlxcx_buffer_t *b0, const mlxcx_tx_ctx_t *ctx) +{ + mlxcx_sendq_ent_t *ent0; + mlxcx_sendq_extra_ent_t *ent; + mlxcx_wqe_data_seg_t *seg; + uint_t ents, ptri, nptr; + const ddi_dma_cookie_t *c; + size_t rem, take, off; + mlxcx_buffer_t *b; + + ASSERT3P(b0->mlb_tx_head, ==, b0); + ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); + + if (b0->mlb_sqe == NULL) { + b0->mlb_sqe_count = MLXCX_SQE_BUF; + b0->mlb_sqe_size = b0->mlb_sqe_count * + sizeof (mlxcx_sendq_ent_t); + b0->mlb_sqe = kmem_zalloc(b0->mlb_sqe_size, KM_SLEEP); + } + + MLXCX_PTIMER(b0->mlb_t, MLXCX_BUF_TIMER_POST_SQE_BUF); + + ents = 1; + ent0 = &b0->mlb_sqe[0]; + + bzero(ent0, sizeof (mlxcx_sendq_ent_t)); + ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; + ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); + /* mlcs_wqe_index set by mlxcx_sq_add_buffer */ + + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); + set_bits8(&ent0->mlsqe_control.mlcs_flags, + MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); + + ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / + MLXCX_WQE_OCTOWORD; + ptri = 0; + seg = ent0->mlsqe_data; + nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); + + VERIFY3U(ctx->mtc_inline_hdrlen, <=, MLXCX_MAX_INLINE_HEADERLEN); + set_bits16(&ent0->mlsqe_eth.mles_szflags, + MLXCX_SQE_ETH_INLINE_HDR_SZ, ctx->mtc_inline_hdrlen); + if (ctx->mtc_inline_hdrlen > 0) { + ASSERT3U(ctx->mtc_inline_hdrlen, >, + sizeof (ent0->mlsqe_eth.mles_inline_headers)); + rem = ctx->mtc_inline_hdrlen; + off = 0; + + off += sizeof (ent0->mlsqe_eth.mles_inline_headers); + rem -= sizeof (ent0->mlsqe_eth.mles_inline_headers); + + while (rem > 0) { + if (ptri >= nptr) { + if (ents >= b0->mlb_sqe_count) + return (B_FALSE); + + ent = &b0->mlb_esqe[ents]; + ++ents; + + seg = ent->mlsqe_data; + ptri = 0; + nptr = sizeof (ent->mlsqe_data) / + sizeof (mlxcx_wqe_data_seg_t); + } + take = sizeof (mlxcx_wqe_data_seg_t); + if (take > rem) + take = rem; + off += take; + rem -= take; + + ++seg; + ++ptri; + ++ent0->mlsqe_control.mlcs_ds; + + ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, + MLXCX_SQE_MAX_DS); + } + + bcopy(ctx->mtc_inline_hdrs, + ent0->mlsqe_eth.mles_inline_headers, + ctx->mtc_inline_hdrlen); + } + + if (ctx->mtc_chkflags & HCK_IPV4_HDRCKSUM) { + ASSERT(mlxp->mlx_caps->mlc_checksum); + set_bit8(&ent0->mlsqe_eth.mles_csflags, + MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); + } + if (ctx->mtc_chkflags & HCK_FULLCKSUM) { + ASSERT(mlxp->mlx_caps->mlc_checksum); + set_bit8(&ent0->mlsqe_eth.mles_csflags, + MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); + } + if (ctx->mtc_lsoflags & HW_LSO) { + ASSERT(mlxp->mlx_caps->mlc_lso); + ASSERT3U(ctx->mtc_inline_hdrlen, >, 0); + ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_LSO; + ent0->mlsqe_eth.mles_mss = to_be16(ctx->mtc_mss); + } + + MLXCX_PTIMER(b0->mlb_t, MLXCX_BUF_TIMER_POST_PREPARE_SQE_INLINE); + + b = b0; + while (b != NULL) { + rem = b->mlb_used; + + c = NULL; + while (rem > 0 && + (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { + if (ptri >= nptr) { + if (ents >= b0->mlb_sqe_count) + return (B_FALSE); + + ent = &b0->mlb_esqe[ents]; + ++ents; + + seg = ent->mlsqe_data; + ptri = 0; + nptr = sizeof (ent->mlsqe_data) / + sizeof (mlxcx_wqe_data_seg_t); + } + + seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); + if (c->dmac_size > rem) { + seg->mlds_byte_count = to_be32(rem); + rem = 0; + } else { + seg->mlds_byte_count = to_be32(c->dmac_size); + rem -= c->dmac_size; + } + seg->mlds_address = to_be64(c->dmac_laddress); + ++seg; + ++ptri; + ++ent0->mlsqe_control.mlcs_ds; + + ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, + MLXCX_SQE_MAX_DS); + } + + if (b == b0) { + b = list_head(&b0->mlb_tx_chain); + } else { + b = list_next(&b0->mlb_tx_chain, b); + } + } + + for (; ptri < nptr; ++ptri, ++seg) { + seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); + seg->mlds_byte_count = to_be32(0); + seg->mlds_address = to_be64(0); + } + + b0->mlb_wqebbs = ents; + + return (B_TRUE); +} + uint_t mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, - mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) + mblk_t *mp0, mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) { mlxcx_buffer_t *b, *b0 = NULL; boolean_t first = B_TRUE; @@ -2332,12 +2529,21 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, if (!first) b->mlb_state = MLXCX_BUFFER_ON_CHAIN; - b->mlb_tx_mp = mp; + b->mlb_tx_mp = first ? mp0 : mp; b->mlb_tx_head = b0; b->mlb_used = MBLKL(mp) - offset; - if (!first) + if (!first) { list_insert_tail(&b0->mlb_tx_chain, b); +#if defined(MLXCX_PERF_TIMERS) + b0->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL] += + b->mlb_t[MLXCX_BUF_TIMER_COPY_TOTAL]; + b0->mlb_t[MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL] += + b->mlb_t[MLXCX_BUF_TIMER_TAKE_FOREIGN_TOTAL]; + b0->mlb_t[MLXCX_BUF_TIMER_BIND_MBLK_TOTAL] += + b->mlb_t[MLXCX_BUF_TIMER_BIND_MBLK_TOTAL]; +#endif + } first = B_FALSE; offset = 0; @@ -2365,7 +2571,7 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, freemsg(mp); return (0); } - freemsg(mpb); + freemsg(mp0); b0->mlb_tx_mp = mp; b0->mlb_tx_head = b0; @@ -2490,6 +2696,29 @@ mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) mlxcx_buf_return(mlxp, b0); } +static void +mlxcx_buf_return_batch_push_chain(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, + mlxcx_buffer_t *b0, boolean_t keepmp) +{ + mlxcx_buffer_t *b; + + if (b0->mlb_tx_head != b0) { + mlxcx_buf_return_batch_push(mlxp, mbrb, b0); + return; + } + + b = list_head(&b0->mlb_tx_chain); + while (b != NULL) { + mlxcx_buf_return_batch_push(mlxp, mbrb, b); + b = list_next(&b0->mlb_tx_chain, b); + } + if (keepmp) { + b0->mlb_tx_mp = NULL; + b0->mlb_tx_head = NULL; + } + mlxcx_buf_return_batch_push(mlxp, mbrb, b0); +} + inline void mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr) { @@ -2498,6 +2727,211 @@ mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr) s->mlbs_hiwat2 = 3 * (s->mlbs_ntotal / 4); } +static void mlxcx_buf_return_batch_flush_shard(mlxcx_t *, + mlxcx_buf_return_batch_t *, uint); + +static void +mlxcx_buf_return_batch_push(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, + mlxcx_buffer_t *b) +{ + uint i, found = 0; + uint min_n, min_n_i; + mlxcx_buf_shard_t *s = b->mlb_shard; + + VERIFY(!list_link_active(&b->mlb_cq_entry)); + + /* + * Are we already spooling up buffers for this shard? If so, add it + * to that existing list + */ + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { + if (mbrb->mbrb_shard[i] == s) { + found = 1; + break; + } + } + if (!found) { + /* Do we have any unused shard slots? If so, use that. */ + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { + if (mbrb->mbrb_shard[i] == NULL) { + mbrb->mbrb_shard[i] = s; + found = 1; + break; + } + } + } + if (!found) { + /* Otherwise evict the least popular shard. */ + min_n = mbrb->mbrb_n[0]; + min_n_i = 0; + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { + if (mbrb->mbrb_n[i] < min_n) { + min_n = mbrb->mbrb_n[i]; + min_n_i = i; + } + } + mlxcx_buf_return_batch_flush_shard(mlxp, mbrb, min_n_i); + i = min_n_i; + found = 1; + } + ASSERT(found); + + ++mbrb->mbrb_n[i]; + list_insert_tail(&mbrb->mbrb_list[i], b); +} + +void +mlxcx_buf_return_batch_init(mlxcx_buf_return_batch_t *mbrb) +{ + uint i; + list_create(&mbrb->mbrb_mblks, sizeof (mlxcx_buf_return_mblk_t), + offsetof(mlxcx_buf_return_mblk_t, mbrm_entry)); + mbrb->mbrb_inline_mblks = 0; + for (i = 0; i < MLXCX_BRB_INLINE_MBLKS; ++i) + mbrb->mbrb_inline_mblk[i] = NULL; + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { + mbrb->mbrb_shard[i] = NULL; + mbrb->mbrb_n[i] = 0; + list_create(&mbrb->mbrb_list[i], sizeof (mlxcx_buffer_t), + offsetof(mlxcx_buffer_t, mlb_cq_entry)); + } +} + +static void +mlxcx_buf_return_step1(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb, + mlxcx_buffer_t *b) +{ + mlxcx_buffer_t *txhead = b->mlb_tx_head; + mlxcx_buf_return_mblk_t *mbrm; + mblk_t *mp = b->mlb_tx_mp; + + VERIFY3U(b->mlb_state, !=, MLXCX_BUFFER_FREE); + ASSERT3P(b->mlb_mlx, ==, mlxp); + + b->mlb_wqe_index = 0; + b->mlb_tx_mp = NULL; + b->mlb_used = 0; + b->mlb_wqebbs = 0; + if (txhead == b) { + if (mbrb->mbrb_inline_mblks >= MLXCX_BRB_INLINE_MBLKS) { + mbrm = kmem_cache_alloc(mlxp->mlx_mbrm_cache, KM_SLEEP); + mbrm->mbrm_mp = mp; + list_insert_tail(&mbrb->mbrb_mblks, mbrm); + } else { + mbrb->mbrb_inline_mblk[mbrb->mbrb_inline_mblks++] = mp; + } + } + ASSERT(list_is_empty(&b->mlb_tx_chain)); + + if (b->mlb_foreign) { + if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { + mlxcx_dma_unbind(mlxp, &b->mlb_dma); + } + } +} + +static void +mlxcx_buf_return_step2(mlxcx_t *mlxp, mlxcx_buffer_t *b) +{ + mlxcx_buffer_state_t oldstate = b->mlb_state; + mlxcx_buffer_t *txhead = b->mlb_tx_head; + mlxcx_buf_shard_t *s = b->mlb_shard; + + ASSERT(mutex_owned(&s->mlbs_mtx)); + + b->mlb_state = MLXCX_BUFFER_FREE; + b->mlb_tx_head = NULL; + + switch (oldstate) { + case MLXCX_BUFFER_INIT: + mlxcx_bufshard_adjust_total(s, 1); + break; + case MLXCX_BUFFER_ON_WQ: + list_remove(&s->mlbs_busy, b); + break; + case MLXCX_BUFFER_ON_LOAN: + ASSERT(!b->mlb_foreign); + --s->mlbs_nloaned; + list_remove(&s->mlbs_loaned, b); + if (s->mlbs_state == MLXCX_SHARD_DRAINING) { + /* + * When we're draining, Eg during mac_stop(), + * we destroy the buffer immediately rather than + * recycling it. Otherwise we risk leaving it + * on the free list and leaking it. + */ + list_insert_tail(&s->mlbs_free, b); + mlxcx_buf_destroy(mlxp, b); + /* + * Teardown might be waiting for loaned list to empty. + */ + cv_broadcast(&s->mlbs_free_nonempty); + return; + } + break; + case MLXCX_BUFFER_FREE: + VERIFY(0); + break; + case MLXCX_BUFFER_ON_CHAIN: + ASSERT(txhead != NULL); + list_remove(&txhead->mlb_tx_chain, b); + list_remove(&s->mlbs_busy, b); + break; + } + +#if defined(MLXCX_PERF_TIMERS) + bzero(b->mlb_t, sizeof (b->mlb_t)); +#endif + + list_insert_tail(&s->mlbs_free, b); + cv_broadcast(&s->mlbs_free_nonempty); +} + +static void +mlxcx_buf_return_batch_flush_shard(mlxcx_t *mlxp, + mlxcx_buf_return_batch_t *mbrb, uint i) +{ + mlxcx_buffer_t *b; + mlxcx_buf_return_mblk_t *mbrm; + uint j; + + b = list_head(&mbrb->mbrb_list[i]); + while (b != NULL) { + mlxcx_buf_return_step1(mlxp, mbrb, b); + b = list_next(&mbrb->mbrb_list[i], b); + } + mutex_enter(&mbrb->mbrb_shard[i]->mlbs_mtx); + while ((b = list_remove_head(&mbrb->mbrb_list[i]))) { + MLXCX_PTIMER(b->mlb_t, MLXCX_BUF_TIMER_PRE_STEP2); + mlxcx_buf_return_step2(mlxp, b); + } + mutex_exit(&mbrb->mbrb_shard[i]->mlbs_mtx); + for (j = 0; j < mbrb->mbrb_inline_mblks; ++j) { + freemsg(mbrb->mbrb_inline_mblk[j]); + mbrb->mbrb_inline_mblk[j] = NULL; + } + mbrb->mbrb_inline_mblks = 0; + while ((mbrm = list_remove_head(&mbrb->mbrb_mblks))) { + freemsg(mbrm->mbrm_mp); + mbrm->mbrm_mp = NULL; + kmem_cache_free(mlxp->mlx_mbrm_cache, mbrm); + } + + mbrb->mbrb_shard[i] = NULL; + mbrb->mbrb_n[i] = 0; +} + +void +mlxcx_buf_return_batch_flush(mlxcx_t *mlxp, mlxcx_buf_return_batch_t *mbrb) +{ + uint i; + for (i = 0; i < MLXCX_BRB_SHARDS; ++i) { + if (mbrb->mbrb_shard[i] == NULL) + continue; + mlxcx_buf_return_batch_flush_shard(mlxp, mbrb, i); + } +} + void mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) { @@ -2596,6 +3030,13 @@ mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) mlxcx_bufshard_adjust_total(s, -1); } + if (b->mlb_sqe != NULL) { + kmem_free(b->mlb_sqe, b->mlb_sqe_size); + b->mlb_sqe = NULL; + b->mlb_sqe_size = 0; + b->mlb_sqe_count = 0; + } + /* * This is going back to the kmem cache, so it needs to be set up in * the same way we expect a new buffer to come out (state INIT, other