diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 97665ca53..71c420f04 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2707,7 +2707,7 @@ dsv4-fp4-b300-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } dsv4-fp4-b300-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh index 1356ecbac..754846912 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # DeepSeek-V4-Pro single-node TRTLLM recipe for B300. The configured image -# already contains NVIDIA/TensorRT-LLM@feat/deepseek_v4; do not build TRTLLM at +# already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at # runtime from this benchmark path. source "$(dirname "$0")/../benchmark_lib.sh" @@ -101,10 +101,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" fi -# DeepSeek-V4-Pro has hidden size 7168. The current TRTLLM fused-HC MHC -# path corrupts eval generations for this shape; keep eval servers on the -# unfused path until the fused kernel is guarded or supports 7168. -export TRTLLM_MHC_ENABLE_FUSED_HC=0 +export TRTLLM_MHC_ENABLE_FUSED_HC="${TRTLLM_MHC_ENABLE_FUSED_HC:-1}" echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC" start_gpu_monitor --output "$PWD/gpu_metrics.csv" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4098a580a..5d02725da 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2148,3 +2148,10 @@ - "Disable TRTLLM fused MHC hyper-connection for eval servers via TRTLLM_MHC_ENABLE_FUSED_HC=0 because the current fused kernel corrupts DeepSeek-V4-Pro hidden size 7168 generations" - "Keep this as eval-only PR validation until the TensorRT-LLM fused MHC kernel is guarded or supports hidden size 7168" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233 + +- config-keys: + - dsv4-fp4-b300-trt + description: + - "Update the TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:fix-mhc7168-eb20e9e" + - "Enable TRTLLM fused MHC by default now that the image includes the hidden-size 7168 fused-HC fix" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1270