From 82ab422285e1e8a617e0d3c4488bbcde4f84dc5f Mon Sep 17 00:00:00 2001 From: AmdSampsa Date: Wed, 17 Sep 2025 14:34:27 +0000 Subject: [PATCH 1/2] a few more configs for 2d grids --- torch/_inductor/runtime/triton_heuristics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 5c48a8ebfc295..951977af602e5 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -2521,6 +2521,8 @@ def pointwise( triton_config_with_settings(size_hints, 64, 64), # ~8% better for fp16 triton_config_with_settings(size_hints, 256, 16), triton_config_with_settings(size_hints, 16, 256), + triton_config_with_settings(size_hints, 128, 16), # wrt: +10% for some kernels + triton_config_with_settings(size_hints, 32, 512), # wrt: +30% for some kernels triton_config_with_settings(size_hints, bs, 1), triton_config_with_settings(size_hints, 1, bs), *hinted_configs, From f97c7a91f36af6dd249e63809c44d21543ae7719 Mon Sep 17 00:00:00 2001 From: AmdSampsa Date: Fri, 19 Sep 2025 11:33:11 +0000 Subject: [PATCH 2/2] even more configs --- torch/_inductor/runtime/triton_heuristics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 951977af602e5..4cb9660ebe84c 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -2505,6 +2505,9 @@ def pointwise( triton_config_with_settings( size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2 ), + triton_config_with_settings( + size_hints, 4096 # wrt: better than the max_block for some kernel + ), *hinted_configs, ] if len(size_hints) == 2: @@ -2518,10 +2521,12 @@ def pointwise( else: configs = [ triton_config_with_settings(size_hints, 32, 32), + triton_config_with_settings(size_hints, 64, 32), # wrt: better for some kernels triton_config_with_settings(size_hints, 64, 64), # ~8% better for fp16 triton_config_with_settings(size_hints, 256, 16), triton_config_with_settings(size_hints, 16, 256), triton_config_with_settings(size_hints, 128, 16), # wrt: +10% for some kernels + triton_config_with_settings(size_hints, 128, 32), # wrt: ..additional 10% more triton_config_with_settings(size_hints, 32, 512), # wrt: +30% for some kernels triton_config_with_settings(size_hints, bs, 1), triton_config_with_settings(size_hints, 1, bs),