From f0e104de3819e3894d66d556e6d297a9ebff6bd4 Mon Sep 17 00:00:00 2001 From: Jerry Mannil <65309407+jerrymannil@users.noreply.github.com> Date: Tue, 15 Jul 2025 20:10:43 -0700 Subject: [PATCH] [ROCm] cpp_extension allow user to override default flags (#152432) (#2374) cherry-pick of https://github.com/pytorch/pytorch/commit/e4adf5df39d9c472c7dcbac18efde29241e238f0 We need -fgpu-rdc for projects such as DeepEP + rocSHMEM. The default of -no-gpu-rdc doesn't work for such cases. As per https://github.com/pytorch/pytorch/pull/152432#issuecomment-2840899088: "rocshmem shares the same global variable in different files, as deepEP uses CUDAExtention to build the project https://github.com/deepseek-ai/DeepEP/blob/65e2a700f0330f3fb1c26f49a0250d1f9d0ac1e3/setup.py#L51 and depends on rocshmem, this -fgpu-rdc is needed. The current logic in Pytorch prevents users from overriding this flag." Pull Request resolved: https://github.com/pytorch/pytorch/pull/152432 Approved by: https://github.com/jeffdaily Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Co-authored-by: Jeff Daily --- torch/utils/cpp_extension.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index ebe109bdf335d..6a261bd294ec1 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -2110,11 +2110,18 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: # If cflags is given, there may already be user-provided arch flags in it - # (from `extra_compile_args`) + # (from `extra_compile_args`). If user also specified -fgpu-rdc or -fno-gpu-rdc, we + # assume they know what they're doing. Otherwise, we force -fno-gpu-rdc default. + has_gpu_rdc_flag = False if cflags is not None: + has_custom_flags = False for flag in cflags: if 'amdgpu-target' in flag or 'offload-arch' in flag: - return ['-fno-gpu-rdc'] + has_custom_flags = True + elif 'gpu-rdc' in flag: + has_gpu_rdc_flag = True + if has_custom_flags: + return [] if has_gpu_rdc_flag else ['-fno-gpu-rdc'] # Use same defaults as used for building PyTorch # Allow env var to override, just like during initial cmake build. _archs = os.environ.get('PYTORCH_ROCM_ARCH', None) @@ -2127,7 +2134,7 @@ def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: else: archs = _archs.replace(' ', ';').split(';') flags = [f'--offload-arch={arch}' for arch in archs] - flags += ['-fno-gpu-rdc'] + flags += [] if has_gpu_rdc_flag else ['-fno-gpu-rdc'] return flags def _get_build_directory(name: str, verbose: bool) -> str: @@ -2312,8 +2319,8 @@ def _write_ninja_file_to_build_library(path, if with_cuda and IS_HIP_EXTENSION: cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS - cuda_flags += extra_cuda_cflags cuda_flags += _get_rocm_arch_flags(cuda_flags) + cuda_flags += extra_cuda_cflags elif with_cuda: cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags() if IS_WINDOWS: