From b6ec433e9443fd57b8294fdf6c3dfc32b056d6dc Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 6 Jan 2025 14:07:00 -0600 Subject: [PATCH] Respect ROCR_VISIBLE_DEVICES on AMD GPU device discovery (#144026) Respect ROCR_VISIBLE_DEVICES on AMD GPU device discovery (#142292) Reland of #140320 after failing test on trunk. Fixes potential environment clobbering in test, makes ROCr+HIP devices (if specified together) more robust to index errors. Fixes #140318 Pull Request resolved: https://github.com/pytorch/pytorch/pull/142292 Approved by: https://github.com/jataylo, https://github.com/huydhn, https://github.com/jeffdaily Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com> Co-authored-by: Jeff Daily (cherry picked from commit c0d710634fcce172490c3ace0de977829b38bc06) Co-authored-by: Tal Ben-Nun (cherry picked from commit 23e390c711670095a8a4a64cc01b0c86eb3af04e) --- test/test_cuda.py | 2 ++ torch/cuda/__init__.py | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/test/test_cuda.py b/test/test_cuda.py index 33dfd95dab11e..62e5f85bb64bc 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3279,6 +3279,8 @@ def test_hip_device_count(self): {"CUDA_VISIBLE_DEVICES": "0", "HIP_VISIBLE_DEVICES": None}, {"CUDA_VISIBLE_DEVICES": None, "HIP_VISIBLE_DEVICES": "0"}, {"CUDA_VISIBLE_DEVICES": "0,1,2,3", "HIP_VISIBLE_DEVICES": "0"}, + {"ROCR_VISIBLE_DEVICES": "1,2,3", "HIP_VISIBLE_DEVICES": "0"}, + {"ROCR_VISIBLE_DEVICES": "0", "HIP_VISIBLE_DEVICES": None}, ] for env_config in custom_envs: diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 5d7d77dcd4134..5d72bdf15e391 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -646,7 +646,25 @@ def _parse_visible_devices() -> Union[List[int], List[str]]: if torch.version.hip: hip_devices = os.getenv("HIP_VISIBLE_DEVICES") - if hip_devices is not None: + rocr_devices = os.getenv("ROCR_VISIBLE_DEVICES") + + # You must take care if both HIP and ROCR env vars are set as they have + # different meanings. Both env vars accept either a list of ints or a + # list of UUIDs. The ROCR env var is processed first which then reduces + # the number of GPUs that HIP can select from. + if rocr_devices is not None: + rocr_count = len(rocr_devices.split(",")) + if hip_devices is not None: + # sanity check if both env vars are set + if len(hip_devices.split(",")) > rocr_count: + raise RuntimeError( + "HIP_VISIBLE_DEVICES contains more devices than ROCR_VISIBLE_DEVICES" + ) + # HIP_VISIBLE_DEVICES is preferred over ROCR_VISIBLE_DEVICES + var = hip_devices + else: + return list(range(rocr_count)) + elif hip_devices is not None: var = hip_devices if var is None: