From e0f617f4a5731acd108a12b519b81963e322ea0f Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Mon, 11 Feb 2019 17:18:13 -0600 Subject: [PATCH 1/5] Reworked changes for rccl integration on top of latest pytorch master --- CMakeLists.txt | 8 +- cmake/Dependencies.cmake | 6 ++ cmake/Summary.cmake | 1 + cmake/public/LoadHIP.cmake | 10 +++ setup.py | 11 +++ tools/amd_build/build_amd.py | 4 +- tools/amd_build/disabled_features.json | 6 ++ tools/amd_build/pyHIPIFY/constants.py | 5 +- .../pyHIPIFY/cuda_to_hip_mappings.py | 39 ++++++++++ tools/amd_build/pyHIPIFY/hipify_python.py | 19 ++++- tools/amd_build/pyHIPIFY/rccl1_compat.h | 53 +++++++++++++ tools/build_pytorch_libs.py | 6 ++ tools/setup_helpers/dist_check.py | 2 +- tools/setup_helpers/rccl.py | 75 +++++++++++++++++++ torch/CMakeLists.txt | 10 ++- torch/lib/c10d/CMakeLists.txt | 39 +++++++++- torch/lib/c10d/ProcessGroupNCCL.cpp | 16 ++++ 17 files changed, 300 insertions(+), 10 deletions(-) create mode 100644 tools/amd_build/pyHIPIFY/rccl1_compat.h create mode 100644 tools/setup_helpers/rccl.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ef8bc19e272..25dc3b3b06f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,7 +118,12 @@ option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF) option(USE_LMDB "Use LMDB" OFF) option(USE_METAL "Use Metal for iOS build" ON) option(USE_NATIVE_ARCH "Use -march=native" OFF) -option(USE_NCCL "Use NCCL" ON) +cmake_dependent_option( + USE_NCCL "Use NCCL" ON + "USE_CUDA" OFF) +cmake_dependent_option( + USE_RCCL "Use RCCL" ON + "USE_ROCM" OFF) option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF) option(USE_NNAPI "Use NNAPI" OFF) option(USE_NNPACK "Use NNPACK" ON) @@ -170,6 +175,7 @@ if (BUILD_ATEN_ONLY) set(USE_GFLAGS OFF) set(USE_GLOG OFF) set(USE_NCCL OFF) + set(USE_RCCL OFF) set(USE_NNPACK OFF) set(USE_NUMPY OFF) set(USE_OPENCV OFF) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 7ab15fb5cf0f..532a714d9d54 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -810,12 +810,18 @@ if(USE_ROCM) set(Caffe2_HIP_INCLUDE ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $ ${Caffe2_HIP_INCLUDE}) + if(USE_RCCL) + list(APPEND Caffe2_HIP_INCLUDE ${rccl_INCLUDE_DIRS}) + endif(USE_RCCL) # This is needed for library added by hip_add_library (same for hip_add_executable) hip_include_directories(${Caffe2_HIP_INCLUDE}) set(Caffe2_HIP_DEPENDENCY_LIBS ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${hipsparse_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES}) + if(USE_RCCL) + list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${PYTORCH_RCCL_LIBRARIES}) + endif(USE_RCCL) # Note [rocblas & rocfft cmake bug] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 0793526d64b3..283f994f513d 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -103,6 +103,7 @@ function (caffe2_print_configuration_summary) if(${USE_NCCL}) message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}") endif() + message(STATUS " USE_RCCL : ${USE_RCCL}") message(STATUS " USE_NNPACK : ${USE_NNPACK}") message(STATUS " USE_NUMPY : ${USE_NUMPY}") message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}") diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 6e9fb6e81917..548241f1af70 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -100,6 +100,13 @@ ELSE() SET(HCC_AMDGPU_TARGET $ENV{HCC_AMDGPU_TARGET}) ENDIF() +# RCCL PATH +IF(NOT DEFINED ENV{RCCL_PATH}) + SET(RCCL_PATH ${ROCM_PATH}/rccl) +ELSE() + SET(RCCL_PATH $ENV{RCCL_PATH}) +ENDIF() + # Add HIP to the CMAKE Module Path set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH}) @@ -140,6 +147,7 @@ IF(HIP_FOUND) set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) set(rocsparse_DIR ${ROCSPARSE_PATH}/lib/cmake/rocsparse) + set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl) find_package_and_print_version(rocrand REQUIRED) find_package_and_print_version(hiprand REQUIRED) @@ -149,6 +157,7 @@ IF(HIP_FOUND) find_package_and_print_version(rocfft REQUIRED) #find_package_and_print_version(hipsparse REQUIRED) find_package_and_print_version(rocsparse REQUIRED) + find_package_and_print_version(rccl) # TODO: hip_hcc has an interface include flag "-hc" which is only # recognizable by hcc, but not gcc and clang. Right now in our @@ -158,6 +167,7 @@ IF(HIP_FOUND) # TODO: miopen_LIBRARIES should return fullpath to the library file, # however currently it's just the lib name FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib) + FIND_LIBRARY(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${RCCL_PATH}/lib) FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib) FIND_LIBRARY(rocsparse_LIBRARIES rocsparse HINTS ${ROCSPARSE_PATH}/lib) FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib) diff --git a/setup.py b/setup.py index c02a3811a456..4a3ed68d619d 100644 --- a/setup.py +++ b/setup.py @@ -134,6 +134,11 @@ # NCCL_INCLUDE_DIR # specify where nccl is installed # +# RCCL_ROOT_DIR +# RCCL_LIB_DIR +# RCCL_INCLUDE_DIR +# specify where rccl is installed +# # NVTOOLSEXT_PATH (Windows only) # specify where nvtoolsext is installed # @@ -169,6 +174,7 @@ from tools.setup_helpers.rocm import USE_ROCM from tools.setup_helpers.miopen import USE_MIOPEN, MIOPEN_LIBRARY, MIOPEN_INCLUDE_DIR from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_SYSTEM_LIB, NCCL_INCLUDE_DIR +from tools.setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB from tools.setup_helpers.dist_check import USE_DISTRIBUTED ################################################################################ # Parameters parsed from environment @@ -343,6 +349,11 @@ def run(self): report('-- Building NCCL library') else: report('-- Not using NCCL') + if USE_RCCL: + print('-- Detected RCCL library at ' + + RCCL_SYSTEM_LIB + ', ' + RCCL_INCLUDE_DIR) + else: + print('-- Not using RCCL') if USE_DISTRIBUTED: report('-- Building with THD distributed package ') if IS_LINUX: diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index 5e152a0be21e..ab4401f538a8 100644 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -80,12 +80,14 @@ "csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp", "csrc/cuda/cuda_check.h", + "torch/lib/c10d/ProcessGroupGloo.hpp", + "torch/lib/c10d/ProcessGroupGloo.cpp", # These files are compatible with both cuda and hip "csrc/autograd/engine.cpp" ] for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")): for filename in files: - if filename.endswith(".cpp") or filename.endswith(".h"): + if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"): source = os.path.join(root, filename) # Disabled files if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False): diff --git a/tools/amd_build/disabled_features.json b/tools/amd_build/disabled_features.json index c7228b7ad144..726e93fb8400 100644 --- a/tools/amd_build/disabled_features.json +++ b/tools/amd_build/disabled_features.json @@ -1,5 +1,11 @@ { "disable_unsupported_hip_calls": [ + { + "path": "torch/lib/c10d/NCCLUtils.hpp", + "s_constants": { + "": "\"c10d/rccl1_compat.h\"" + } + } ], "disabled_modules": [ ], diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py index 6198031a9c8f..829b2ccc78b5 100644 --- a/tools/amd_build/pyHIPIFY/constants.py +++ b/tools/amd_build/pyHIPIFY/constants.py @@ -49,8 +49,9 @@ API_BLAS = 39 API_SPARSE = 40 API_RAND = 41 -API_LAST = 42 -API_FFT = 43 +API_FFT = 42 +API_RCCL = 43 +API_LAST = 44 HIP_UNSUPPORTED = 43 API_PYTORCH = 1337 diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index 926a7976f48a..12eca85d8122 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -238,6 +238,10 @@ ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)), ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)), ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)), + ("ncclResult_t", ("rcclResult_t", CONV_TYPE, API_RCCL)), + ("ncclComm_t", ("rcclComm_t", CONV_TYPE, API_RCCL)), + ("ncclDataType_t", ("rcclDataType_t", CONV_TYPE, API_RCCL)), + ("ncclRedOp_t", ("rcclRedOp_t", CONV_TYPE, API_RCCL)), ]) CUDA_INCLUDE_MAP = collections.OrderedDict([ @@ -276,6 +280,7 @@ ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)), ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), + ("", ("", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate ]) CUDA_IDENTIFIER_MAP = collections.OrderedDict([ @@ -2172,6 +2177,40 @@ ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)), ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)), ("cufftGetProperty", ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED)), + ("ncclGetErrorString", ("rcclGetErrorString", CONV_ERROR, API_RCCL)), + ("ncclCommInitAll", ("rcclCommInitAll", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclCommInitRank", ("rcclCommInitRank", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclCommDestroy", ("rcclCommDestroy", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclBcast", ("rcclBcast", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclReduce", ("rcclReduce", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclAllReduce", ("rcclAllReduce", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclAllGather", ("rcclAllGather", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclReduceScatter", ("rcclReduceScatter", CONV_SPECIAL_FUNC, API_RCCL, HIP_UNSUPPORTED)), + ("ncclSuccess", ("rcclSuccess", CONV_TYPE, API_RCCL)), + ("ncclChar", ("rcclChar", CONV_TYPE, API_RCCL)), + ("ncclInt8", ("rcclChar", CONV_TYPE, API_RCCL)), + ("ncclUint8", ("rcclChar", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int8 type + ("ncclInt", ("rcclInt", CONV_TYPE, API_RCCL)), + ("ncclInt32", ("rcclInt", CONV_TYPE, API_RCCL)), + ("ncclUint32", ("rcclInt", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int32 type + ("ncclInt64", ("rcclInt64", CONV_TYPE, API_RCCL)), + ("ncclUint64", ("rcclUint64", CONV_TYPE, API_RCCL)), + ("ncclHalf", ("rcclHalf", CONV_TYPE, API_RCCL)), + ("ncclFloat16", ("rcclHalf", CONV_TYPE, API_RCCL)), + ("ncclFloat", ("rcclFloat", CONV_TYPE, API_RCCL)), + ("ncclFloat32", ("rcclFloat", CONV_TYPE, API_RCCL)), + ("ncclDouble", ("rcclDouble", CONV_TYPE, API_RCCL)), + ("ncclFloat64", ("rcclDouble", CONV_TYPE, API_RCCL)), + ("ncclSum", ("rcclSum", CONV_TYPE, API_RCCL)), + ("ncclProd", ("rcclProd", CONV_TYPE, API_RCCL)), + ("ncclMin", ("rcclMin", CONV_TYPE, API_RCCL)), + ("ncclMax", ("rcclMax", CONV_TYPE, API_RCCL)), + ("ncclUniqueId", ("rcclUniqueId", CONV_TYPE, API_RCCL)), + ("ncclGetUniqueId", ("rcclGetUniqueId", CONV_TYPE, API_RCCL)), + ("ncclGroupStart", ("rcclGroupStart", CONV_TYPE, API_RCCL)), + ("ncclGroupEnd", ("rcclGroupEnd", CONV_TYPE, API_RCCL)), + ("NCCL_UNIQUE_ID_BYTES", ("RCCL_UNIQUE_ID_BYTES", CONV_TYPE, API_RCCL)), + ("USE_NCCL", ("USE_RCCL", CONV_DEF, API_RCCL)), ]) CUDA_SPARSE_MAP = collections.OrderedDict([ diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/tools/amd_build/pyHIPIFY/hipify_python.py index db14d7cee824..b4a852032d3f 100755 --- a/tools/amd_build/pyHIPIFY/hipify_python.py +++ b/tools/amd_build/pyHIPIFY/hipify_python.py @@ -871,7 +871,8 @@ def pattern(self): CAFFE2_TRIE.add(src) CAFFE2_MAP[src] = dst RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.pattern()) -RE_PYTORCH_PREPROCESSOR = re.compile(r'\b{0}\b'.format(PYTORCH_TRIE.pattern())) +# Use \W instead of \b so that even if the pattern contains non-word characters, the replacement still succeeds +RE_PYTORCH_PREPROCESSOR = re.compile(r'(\W)({0})(\W)'.format(PYTORCH_TRIE.pattern())) RE_QUOTE_HEADER = re.compile(r'#include "([^"]+)"') RE_ANGLE_HEADER = re.compile(r'#include <([^>]+)>') @@ -892,7 +893,7 @@ def preprocessor(output_directory, filepath, stats): # unsupported_calls statistics reporting is broken atm if is_pytorch_file(filepath): def pt_repl(m): - return PYTORCH_MAP[m.group(0)] + return m.group(1) + PYTORCH_MAP[m.group(2)] + m.group(3) output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source) else: def c2_repl(m): @@ -1423,3 +1424,17 @@ def hipify( output_directory, get_hip_file_path(filepath)), KernelTemplateParams) + + # copy rccl compat file to c10d + rccl_compat_file = "rccl1_compat.h" + rccl_compat_src_filepath = os.path.join(os.path.dirname(__file__), rccl_compat_file) + if not os.path.exists(rccl_compat_src_filepath): + print("ERROR: File does not exist: " + rccl_compat_src_filepath) + sys.exit(1) + rccl_compat_dst_dir = os.path.join(output_directory, "torch", "lib", "c10d") + if not os.path.exists(rccl_compat_dst_dir): + print("ERROR: Directory does not exist: " + rccl_compat_dst_dir) + sys.exit(1) + rccl_compat_dst_filepath = os.path.join(rccl_compat_dst_dir, rccl_compat_file) + shutil.copy(rccl_compat_src_filepath, rccl_compat_dst_filepath) + diff --git a/tools/amd_build/pyHIPIFY/rccl1_compat.h b/tools/amd_build/pyHIPIFY/rccl1_compat.h new file mode 100644 index 000000000000..b3da757c6fd8 --- /dev/null +++ b/tools/amd_build/pyHIPIFY/rccl1_compat.h @@ -0,0 +1,53 @@ +/************************************************************************* +* Copyright (c) 2017, AMD. All rights reserved. +* +************************************************************************/ + +#ifndef RCCL1_COMPAT_H +#define RCCL1_COMPAT_H + +#include + +#ifndef RCCL_MAJOR // RCCL 1.x +#define RCCL_MAJOR 1 +#define RCCL_MINOR 0 + +#define rcclNumOps rccl_NUM_OPS +#define rcclNumTypes rccl_NUM_TYPES + + static rcclResult_t rcclGroupStart() { return rcclSuccess; } +static rcclResult_t rcclGroupEnd() { return rcclSuccess; } + +#define CHECKCOUNT(count) if (count > INT_MAX) return rcclInvalidArgument; + + /* +static rcclResult_t rcclReduce(const void* sendbuff, void* recvbuff, size_t count, rcclDataType_t datatype, + rcclRedOp_t op, int root, rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(count); + return rcclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static rcclResult_t rcclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(count); + return rcclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static rcclResult_t rcclBcast(void* buff, size_t count, rcclDataType_t datatype, int root, + rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(count); + return rcclBcast(buff, (int)count, datatype, root, comm, stream); +} +static rcclResult_t rcclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm, + hipStream_t stream) { + CHECKCOUNT(recvcount); + return rcclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +*/ +static rcclResult_t rcclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + rcclDataType_t datatype, rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(sendcount); + return rcclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py index fea64e79aa32..09048ab7cf48 100644 --- a/tools/build_pytorch_libs.py +++ b/tools/build_pytorch_libs.py @@ -13,6 +13,7 @@ from .setup_helpers.cuda import USE_CUDA, CUDA_HOME from .setup_helpers.dist_check import USE_DISTRIBUTED, USE_GLOO_IBVERBS from .setup_helpers.nccl import USE_SYSTEM_NCCL, NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB +from .setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB from .setup_helpers.rocm import ROCM_HOME, ROCM_VERSION, USE_ROCM from .setup_helpers.nnpack import USE_NNPACK from .setup_helpers.qnnpack import USE_QNNPACK @@ -171,6 +172,11 @@ def run_cmake(version, NCCL_INCLUDE_DIR=NCCL_INCLUDE_DIR, NCCL_ROOT_DIR=NCCL_ROOT_DIR, NCCL_SYSTEM_LIB=NCCL_SYSTEM_LIB, + USE_RCCL=USE_RCCL, + RCCL_LIB_DIR=RCCL_LIB_DIR, + RCCL_INCLUDE_DIR=RCCL_INCLUDE_DIR, + RCCL_ROOT_DIR=RCCL_ROOT_DIR, + RCCL_SYSTEM_LIB=RCCL_SYSTEM_LIB, CAFFE2_STATIC_LINK_CUDA=check_env_flag('USE_CUDA_STATIC_LINK'), USE_ROCM=USE_ROCM, USE_NNPACK=USE_NNPACK, diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py index 8859fe122775..823807234764 100644 --- a/tools/setup_helpers/dist_check.py +++ b/tools/setup_helpers/dist_check.py @@ -6,7 +6,7 @@ from .cuda import USE_CUDA # On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl -USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM") +USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS# and not check_env_flag("USE_ROCM") USE_GLOO_IBVERBS = False IB_DEVINFO_CMD = "ibv_devinfo" diff --git a/tools/setup_helpers/rccl.py b/tools/setup_helpers/rccl.py new file mode 100644 index 000000000000..560a84ec2181 --- /dev/null +++ b/tools/setup_helpers/rccl.py @@ -0,0 +1,75 @@ +import os +import glob +import warnings +from itertools import chain + +from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_flag, \ + gather_paths + +from .rocm import USE_ROCM, ROCM_HOME + +USE_RCCL = False +RCCL_LIB_DIR = None +RCCL_SYSTEM_LIB = None +RCCL_INCLUDE_DIR = None +RCCL_ROOT_DIR = None +LIBRCCL_PREFIX = "librccl" +if USE_ROCM and not IS_DARWIN and not IS_WINDOWS and not check_negative_env_flag('USE_RCCL'): + ENV_ROOT = os.getenv('RCCL_ROOT_DIR', None) + LIB_DIR = os.getenv('RCCL_LIB_DIR', None) + INCLUDE_DIR = os.getenv('RCCL_INCLUDE_DIR', None) + + lib_paths = list(filter(bool, [ + LIB_DIR, + ENV_ROOT, + os.path.join(ENV_ROOT, 'lib') if ENV_ROOT is not None else None, + os.path.join(ENV_ROOT, 'lib', 'x86_64-linux-gnu') if ENV_ROOT is not None else None, + os.path.join(ENV_ROOT, 'lib64') if ENV_ROOT is not None else None, + os.path.join(ROCM_HOME, 'lib'), + os.path.join(ROCM_HOME, 'lib64'), + '/usr/local/lib', + '/usr/lib/x86_64-linux-gnu/', + '/usr/lib/powerpc64le-linux-gnu/', + '/usr/lib/aarch64-linux-gnu/', + '/usr/lib', + ] + gather_paths([ + 'LIBRARY_PATH', + ]) + gather_paths([ + 'LD_LIBRARY_PATH', + ]))) + + include_paths = list(filter(bool, [ + INCLUDE_DIR, + ENV_ROOT, + os.path.join(ENV_ROOT, 'include') if ENV_ROOT is not None else None, + os.path.join(ROCM_HOME, 'include'), + '/usr/local/include', + '/usr/include', + ])) + + if IS_CONDA: + lib_paths.append(os.path.join(CONDA_DIR, 'lib')) + for path in lib_paths: + path = os.path.expanduser(path) + if path is None or not os.path.exists(path): + continue + if glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*')): + RCCL_LIB_DIR = path + # try to find an exact versioned .so/.dylib, rather than librccl.so + preferred_path = glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*[0-9]*')) + if len(preferred_path) == 0: + RCCL_SYSTEM_LIB = glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*'))[0] + else: + RCCL_SYSTEM_LIB = os.path.realpath(preferred_path[0]) + break + for path in include_paths: + path = os.path.expanduser(path) + if path is None or not os.path.exists(path): + continue + if glob.glob(os.path.join(path, 'rccl.h')): + RCCL_INCLUDE_DIR = path + break + + if RCCL_LIB_DIR is not None and RCCL_INCLUDE_DIR is not None: + USE_RCCL = True + RCCL_ROOT_DIR = os.path.commonprefix((RCCL_LIB_DIR, RCCL_INCLUDE_DIR)) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 7be2a601f69c..acfc5c792e31 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -678,7 +678,7 @@ if (BUILD_PYTHON) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp) list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) - if (USE_CUDA) + if (USE_CUDA OR USE_ROCM) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp) endif() endif() @@ -694,6 +694,14 @@ if (BUILD_PYTHON) endif() endif() + if (USE_RCCL) + list(APPEND TORCH_PYTHON_SRCS + ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp + ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RCCL) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) + endif() + add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi") # For Declarations.yaml dependency add_dependencies(torch_python_stubs ATEN_CPU_FILES_GEN_TARGET) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 81dbbf3b93a5..b585779193dd 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -21,6 +21,21 @@ else() message(STATUS "Building C10D without CUDA support") endif() +if(USE_ROCM) + find_package(HIP) + if(HIP_FOUND) + set(C10D_USE_ROCM true) + message(STATUS "Building C10D with ROCM support") + add_definitions(-DUSE_ROCM=1 -D__HIP_PLATFORM_HCC__=1) + else() + set(C10D_USE_ROCM false) + message(STATUS "HIP not found, building C10D without ROCM support") + endif() +else() + set(C10D_USE_ROCM false) + message(STATUS "Building C10D without ROCM support") +endif() + find_package(MPI) if(MPI_FOUND) message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}") @@ -38,6 +53,10 @@ if(USE_NCCL) option(USE_C10D_NCCL "USE C10D NCCL" ON) endif() +if(USE_RCCL) + option(USE_C10D_RCCL "USE C10D RCCL" ON) +endif() + if(MPI_FOUND) option(USE_C10D_MPI "USE C10D MPI" ON) endif() @@ -56,6 +75,10 @@ if(C10D_USE_CUDA) set(C10D_LIBS caffe2_gpu ) +elseif(C10D_USE_ROCM) + set(C10D_LIBS + caffe2_hip + ) else() set(C10D_LIBS caffe2 @@ -63,11 +86,18 @@ else() endif() -if(USE_C10D_NCCL) +if(USE_C10D_NCCL OR USE_C10D_RCCL) list(APPEND C10D_SRCS ProcessGroupNCCL.cpp) +endif() + +if(USE_NCCL) list(APPEND C10D_LIBS __caffe2_nccl) endif() +if(USE_RCCL) + list(APPEND C10D_LIBS ${PYTORCH_RCCL_LIBRARIES}) +endif() + if(USE_C10D_MPI) list(APPEND C10D_SRCS ProcessGroupMPI.cpp) list(APPEND C10D_LIBS ${MPI_LIBRARIES}) @@ -123,11 +153,16 @@ copy_header(Types.hpp) copy_header(Utils.hpp) copy_header(ProcessGroupGloo.hpp) -if(USE_C10D_NCCL) +if(USE_C10D_NCCL OR USE_C10D_RCCL) copy_header(ProcessGroupNCCL.hpp) copy_header(NCCLUtils.hpp) endif() +if(USE_RCCL) + copy_header(rccl1_compat.h) + target_include_directories(c10d PUBLIC ${RCCL_INCLUDE_DIRS}) +endif() + if(USE_C10D_MPI) target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH}) copy_header(ProcessGroupMPI.hpp) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 5031e6fd05f0..b07bb0207e45 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -164,6 +164,22 @@ ProcessGroupNCCL::ProcessGroupNCCL( processGroupID_ = std::to_string(processGroupCounterMap_[groupKey]); groupPgID_ = groupName_ + "_" + processGroupID_; pgUniqueNCCLIDCnt_[groupPgID_] = -1; + +#ifdef __HIP_PLATFORM_HCC__ + int nDevices; + if (hipGetDeviceCount(&nDevices) != hipSuccess) { + hipGetLastError(); + nDevices = 0; + } + for (int i = 0; i < nDevices; ++i) { + hipSetDevice(i); + // Enable peer access from current device to all other devices + for (int j = 0; j < nDevices; ++j) { + if (i==j) continue; + hipDeviceEnablePeerAccess(j, 0); + } + } +#endif } ProcessGroupNCCL::~ProcessGroupNCCL() { From 252e38e47480087c696efaa3b0017e087bb37263 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 13 Feb 2019 00:04:19 +0000 Subject: [PATCH 2/5] Remove nccl dependency for ROCm --- torch/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index acfc5c792e31..f4c07e7b6925 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -699,7 +699,6 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RCCL) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) endif() add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi") From 74de72046a3b71d71ee88459da50f5df952e4293 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 13 Feb 2019 21:43:12 +0000 Subject: [PATCH 3/5] Skip test_nn data_parallel unit tests on ROCm since they currently fail with RCCL --- test/test_nn.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_nn.py b/test/test_nn.py index 5adf5b5a249d..de6a4e61d91e 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -3237,6 +3237,7 @@ def test_broadcast_no_grad(self): self.assertFalse(output.requires_grad) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_replicate(self): module = nn.Linear(10, 5).float().cuda() input = Variable(torch.randn(2, 10).float().cuda()) @@ -3345,6 +3346,7 @@ def local_test(out): local_test(out) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_small_back(self): l = nn.Linear(10, 5).float().cuda() i = Variable(torch.randn(20, 10).float().cuda()) @@ -3391,6 +3393,7 @@ def forward(self, x): self.assertRaises(AssertionError, lambda: dp.data_parallel(l, i, (0, 1))) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel(self): l = nn.Linear(10, 5).float().cuda() i = Variable(torch.randn(20, 10).float().cuda(1)) @@ -3494,6 +3497,7 @@ def forward(self, *input): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_data_parallel_module(self, dtype=torch.float): l = nn.Linear(10, 5).to("cuda", dtype) i = torch.randn(20, 10, device="cuda", dtype=dtype) @@ -3505,6 +3509,7 @@ def test_data_parallel_module(self, dtype=torch.float): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_data_parallel_module_kwargs_only(self, dtype=torch.float): class Net(nn.Module): def __init__(self): @@ -3524,6 +3529,7 @@ def forward(self, input): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_data_parallel_module_kwargs_only_empty_list(self, dtype=torch.float): class Net(nn.Module): def __init__(self): @@ -3543,6 +3549,7 @@ def forward(self, input): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_data_parallel_module_kwargs_only_empty_dict(self, dtype=torch.float): class Net(nn.Module): def __init__(self): @@ -3562,6 +3569,7 @@ def forward(self, input): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_data_parallel_module_kwargs_only_empty_tuple(self, dtype=torch.float): class Net(nn.Module): def __init__(self): @@ -3580,6 +3588,7 @@ def forward(self, input): self.assertEqual(out.data, expected_out, dtype2prec[dtype]) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_device_args(self): cuda0 = torch.device('cuda:0') cuda1 = torch.device('cuda:1') From 37596159411c6686a4606b350d05a5b19321a5fa Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 13 Feb 2019 23:05:12 +0000 Subject: [PATCH 4/5] Update some comments based on review --- tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py | 2 +- tools/amd_build/pyHIPIFY/rccl1_compat.h | 5 ----- tools/setup_helpers/dist_check.py | 3 +-- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index 12eca85d8122..8c6ed65f9d59 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -280,7 +280,7 @@ ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)), ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), - ("", ("", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate + ("", ("", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to use "<"">" to differentiate ]) CUDA_IDENTIFIER_MAP = collections.OrderedDict([ diff --git a/tools/amd_build/pyHIPIFY/rccl1_compat.h b/tools/amd_build/pyHIPIFY/rccl1_compat.h index b3da757c6fd8..deb5dbd2292f 100644 --- a/tools/amd_build/pyHIPIFY/rccl1_compat.h +++ b/tools/amd_build/pyHIPIFY/rccl1_compat.h @@ -1,8 +1,3 @@ -/************************************************************************* -* Copyright (c) 2017, AMD. All rights reserved. -* -************************************************************************/ - #ifndef RCCL1_COMPAT_H #define RCCL1_COMPAT_H diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py index 823807234764..70925d1d8fb8 100644 --- a/tools/setup_helpers/dist_check.py +++ b/tools/setup_helpers/dist_check.py @@ -5,8 +5,7 @@ from .env import IS_CONDA, IS_LINUX, IS_WINDOWS, CONDA_DIR, check_env_flag, check_negative_env_flag, gather_paths from .cuda import USE_CUDA -# On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl -USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS# and not check_env_flag("USE_ROCM") +USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS USE_GLOO_IBVERBS = False IB_DEVINFO_CMD = "ibv_devinfo" From 928174fe1e72f58cd1fd46ef6fdcfce6987e03dc Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 13 Feb 2019 23:07:50 +0000 Subject: [PATCH 5/5] Skip test_cuda coalesced tests on ROCm since they currently fail with RCCL --- test/test_cuda.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_cuda.py b/test/test_cuda.py index 26fc9fe5b417..930e262e3315 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -1076,6 +1076,7 @@ def test_broadcast_coalesced(self): self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_broadcast_coalesced_dense_only(self): numel = 5 num_bytes = numel * 8 @@ -1146,6 +1147,7 @@ def test_reduce_add_coalesced(self): self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_reduce_add_coalesced_dense_only(self): numel = 5 num_bytes = numel * 8