From c1b07a24b8faeb299c058190805b409dc2323628 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Mon, 5 Nov 2018 15:51:02 -0600 Subject: [PATCH 01/11] Add rccl to HIP cmake --- cmake/Dependencies.cmake | 2 +- cmake/public/LoadHIP.cmake | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ac82134589d9..76ef21a1716e 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -633,7 +633,7 @@ if(NOT BUILD_ATEN_MOBILE) hip_include_directories(${Caffe2_HIP_INCLUDES}) set(Caffe2_HIP_DEPENDENCY_LIBS - ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES}) + ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${rccl_libraries} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES}) # TODO: There is a bug in rocblas and rocfft's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} and ${rocfft_LIBRARIES} respectively list(APPEND Caffe2_HIP_DEPENDENCY_LIBS roc::rocblas roc::rocfft) diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index a95a04712470..7e9cb383c27e 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -94,6 +94,13 @@ ELSE() SET(MIOPEN_PATH $ENV{MIOPEN_PATH}) ENDIF() +# RCCL PATH +IF(NOT DEFINED ENV{RCCL_PATH}) + SET(RCCL_PATH ${ROCM_PATH}/rccl) +ELSE() + SET(RCCL_PATH $ENV{RCCL_PATH}) +ENDIF() + # Add HIP to the CMAKE Module Path set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH}) @@ -143,6 +150,7 @@ IF(HIP_FOUND) set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) set(rocsparse_DIR ${ROCSPARSE_PATH}/lib/cmake/rocsparse) + set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl) find_package_and_print_version(rocrand REQUIRED) find_package_and_print_version(hiprand REQUIRED) @@ -152,6 +160,7 @@ IF(HIP_FOUND) find_package_and_print_version(rocfft REQUIRED) find_package_and_print_version(hipsparse REQUIRED) find_package_and_print_version(rocsparse REQUIRED) + find_package_and_print_version(rccl) #NOT REQUIRED FOR NOW UNTIL RCCL PACKAGE IS AVAILABLE FIND_LIBRARY(PYTORCH_HIP_HCC_LIBRARIES hip_hcc HINTS ${HIP_PATH}/lib) FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib) From c84296b7f1a830a211e3adae3fbff17fa7081ac2 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Tue, 6 Nov 2018 18:25:10 -0600 Subject: [PATCH 02/11] Setup rccl environment variables in cmake, python and sh files --- CMakeLists.txt | 8 +++- cmake/Dependencies.cmake | 8 +++- cmake/Summary.cmake | 1 + setup.py | 31 +++++++++++++-- tools/build_pytorch_libs.sh | 4 +- tools/setup_helpers/rccl.py | 75 +++++++++++++++++++++++++++++++++++++ 6 files changed, 119 insertions(+), 8 deletions(-) create mode 100644 tools/setup_helpers/rccl.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 8176344226a7..d4a295bd06a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,7 +93,12 @@ option(USE_LMDB "Use LMDB" ON) option(USE_METAL "Use Metal for iOS build" ON) option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON) option(USE_NATIVE_ARCH "Use -march=native" OFF) -option(USE_NCCL "Use NCCL" ON) +cmake_dependent_option( + USE_NCCL "Use NCCL" ON + "USE_CUDA" OFF) +cmake_dependent_option( + USE_RCCL "Use RCCL" ON + "USE_ROCM" OFF) option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF) option(USE_NNAPI "Use NNAPI" OFF) option(USE_NNPACK "Use NNPACK" ON) @@ -147,6 +152,7 @@ if (BUILD_ATEN_ONLY) set(USE_GFLAGS OFF) set(USE_GLOG OFF) set(USE_NCCL OFF) + set(USE_RCCL OFF) set(USE_NNPACK OFF) set(USE_NUMPY OFF) set(USE_OPENCV OFF) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 76ef21a1716e..8141a8ca4d75 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -628,12 +628,18 @@ if(NOT BUILD_ATEN_MOBILE) set(Caffe2_HIP_INCLUDES ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $ ${Caffe2_HIP_INCLUDES}) + if(USE_RCCL) + list(APPEND Caffe2_HIP_INCLUDES ${rccl_INCLUDE_DIRS}) + endif(USE_RCCL) # This is needed for library added by hip_add_library (same for hip_add_executable) hip_include_directories(${Caffe2_HIP_INCLUDES}) set(Caffe2_HIP_DEPENDENCY_LIBS - ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${rccl_libraries} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES}) + ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES}) + if(USE_RCCL) + list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${rccl_libraries}) + endif(USE_RCCL) # TODO: There is a bug in rocblas and rocfft's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} and ${rocfft_LIBRARIES} respectively list(APPEND Caffe2_HIP_DEPENDENCY_LIBS roc::rocblas roc::rocfft) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 8708d8cef9a3..557a3430d41a 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -105,6 +105,7 @@ function (caffe2_print_configuration_summary) if(${USE_NCCL}) message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}") endif() + message(STATUS " USE_RCCL : ${USE_RCCL}") message(STATUS " USE_NNPACK : ${USE_NNPACK}") message(STATUS " USE_NUMPY : ${USE_NUMPY}") message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}") diff --git a/setup.py b/setup.py index ac96d329595c..c052901cd13b 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,11 @@ # NCCL_INCLUDE_DIR # specify where nccl is installed # +# RCCL_ROOT_DIR +# RCCL_LIB_DIR +# RCCL_INCLUDE_DIR +# specify where rccl is installed +# # MKLDNN_LIB_DIR # MKLDNN_LIBRARY # MKLDNN_INCLUDE_DIR @@ -177,6 +182,8 @@ def hotpatch_var(var, prefix='USE_'): MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR) from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \ NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB +from tools.setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, \ + RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB from tools.setup_helpers.mkldnn import (USE_MKLDNN, MKLDNN_LIBRARY, MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR) from tools.setup_helpers.nnpack import USE_NNPACK @@ -376,6 +383,8 @@ def build_libs(libs): my_env['CMAKE_INSTALL'] = 'make install' if USE_SYSTEM_NCCL: my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR + if USE_RCCL: + my_env["RCCL_ROOT_DIR"] = RCCL_ROOT_DIR if USE_CUDA: my_env["CUDA_BIN_PATH"] = CUDA_HOME build_libs_cmd += ['--use-cuda'] @@ -620,6 +629,11 @@ def run(self): print('-- Building NCCL library') else: print('-- Not using NCCL') + if USE_RCCL: + print('-- Detected RCCL library at ' + + RCCL_SYSTEM_LIB + ', ' + RCCL_INCLUDE_DIR) + else: + print('-- Not using RCCL') if USE_DISTRIBUTED: print('-- Building with THD distributed package ') if IS_LINUX: @@ -1040,6 +1054,11 @@ def run(self): "torch/csrc/nn/THCUNN.cpp", ] +NCCL_SOURCES = [ + "torch/csrc/cuda/nccl.cpp", + "torch/csrc/cuda/python_nccl.cpp", + ] + if USE_NCCL: if USE_SYSTEM_NCCL: main_link_args += [NCCL_SYSTEM_LIB] @@ -1047,10 +1066,14 @@ def run(self): else: main_link_args += [NCCL_LIB] extra_compile_args += ['-DUSE_NCCL'] - main_sources += [ - "torch/csrc/cuda/nccl.cpp", - "torch/csrc/cuda/python_nccl.cpp", - ] + main_sources += NCCL_SOURCES + +if USE_RCCL: + main_link_args += [RCCL_SYSTEM_LIB] + include_dirs.append(RCCL_INCLUDE_DIR) + extra_compile_args += ['-DUSE_RCCL'] + main_sources += NCCL_SOURCES + if USE_CUDNN: main_libraries += [CUDNN_LIBRARY] # NOTE: these are at the front, in case there's another cuDNN in CUDA path diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 7379cedc0723..879cce16c808 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -139,8 +139,8 @@ else fi CPP_FLAGS=" -std=c++11 " THD_FLAGS="" -NCCL_ROOT_DIR=${NCCL_ROOT_DIR:-$INSTALL_DIR} if [[ $USE_CUDA -eq 1 ]]; then +NCCL_ROOT_DIR=${NCCL_ROOT_DIR:-$INSTALL_DIR} GLOO_FLAGS+="-DNCCL_ROOT_DIR=$NCCL_ROOT_DIR" fi # Gloo infiniband support @@ -346,7 +346,7 @@ function build_caffe2() { fi # This is needed by the aten tests built with caffe2 - if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.2" ]; then + if [[ $USE_CUDA -eq 1 ]] && [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.2" ]; then # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl find "${INSTALL_DIR}/lib" -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "lib/" fi diff --git a/tools/setup_helpers/rccl.py b/tools/setup_helpers/rccl.py new file mode 100644 index 000000000000..7056c8b3758b --- /dev/null +++ b/tools/setup_helpers/rccl.py @@ -0,0 +1,75 @@ +import os +import glob +import warnings +from itertools import chain + +from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_flag, \ + gather_paths + +from .rocm import USE_ROCM, ROCM_HOME + +USE_RCCL = False +RCCL_LIB_DIR = None +RCCL_SYSTEM_LIB = None +RCCL_INCLUDE_DIR = None +RCCL_ROOT_DIR = None +LIBRCCL_PREFIX = "librccl" +if USE_ROCM and not IS_DARWIN and not IS_WINDOWS and not check_negative_env_flag('USE_RCCL'): + ENV_ROOT = os.getenv('RCCL_ROOT_DIR', None) + LIB_DIR = os.getenv('RCCL_LIB_DIR', None) + INCLUDE_DIR = os.getenv('RCCL_INCLUDE_DIR', None) + + lib_paths = list(filter(bool, [ + LIB_DIR, + ENV_ROOT, + os.path.join(ENV_ROOT, 'lib') if ENV_ROOT is not None else None, + os.path.join(ENV_ROOT, 'lib', 'x86_64-linux-gnu') if ENV_ROOT is not None else None, + os.path.join(ENV_ROOT, 'lib64') if ENV_ROOT is not None else None, + os.path.join(ROCM_HOME, 'lib'), + os.path.join(ROCM_HOME, 'lib64'), + '/usr/local/lib', + '/usr/lib/x86_64-linux-gnu/', + '/usr/lib/powerpc64le-linux-gnu/', + '/usr/lib/aarch64-linux-gnu/', + '/usr/lib', + ] + gather_paths([ + 'LIBRARY_PATH', + ]) + gather_paths([ + 'LD_LIBRARY_PATH', + ]))) + + include_paths = list(filter(bool, [ + INCLUDE_DIR, + ENV_ROOT, + os.path.join(ENV_ROOT, 'include') if ENV_ROOT is not None else None, + os.path.join(ROCM_HOME, 'include'), + '/usr/local/include', + '/usr/include', + ])) + + if IS_CONDA: + lib_paths.append(os.path.join(CONDA_DIR, 'lib')) + for path in lib_paths: + path = os.path.expanduser(path) + if path is None or not os.path.exists(path): + continue + if glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*')): + RCCL_LIB_DIR = path + # try to find an exact versioned .so/.dylib, rather than librccl.so + preferred_path = glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*[0-9]*')) + if len(preferred_path) == 0: + RCCL_SYSTEM_LIB = glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*'))[0] + else: + RCCL_SYSTEM_LIB = os.path.realpath(preferred_path[0]) + break + for path in include_paths: + path = os.path.expanduser(path) + if path is None or not os.path.exists(path): + continue + if glob.glob(os.path.join(path, 'rccl.h')): + RCCL_INCLUDE_DIR = path + break + + if RCCL_LIB_DIR is not None and RCCL_INCLUDE_DIR is not None: + USE_RCCL = True + RCCL_ROOT_DIR = os.path.commonprefix((NCCL_LIB_DIR, NCCL_INCLUDE_DIR)) From 1f51ff2a15532f73fe9d7c4626b2430a70a5ae67 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Thu, 15 Nov 2018 14:29:30 -0600 Subject: [PATCH 03/11] Undo bad conflict merge from previous IFU --- setup.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/setup.py b/setup.py index 59f000175099..9468c7e46a90 100644 --- a/setup.py +++ b/setup.py @@ -119,11 +119,6 @@ # RCCL_INCLUDE_DIR # specify where rccl is installed # -# MKLDNN_LIB_DIR -# MKLDNN_LIBRARY -# MKLDNN_INCLUDE_DIR -# specify where MKLDNN is installed -# # NVTOOLSEXT_PATH (Windows only) # specify where nvtoolsext is installed # @@ -188,8 +183,6 @@ def hotpatch_var(var, prefix='USE_'): NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB from tools.setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, \ RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB -from tools.setup_helpers.mkldnn import (USE_MKLDNN, MKLDNN_LIBRARY, - MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR) from tools.setup_helpers.nnpack import USE_NNPACK from tools.setup_helpers.qnnpack import USE_QNNPACK from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME From 49ca8edf6f892f3d4a5e07a45556fb60b96470ab Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Thu, 15 Nov 2018 18:53:12 -0600 Subject: [PATCH 04/11] Add hipification for nccl code; minor changes to use rccl in PyTorch build --- cmake/Dependencies.cmake | 2 +- cmake/public/LoadHIP.cmake | 2 +- tools/amd_build/pyHIPIFY/constants.py | 5 ++-- .../pyHIPIFY/cuda_to_hip_mappings.py | 29 +++++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 29376f3f212e..50f5dd82526c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -712,7 +712,7 @@ if(NOT BUILD_ATEN_MOBILE) set(Caffe2_HIP_DEPENDENCY_LIBS ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES}) if(USE_RCCL) - list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${rccl_libraries}) + list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${PYTORCH_RCCL_LIBRARIES}) endif(USE_RCCL) # TODO: There is a bug in rocblas and rocfft's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} and ${rocfft_LIBRARIES} respectively list(APPEND Caffe2_HIP_DEPENDENCY_LIBS diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 74340b42b831..187abddcde88 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -170,7 +170,7 @@ IF(HIP_FOUND) FIND_LIBRARY(PYTORCH_HIP_HCC_LIBRARIES hip_hcc HINTS ${HIP_PATH}/lib) FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib) - + FIND_LIBRARY(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${RCCL_PATH}/lib) # Necessary includes for building PyTorch since we include HIP headers that depend on hcc/hsa headers. set(hcc_INCLUDE_DIRS ${HCC_PATH}/include) diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py index 9dc4b5ee3489..160e81359623 100644 --- a/tools/amd_build/pyHIPIFY/constants.py +++ b/tools/amd_build/pyHIPIFY/constants.py @@ -49,8 +49,9 @@ API_BLAS = 39 API_SPARSE = 40 API_RAND = 41 -API_LAST = 42 -API_FFT = 43 +API_FFT = 42 +API_RCCL = 43 +API_LAST = 44 HIP_UNSUPPORTED = 43 API_PYTORCH = 1337 diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index 91dbee2599ea..f42d6ebac459 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -238,6 +238,10 @@ ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)), ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)), ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)), + ("ncclResult_t", ("rcclResult_t", CONV_TYPE, API_RCCL)), + ("ncclComm_t", ("rcclComm_t", CONV_TYPE, API_RCCL)), + ("ncclDataType_t", ("rcclDataType_t", CONV_TYPE, API_RCCL)), + ("ncclRedOp_t", ("rcclRedOp_t", CONV_TYPE, API_RCCL)), ]) CUDA_INCLUDE_MAP = collections.OrderedDict([ @@ -277,6 +281,7 @@ ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), ("#include ", ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED)), + ("", ("", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate ]) CUDA_IDENTIFIER_MAP = collections.OrderedDict([ @@ -2171,6 +2176,30 @@ ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)), ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)), ("cufftGetProperty", ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED)), + ("ncclGetErrorString", ("rcclGetErrorString", CONV_ERROR, API_RCCL)), + ("ncclCommInitAll", ("rcclCommInitAll", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclCommInitRank", ("rcclCommInitRank", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclCommDestroy", ("rcclCommDestroy", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclBcast", ("rcclBcast", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclReduce", ("rcclReduce", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclAllReduce", ("rcclAllReduce", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclAllGather", ("rcclAllGather", CONV_SPECIAL_FUNC, API_RCCL)), + ("ncclReduceScatter", ("rcclReduceScatter", CONV_SPECIAL_FUNC, API_RCCL, HIP_UNSUPPORTED)), + ("ncclSuccess", ("rcclSuccess", CONV_TYPE, API_RCCL)), + ("ncclFloat", ("rcclFloat", CONV_TYPE, API_RCCL)), + ("ncclHalf", ("rcclHalf", CONV_TYPE, API_RCCL)), + ("ncclDouble", ("rcclDouble", CONV_TYPE, API_RCCL)), + ("ncclInt64", ("rcclInt64", CONV_TYPE, API_RCCL)), + ("ncclInt", ("rcclInt", CONV_TYPE, API_RCCL)), + ("ncclChar", ("rcclChar", CONV_TYPE, API_RCCL)), + ("ncclSum", ("rcclSum", CONV_TYPE, API_RCCL)), + ("ncclProd", ("rcclProd", CONV_TYPE, API_RCCL)), + ("ncclMin", ("rcclMin", CONV_TYPE, API_RCCL)), + ("ncclMax", ("rcclMax", CONV_TYPE, API_RCCL)), + ("ncclUniqueId", ("rcclUniqueId", CONV_TYPE, API_RCCL)), + ("ncclGetUniqueId", ("rcclGetUniqueId", CONV_TYPE, API_RCCL)), + ("NCCL_UNIQUE_ID_BYTES", ("RCCL_UNIQUE_ID_BYTES", CONV_TYPE, API_RCCL)), + ("USE_NCCL", ("USE_RCCL", CONV_DEF, API_RCCL)), ]) CUDA_SPARSE_MAP = collections.OrderedDict([ From 59a68b30e113c0bc18d20624005b9343b4b141af Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Wed, 21 Nov 2018 12:50:33 -0600 Subject: [PATCH 05/11] Cmake and build-related changes to include direct rccl path in PyTorch build --- setup.py | 3 ++- torch/lib/c10d/CMakeLists.txt | 40 +++++++++++++++++++++++++++++++-- torch/lib/c10d/cmake/Def.hpp.in | 1 + 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 9468c7e46a90..bc08edfcba12 100644 --- a/setup.py +++ b/setup.py @@ -985,8 +985,9 @@ def run(self): main_sources.append('torch/csrc/distributed/c10d/init.cpp') main_link_args.append(C10D_LIB) main_link_args.append(GLOO_LIB) - if USE_CUDA: + if USE_CUDA or USE_ROCM: main_sources.append('torch/csrc/distributed/c10d/ddp.cpp') + if USE_CUDA: main_link_args.append(GLOO_CUDA_LIB) if USE_CUDA: diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 80a6d1ed14f4..bb8f10791ed0 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -21,6 +21,21 @@ else() message(STATUS "Building C10D without CUDA support") endif() +if(USE_ROCM) + find_package(HIP) + if(HIP_FOUND) + set(C10D_USE_ROCM true) + message(STATUS "Building C10D with ROCM support") + add_definitions(-DUSE_ROCM=1 -D__HIP_PLATFORM_HCC__=1) + else() + set(C10D_USE_ROCM false) + message(STATUS "HIP not found, building C10D without ROCM support") + endif() +else() + set(C10D_USE_ROCM false) + message(STATUS "Building C10D without ROCM support") +endif() + find_package(MPI) if(MPI_FOUND) message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}") @@ -38,6 +53,10 @@ if(USE_NCCL) option(USE_C10D_NCCL "USE C10D NCCL" ON) endif() +if(USE_RCCL) + option(USE_C10D_RCCL "USE C10D RCCL" ON) +endif() + if(MPI_FOUND) option(USE_C10D_MPI "USE C10D MPI" ON) endif() @@ -57,6 +76,11 @@ if(C10D_USE_CUDA) set(C10D_LIBS caffe2_gpu ) +elseif(C10D_USE_ROCM) + list(APPEND C10D_SRCS CUDAUtils.cpp) + set(C10D_LIBS + caffe2_hip + ) else() set(C10D_LIBS caffe2 @@ -64,11 +88,18 @@ else() endif() -if(USE_NCCL) +if(USE_NCCL OR USE_RCCL) list(APPEND C10D_SRCS ProcessGroupNCCL.cpp) +endif() + +if(USE_NCCL) list(APPEND C10D_LIBS __caffe2_nccl) endif() +if(USE_RCCL) + list(APPEND C10D_LIBS ${PYTORCH_RCCL_LIBRARIES}) +endif() + if(MPI_FOUND) list(APPEND C10D_SRCS ProcessGroupMPI.cpp) list(APPEND C10D_LIBS ${MPI_LIBRARIES}) @@ -119,11 +150,16 @@ copy_header(Types.hpp) copy_header(Utils.hpp) copy_header(ProcessGroupGloo.hpp) -if(USE_NCCL) +if(USE_NCCL OR USE_RCCL) copy_header(ProcessGroupNCCL.hpp) copy_header(NCCLUtils.hpp) endif() +if(USE_RCCL) + copy_header(rccl1_compat.h) + target_include_directories(c10d PUBLIC ${RCCL_INCLUDE_DIRS}) +endif() + if(MPI_FOUND) target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH}) copy_header(ProcessGroupMPI.hpp) diff --git a/torch/lib/c10d/cmake/Def.hpp.in b/torch/lib/c10d/cmake/Def.hpp.in index c56ce4fd95ff..f25e130faa34 100644 --- a/torch/lib/c10d/cmake/Def.hpp.in +++ b/torch/lib/c10d/cmake/Def.hpp.in @@ -1,4 +1,5 @@ #pragma once #cmakedefine USE_C10D_NCCL +#cmakedefine USE_C10D_RCCL #cmakedefine USE_C10D_MPI From 8db70df940de60e3f0fe95fb143912363849a1d0 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Wed, 21 Nov 2018 12:51:58 -0600 Subject: [PATCH 06/11] Hipify updates for rccl --- tools/amd_build/build_pytorch_amd.py | 4 ++-- .../pyHIPIFY/cuda_to_hip_mappings.py | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py index 07fdd684534a..3e3c58b42e66 100644 --- a/tools/amd_build/build_pytorch_amd.py +++ b/tools/amd_build/build_pytorch_amd.py @@ -29,10 +29,10 @@ # Make various replacements inside AMD_BUILD/torch directory ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp", - "csrc/cuda/cuda_check.h"] + "csrc/cuda/cuda_check.h", "torch/lib/c10d/ProcessGroupGloo.hpp", "torch/lib/c10d/ProcessGroupGloo.cpp"] for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")): for filename in files: - if filename.endswith(".cpp") or filename.endswith(".h"): + if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"): source = os.path.join(root, filename) # Disabled files if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False): diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index f42d6ebac459..7975443a9d14 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -2186,20 +2186,31 @@ ("ncclAllGather", ("rcclAllGather", CONV_SPECIAL_FUNC, API_RCCL)), ("ncclReduceScatter", ("rcclReduceScatter", CONV_SPECIAL_FUNC, API_RCCL, HIP_UNSUPPORTED)), ("ncclSuccess", ("rcclSuccess", CONV_TYPE, API_RCCL)), - ("ncclFloat", ("rcclFloat", CONV_TYPE, API_RCCL)), + ("ncclChar", ("rcclChar", CONV_TYPE, API_RCCL)), + ("ncclInt8", ("rcclChar", CONV_TYPE, API_RCCL)), + ("ncclUint8", ("rcclChar", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int8 type + ("ncclInt", ("rcclInt", CONV_TYPE, API_RCCL)), + ("ncclInt32", ("rcclInt", CONV_TYPE, API_RCCL)), + ("ncclUint32", ("rcclInt", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int32 type + ("ncclInt64", ("rcclInt64", CONV_TYPE, API_RCCL)), + ("ncclUint64", ("rcclUint64", CONV_TYPE, API_RCCL)), ("ncclHalf", ("rcclHalf", CONV_TYPE, API_RCCL)), + ("ncclFloat16", ("rcclHalf", CONV_TYPE, API_RCCL)), + ("ncclFloat", ("rcclFloat", CONV_TYPE, API_RCCL)), + ("ncclFloat32", ("rcclFloat", CONV_TYPE, API_RCCL)), ("ncclDouble", ("rcclDouble", CONV_TYPE, API_RCCL)), - ("ncclInt64", ("rcclInt64", CONV_TYPE, API_RCCL)), - ("ncclInt", ("rcclInt", CONV_TYPE, API_RCCL)), - ("ncclChar", ("rcclChar", CONV_TYPE, API_RCCL)), + ("ncclFloat64", ("rcclDouble", CONV_TYPE, API_RCCL)), ("ncclSum", ("rcclSum", CONV_TYPE, API_RCCL)), ("ncclProd", ("rcclProd", CONV_TYPE, API_RCCL)), ("ncclMin", ("rcclMin", CONV_TYPE, API_RCCL)), ("ncclMax", ("rcclMax", CONV_TYPE, API_RCCL)), ("ncclUniqueId", ("rcclUniqueId", CONV_TYPE, API_RCCL)), ("ncclGetUniqueId", ("rcclGetUniqueId", CONV_TYPE, API_RCCL)), + ("ncclGroupStart", ("rcclGroupStart", CONV_TYPE, API_RCCL)), + ("ncclGroupEnd", ("rcclGroupEnd", CONV_TYPE, API_RCCL)), ("NCCL_UNIQUE_ID_BYTES", ("RCCL_UNIQUE_ID_BYTES", CONV_TYPE, API_RCCL)), ("USE_NCCL", ("USE_RCCL", CONV_DEF, API_RCCL)), + ("USE_C10D_NCCL", ("USE_C10D_RCCL", CONV_DEF, API_RCCL)), ]) CUDA_SPARSE_MAP = collections.OrderedDict([ From 24141aacdea5b7b9cb1df86380d4393b8155eae5 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Wed, 21 Nov 2018 12:53:00 -0600 Subject: [PATCH 07/11] Add rccl1_compat.h to build to allow for use of NCCL2 api signature for collectives --- tools/amd_build/disabled_features.json | 6 +++ tools/amd_build/pyHIPIFY/hipify_python.py | 13 ++++++ tools/amd_build/pyHIPIFY/rccl1_compat.h | 53 +++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 tools/amd_build/pyHIPIFY/rccl1_compat.h diff --git a/tools/amd_build/disabled_features.json b/tools/amd_build/disabled_features.json index eb54531e30b6..f91d4ecfa6dc 100644 --- a/tools/amd_build/disabled_features.json +++ b/tools/amd_build/disabled_features.json @@ -95,6 +95,12 @@ "s_constants": { "inverse_indices_kernel<<<": "inverse_indices_kernel<<<" } + }, + { + "path": "torch/lib/c10d/NCCLUtils.hpp", + "s_constants": { + "": "\"c10d/rccl1_compat.h\"" + } } ], "disabled_modules": [ diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/tools/amd_build/pyHIPIFY/hipify_python.py index 241ef2c229a3..0ed733f26ed0 100755 --- a/tools/amd_build/pyHIPIFY/hipify_python.py +++ b/tools/amd_build/pyHIPIFY/hipify_python.py @@ -1370,6 +1370,19 @@ def hipify( extensions_to_hip_suffix=extensions_to_hip_suffix), KernelTemplateParams) + # copy rccl compat file to c10d + rccl_compat_file = "rccl1_compat.h" + rccl_compat_src_filepath = os.path.join(os.path.dirname(__file__), rccl_compat_file) + if not os.path.exists(rccl_compat_src_filepath): + print("ERROR: File does not exist: " + rccl_compat_src_filepath) + sys.exit(1) + rccl_compat_dst_dir = os.path.join(args.output_directory, "torch", "lib", "c10d") + if not os.path.exists(rccl_compat_dst_dir): + print("ERROR: Directory does not exist: " + rccl_compat_dst_dir) + sys.exit(1) + rccl_compat_dst_filepath = os.path.join(rccl_compat_dst_dir, rccl_compat_file) + shutil.copy(rccl_compat_src_filepath, rccl_compat_dst_filepath) + if __name__ == '__main__': main() diff --git a/tools/amd_build/pyHIPIFY/rccl1_compat.h b/tools/amd_build/pyHIPIFY/rccl1_compat.h new file mode 100644 index 000000000000..8cd6b3eeee49 --- /dev/null +++ b/tools/amd_build/pyHIPIFY/rccl1_compat.h @@ -0,0 +1,53 @@ +/************************************************************************* + * Copyright (c) 2017, AMD. All rights reserved. + * + ************************************************************************/ + +#ifndef RCCL1_COMPAT_H +#define RCCL1_COMPAT_H + +#include + +#ifndef RCCL_MAJOR // RCCL 1.x +#define RCCL_MAJOR 1 +#define RCCL_MINOR 0 + +#define rcclNumOps rccl_NUM_OPS +#define rcclNumTypes rccl_NUM_TYPES + +static rcclResult_t rcclGroupStart() { return rcclSuccess; } +static rcclResult_t rcclGroupEnd() { return rcclSuccess; } + +#define CHECKCOUNT(count) if (count > INT_MAX) return rcclInvalidArgument; + +/* +static rcclResult_t rcclReduce(const void* sendbuff, void* recvbuff, size_t count, rcclDataType_t datatype, + rcclRedOp_t op, int root, rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(count); + return rcclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static rcclResult_t rcclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(count); + return rcclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static rcclResult_t rcclBcast(void* buff, size_t count, rcclDataType_t datatype, int root, + rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(count); + return rcclBcast(buff, (int)count, datatype, root, comm, stream); +} +static rcclResult_t rcclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm, + hipStream_t stream) { + CHECKCOUNT(recvcount); + return rcclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +*/ +static rcclResult_t rcclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + rcclDataType_t datatype, rcclComm_t comm, hipStream_t stream) { + CHECKCOUNT(sendcount); + return rcclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif From 81164f2d176fb572e3a80fa46b06c85e80293949 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Thu, 6 Dec 2018 11:20:23 -0600 Subject: [PATCH 08/11] Update to match hipify script changes --- tools/amd_build/pyHIPIFY/hipify_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/tools/amd_build/pyHIPIFY/hipify_python.py index 0ed733f26ed0..68fdacffbf0c 100755 --- a/tools/amd_build/pyHIPIFY/hipify_python.py +++ b/tools/amd_build/pyHIPIFY/hipify_python.py @@ -1376,7 +1376,7 @@ def hipify( if not os.path.exists(rccl_compat_src_filepath): print("ERROR: File does not exist: " + rccl_compat_src_filepath) sys.exit(1) - rccl_compat_dst_dir = os.path.join(args.output_directory, "torch", "lib", "c10d") + rccl_compat_dst_dir = os.path.join(output_directory, "torch", "lib", "c10d") if not os.path.exists(rccl_compat_dst_dir): print("ERROR: Directory does not exist: " + rccl_compat_dst_dir) sys.exit(1) From 6224d5bef2cce637a9bd5d8ae8658f295337fe59 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Thu, 6 Dec 2018 11:49:27 -0600 Subject: [PATCH 09/11] Fix typo in variable names --- tools/setup_helpers/rccl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/setup_helpers/rccl.py b/tools/setup_helpers/rccl.py index 7056c8b3758b..560a84ec2181 100644 --- a/tools/setup_helpers/rccl.py +++ b/tools/setup_helpers/rccl.py @@ -72,4 +72,4 @@ if RCCL_LIB_DIR is not None and RCCL_INCLUDE_DIR is not None: USE_RCCL = True - RCCL_ROOT_DIR = os.path.commonprefix((NCCL_LIB_DIR, NCCL_INCLUDE_DIR)) + RCCL_ROOT_DIR = os.path.commonprefix((RCCL_LIB_DIR, RCCL_INCLUDE_DIR)) From d82b7beb756c930d067377e27564b50aaf86a237 Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Thu, 6 Dec 2018 12:34:14 -0600 Subject: [PATCH 10/11] Remove CUDAUtils.cpp from CMakeLists.txt since file has been removed from upstream repo --- torch/lib/c10d/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index bb8f10791ed0..c90831059fc0 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -77,7 +77,6 @@ if(C10D_USE_CUDA) caffe2_gpu ) elseif(C10D_USE_ROCM) - list(APPEND C10D_SRCS CUDAUtils.cpp) set(C10D_LIBS caffe2_hip ) From 119866990dcc2caeb0ccd95cb05cc1e017f450af Mon Sep 17 00:00:00 2001 From: jithunnair-amd Date: Sat, 15 Dec 2018 00:58:53 -0600 Subject: [PATCH 11/11] Enable peer-to-peer access for rccl collectives to work --- torch/lib/c10d/ProcessGroupNCCL.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index df614245365d..0a4e311a9ba8 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -152,6 +152,22 @@ ProcessGroupNCCL::ProcessGroupNCCL( ++processGroupCounter_; pgUniqueNCCLIDCnt_[processGroupCounter_] = -1; processGroupID_ = std::to_string(processGroupCounter_); + +#ifdef __HIP_PLATFORM_HCC__ + int nDevices; + if (hipGetDeviceCount(&nDevices) != hipSuccess) { + hipGetLastError(); + nDevices = 0; + } + for (int i = 0; i < nDevices; ++i) { + hipSetDevice(i); + // Enable peer access from current device to all other devices + for (int j = 0; j < nDevices; ++j) { + if (i==j) continue; + hipDeviceEnablePeerAccess(j, 0); + } + } +#endif } ProcessGroupNCCL::~ProcessGroupNCCL() {