Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,12 @@ option(USE_LMDB "Use LMDB" ON)
option(USE_METAL "Use Metal for iOS build" ON)
option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
option(USE_NCCL "Use NCCL" ON)
cmake_dependent_option(
USE_NCCL "Use NCCL" ON
"USE_CUDA" OFF)
cmake_dependent_option(
USE_RCCL "Use RCCL" ON
"USE_ROCM" OFF)
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
Expand Down Expand Up @@ -146,6 +151,7 @@ if (BUILD_ATEN_ONLY)
set(USE_GFLAGS OFF)
set(USE_GLOG OFF)
set(USE_NCCL OFF)
set(USE_RCCL OFF)
set(USE_NNPACK OFF)
set(USE_NUMPY OFF)
set(USE_OPENCV OFF)
Expand Down
6 changes: 6 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -702,12 +702,18 @@ if(NOT BUILD_ATEN_MOBILE)

set(Caffe2_HIP_INCLUDES
${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDES})
if(USE_RCCL)
list(APPEND Caffe2_HIP_INCLUDES ${rccl_INCLUDE_DIRS})
endif(USE_RCCL)

# This is needed for library added by hip_add_library (same for hip_add_executable)
hip_include_directories(${Caffe2_HIP_INCLUDES})

set(Caffe2_HIP_DEPENDENCY_LIBS
${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES})
if(USE_RCCL)
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${PYTORCH_RCCL_LIBRARIES})
endif(USE_RCCL)
# TODO: There is a bug in rocblas and rocfft's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} and ${rocfft_LIBRARIES} respectively
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
roc::rocblas roc::rocfft)
Expand Down
1 change: 1 addition & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ function (caffe2_print_configuration_summary)
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
endif()
message(STATUS " USE_RCCL : ${USE_RCCL}")
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")
Expand Down
11 changes: 10 additions & 1 deletion cmake/public/LoadHIP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ ELSE()
SET(MIOPEN_PATH $ENV{MIOPEN_PATH})
ENDIF()

# RCCL PATH
IF(NOT DEFINED ENV{RCCL_PATH})
SET(RCCL_PATH ${ROCM_PATH}/rccl)
ELSE()
SET(RCCL_PATH $ENV{RCCL_PATH})
ENDIF()

IF(NOT DEFINED ENV{HCC_AMDGPU_TARGET})
SET(HCC_AMDGPU_TARGET gfx900)
ELSE()
Expand Down Expand Up @@ -149,6 +156,7 @@ IF(HIP_FOUND)
set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
set(rocsparse_DIR ${ROCSPARSE_PATH}/lib/cmake/rocsparse)
set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)

find_package_and_print_version(rocrand REQUIRED)
find_package_and_print_version(hiprand REQUIRED)
Expand All @@ -158,10 +166,11 @@ IF(HIP_FOUND)
find_package_and_print_version(rocfft REQUIRED)
find_package_and_print_version(hipsparse REQUIRED)
find_package_and_print_version(rocsparse REQUIRED)
find_package_and_print_version(rccl) #NOT REQUIRED FOR NOW UNTIL RCCL PACKAGE IS AVAILABLE

FIND_LIBRARY(PYTORCH_HIP_HCC_LIBRARIES hip_hcc HINTS ${HIP_PATH}/lib)
FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)

FIND_LIBRARY(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${RCCL_PATH}/lib)

# Necessary includes for building PyTorch since we include HIP headers that depend on hcc/hsa headers.
set(hcc_INCLUDE_DIRS ${HCC_PATH}/include)
Expand Down
34 changes: 29 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@
# NCCL_INCLUDE_DIR
# specify where nccl is installed
#
# RCCL_ROOT_DIR
# RCCL_LIB_DIR
# RCCL_INCLUDE_DIR
# specify where rccl is installed
#
# NVTOOLSEXT_PATH (Windows only)
# specify where nvtoolsext is installed
#
Expand Down Expand Up @@ -176,6 +181,8 @@ def hotpatch_var(var, prefix='USE_'):
MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_LIB_DIR, \
NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
from tools.setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, \
RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB
from tools.setup_helpers.nnpack import USE_NNPACK
from tools.setup_helpers.qnnpack import USE_QNNPACK
from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
Expand Down Expand Up @@ -385,6 +392,8 @@ def build_libs(libs):
my_env['CMAKE_INSTALL'] = 'make install'
if USE_SYSTEM_NCCL:
my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
if USE_RCCL:
my_env["RCCL_ROOT_DIR"] = RCCL_ROOT_DIR
if USE_CUDA:
my_env["CUDA_BIN_PATH"] = CUDA_HOME
build_libs_cmd += ['--use-cuda']
Expand Down Expand Up @@ -626,6 +635,11 @@ def run(self):
print('-- Building NCCL library')
else:
print('-- Not using NCCL')
if USE_RCCL:
print('-- Detected RCCL library at ' +
RCCL_SYSTEM_LIB + ', ' + RCCL_INCLUDE_DIR)
else:
print('-- Not using RCCL')
if USE_DISTRIBUTED:
print('-- Building with THD distributed package ')
if IS_LINUX:
Expand Down Expand Up @@ -971,8 +985,9 @@ def run(self):
main_sources.append('torch/csrc/distributed/c10d/init.cpp')
main_link_args.append(C10D_LIB)
main_link_args.append(GLOO_LIB)
if USE_CUDA:
if USE_CUDA or USE_ROCM:
main_sources.append('torch/csrc/distributed/c10d/ddp.cpp')
if USE_CUDA:
main_link_args.append(GLOO_CUDA_LIB)

if USE_CUDA:
Expand Down Expand Up @@ -1054,16 +1069,25 @@ def run(self):
"torch/csrc/nn/THCUNN.cpp",
]

NCCL_SOURCES = [
"torch/csrc/cuda/nccl.cpp",
"torch/csrc/cuda/python_nccl.cpp",
]

if USE_NCCL:
if USE_SYSTEM_NCCL:
include_dirs.append(NCCL_INCLUDE_DIR)
else:
include_dirs.append("build/nccl/include")
extra_compile_args += ['-DUSE_NCCL']
main_sources += [
"torch/csrc/cuda/nccl.cpp",
"torch/csrc/cuda/python_nccl.cpp",
]
main_sources += NCCL_SOURCES

if USE_RCCL:
main_link_args += [RCCL_SYSTEM_LIB]
include_dirs.append(RCCL_INCLUDE_DIR)
extra_compile_args += ['-DUSE_RCCL']
main_sources += NCCL_SOURCES

if USE_CUDNN:
main_libraries += [CUDNN_LIBRARY]
# NOTE: these are at the front, in case there's another cuDNN in CUDA path
Expand Down
4 changes: 2 additions & 2 deletions tools/amd_build/build_pytorch_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@

# Make various replacements inside AMD_BUILD/torch directory
ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
"csrc/cuda/cuda_check.h"]
"csrc/cuda/cuda_check.h", "torch/lib/c10d/ProcessGroupGloo.hpp", "torch/lib/c10d/ProcessGroupGloo.cpp"]
for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
for filename in files:
if filename.endswith(".cpp") or filename.endswith(".h"):
if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"):
source = os.path.join(root, filename)
# Disabled files
if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
Expand Down
6 changes: 6 additions & 0 deletions tools/amd_build/disabled_features.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@
"s_constants": {
"inverse_indices_kernel<<<": "inverse_indices_kernel<scalar_t><<<"
}
},
{
"path": "torch/lib/c10d/NCCLUtils.hpp",
"s_constants": {
"<nccl.h>": "\"c10d/rccl1_compat.h\""
}
}
],
"disabled_modules": [
Expand Down
5 changes: 3 additions & 2 deletions tools/amd_build/pyHIPIFY/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@
API_BLAS = 39
API_SPARSE = 40
API_RAND = 41
API_LAST = 42
API_FFT = 43
API_FFT = 42
API_RCCL = 43
API_LAST = 44

HIP_UNSUPPORTED = 43
API_PYTORCH = 1337
Expand Down
40 changes: 40 additions & 0 deletions tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@
("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)),
("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)),
("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
("ncclResult_t", ("rcclResult_t", CONV_TYPE, API_RCCL)),
("ncclComm_t", ("rcclComm_t", CONV_TYPE, API_RCCL)),
("ncclDataType_t", ("rcclDataType_t", CONV_TYPE, API_RCCL)),
("ncclRedOp_t", ("rcclRedOp_t", CONV_TYPE, API_RCCL)),
])

CUDA_INCLUDE_MAP = collections.OrderedDict([
Expand Down Expand Up @@ -277,6 +281,7 @@
("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
("#include <nvfunctional>", ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED)),
("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
])

CUDA_IDENTIFIER_MAP = collections.OrderedDict([
Expand Down Expand Up @@ -2171,6 +2176,41 @@
("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)),
("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)),
("cufftGetProperty", ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED)),
("ncclGetErrorString", ("rcclGetErrorString", CONV_ERROR, API_RCCL)),
("ncclCommInitAll", ("rcclCommInitAll", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclCommInitRank", ("rcclCommInitRank", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclCommDestroy", ("rcclCommDestroy", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclBcast", ("rcclBcast", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclReduce", ("rcclReduce", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclAllReduce", ("rcclAllReduce", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclAllGather", ("rcclAllGather", CONV_SPECIAL_FUNC, API_RCCL)),
("ncclReduceScatter", ("rcclReduceScatter", CONV_SPECIAL_FUNC, API_RCCL, HIP_UNSUPPORTED)),
("ncclSuccess", ("rcclSuccess", CONV_TYPE, API_RCCL)),
("ncclChar", ("rcclChar", CONV_TYPE, API_RCCL)),
("ncclInt8", ("rcclChar", CONV_TYPE, API_RCCL)),
("ncclUint8", ("rcclChar", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int8 type
("ncclInt", ("rcclInt", CONV_TYPE, API_RCCL)),
("ncclInt32", ("rcclInt", CONV_TYPE, API_RCCL)),
("ncclUint32", ("rcclInt", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int32 type
("ncclInt64", ("rcclInt64", CONV_TYPE, API_RCCL)),
("ncclUint64", ("rcclUint64", CONV_TYPE, API_RCCL)),
("ncclHalf", ("rcclHalf", CONV_TYPE, API_RCCL)),
("ncclFloat16", ("rcclHalf", CONV_TYPE, API_RCCL)),
("ncclFloat", ("rcclFloat", CONV_TYPE, API_RCCL)),
("ncclFloat32", ("rcclFloat", CONV_TYPE, API_RCCL)),
("ncclDouble", ("rcclDouble", CONV_TYPE, API_RCCL)),
("ncclFloat64", ("rcclDouble", CONV_TYPE, API_RCCL)),
("ncclSum", ("rcclSum", CONV_TYPE, API_RCCL)),
("ncclProd", ("rcclProd", CONV_TYPE, API_RCCL)),
("ncclMin", ("rcclMin", CONV_TYPE, API_RCCL)),
("ncclMax", ("rcclMax", CONV_TYPE, API_RCCL)),
("ncclUniqueId", ("rcclUniqueId", CONV_TYPE, API_RCCL)),
("ncclGetUniqueId", ("rcclGetUniqueId", CONV_TYPE, API_RCCL)),
("ncclGroupStart", ("rcclGroupStart", CONV_TYPE, API_RCCL)),
("ncclGroupEnd", ("rcclGroupEnd", CONV_TYPE, API_RCCL)),
("NCCL_UNIQUE_ID_BYTES", ("RCCL_UNIQUE_ID_BYTES", CONV_TYPE, API_RCCL)),
("USE_NCCL", ("USE_RCCL", CONV_DEF, API_RCCL)),
("USE_C10D_NCCL", ("USE_C10D_RCCL", CONV_DEF, API_RCCL)),
])

CUDA_SPARSE_MAP = collections.OrderedDict([
Expand Down
13 changes: 13 additions & 0 deletions tools/amd_build/pyHIPIFY/hipify_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -1370,6 +1370,19 @@ def hipify(
extensions_to_hip_suffix=extensions_to_hip_suffix),
KernelTemplateParams)

# copy rccl compat file to c10d
rccl_compat_file = "rccl1_compat.h"
rccl_compat_src_filepath = os.path.join(os.path.dirname(__file__), rccl_compat_file)
if not os.path.exists(rccl_compat_src_filepath):
print("ERROR: File does not exist: " + rccl_compat_src_filepath)
sys.exit(1)
rccl_compat_dst_dir = os.path.join(output_directory, "torch", "lib", "c10d")
if not os.path.exists(rccl_compat_dst_dir):
print("ERROR: Directory does not exist: " + rccl_compat_dst_dir)
sys.exit(1)
rccl_compat_dst_filepath = os.path.join(rccl_compat_dst_dir, rccl_compat_file)
shutil.copy(rccl_compat_src_filepath, rccl_compat_dst_filepath)


if __name__ == '__main__':
main()
53 changes: 53 additions & 0 deletions tools/amd_build/pyHIPIFY/rccl1_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*************************************************************************
* Copyright (c) 2017, AMD. All rights reserved.
*
************************************************************************/

#ifndef RCCL1_COMPAT_H
#define RCCL1_COMPAT_H

#include <rccl.h>

#ifndef RCCL_MAJOR // RCCL 1.x
#define RCCL_MAJOR 1
#define RCCL_MINOR 0

#define rcclNumOps rccl_NUM_OPS
#define rcclNumTypes rccl_NUM_TYPES

static rcclResult_t rcclGroupStart() { return rcclSuccess; }
static rcclResult_t rcclGroupEnd() { return rcclSuccess; }

#define CHECKCOUNT(count) if (count > INT_MAX) return rcclInvalidArgument;

/*
static rcclResult_t rcclReduce(const void* sendbuff, void* recvbuff, size_t count, rcclDataType_t datatype,
rcclRedOp_t op, int root, rcclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return rcclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
}
static rcclResult_t rcclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return rcclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
}
static rcclResult_t rcclBcast(void* buff, size_t count, rcclDataType_t datatype, int root,
rcclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return rcclBcast(buff, (int)count, datatype, root, comm, stream);
}
static rcclResult_t rcclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm,
hipStream_t stream) {
CHECKCOUNT(recvcount);
return rcclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
}
*/
static rcclResult_t rcclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
rcclDataType_t datatype, rcclComm_t comm, hipStream_t stream) {
CHECKCOUNT(sendcount);
return rcclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
}
#endif

#endif
10 changes: 10 additions & 0 deletions tools/build_pytorch_libs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ else
fi
CPP_FLAGS=" -std=c++11 "
THD_FLAGS=""
if [[ $USE_CUDA -eq 1 ]]; then
NCCL_ROOT_DIR=${NCCL_ROOT_DIR:-$INSTALL_DIR}
GLOO_FLAGS+="-DNCCL_ROOT_DIR=$NCCL_ROOT_DIR"
fi
# Gloo infiniband support
if [[ $USE_GLOO_IBVERBS -eq 1 ]]; then
GLOO_FLAGS+=" -DUSE_IBVERBS=1"
Expand Down Expand Up @@ -248,6 +252,12 @@ function build_caffe2() {
# We need the vanilla cmake build to work.
fi

# This is needed by the aten tests built with caffe2
if [[ $USE_CUDA -eq 1 ]] && [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.2" ]; then
# $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
find "${INSTALL_DIR}/lib" -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "lib/"
fi

${CMAKE_INSTALL} -j"$MAX_JOBS"
if ls build.ninja 2>&1 >/dev/null; then
# in cmake, .cu compilation involves generating certain intermediates
Expand Down
Loading