From e0f617f4a5731acd108a12b519b81963e322ea0f Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Mon, 11 Feb 2019 17:18:13 -0600
Subject: [PATCH 1/5] Reworked changes for rccl integration on top of latest
 pytorch master

---
 CMakeLists.txt                                |  8 +-
 cmake/Dependencies.cmake                      |  6 ++
 cmake/Summary.cmake                           |  1 +
 cmake/public/LoadHIP.cmake                    | 10 +++
 setup.py                                      | 11 +++
 tools/amd_build/build_amd.py                  |  4 +-
 tools/amd_build/disabled_features.json        |  6 ++
 tools/amd_build/pyHIPIFY/constants.py         |  5 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          | 39 ++++++++++
 tools/amd_build/pyHIPIFY/hipify_python.py     | 19 ++++-
 tools/amd_build/pyHIPIFY/rccl1_compat.h       | 53 +++++++++++++
 tools/build_pytorch_libs.py                   |  6 ++
 tools/setup_helpers/dist_check.py             |  2 +-
 tools/setup_helpers/rccl.py                   | 75 +++++++++++++++++++
 torch/CMakeLists.txt                          | 10 ++-
 torch/lib/c10d/CMakeLists.txt                 | 39 +++++++++-
 torch/lib/c10d/ProcessGroupNCCL.cpp           | 16 ++++
 17 files changed, 300 insertions(+), 10 deletions(-)
 create mode 100644 tools/amd_build/pyHIPIFY/rccl1_compat.h
 create mode 100644 tools/setup_helpers/rccl.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ef8bc19e272..25dc3b3b06f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,7 +118,12 @@ option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
 option(USE_LMDB "Use LMDB" OFF)
 option(USE_METAL "Use Metal for iOS build" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
-option(USE_NCCL "Use NCCL" ON)
+cmake_dependent_option(
+    USE_NCCL "Use NCCL" ON
+    "USE_CUDA" OFF)
+cmake_dependent_option(
+    USE_RCCL "Use RCCL" ON
+    "USE_ROCM" OFF)
 option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
@@ -170,6 +175,7 @@ if (BUILD_ATEN_ONLY)
   set(USE_GFLAGS OFF)
   set(USE_GLOG OFF)
   set(USE_NCCL OFF)
+  set(USE_RCCL OFF)
   set(USE_NNPACK OFF)
   set(USE_NUMPY OFF)
   set(USE_OPENCV OFF)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7ab15fb5cf0f..532a714d9d54 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -810,12 +810,18 @@ if(USE_ROCM)
 
     set(Caffe2_HIP_INCLUDE
       ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDE})
+    if(USE_RCCL)
+      list(APPEND Caffe2_HIP_INCLUDE ${rccl_INCLUDE_DIRS})
+    endif(USE_RCCL)
 
     # This is needed for library added by hip_add_library (same for hip_add_executable)
     hip_include_directories(${Caffe2_HIP_INCLUDE})
 
     set(Caffe2_HIP_DEPENDENCY_LIBS
       ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${hipsparse_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
+    if(USE_RCCL)
+      list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${PYTORCH_RCCL_LIBRARIES})
+    endif(USE_RCCL)
 
     # Note [rocblas & rocfft cmake bug]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 0793526d64b3..283f994f513d 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -103,6 +103,7 @@ function (caffe2_print_configuration_summary)
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
   endif()
+  message(STATUS "  USE_RCCL              : ${USE_RCCL}")
   message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
   message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
   message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 6e9fb6e81917..548241f1af70 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -100,6 +100,13 @@ ELSE()
   SET(HCC_AMDGPU_TARGET $ENV{HCC_AMDGPU_TARGET})
 ENDIF()
 
+# RCCL PATH
+IF(NOT DEFINED ENV{RCCL_PATH})
+  SET(RCCL_PATH ${ROCM_PATH}/rccl)
+ELSE()
+  SET(RCCL_PATH $ENV{RCCL_PATH})
+ENDIF()
+
 # Add HIP to the CMAKE Module Path
 set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 
@@ -140,6 +147,7 @@ IF(HIP_FOUND)
   set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
   set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
   set(rocsparse_DIR ${ROCSPARSE_PATH}/lib/cmake/rocsparse)
+  set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
 
   find_package_and_print_version(rocrand REQUIRED)
   find_package_and_print_version(hiprand REQUIRED)
@@ -149,6 +157,7 @@ IF(HIP_FOUND)
   find_package_and_print_version(rocfft REQUIRED)
   #find_package_and_print_version(hipsparse REQUIRED)
   find_package_and_print_version(rocsparse REQUIRED)
+  find_package_and_print_version(rccl)
 
   # TODO: hip_hcc has an interface include flag "-hc" which is only
   # recognizable by hcc, but not gcc and clang. Right now in our
@@ -158,6 +167,7 @@ IF(HIP_FOUND)
   # TODO: miopen_LIBRARIES should return fullpath to the library file,
   # however currently it's just the lib name
   FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
+  FIND_LIBRARY(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${RCCL_PATH}/lib)
   FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
   FIND_LIBRARY(rocsparse_LIBRARIES rocsparse HINTS ${ROCSPARSE_PATH}/lib)
   FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
diff --git a/setup.py b/setup.py
index c02a3811a456..4a3ed68d619d 100644
--- a/setup.py
+++ b/setup.py
@@ -134,6 +134,11 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
+#   RCCL_ROOT_DIR
+#   RCCL_LIB_DIR
+#   RCCL_INCLUDE_DIR
+#     specify where rccl is installed
+#
 #   NVTOOLSEXT_PATH (Windows only)
 #     specify where nvtoolsext is installed
 #
@@ -169,6 +174,7 @@
 from tools.setup_helpers.rocm import USE_ROCM
 from tools.setup_helpers.miopen import USE_MIOPEN, MIOPEN_LIBRARY, MIOPEN_INCLUDE_DIR
 from tools.setup_helpers.nccl import USE_NCCL, USE_SYSTEM_NCCL, NCCL_SYSTEM_LIB, NCCL_INCLUDE_DIR
+from tools.setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB
 from tools.setup_helpers.dist_check import USE_DISTRIBUTED
 ################################################################################
 # Parameters parsed from environment
@@ -343,6 +349,11 @@ def run(self):
             report('-- Building NCCL library')
         else:
             report('-- Not using NCCL')
+        if USE_RCCL:
+            print('-- Detected RCCL library at ' +
+                  RCCL_SYSTEM_LIB + ', ' + RCCL_INCLUDE_DIR)
+        else:
+            print('-- Not using RCCL')
         if USE_DISTRIBUTED:
             report('-- Building with THD distributed package ')
             if IS_LINUX:
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 5e152a0be21e..ab4401f538a8 100644
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -80,12 +80,14 @@
         "csrc/autograd/profiler.h",
         "csrc/autograd/profiler.cpp",
         "csrc/cuda/cuda_check.h",
+        "torch/lib/c10d/ProcessGroupGloo.hpp",
+        "torch/lib/c10d/ProcessGroupGloo.cpp",
         # These files are compatible with both cuda and hip
         "csrc/autograd/engine.cpp"
     ]
     for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
         for filename in files:
-            if filename.endswith(".cpp") or filename.endswith(".h"):
+            if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"):
                 source = os.path.join(root, filename)
                 # Disabled files
                 if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
diff --git a/tools/amd_build/disabled_features.json b/tools/amd_build/disabled_features.json
index c7228b7ad144..726e93fb8400 100644
--- a/tools/amd_build/disabled_features.json
+++ b/tools/amd_build/disabled_features.json
@@ -1,5 +1,11 @@
 {
   "disable_unsupported_hip_calls": [
+   {
+    "path": "torch/lib/c10d/NCCLUtils.hpp",
+    "s_constants": {
+        "<nccl.h>": "\"c10d/rccl1_compat.h\""
+    }
+   }
   ],
   "disabled_modules": [
   ],
diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py
index 6198031a9c8f..829b2ccc78b5 100644
--- a/tools/amd_build/pyHIPIFY/constants.py
+++ b/tools/amd_build/pyHIPIFY/constants.py
@@ -49,8 +49,9 @@
 API_BLAS = 39
 API_SPARSE = 40
 API_RAND = 41
-API_LAST = 42
-API_FFT = 43
+API_FFT = 42
+API_RCCL = 43
+API_LAST = 44
 
 HIP_UNSUPPORTED = 43
 API_PYTORCH = 1337
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 926a7976f48a..12eca85d8122 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -238,6 +238,10 @@
     ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)),
     ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)),
     ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
+    ("ncclResult_t", ("rcclResult_t", CONV_TYPE, API_RCCL)),
+    ("ncclComm_t", ("rcclComm_t", CONV_TYPE, API_RCCL)),
+    ("ncclDataType_t", ("rcclDataType_t", CONV_TYPE, API_RCCL)),
+    ("ncclRedOp_t", ("rcclRedOp_t", CONV_TYPE, API_RCCL)),
 ])
 
 CUDA_INCLUDE_MAP = collections.OrderedDict([
@@ -276,6 +280,7 @@
     ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)),
     ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
     ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
+    ("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
 ])
 
 CUDA_IDENTIFIER_MAP = collections.OrderedDict([
@@ -2172,6 +2177,40 @@
     ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)),
     ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)),
     ("cufftGetProperty", ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED)),
+    ("ncclGetErrorString", ("rcclGetErrorString", CONV_ERROR, API_RCCL)),
+    ("ncclCommInitAll", ("rcclCommInitAll", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclCommInitRank", ("rcclCommInitRank", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclCommDestroy", ("rcclCommDestroy", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclBcast", ("rcclBcast", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclReduce", ("rcclReduce", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclAllReduce", ("rcclAllReduce", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclAllGather", ("rcclAllGather", CONV_SPECIAL_FUNC, API_RCCL)),
+    ("ncclReduceScatter", ("rcclReduceScatter", CONV_SPECIAL_FUNC, API_RCCL, HIP_UNSUPPORTED)),
+    ("ncclSuccess", ("rcclSuccess", CONV_TYPE, API_RCCL)),
+    ("ncclChar", ("rcclChar", CONV_TYPE, API_RCCL)),
+    ("ncclInt8", ("rcclChar", CONV_TYPE, API_RCCL)),
+    ("ncclUint8", ("rcclChar", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int8 type
+    ("ncclInt", ("rcclInt", CONV_TYPE, API_RCCL)),
+    ("ncclInt32", ("rcclInt", CONV_TYPE, API_RCCL)),
+    ("ncclUint32", ("rcclInt", CONV_TYPE, API_RCCL)), #FIXME: This should be mapped to an unsigned int32 type
+    ("ncclInt64", ("rcclInt64", CONV_TYPE, API_RCCL)),
+    ("ncclUint64", ("rcclUint64", CONV_TYPE, API_RCCL)),
+    ("ncclHalf", ("rcclHalf", CONV_TYPE, API_RCCL)),
+    ("ncclFloat16", ("rcclHalf", CONV_TYPE, API_RCCL)),
+    ("ncclFloat", ("rcclFloat", CONV_TYPE, API_RCCL)),
+    ("ncclFloat32", ("rcclFloat", CONV_TYPE, API_RCCL)),
+    ("ncclDouble", ("rcclDouble", CONV_TYPE, API_RCCL)),
+    ("ncclFloat64", ("rcclDouble", CONV_TYPE, API_RCCL)),
+    ("ncclSum", ("rcclSum", CONV_TYPE, API_RCCL)),
+    ("ncclProd", ("rcclProd", CONV_TYPE, API_RCCL)),
+    ("ncclMin", ("rcclMin", CONV_TYPE, API_RCCL)),
+    ("ncclMax", ("rcclMax", CONV_TYPE, API_RCCL)),
+    ("ncclUniqueId", ("rcclUniqueId", CONV_TYPE, API_RCCL)),
+    ("ncclGetUniqueId", ("rcclGetUniqueId", CONV_TYPE, API_RCCL)),
+    ("ncclGroupStart", ("rcclGroupStart", CONV_TYPE, API_RCCL)),
+    ("ncclGroupEnd", ("rcclGroupEnd", CONV_TYPE, API_RCCL)),
+    ("NCCL_UNIQUE_ID_BYTES", ("RCCL_UNIQUE_ID_BYTES", CONV_TYPE, API_RCCL)),
+    ("USE_NCCL", ("USE_RCCL", CONV_DEF, API_RCCL)),
 ])
 
 CUDA_SPARSE_MAP = collections.OrderedDict([
diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/tools/amd_build/pyHIPIFY/hipify_python.py
index db14d7cee824..b4a852032d3f 100755
--- a/tools/amd_build/pyHIPIFY/hipify_python.py
+++ b/tools/amd_build/pyHIPIFY/hipify_python.py
@@ -871,7 +871,8 @@ def pattern(self):
             CAFFE2_TRIE.add(src)
             CAFFE2_MAP[src] = dst
 RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.pattern())
-RE_PYTORCH_PREPROCESSOR = re.compile(r'\b{0}\b'.format(PYTORCH_TRIE.pattern()))
+# Use \W instead of \b so that even if the pattern contains non-word characters, the replacement still succeeds
+RE_PYTORCH_PREPROCESSOR = re.compile(r'(\W)({0})(\W)'.format(PYTORCH_TRIE.pattern()))
 
 RE_QUOTE_HEADER = re.compile(r'#include "([^"]+)"')
 RE_ANGLE_HEADER = re.compile(r'#include <([^>]+)>')
@@ -892,7 +893,7 @@ def preprocessor(output_directory, filepath, stats):
         # unsupported_calls statistics reporting is broken atm
         if is_pytorch_file(filepath):
             def pt_repl(m):
-                return PYTORCH_MAP[m.group(0)]
+                return m.group(1) + PYTORCH_MAP[m.group(2)] + m.group(3)
             output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
         else:
             def c2_repl(m):
@@ -1423,3 +1424,17 @@ def hipify(
                     output_directory,
                     get_hip_file_path(filepath)),
                 KernelTemplateParams)
+
+    # copy rccl compat file to c10d
+    rccl_compat_file = "rccl1_compat.h"
+    rccl_compat_src_filepath = os.path.join(os.path.dirname(__file__), rccl_compat_file)
+    if not os.path.exists(rccl_compat_src_filepath):
+      print("ERROR: File does not exist: " + rccl_compat_src_filepath)
+      sys.exit(1)
+    rccl_compat_dst_dir = os.path.join(output_directory, "torch", "lib", "c10d")
+    if not os.path.exists(rccl_compat_dst_dir):
+      print("ERROR: Directory does not exist: " + rccl_compat_dst_dir)
+      sys.exit(1)
+    rccl_compat_dst_filepath = os.path.join(rccl_compat_dst_dir, rccl_compat_file)
+    shutil.copy(rccl_compat_src_filepath, rccl_compat_dst_filepath)
+
diff --git a/tools/amd_build/pyHIPIFY/rccl1_compat.h b/tools/amd_build/pyHIPIFY/rccl1_compat.h
new file mode 100644
index 000000000000..b3da757c6fd8
--- /dev/null
+++ b/tools/amd_build/pyHIPIFY/rccl1_compat.h
@@ -0,0 +1,53 @@
+/*************************************************************************
+* Copyright (c) 2017, AMD. All rights reserved.
+*
+************************************************************************/
+
+#ifndef RCCL1_COMPAT_H
+#define RCCL1_COMPAT_H
+
+#include <rccl.h>
+
+#ifndef RCCL_MAJOR // RCCL 1.x
+#define RCCL_MAJOR 1
+#define RCCL_MINOR 0
+
+#define rcclNumOps rccl_NUM_OPS
+#define rcclNumTypes rccl_NUM_TYPES
+
+ static rcclResult_t rcclGroupStart() { return rcclSuccess; }
+static rcclResult_t rcclGroupEnd() { return rcclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return rcclInvalidArgument;
+
+ /*
+static rcclResult_t rcclReduce(const void* sendbuff, void* recvbuff, size_t count, rcclDataType_t datatype,
+    rcclRedOp_t op, int root, rcclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return rcclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static rcclResult_t rcclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return rcclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static rcclResult_t rcclBcast(void* buff, size_t count, rcclDataType_t datatype, int root,
+    rcclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return rcclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static rcclResult_t rcclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, rcclDataType_t datatype, rcclRedOp_t op, rcclComm_t comm,
+    hipStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return rcclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+*/
+static rcclResult_t rcclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    rcclDataType_t datatype, rcclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return rcclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index fea64e79aa32..09048ab7cf48 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -13,6 +13,7 @@
 from .setup_helpers.cuda import USE_CUDA, CUDA_HOME
 from .setup_helpers.dist_check import USE_DISTRIBUTED, USE_GLOO_IBVERBS
 from .setup_helpers.nccl import USE_SYSTEM_NCCL, NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
+from .setup_helpers.rccl import USE_RCCL, RCCL_LIB_DIR, RCCL_INCLUDE_DIR, RCCL_ROOT_DIR, RCCL_SYSTEM_LIB
 from .setup_helpers.rocm import ROCM_HOME, ROCM_VERSION, USE_ROCM
 from .setup_helpers.nnpack import USE_NNPACK
 from .setup_helpers.qnnpack import USE_QNNPACK
@@ -171,6 +172,11 @@ def run_cmake(version,
         NCCL_INCLUDE_DIR=NCCL_INCLUDE_DIR,
         NCCL_ROOT_DIR=NCCL_ROOT_DIR,
         NCCL_SYSTEM_LIB=NCCL_SYSTEM_LIB,
+        USE_RCCL=USE_RCCL,
+        RCCL_LIB_DIR=RCCL_LIB_DIR,
+        RCCL_INCLUDE_DIR=RCCL_INCLUDE_DIR,
+        RCCL_ROOT_DIR=RCCL_ROOT_DIR,
+        RCCL_SYSTEM_LIB=RCCL_SYSTEM_LIB,
         CAFFE2_STATIC_LINK_CUDA=check_env_flag('USE_CUDA_STATIC_LINK'),
         USE_ROCM=USE_ROCM,
         USE_NNPACK=USE_NNPACK,
diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py
index 8859fe122775..823807234764 100644
--- a/tools/setup_helpers/dist_check.py
+++ b/tools/setup_helpers/dist_check.py
@@ -6,7 +6,7 @@
 from .cuda import USE_CUDA
 
 # On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
-USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
+USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS# and not check_env_flag("USE_ROCM")
 USE_GLOO_IBVERBS = False
 
 IB_DEVINFO_CMD = "ibv_devinfo"
diff --git a/tools/setup_helpers/rccl.py b/tools/setup_helpers/rccl.py
new file mode 100644
index 000000000000..560a84ec2181
--- /dev/null
+++ b/tools/setup_helpers/rccl.py
@@ -0,0 +1,75 @@
+import os
+import glob
+import warnings
+from itertools import chain
+
+from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_flag, \
+    gather_paths
+
+from .rocm import USE_ROCM, ROCM_HOME
+
+USE_RCCL = False
+RCCL_LIB_DIR = None
+RCCL_SYSTEM_LIB = None
+RCCL_INCLUDE_DIR = None
+RCCL_ROOT_DIR = None
+LIBRCCL_PREFIX = "librccl"
+if USE_ROCM and not IS_DARWIN and not IS_WINDOWS and not check_negative_env_flag('USE_RCCL'):
+   ENV_ROOT = os.getenv('RCCL_ROOT_DIR', None)
+   LIB_DIR = os.getenv('RCCL_LIB_DIR', None)
+   INCLUDE_DIR = os.getenv('RCCL_INCLUDE_DIR', None)
+
+   lib_paths = list(filter(bool, [
+       LIB_DIR,
+       ENV_ROOT,
+       os.path.join(ENV_ROOT, 'lib') if ENV_ROOT is not None else None,
+       os.path.join(ENV_ROOT, 'lib', 'x86_64-linux-gnu') if ENV_ROOT is not None else None,
+       os.path.join(ENV_ROOT, 'lib64') if ENV_ROOT is not None else None,
+       os.path.join(ROCM_HOME, 'lib'),
+       os.path.join(ROCM_HOME, 'lib64'),
+       '/usr/local/lib',
+       '/usr/lib/x86_64-linux-gnu/',
+       '/usr/lib/powerpc64le-linux-gnu/',
+       '/usr/lib/aarch64-linux-gnu/',
+       '/usr/lib',
+   ] + gather_paths([
+       'LIBRARY_PATH',
+   ]) + gather_paths([
+       'LD_LIBRARY_PATH',
+   ])))
+
+   include_paths = list(filter(bool, [
+       INCLUDE_DIR,
+       ENV_ROOT,
+       os.path.join(ENV_ROOT, 'include') if ENV_ROOT is not None else None,
+       os.path.join(ROCM_HOME, 'include'),
+       '/usr/local/include',
+       '/usr/include',
+   ]))
+
+   if IS_CONDA:
+       lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
+   for path in lib_paths:
+       path = os.path.expanduser(path)
+       if path is None or not os.path.exists(path):
+           continue
+       if glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*')):
+           RCCL_LIB_DIR = path
+           # try to find an exact versioned .so/.dylib, rather than librccl.so
+           preferred_path = glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*[0-9]*'))
+           if len(preferred_path) == 0:
+               RCCL_SYSTEM_LIB = glob.glob(os.path.join(path, LIBRCCL_PREFIX + '*'))[0]
+           else:
+               RCCL_SYSTEM_LIB = os.path.realpath(preferred_path[0])
+           break
+   for path in include_paths:
+       path = os.path.expanduser(path)
+       if path is None or not os.path.exists(path):
+           continue
+       if glob.glob(os.path.join(path, 'rccl.h')):
+           RCCL_INCLUDE_DIR = path
+           break
+
+   if RCCL_LIB_DIR is not None and RCCL_INCLUDE_DIR is not None:
+       USE_RCCL = True
+       RCCL_ROOT_DIR = os.path.commonprefix((RCCL_LIB_DIR, RCCL_INCLUDE_DIR))
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 7be2a601f69c..acfc5c792e31 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -678,7 +678,7 @@ if (BUILD_PYTHON)
       list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
       list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-      if (USE_CUDA)
+      if (USE_CUDA OR USE_ROCM)
         list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
       endif()
     endif()
@@ -694,6 +694,14 @@ if (BUILD_PYTHON)
     endif()
   endif()
 
+  if (USE_RCCL)
+    list(APPEND TORCH_PYTHON_SRCS
+      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp
+      ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RCCL)
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
+  endif()
+
   add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi")
   # For Declarations.yaml dependency
   add_dependencies(torch_python_stubs ATEN_CPU_FILES_GEN_TARGET)
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 81dbbf3b93a5..b585779193dd 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -21,6 +21,21 @@ else()
   message(STATUS "Building C10D without CUDA support")
 endif()
 
+if(USE_ROCM)
+  find_package(HIP)
+  if(HIP_FOUND)
+    set(C10D_USE_ROCM true)
+    message(STATUS "Building C10D with ROCM support")
+    add_definitions(-DUSE_ROCM=1 -D__HIP_PLATFORM_HCC__=1)
+  else()
+    set(C10D_USE_ROCM false)
+    message(STATUS "HIP not found, building C10D without ROCM support")
+  endif()
+else()
+  set(C10D_USE_ROCM false)
+  message(STATUS "Building C10D without ROCM support")
+endif()
+
 find_package(MPI)
 if(MPI_FOUND)
   message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
@@ -38,6 +53,10 @@ if(USE_NCCL)
   option(USE_C10D_NCCL "USE C10D NCCL" ON)
 endif()
 
+if(USE_RCCL)
+  option(USE_C10D_RCCL "USE C10D RCCL" ON)
+endif()
+
 if(MPI_FOUND)
   option(USE_C10D_MPI "USE C10D MPI" ON)
 endif()
@@ -56,6 +75,10 @@ if(C10D_USE_CUDA)
   set(C10D_LIBS
     caffe2_gpu
     )
+elseif(C10D_USE_ROCM)
+  set(C10D_LIBS
+    caffe2_hip
+    )
 else()
   set(C10D_LIBS
     caffe2
@@ -63,11 +86,18 @@ else()
 endif()
 
 
-if(USE_C10D_NCCL)
+if(USE_C10D_NCCL OR USE_C10D_RCCL)
   list(APPEND C10D_SRCS ProcessGroupNCCL.cpp)
+endif()
+
+if(USE_NCCL)
   list(APPEND C10D_LIBS __caffe2_nccl)
 endif()
 
+if(USE_RCCL)
+  list(APPEND C10D_LIBS ${PYTORCH_RCCL_LIBRARIES})
+endif()
+
 if(USE_C10D_MPI)
   list(APPEND C10D_SRCS ProcessGroupMPI.cpp)
   list(APPEND C10D_LIBS ${MPI_LIBRARIES})
@@ -123,11 +153,16 @@ copy_header(Types.hpp)
 copy_header(Utils.hpp)
 copy_header(ProcessGroupGloo.hpp)
 
-if(USE_C10D_NCCL)
+if(USE_C10D_NCCL OR USE_C10D_RCCL)
   copy_header(ProcessGroupNCCL.hpp)
   copy_header(NCCLUtils.hpp)
 endif()
 
+if(USE_RCCL)
+  copy_header(rccl1_compat.h)
+  target_include_directories(c10d PUBLIC ${RCCL_INCLUDE_DIRS})
+endif()
+
 if(USE_C10D_MPI)
   target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH})
   copy_header(ProcessGroupMPI.hpp)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 5031e6fd05f0..b07bb0207e45 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -164,6 +164,22 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   processGroupID_ = std::to_string(processGroupCounterMap_[groupKey]);
   groupPgID_ = groupName_ + "_" + processGroupID_;
   pgUniqueNCCLIDCnt_[groupPgID_] = -1;
+
+#ifdef __HIP_PLATFORM_HCC__
+  int nDevices;
+  if (hipGetDeviceCount(&nDevices) != hipSuccess) {
+    hipGetLastError();
+    nDevices = 0;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    hipSetDevice(i);
+    // Enable peer access from current device to all other devices
+    for (int j = 0; j < nDevices; ++j) {
+      if (i==j) continue;
+      hipDeviceEnablePeerAccess(j, 0);
+    }
+  }
+#endif
 }
 
 ProcessGroupNCCL::~ProcessGroupNCCL() {

From 252e38e47480087c696efaa3b0017e087bb37263 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 13 Feb 2019 00:04:19 +0000
Subject: [PATCH 2/5] Remove nccl dependency for ROCm

---
 torch/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index acfc5c792e31..f4c07e7b6925 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -699,7 +699,6 @@ if (BUILD_PYTHON)
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RCCL)
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
   endif()
 
   add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi")

From 74de72046a3b71d71ee88459da50f5df952e4293 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 13 Feb 2019 21:43:12 +0000
Subject: [PATCH 3/5] Skip test_nn data_parallel unit tests on ROCm since they
 currently fail with RCCL

---
 test/test_nn.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index 5adf5b5a249d..de6a4e61d91e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -3237,6 +3237,7 @@ def test_broadcast_no_grad(self):
             self.assertFalse(output.requires_grad)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_replicate(self):
         module = nn.Linear(10, 5).float().cuda()
         input = Variable(torch.randn(2, 10).float().cuda())
@@ -3345,6 +3346,7 @@ def local_test(out):
         local_test(out)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_small_back(self):
         l = nn.Linear(10, 5).float().cuda()
         i = Variable(torch.randn(20, 10).float().cuda())
@@ -3391,6 +3393,7 @@ def forward(self, x):
         self.assertRaises(AssertionError, lambda: dp.data_parallel(l, i, (0, 1)))
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel(self):
         l = nn.Linear(10, 5).float().cuda()
         i = Variable(torch.randn(20, 10).float().cuda(1))
@@ -3494,6 +3497,7 @@ def forward(self, *input):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_data_parallel_module(self, dtype=torch.float):
         l = nn.Linear(10, 5).to("cuda", dtype)
         i = torch.randn(20, 10, device="cuda", dtype=dtype)
@@ -3505,6 +3509,7 @@ def test_data_parallel_module(self, dtype=torch.float):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_data_parallel_module_kwargs_only(self, dtype=torch.float):
         class Net(nn.Module):
             def __init__(self):
@@ -3524,6 +3529,7 @@ def forward(self, input):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_data_parallel_module_kwargs_only_empty_list(self, dtype=torch.float):
         class Net(nn.Module):
             def __init__(self):
@@ -3543,6 +3549,7 @@ def forward(self, input):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_data_parallel_module_kwargs_only_empty_dict(self, dtype=torch.float):
         class Net(nn.Module):
             def __init__(self):
@@ -3562,6 +3569,7 @@ def forward(self, input):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_data_parallel_module_kwargs_only_empty_tuple(self, dtype=torch.float):
         class Net(nn.Module):
             def __init__(self):
@@ -3580,6 +3588,7 @@ def forward(self, input):
         self.assertEqual(out.data, expected_out, dtype2prec[dtype])
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_device_args(self):
         cuda0 = torch.device('cuda:0')
         cuda1 = torch.device('cuda:1')

From 37596159411c6686a4606b350d05a5b19321a5fa Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 13 Feb 2019 23:05:12 +0000
Subject: [PATCH 4/5] Update some comments based on review

---
 tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py | 2 +-
 tools/amd_build/pyHIPIFY/rccl1_compat.h          | 5 -----
 tools/setup_helpers/dist_check.py                | 3 +--
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 12eca85d8122..8c6ed65f9d59 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -280,7 +280,7 @@
     ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)),
     ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
     ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
-    ("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
+    ("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RCCL)), #PyTorch also has a source file named "nccl.h", so we need to use "<"">" to differentiate
 ])
 
 CUDA_IDENTIFIER_MAP = collections.OrderedDict([
diff --git a/tools/amd_build/pyHIPIFY/rccl1_compat.h b/tools/amd_build/pyHIPIFY/rccl1_compat.h
index b3da757c6fd8..deb5dbd2292f 100644
--- a/tools/amd_build/pyHIPIFY/rccl1_compat.h
+++ b/tools/amd_build/pyHIPIFY/rccl1_compat.h
@@ -1,8 +1,3 @@
-/*************************************************************************
-* Copyright (c) 2017, AMD. All rights reserved.
-*
-************************************************************************/
-
 #ifndef RCCL1_COMPAT_H
 #define RCCL1_COMPAT_H
 
diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py
index 823807234764..70925d1d8fb8 100644
--- a/tools/setup_helpers/dist_check.py
+++ b/tools/setup_helpers/dist_check.py
@@ -5,8 +5,7 @@
 from .env import IS_CONDA, IS_LINUX, IS_WINDOWS, CONDA_DIR, check_env_flag, check_negative_env_flag, gather_paths
 from .cuda import USE_CUDA
 
-# On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
-USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS# and not check_env_flag("USE_ROCM")
+USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS
 USE_GLOO_IBVERBS = False
 
 IB_DEVINFO_CMD = "ibv_devinfo"

From 928174fe1e72f58cd1fd46ef6fdcfce6987e03dc Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 13 Feb 2019 23:07:50 +0000
Subject: [PATCH 5/5] Skip test_cuda coalesced tests on ROCm since they
 currently fail with RCCL

---
 test/test_cuda.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 26fc9fe5b417..930e262e3315 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1076,6 +1076,7 @@ def test_broadcast_coalesced(self):
         self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_broadcast_coalesced_dense_only(self):
         numel = 5
         num_bytes = numel * 8
@@ -1146,6 +1147,7 @@ def test_reduce_add_coalesced(self):
         self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_reduce_add_coalesced_dense_only(self):
         numel = 5
         num_bytes = numel * 8