Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/triton.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
f7888497a1eb9e98d4c07537f0d0bcfe180d1363
711e2a92522e0a9921ce58ae658571ca55c49b97
17 changes: 7 additions & 10 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,8 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

numba==0.49.0 ; python_version < "3.9"
numba==0.55.2 ; python_version == "3.9"
numba==0.55.2 ; python_version == "3.10"
numba==0.60.0 ; python_version == "3.12"
numba==0.60.0 ; python_version == "3.9"
numba==0.61.2 ; python_version > "3.9"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
Expand All @@ -135,6 +133,7 @@ numpy==2.0.2 ; python_version == "3.9"
numpy==2.1.2 ; python_version > "3.9"

pandas==2.2.3

#onnxruntime
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
#Pinned versions: 1.9.0
Expand Down Expand Up @@ -168,6 +167,7 @@ protobuf==5.29.4
#Pinned versions: 5.29.4
#test that import: test_tensorboard.py, test/onnx/*


psutil
#Description: information on running processes and system utilization
#Pinned versions:
Expand Down Expand Up @@ -263,11 +263,6 @@ tb-nightly==2.13.0a20230426
#Pinned versions:
#test that import:

tlparse==0.3.30
#Description: parse logs produced by torch.compile
#Pinned versions:
#test that import: dynamo/test_structured_trace.py

# needed by torchgen utils
typing-extensions>=4.10.0
#Description: type hints for python
Expand Down Expand Up @@ -326,7 +321,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
#Pinned versions: 1.4.1
#test that import:

lxml==5.3.0
lxml==5.3.0 ; python_version <= "3.12"
lxml==6.0.0 ; python_version == "3.13"
#Description: This is a requirement of unittest-xml-reporting

# Python-3.9 binaries
Expand All @@ -340,6 +336,7 @@ sympy==1.13.3

onnx==1.18.0
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal

#Pinned versions:
#test that import:

Expand Down
21 changes: 2 additions & 19 deletions aten/src/ATen/native/cuda/Blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1205,7 +1205,6 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(

} // namespace


// Computes matrix multiply + bias while applying scaling to input and output matrices
// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
// If output matrix type is 16 or 32-bit type, scale_result is not applied.
Expand Down Expand Up @@ -1362,25 +1361,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
else {
TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
}
// Until more than bf16 is supported
// Until more than bf16 is supported.
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
"hipblaslt rowwise _scaled_mm only supports BFloat16 output");
}
else if (scaling_choice == ScalingType::BlockWise) {
#if ROCM_VERSION >= 70000
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");

TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
"Matrix dimensions must be multiples of 32 for block-wise scaling");

TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
out.scalar_type() == ScalarType::Half,
"Block-wise scaling only supports BFloat16 or Half output types");
#else
TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
#endif
"hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pragupta Sorry, it's not clear to me why we are removing these lines... Is the gfx950-specific code not needed?

}
#endif

Expand Down
38 changes: 0 additions & 38 deletions aten/src/ATen/native/cuda/CUDALoops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -531,44 +531,6 @@ __global__ void elementwise_kernel(int N, func_t f) {
}
}

#ifdef USE_ROCM
template <int nt, int vt, typename func_t>
C10_LAUNCH_BOUNDS_2(nt, 4)
__global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
int tid = threadIdx.x;
int nv = nt * vt;
int idx = nv * blockIdx.x + tid;
if ((idx + nt*(vt-1)) < N) {
f(idx, true);
} else {
#pragma unroll
for (int i = 0; i < vt; i++) {
if (idx < N) {
f(idx, false);
idx += nt;
}
}
}
}

template <int nt, int vt, typename func_t>
C10_LAUNCH_BOUNDS_2(nt, 4)
__global__ void elementwise_kernel_strided(int N, func_t f) {
int tid = threadIdx.x;
int idx = nt * vt * blockIdx.x + tid;
int step = nt * vt * gridDim.x;
while (idx < N) {
#pragma unroll
for (int i = 0; i < vt; i++) {
if ((idx + nt * i) < N) {
f(idx + nt * i);
}
}
idx += step;
}
}
#endif

template <int nt, int vt, typename func_t>
static void launch_legacy_kernel(int64_t N, const func_t& f) {
TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
Expand Down
17 changes: 0 additions & 17 deletions aten/src/ATen/test/cuda_vectorized_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,6 @@ void reset_buffers() {
}
}

#if defined(USE_ROCM) && !defined(_WIN32)
TEST(TestLoops, HasSameArgTypes) {
// This is a compile-time unit test. If this file compiles without error,
// then the test passes and during runtime, we just need to return.
using namespace at::native::modern::detail;
using func1_t = int (*)(float, float);
using func2_t = int (*)(bool, float, float);
using func3_t = int (*)(float);
using func4_t = int (*)();
static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
return;
}
#endif

TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
char *ptr = reinterpret_cast<char *>(buffer1);

Expand Down
12 changes: 2 additions & 10 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1159,14 +1159,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
endif()
if(USE_ROCM)
add_compile_options(-D__HIP_PLATFORM_AMD__=1)
set(TP_USE_ROCM ON CACHE BOOL "" FORCE)
set(TP_ENABLE_HIP_IPC OFF CACHE BOOL "" FORCE)
set(TP_ENABLE_HIP_XTH OFF CACHE BOOL "" FORCE)
set(TP_ENABLE_HIP_GDR OFF CACHE BOOL "" FORCE)
set(TP_ENABLE_IBV OFF CACHE BOOL "" FORCE)
endif()
set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
add_compile_options(-DTORCH_USE_LIBUV)
include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/tensorpipe/third_party/libuv/include)
Expand All @@ -1192,9 +1184,9 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
if(USE_CUDA)
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
elseif(USE_ROCM)
message(WARNING "TensorPipe is supported on ROCm")
message(WARNING "TensorPipe doesn't yet support ROCm")
# Not yet...
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
# list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
endif()
endif()
endif()
Expand Down
2 changes: 1 addition & 1 deletion cmake/External/aotriton.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ if(NOT __AOTRITON_INCLUDED)
list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
string(CONCAT __AOTRITON_FILE "aotriton-"
"${__AOTRITON_VER_WITH_COMMIT}-${__AOTRITON_MANYLINUX}"
"${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
"_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
"-shared.tar.${__AOTRITON_Z}")
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" # @lint-ignore
Expand Down
22 changes: 10 additions & 12 deletions related_commits
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
ubuntu|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
centos|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
ubuntu|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
centos|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
ubuntu|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
centos|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
ubuntu|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
centos|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
ubuntu|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
centos|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
ubuntu|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
centos|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
ubuntu|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
centos|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
ubuntu|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
centos|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
ubuntu|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
centos|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
ubuntu|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
centos|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
6 changes: 1 addition & 5 deletions torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
#include <torch/csrc/distributed/rpc/tensorpipe_utils.h>

#if defined(USE_TENSORPIPE)
#if defined(USE_TENSORPIPE) && !defined(USE_ROCM)

#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAGuard.h>
Expand Down Expand Up @@ -50,8 +50,6 @@ C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_gdr, makeCudaGdrChannel)

#endif

#if TENSORPIPE_HAS_CUDA_XTH_CHANNEL

std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
auto context = tensorpipe::channel::cuda_xth::create();
return std::make_unique<ChannelRegistration>(
Expand All @@ -61,8 +59,6 @@ std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
// The cuda_xth channel supports same-process GPU-to-GPU comm
C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_xth, makeCudaXthChannel)

#endif

std::unique_ptr<ChannelRegistration> makeCudaBasicChannel() {
auto context = tensorpipe::channel::cuda_basic::create(
tensorpipe::channel::basic::create());
Expand Down
12 changes: 2 additions & 10 deletions torch/headeronly/macros/Export.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,10 @@
#define TORCH_API C10_IMPORT
#endif

// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API
// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_API
// belonging to the same library instead of just one TORCH_CUDA_API. Well, it
// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API
// and TORCH_CUDA_CU_API are artifacts of when we needed a split build to
// and TORCH_CUDA_API are artifacts of when we needed a split build to
// avoid relocation marker linking errors. The context is as follows:
//
// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
Expand All @@ -130,14 +130,6 @@
#define TORCH_CUDA_CU_API C10_IMPORT
#endif

#if defined(TORCH_HIP_BUILD_MAIN_LIB)
#define TORCH_HIP_CPP_API C10_EXPORT
#define TORCH_HIP_API C10_EXPORT
#else
#define TORCH_HIP_CPP_API C10_IMPORT
#define TORCH_HIP_API C10_IMPORT
#endif

#if defined(TORCH_XPU_BUILD_MAIN_LIB)
#define TORCH_XPU_API C10_EXPORT
#else
Expand Down
1 change: 0 additions & 1 deletion torch/testing/_internal/distributed/rpc/rpc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
captured_output,
skip_if_lt_x_gpu,
tp_transports,
skip_if_rocm,
)
from torch.testing._internal.common_utils import (
get_cycles_per_ms,
Expand Down