Skip to content

Commit

Permalink
Merge pull request tensorflow#34800 from ROCmSoftwarePlatform:google_…
Browse files Browse the repository at this point in the history
…upstream_rocm_csr_sparse_matrix_support

PiperOrigin-RevId: 289617600
Change-Id: Ic1aa3714126d7b867295ae386b6be643c1dc83e4
  • Loading branch information
tensorflower-gardener committed Jan 14, 2020
2 parents 353cb74 + 5d1ccc1 commit 0e2b5a9
Show file tree
Hide file tree
Showing 35 changed files with 1,030 additions and 423 deletions.
8 changes: 6 additions & 2 deletions tensorflow/core/kernels/BUILD
Expand Up @@ -3480,14 +3480,18 @@ tf_kernel_library(

tf_kernel_library(
name = "cuda_sparse",
srcs = ["cuda_sparse.cc"],
srcs = if_cuda(["cuda_sparse.cc"]) + if_rocm(["rocm_sparse.cc"]),
hdrs = ["cuda_sparse.h"],
deps = [
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core/kernels:cuda_solvers",
] + if_cuda([
"//tensorflow/stream_executor/cuda:cusparse_lib",
] + if_cuda(["@cub_archive//:cub"]),
"@cub_archive//:cub",
]) + if_rocm([
"@local_config_rocm//rocm:hipsparse",
]),
)

LINALG_DEPS = [
Expand Down
259 changes: 129 additions & 130 deletions tensorflow/core/kernels/cuda_sparse.cc

Large diffs are not rendered by default.

212 changes: 149 additions & 63 deletions tensorflow/core/kernels/cuda_sparse.h

Large diffs are not rendered by default.

330 changes: 330 additions & 0 deletions tensorflow/core/kernels/rocm_sparse.cc
@@ -0,0 +1,330 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#if TENSORFLOW_USE_ROCM

#include <complex>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>

#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/cuda_solvers.h"
#include "tensorflow/core/kernels/cuda_sparse.h"
#include "tensorflow/core/lib/core/blocking_counter.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"

namespace tensorflow {
namespace {

// A set of initialized handles to the underlying ROCm libraries used by
// GpuSparse. We maintain one such set of handles per unique stream.
class HipSparseHandles {
public:
explicit HipSparseHandles(hipStream_t stream)
: initialized_(false), stream_(stream) {}

HipSparseHandles(HipSparseHandles&& rhs)
: initialized_(rhs.initialized_),
stream_(std::move(rhs.stream_)),
hipsparse_handle_(rhs.hipsparse_handle_) {
rhs.initialized_ = false;
}

HipSparseHandles& operator=(HipSparseHandles&& rhs) {
if (this == &rhs) return *this;
Release();
stream_ = std::move(rhs.stream_);
hipsparse_handle_ = std::move(rhs.hipsparse_handle_);
initialized_ = rhs.initialized_;
rhs.initialized_ = false;
return *this;
}

~HipSparseHandles() { Release(); }

Status Initialize() {
if (initialized_) return Status::OK();
TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreate(&hipsparse_handle_));
TF_RETURN_IF_GPUSPARSE_ERROR(
hipsparseSetStream(hipsparse_handle_, stream_));
initialized_ = true;
return Status::OK();
}

hipsparseHandle_t& handle() {
DCHECK(initialized_);
return hipsparse_handle_;
}

const hipsparseHandle_t& handle() const {
DCHECK(initialized_);
return hipsparse_handle_;
}

private:
void Release() {
if (initialized_) {
// This should never return anything other than success
auto err = hipsparseDestroy(hipsparse_handle_);
DCHECK(err == HIPSPARSE_STATUS_SUCCESS)
<< "Failed to destroy hipSPARSE instance.";
initialized_ = false;
}
}
bool initialized_;
hipStream_t stream_;
hipsparseHandle_t hipsparse_handle_;

TF_DISALLOW_COPY_AND_ASSIGN(HipSparseHandles);
};

// TODO(ebrevdo): Replace global mutex guarding CudaSparseHandles
// lookup with one of:
// 1. Adding the handle to the CudaStream structure; do the lookup there.
// 2. Add a thread-local cusparse, set it to the current stream
// upon each call.
// #1 seems like the cleanest option but will need to wait until this
// is moved into TF core.
static mutex handle_map_mutex(LINKER_INITIALIZED);

using HandleMap = std::unordered_map<hipStream_t, HipSparseHandles>;

// Returns a singleton map used for storing initialized handles for each unique
// cuda stream.
HandleMap* GetHandleMapSingleton() {
static HandleMap* cm = new HandleMap;
return cm;
}

} // namespace

GpuSparse::GpuSparse(OpKernelContext* context)
: initialized_(false), context_(context) {
auto hip_stream_ptr =
reinterpret_cast<const hipStream_t*>(context->op_device_context()
->stream()
->implementation()
->GpuStreamMemberHack());
DCHECK(hip_stream_ptr);
gpu_stream_ = *hip_stream_ptr;
}

Status GpuSparse::Initialize() {
HandleMap* handle_map = GetHandleMapSingleton();
DCHECK(handle_map);
mutex_lock lock(handle_map_mutex);
auto it = handle_map->find(gpu_stream_);
if (it == handle_map->end()) {
LOG(INFO) << "Creating GpuSparse handles for stream " << gpu_stream_;
// Previously unseen ROCm stream. Initialize a set of ROCm sparse library
// handles for it.
HipSparseHandles new_handles(gpu_stream_);
TF_RETURN_IF_ERROR(new_handles.Initialize());
it = handle_map->insert(std::make_pair(gpu_stream_, std::move(new_handles)))
.first;
}
gpusparse_handle_ = &it->second.handle();
initialized_ = true;
return Status::OK();
}

// Macro that specializes a sparse method for all 4 standard
// numeric types.
#define TF_CALL_HIP_LAPACK_TYPES(m) m(float, S) m(double, D)

// Macros to construct hipsparse method names.
#define SPARSE_FN(method, sparse_prefix) hipsparse##sparse_prefix##method

Status GpuSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
int* csrRowPtr) const {
DCHECK(initialized_);
TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcoo2csr(*gpusparse_handle_, cooRowInd,
nnz, m, csrRowPtr,
HIPSPARSE_INDEX_BASE_ZERO));
return Status::OK();
}

Status GpuSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
int* cooRowInd) const {
DCHECK(initialized_);
TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsr2coo(*gpusparse_handle_, csrRowPtr,
nnz, m, cooRowInd,
HIPSPARSE_INDEX_BASE_ZERO));
return Status::OK();
}

template <typename Scalar, typename SparseFnT>
static inline Status CsrmmImpl(
SparseFnT op, OpKernelContext* context, hipsparseHandle_t hipsparse_handle,
hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,
int k, int nnz, const Scalar* alpha_host, const hipsparseMatDescr_t descrA,
const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
const int* csrSortedColIndA, const Scalar* B, int ldb,
const Scalar* beta_host, Scalar* C, int ldc) {
TF_RETURN_IF_GPUSPARSE_ERROR(op(hipsparse_handle, transA, transB, m, n, k,
nnz, alpha_host, descrA, csrSortedValA,
csrSortedRowPtrA, csrSortedColIndA, B, ldb,
beta_host, C, ldc));
return Status::OK();
}

#define CSRMM_INSTANCE(Scalar, sparse_prefix) \
template <> \
Status GpuSparse::Csrmm<Scalar>( \
hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n, \
int k, int nnz, const Scalar* alpha_host, \
const hipsparseMatDescr_t descrA, const Scalar* csrSortedValA, \
const int* csrSortedRowPtrA, const int* csrSortedColIndA, \
const Scalar* B, int ldb, const Scalar* beta_host, Scalar* C, int ldc) \
const { \
DCHECK(initialized_); \
return CsrmmImpl(SPARSE_FN(csrmm2, sparse_prefix), context_, \
*gpusparse_handle_, transA, transB, m, n, k, nnz, \
alpha_host, descrA, csrSortedValA, csrSortedRowPtrA, \
csrSortedColIndA, B, ldb, beta_host, C, ldc); \
}

TF_CALL_HIP_LAPACK_TYPES(CSRMM_INSTANCE);

template <typename Scalar, typename SparseFnT>
static inline Status CsrmvImpl(SparseFnT op, OpKernelContext* context,
hipsparseHandle_t hipsparse_handle,
hipsparseOperation_t transA, int m, int n,
int nnz, const Scalar* alpha_host,
const hipsparseMatDescr_t descrA,
const Scalar* csrSortedValA,
const int* csrSortedRowPtrA,
const int* csrSortedColIndA, const Scalar* x,
const Scalar* beta_host, Scalar* y) {
TF_RETURN_IF_GPUSPARSE_ERROR(
op(hipsparse_handle, transA, m, n, nnz, alpha_host, descrA, csrSortedValA,
csrSortedRowPtrA, csrSortedColIndA, x, beta_host, y));
return Status::OK();
}

// TODO(ebrevdo,rmlarsen): Use csrmv_mp for all cases when available in CUDA 9.
#define CSRMV_INSTANCE(Scalar, sparse_prefix) \
template <> \
Status GpuSparse::Csrmv<Scalar>( \
hipsparseOperation_t transA, int m, int n, int nnz, \
const Scalar* alpha_host, const hipsparseMatDescr_t descrA, \
const Scalar* csrSortedValA, const int* csrSortedRowPtrA, \
const int* csrSortedColIndA, const Scalar* x, const Scalar* beta_host, \
Scalar* y) const { \
DCHECK(initialized_); \
return CsrmvImpl(SPARSE_FN(csrmv, sparse_prefix), context_, \
*gpusparse_handle_, transA, m, n, nnz, alpha_host, \
descrA, csrSortedValA, csrSortedRowPtrA, \
csrSortedColIndA, x, beta_host, y); \
}

TF_CALL_HIP_LAPACK_TYPES(CSRMV_INSTANCE);

Status GpuSparse::CsrgemmNnz(
hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,
int k, const hipsparseMatDescr_t descrA, int nnzA,
const int* csrSortedRowPtrA, const int* csrSortedColIndA,
const hipsparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
const int* csrSortedColIndB, const hipsparseMatDescr_t descrC,
int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
DCHECK(initialized_);
DCHECK(nnzTotalDevHostPtr != nullptr);
TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsrgemmNnz(
*gpusparse_handle_, transA, transB, m, n, k, descrA, nnzA,
csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
csrSortedColIndB, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
return Status::OK();
}

template <typename Scalar, typename SparseFnT>
static inline Status CsrgemmImpl(
SparseFnT op, OpKernelContext* context, hipsparseHandle_t hipsparse_handle,
hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n,
int k, const hipsparseMatDescr_t descrA, int nnzA,
const Scalar* csrSortedValA, const int* csrSortedRowPtrA,
const int* csrSortedColIndA, const hipsparseMatDescr_t descrB, int nnzB,
const Scalar* csrSortedValB, const int* csrSortedRowPtrB,
const int* csrSortedColIndB, const hipsparseMatDescr_t descrC,
Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC) {
TF_RETURN_IF_GPUSPARSE_ERROR(
op(hipsparse_handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedValB,
csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC,
csrSortedRowPtrC, csrSortedColIndC));
return Status::OK();
}

#define CSRGEMM_INSTANCE(Scalar, sparse_prefix) \
template <> \
Status GpuSparse::Csrgemm<Scalar>( \
hipsparseOperation_t transA, hipsparseOperation_t transB, int m, int n, \
int k, const hipsparseMatDescr_t descrA, int nnzA, \
const Scalar* csrSortedValA, const int* csrSortedRowPtrA, \
const int* csrSortedColIndA, const hipsparseMatDescr_t descrB, int nnzB, \
const Scalar* csrSortedValB, const int* csrSortedRowPtrB, \
const int* csrSortedColIndB, const hipsparseMatDescr_t descrC, \
Scalar* csrSortedValC, int* csrSortedRowPtrC, int* csrSortedColIndC) { \
DCHECK(initialized_); \
return CsrgemmImpl(SPARSE_FN(csrgemm, sparse_prefix), context_, \
*gpusparse_handle_, transA, transB, m, n, k, descrA, \
nnzA, csrSortedValA, csrSortedRowPtrA, \
csrSortedColIndA, descrB, nnzB, csrSortedValB, \
csrSortedRowPtrB, csrSortedColIndB, descrC, \
csrSortedValC, csrSortedRowPtrC, csrSortedColIndC); \
}

TF_CALL_HIP_LAPACK_TYPES(CSRGEMM_INSTANCE);

template <typename Scalar, typename SparseFnT>
static inline Status Csr2cscImpl(SparseFnT op, OpKernelContext* context,
hipsparseHandle_t hipsparse_handle, int m,
int n, int nnz, const Scalar* csrVal,
const int* csrRowPtr, const int* csrColInd,
Scalar* cscVal, int* cscRowInd, int* cscColPtr,
const hipsparseAction_t copyValues) {
TF_RETURN_IF_GPUSPARSE_ERROR(
op(hipsparse_handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
cscRowInd, cscColPtr, copyValues, HIPSPARSE_INDEX_BASE_ZERO));
return Status::OK();
}

#define CSR2CSC_INSTANCE(Scalar, sparse_prefix) \
template <> \
Status GpuSparse::Csr2csc<Scalar>( \
int m, int n, int nnz, const Scalar* csrVal, const int* csrRowPtr, \
const int* csrColInd, Scalar* cscVal, int* cscRowInd, int* cscColPtr, \
const hipsparseAction_t copyValues) { \
DCHECK(initialized_); \
return Csr2cscImpl(SPARSE_FN(csr2csc, sparse_prefix), context_, \
*gpusparse_handle_, m, n, nnz, csrVal, csrRowPtr, \
csrColInd, cscVal, cscRowInd, cscColPtr, copyValues); \
}

TF_CALL_HIP_LAPACK_TYPES(CSR2CSC_INSTANCE);

} // namespace tensorflow

#endif // TENSORFLOW_USE_ROCM
4 changes: 2 additions & 2 deletions tensorflow/core/kernels/sparse/BUILD
Expand Up @@ -2,10 +2,10 @@

load(
"//tensorflow:tensorflow.bzl",
"if_cuda_or_rocm",
"tf_cc_test",
"tf_kernel_library",
)
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")

package(
default_visibility = ["//visibility:public"],
Expand Down Expand Up @@ -77,7 +77,7 @@ tf_kernel_library(
"//tensorflow/core/kernels:scatter_nd_op",
"//tensorflow/core/kernels:slice_op",
"//tensorflow/core/kernels:transpose_functor",
] + if_cuda([
] + if_cuda_or_rocm([
"//tensorflow/core/kernels:cuda_solvers",
"//tensorflow/core/kernels:cuda_sparse",
]),
Expand Down

0 comments on commit 0e2b5a9

Please sign in to comment.