diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index f83322d0a0..14d90c0f76 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -10,6 +10,7 @@ cdef extern from "gpuarray/buffer_blas.h": cb_conj_trans cdef extern from "gpuarray/blas.h": + int GpuArray_rdot(_GpuArray *X, _GpuArray *Y, _GpuArray *Z, int nocopy) int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A, _GpuArray *X, double beta, _GpuArray *Y, int nocopy) int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, @@ -18,6 +19,13 @@ cdef extern from "gpuarray/blas.h": int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A, int nocopy) +cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) except -1: + cdef int err + err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy) + if err != GA_NO_ERROR: + raise GpuArrayException(GpuArray_error(&X.ga, err), err) + return 0 + cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, GpuArray X, double beta, GpuArray Y, bint nocopy) except -1: @@ -45,6 +53,16 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, return 0 +def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): + if Z is None: + Z = pygpu_empty(0, NULL, X.typecode, GA_ANY_ORDER, X.context, None) + overwrite_z = True + + if not overwrite_z: + Z = pygpu_copy(Z, GA_ANY_ORDER) + pygpu_blas_rdot(X, Y, Z, 0) + return Z + def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, trans_a=False, overwrite_y=False): cdef cb_transpose transA diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 119ef8e959..8ce7d7aebe 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -1,4 +1,5 @@ -import numpy +from itertools import product +import numpy from nose.plugins.skip import SkipTest from .support import (guard_devsup, gen_gpuarray, context) @@ -14,25 +15,48 @@ import pygpu.blas as gblas +def test_dot(): + bools = [True, False] + for N, dtype, offseted_i, sliced in product( + [1, 256, 1337], ['float32', 'float64'], bools, bools): + yield dot, N, dtype, offseted_i, sliced, True, False + for overwrite, init_z in product(bools, bools): + yield dot, 666, 'float32', False, False, overwrite, init_z + +@guard_devsup +def dot(N, dtype, offseted_i, sliced, overwrite, init_z): + cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + cY, gY = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + if init_z: + _, gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + else: + _, gZ = None, None + + if dtype == 'float32': + cr = fblas.sdot(cX, cY) + else: + cr = fblas.ddot(cX, cY) + gr = gblas.dot(gX, gY, gZ, overwrite_z=overwrite) + numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) + + def test_gemv(): - for shape in [(100, 128), (128, 50)]: - for order in ['f', 'c']: - for trans in [False, True]: - for offseted_i in [True, False]: - for sliced in [1, 2, -1, -2]: - yield gemv, shape, 'float32', order, trans, \ - offseted_i, sliced, True, False - for overwrite in [True, False]: - for init_y in [True, False]: - yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ - overwrite, init_y + bools = [False, True] + for shape, order, trans, offseted_i, sliced in product( + [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): + yield gemv, shape, 'float32', order, trans, \ + offseted_i, sliced, True, False + for overwrite, init_y in product(bools, bools): + yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ + overwrite, init_y yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False - for alpha in [0, 1, -1, 0.6]: - for beta in [0, 1, -1, 0.6]: - for overwite in [True, False]: - yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ - overwrite, True, alpha, beta - + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ + overwrite, True, alpha, beta @guard_devsup def gemv(shp, dtype, order, trans, offseted_i, sliced, @@ -65,28 +89,22 @@ def gemv(shp, dtype, order, trans, offseted_i, sliced, def test_gemm(): - for m, n, k in [(48, 15, 32), (15, 32, 48)]: - for order in [('f', 'f', 'f'), ('c', 'c', 'c'), - ('f', 'f', 'c'), ('f', 'c', 'f'), - ('f', 'c', 'c'), ('c', 'f', 'f'), - ('c', 'f', 'c'), ('c', 'c', 'f')]: - for trans in [(False, False), (True, True), - (False, True), (True, False)]: - for offseted_o in [False, True]: - yield gemm, m, n, k, 'float32', order, trans, \ - offseted_o, 1, False, False - for sliced in [1, 2, -1, -2]: - for overwrite in [True, False]: - for init_res in [True, False]: - yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ - (False, False), False, sliced, overwrite, init_res + bools = [False, True] + for (m, n, k), order, trans, offseted_o in product( + [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)), + list(product(bools, bools)), bools): + yield gemm, m, n, k, 'float32', order, trans, \ + offseted_o, 1, False, False + for sliced, overwrite, init_res in product( + [1, 2, -1, -2], bools, bools): + yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ + (False, False), False, sliced, overwrite, init_res yield gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), \ False, 1, False, False - for alpha in [0, 1, -1, 0.6]: - for beta in [0, 1, -1, 0.6]: - for overwrite in [True, False]: - yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ - (False, False), False, 1, overwrite, True, alpha, beta + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ + (False, False), False, 1, overwrite, True, alpha, beta @guard_devsup def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, @@ -124,19 +142,13 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, def test_ger(): - for m, n in [(4, 5)]: - for order in ['f', 'c']: - for sliced_x in [1, 2, -2, -1]: - for sliced_y in [1, 2, -2, -1]: - yield ger, m, n, 'float32', order, sliced_x, sliced_y, \ - False - + bools = [False, True] + for (m,n), order, sliced_x, sliced_y in product( + [(4,5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]): + yield ger, m, n, 'float32', order, sliced_x, sliced_y, False yield ger, 4, 5, 'float64', 'f', 1, 1, False - - for init_res in [True, False]: - for overwrite in [True, False]: - yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite - + for init_res, overwrite in product(bools, bools): + yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context) diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index d43d07b348..a8dd8096bc 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -8,6 +8,12 @@ extern "C" { #endif +// only for vector-vector dot +GPUARRAY_PUBLIC int GpuArray_rdot( GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy); +#define GpuArray_hdot GpuArray_rdot +#define GpuArray_sdot GpuArray_rdot +#define GpuArray_ddot GpuArray_rdot GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy); diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 6e36c33f37..859ede62b7 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -38,6 +38,24 @@ GPUARRAY_PUBLIC void gpublas_teardown(gpucontext *ctx); GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx); +GPUARRAY_PUBLIC int gpublas_hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); + +GPUARRAY_PUBLIC int gpublas_sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); + +GPUARRAY_PUBLIC int gpublas_ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); + GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 9fb6216054..8f9fb5919b 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -5,6 +5,91 @@ #include "gpuarray/util.h" #include "gpuarray/error.h" +int GpuArray_rdot( GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy) { + GpuArray *Xp = X; + GpuArray copyX; + GpuArray *Yp = Y; + GpuArray copyY; + GpuArray *Zp = Z; + size_t n; + void *ctx; + size_t elsize; + int err; + + if (X->typecode != GA_HALF && + X->typecode != GA_FLOAT && + X->typecode != GA_DOUBLE) + return GA_INVALID_ERROR; + + if (X->nd != 1 || Y->nd != 1 || Z->nd != 0 || + X->typecode != Y->typecode || X->typecode != Z->typecode) + return GA_VALUE_ERROR; + n = X->dimensions[0]; + if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || + !(Z->flags & GA_ALIGNED)) + return GA_UNALIGNED_ERROR; + if (X->dimensions[0] != Y->dimensions[0]) + return GA_VALUE_ERROR; + + elsize = gpuarray_get_elsize(X->typecode); + if (X->strides[0] < 0) { + if (nocopy) + return GA_COPY_ERROR; + else { + err = GpuArray_copy(©X, X, GA_ANY_ORDER); + if (err != GA_NO_ERROR) + goto cleanup; + Xp = ©X; + } + } + if (Y->strides[0] < 0) { + if (nocopy) + return GA_COPY_ERROR; + else { + err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); + if (err != GA_NO_ERROR) + goto cleanup; + Yp = ©Y; + } + } + + ctx = gpudata_context(Xp->data); + err = gpublas_setup(ctx); + if (err != GA_NO_ERROR) + goto cleanup; + + switch (Xp->typecode) { + case GA_HALF: + err = gpublas_hdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data, Zp->offset / elsize); + break; + case GA_FLOAT: + err = gpublas_sdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data, Zp->offset / elsize); + break; + case GA_DOUBLE: + err = gpublas_ddot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data, Zp->offset / elsize); + break; + } + cleanup: + if (Xp == ©X) + GpuArray_clear(©X); + if (Yp == ©Y) + GpuArray_clear(©Y); + return err; +} + int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy) { GpuArray *Ap = A; @@ -24,8 +109,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, return GA_INVALID_ERROR; if (A->nd != 2 || X->nd != 1 || Y->nd != 1 || - A->typecode != A->typecode || X->typecode != A->typecode || - Y->typecode != A->typecode) + X->typecode != A->typecode || Y->typecode != A->typecode) return GA_VALUE_ERROR; if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 03490a8c58..39cba704f5 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -786,6 +786,94 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cuda_context *ctx = X->ctx; + blas_handle *h = (blas_handle *)ctx->blas_handle; + cublasPointerMode_t pmode; + + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(Z); + + if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + + cuda_enter(ctx); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); + + // we should store dot result on device + cublasGetPointerMode(h->h, &pmode); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); + h->err = cublasSdot( + h->h, N, + ((float*)X->ptr) + offX, incX, + ((float*)Y->ptr) + offY, incY, + ((float*)Z->ptr) + offZ); + cublasSetPointerMode(h->h, pmode); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cuda_context *ctx = X->ctx; + blas_handle *h = (blas_handle *)ctx->blas_handle; + cublasPointerMode_t pmode; + + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(Z); + + if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + + cuda_enter(ctx); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); + + // we should store dot result on device + cublasGetPointerMode(h->h, &pmode); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); + h->err = cublasDdot( + h->h, N, + ((double*)X->ptr) + offX, incX, + ((double*)Y->ptr) + offY, incY, + ((double*)Z->ptr) + offZ); + cublasSetPointerMode(h->h, pmode); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, @@ -1558,6 +1646,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, + ddot, hgemv, /* TODO */ sgemv, dgemv, diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 837a74af9b..2041710735 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -6,6 +6,8 @@ #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" +extern const gpuarray_buffer_ops opencl_ops; + static inline clblasOrder convO(cb_order order) { switch (order) { case cb_row: @@ -194,6 +196,99 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return GA_DEVSUP_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + clblasStatus err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + gpudata *wbuf; + int alloc_err; + + wbuf = opencl_ops.buffer_alloc( + (gpucontext*)ctx, + N*sizeof(float), NULL, GA_BUFFER_READ_WRITE, &alloc_err); + if (alloc_err != GA_NO_ERROR) + return alloc_err; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + // TODO: a thread-safe static buffer or allocator? + err = clblasSdot( + N, Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + wbuf->buf, 1, &ctx->q, + num_ev, num_ev ? evl : NULL, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + opencl_ops.buffer_release(wbuf); + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + clblasStatus err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + gpudata *wbuf; + int alloc_err; + + wbuf = opencl_ops.buffer_alloc( + (gpucontext*)ctx, + N*sizeof(double), NULL, GA_BUFFER_READ_WRITE, &alloc_err); + if (alloc_err != GA_NO_ERROR) + return alloc_err; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = clblasDdot( + N, Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + wbuf->buf, 1, &ctx->q, + num_ev, num_ev ? evl : NULL, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + opencl_ops.buffer_release(wbuf); + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, @@ -400,6 +495,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, + ddot, hgemv, /* TODO */ sgemv, dgemv, diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 28b164799f..78cca10f20 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -189,6 +189,99 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return GA_DEVSUP_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastHdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastSdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastDdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, @@ -436,6 +529,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { setup, teardown, error, + hdot, + sdot, + ddot, hgemv, sgemv, dgemv, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 417027e850..c73f3c2f19 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -19,6 +19,33 @@ const char *gpublas_error(gpucontext *ctx) { return "No blas ops available, API error."; } +int gpublas_hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +} + +int gpublas_sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +} + +int gpublas_ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->ddot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +} + int gpublas_hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn index 2ab7f5b2af..f56a2a1393 100644 --- a/src/loaders/libclblas.fn +++ b/src/loaders/libclblas.fn @@ -1,8 +1,12 @@ DEF_PROC(clblasStatus, clblasSetup, (void)); DEF_PROC(void, clblasTeardown, (void)); -DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); + + +DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); \ No newline at end of file +DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn index 544c164e0c..28f36ba20b 100644 --- a/src/loaders/libclblast.fn +++ b/src/loaders/libclblast.fn @@ -1,3 +1,6 @@ +DEF_PROC(StatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(StatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(StatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); DEF_PROC(StatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index 04b0290800..c0dbddf41e 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -3,9 +3,13 @@ DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); +DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t *mode)); DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); +DEF_PROC_V2(cublasSdot, (cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)); +DEF_PROC_V2(cublasDdot, (cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)); + DEF_PROC_V2(cublasSgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)); DEF_PROC_V2(cublasDgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)); diff --git a/src/private.h b/src/private.h index c0e0d7c5d1..57d919be88 100644 --- a/src/private.h +++ b/src/private.h @@ -113,6 +113,19 @@ struct _gpuarray_blas_ops { int (*setup)(gpucontext *ctx); void (*teardown)(gpucontext *ctx); const char *(*error)(gpucontext *ctx); + + int (*hdot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); + int (*sdot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); + int (*ddot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta,