From 28e1ed560e04f7e105e649bd5989775adc42f39f Mon Sep 17 00:00:00 2001 From: khaotik Date: Thu, 24 Nov 2016 09:12:52 -0500 Subject: [PATCH 01/16] API for BLAS dot --- src/gpuarray/blas.h | 6 ++++++ src/gpuarray/buffer_blas.h | 15 +++++++++++++++ src/gpuarray_buffer_blas.c | 15 +++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index d43d07b348..aa7861c4af 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -8,6 +8,12 @@ extern "C" { #endif +// only for vector-vector dot +GPUARRAY_PUBLIC int GpuArray_dot( GpuArray *A, GpuArray *B, + GpuArray *C, int nocopy); +#define GpuArray_hdot GpuArray_rdot +#define GpuArray_sdot GpuArray_rdot +#define GpuArray_ddot GpuArray_rdot GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy); diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 6e36c33f37..f29788a1d8 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -38,6 +38,21 @@ GPUARRAY_PUBLIC void gpublas_teardown(gpucontext *ctx); GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx); +GPUARRAY_PUBLIC int gpublas_hdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +GPUARRAY_PUBLIC int gpublas_sdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +GPUARRAY_PUBLIC int gpublas_ddot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 417027e850..20371093bb 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -19,6 +19,21 @@ const char *gpublas_error(gpucontext *ctx) { return "No blas ops available, API error."; } +int gpublas_hdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +int gpublas_sdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +int gpublas_ddot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + int gpublas_hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, From 136825b561bf94e5b08417bf801f9199c2bbd254 Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 04:02:47 -0500 Subject: [PATCH 02/16] Finish BLAS dot for implementation for CUDA Plus some minor changes: - did `chmod +x setup.py` - added interface for clblas --- setup.py | 0 src/gpuarray/blas.h | 4 +- src/gpuarray/buffer_blas.h | 15 +++-- src/gpuarray_array_blas.c | 91 +++++++++++++++++++++++++++++- src/gpuarray_blas_cuda_cublas.c | 94 +++++++++++++++++++++++++++++++ src/gpuarray_blas_opencl_clblas.c | 27 +++++++++ src/gpuarray_buffer_blas.c | 24 ++++++-- src/loaders/libclblas.fn | 6 +- src/loaders/libcublas.fn | 4 ++ src/private.h | 13 +++++ 10 files changed, 261 insertions(+), 17 deletions(-) mode change 100644 => 100755 setup.py diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index aa7861c4af..a8dd8096bc 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -9,8 +9,8 @@ extern "C" { #endif // only for vector-vector dot -GPUARRAY_PUBLIC int GpuArray_dot( GpuArray *A, GpuArray *B, - GpuArray *C, int nocopy); +GPUARRAY_PUBLIC int GpuArray_rdot( GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy); #define GpuArray_hdot GpuArray_rdot #define GpuArray_sdot GpuArray_rdot #define GpuArray_ddot GpuArray_rdot diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index f29788a1d8..56d1d4d2da 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -40,18 +40,21 @@ GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx); GPUARRAY_PUBLIC int gpublas_hdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); GPUARRAY_PUBLIC int gpublas_sdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); GPUARRAY_PUBLIC int gpublas_ddot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 9fb6216054..74cfa858af 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -5,6 +5,94 @@ #include "gpuarray/util.h" #include "gpuarray/error.h" +int GpuArray_rdot( GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy) { + GpuArray *Xp = X; + GpuArray copyX; + GpuArray *Yp = Y; + GpuArray copyY; + GpuArray *Zp = Z; + void *ctx; + size_t elsize; + size_t n; + int err; + + if (X->typecode != GA_HALF && + X->typecode != GA_FLOAT && + X->typecode != GA_DOUBLE) + return GA_INVALID_ERROR; + + if (X->nd != 1 || X->nd != 1 || Y->nd != 0 || + X->typecode != Y->typecode || X->typecode != Z->typecode) + return GA_VALUE_ERROR; + if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || + !(Z->flags & GA_ALIGNED)) + return GA_UNALIGNED_ERROR; + if (X->dimensions[0] != n || Y->dimensions[0] != n) + return GA_VALUE_ERROR; + + elsize = gpuarray_get_elsize(X->typecode); + if (X->strides[0] < 0) { + if (nocopy) + return GA_COPY_ERROR; + else { + err = GpuArray_copy(©X, X, GA_ANY_ORDER); + if (err != GA_NO_ERROR) + goto cleanup; + Xp = ©X; + } + } + if (Y->strides[0] < 0) { + if (nocopy) + return GA_COPY_ERROR; + else { + err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); + if (err != GA_NO_ERROR) + goto cleanup; + Yp = ©Y; + } + } + if (Z->strides[0] < 0) { + err = GA_VALUE_ERROR; + goto cleanup; + } + + ctx = gpudata_context(Xp->data); + err = gpublas_setup(ctx); + if (err != GA_NO_ERROR) + goto cleanup; + + switch (Xp->typecode) { + case GA_HALF: + err = gpublas_hdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data); + break; + case GA_FLOAT: + err = gpublas_sdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data); + break; + case GA_DOUBLE: + err = gpublas_sdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data); + break; + } + cleanup: + if (Xp == ©X) + GpuArray_clear(©X); + if (Yp == ©Y) + GpuArray_clear(©Y); + return err; +} + int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy) { GpuArray *Ap = A; @@ -24,8 +112,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, return GA_INVALID_ERROR; if (A->nd != 2 || X->nd != 1 || Y->nd != 1 || - A->typecode != A->typecode || X->typecode != A->typecode || - Y->typecode != A->typecode) + X->typecode != A->typecode || Y->typecode != A->typecode) return GA_VALUE_ERROR; if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 03490a8c58..5f4b913b63 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -786,6 +786,97 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z + ) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + cuda_context *ctx = X->ctx; + blas_handle *h = (blas_handle *)ctx->blas_handle; + cublasPointerMode_t pmode; + + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(Z); + + if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + + cuda_enter(ctx); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + + // we should store dot result on device + cublasGetPointerMode(h->h, &pmode); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + h->err = cublasSdot( + h->h, N, + ((float*)X->ptr) + offX, incX, + ((float*)Y->ptr) + offY, incY, + ((float*)Z->ptr) + ); + cublasSetPointerMode(h->h, pmode); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + cuda_context *ctx = X->ctx; + blas_handle *h = (blas_handle *)ctx->blas_handle; + cublasPointerMode_t pmode; + + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(Z); + + if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + + cuda_enter(ctx); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + + // we should store dot result on device + cublasGetPointerMode(h->h, &pmode); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + h->err = cublasDdot( + h->h, N, + ((double*)X->ptr) + offX, incX, + ((double*)Y->ptr) + offY, incY, + ((double*)Z->ptr) + ); + cublasSetPointerMode(h->h, pmode); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, @@ -1558,6 +1649,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, + ddot, hgemv, /* TODO */ sgemv, dgemv, diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 837a74af9b..5ecb982af6 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -194,6 +194,30 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return GA_DEVSUP_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return GA_DEVSUP_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return GA_DEVSUP_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, @@ -400,6 +424,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, + ddot, hgemv, /* TODO */ sgemv, dgemv, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 20371093bb..08ecadf984 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -21,18 +21,30 @@ const char *gpublas_error(gpucontext *ctx) { int gpublas_hdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z); +} int gpublas_sdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z); +} int gpublas_ddot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return gpudata_context(X)->blas_ops->ddot( + N, X, offX, incX, Y, offY, incY, Z); +} int gpublas_hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn index 2ab7f5b2af..6a4c0ed2ba 100644 --- a/src/loaders/libclblas.fn +++ b/src/loaders/libclblas.fn @@ -1,8 +1,12 @@ DEF_PROC(clblasStatus, clblasSetup, (void)); DEF_PROC(void, clblasTeardown, (void)); + + DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); \ No newline at end of file +DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index 04b0290800..6af6589cc9 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -3,9 +3,13 @@ DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); +DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t* mode)); DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); +DEF_PROC_V2(cublasSdot, (cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)); +DEF_PROC_V2(cublasDdot, (cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)); + DEF_PROC_V2(cublasSgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)); DEF_PROC_V2(cublasDgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)); diff --git a/src/private.h b/src/private.h index c0e0d7c5d1..3ae8a270ce 100644 --- a/src/private.h +++ b/src/private.h @@ -113,6 +113,19 @@ struct _gpuarray_blas_ops { int (*setup)(gpucontext *ctx); void (*teardown)(gpucontext *ctx); const char *(*error)(gpucontext *ctx); + + int (*hdot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); + int (*sdot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); + int (*ddot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, From 94b600c5db96dfc3d21c842464b194623e9af208 Mon Sep 17 00:00:00 2001 From: khaotik Date: Thu, 24 Nov 2016 09:12:52 -0500 Subject: [PATCH 03/16] API for BLAS dot --- src/gpuarray/buffer_blas.h | 6 +++--- src/gpuarray_buffer_blas.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 56d1d4d2da..859ede62b7 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -42,19 +42,19 @@ GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 08ecadf984..f3447dd2cd 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,18 +23,18 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z); + N, X, offX, incX, Y, offY, incY, Z, offZ); } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z); + N, X, offX, incX, Y, offY, incY, Z, offZ); } int gpublas_ddot( @@ -43,7 +43,7 @@ int gpublas_ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z) { return gpudata_context(X)->blas_ops->ddot( - N, X, offX, incX, Y, offY, incY, Z); + N, X, offX, incX, Y, offY, incY, Z, offZ); } int gpublas_hgemv(cb_order order, cb_transpose transA, From c0e666371c94719834e12a7c52ddc3d0b5166b11 Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 04:02:47 -0500 Subject: [PATCH 04/16] Finish BLAS dot for implementation for CUDA Plus some minor changes: - did `chmod +x setup.py` - added interface for clblas --- src/gpuarray/buffer_blas.h | 12 ++++++++++++ src/gpuarray_buffer_blas.c | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 859ede62b7..9774ff4d17 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -42,19 +42,31 @@ GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); +======= + gpudata *Z); +>>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); +======= + gpudata *Z); +>>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); +======= + gpudata *Z); +>>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index f3447dd2cd..383a4a365d 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,18 +23,30 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( N, X, offX, incX, Y, offY, incY, Z, offZ); +======= + gpudata *Z) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z); +>>>>>>> Finish BLAS dot for implementation for CUDA } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( N, X, offX, incX, Y, offY, incY, Z, offZ); +======= + gpudata *Z) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z); +>>>>>>> Finish BLAS dot for implementation for CUDA } int gpublas_ddot( @@ -43,7 +55,11 @@ int gpublas_ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z) { return gpudata_context(X)->blas_ops->ddot( +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 N, X, offX, incX, Y, offY, incY, Z, offZ); +======= + N, X, offX, incX, Y, offY, incY, Z); +>>>>>>> Finish BLAS dot for implementation for CUDA } int gpublas_hgemv(cb_order order, cb_transpose transA, From 81e030c3ebd8b508404953b28e538027a0c41094 Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 07:48:28 -0500 Subject: [PATCH 05/16] fix/cleanup --- pygpu/tests/test_blas.py | 4 ++++ src/gpuarray/buffer_blas.h | 12 ------------ src/gpuarray_array_blas.c | 19 ++++++++----------- src/gpuarray_blas_cuda_cublas.c | 18 ++++++++---------- src/gpuarray_buffer_blas.c | 18 +++++++++++++++++- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 119ef8e959..532e4c8fc3 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -14,6 +14,10 @@ import pygpu.blas as gblas +def test_dot(): + # TODO [WIP] + raise NotImplementedError() + def test_gemv(): for shape in [(100, 128), (128, 50)]: for order in ['f', 'c']: diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 9774ff4d17..859ede62b7 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -42,31 +42,19 @@ GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); -======= - gpudata *Z); ->>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); -======= - gpudata *Z); ->>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); -======= - gpudata *Z); ->>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 74cfa858af..8f9fb5919b 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -12,9 +12,9 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, GpuArray *Yp = Y; GpuArray copyY; GpuArray *Zp = Z; + size_t n; void *ctx; size_t elsize; - size_t n; int err; if (X->typecode != GA_HALF && @@ -22,13 +22,14 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, X->typecode != GA_DOUBLE) return GA_INVALID_ERROR; - if (X->nd != 1 || X->nd != 1 || Y->nd != 0 || + if (X->nd != 1 || Y->nd != 1 || Z->nd != 0 || X->typecode != Y->typecode || X->typecode != Z->typecode) return GA_VALUE_ERROR; + n = X->dimensions[0]; if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(Z->flags & GA_ALIGNED)) return GA_UNALIGNED_ERROR; - if (X->dimensions[0] != n || Y->dimensions[0] != n) + if (X->dimensions[0] != Y->dimensions[0]) return GA_VALUE_ERROR; elsize = gpuarray_get_elsize(X->typecode); @@ -52,10 +53,6 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, Yp = ©Y; } } - if (Z->strides[0] < 0) { - err = GA_VALUE_ERROR; - goto cleanup; - } ctx = gpudata_context(Xp->data); err = gpublas_setup(ctx); @@ -68,21 +65,21 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, - Zp->data); + Zp->data, Zp->offset / elsize); break; case GA_FLOAT: err = gpublas_sdot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, - Zp->data); + Zp->data, Zp->offset / elsize); break; case GA_DOUBLE: - err = gpublas_sdot( + err = gpublas_ddot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, - Zp->data); + Zp->data, Zp->offset / elsize); break; } cleanup: diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 5f4b913b63..1897a492dc 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -814,22 +814,21 @@ static int sdot( GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device cublasGetPointerMode(h->h, &pmode); - cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); h->err = cublasSdot( h->h, N, ((float*)X->ptr) + offX, incX, ((float*)Y->ptr) + offY, incY, - ((float*)Z->ptr) - ); + ((float*)Z->ptr) + offZ); cublasSetPointerMode(h->h, pmode); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); cuda_exit(ctx); @@ -855,22 +854,21 @@ static int ddot( GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device cublasGetPointerMode(h->h, &pmode); - cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); h->err = cublasDdot( h->h, N, ((double*)X->ptr) + offX, incX, ((double*)Y->ptr) + offY, incY, - ((double*)Z->ptr) - ); + ((double*)Z->ptr) + offZ); cublasSetPointerMode(h->h, pmode); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); cuda_exit(ctx); diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 383a4a365d..08f6f5fd1e 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,6 +23,7 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 <<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( @@ -32,12 +33,18 @@ int gpublas_hdot( return gpudata_context(X)->blas_ops->hdot( N, X, offX, incX, Y, offY, incY, Z); >>>>>>> Finish BLAS dot for implementation for CUDA +======= + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +>>>>>>> fix/cleanup } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 <<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( @@ -47,19 +54,28 @@ int gpublas_sdot( return gpudata_context(X)->blas_ops->sdot( N, X, offX, incX, Y, offY, incY, Z); >>>>>>> Finish BLAS dot for implementation for CUDA +======= + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +>>>>>>> fix/cleanup } int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->ddot( +<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 <<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 N, X, offX, incX, Y, offY, incY, Z, offZ); ======= N, X, offX, incX, Y, offY, incY, Z); >>>>>>> Finish BLAS dot for implementation for CUDA +======= + N, X, offX, incX, Y, offY, incY, Z, offZ); +>>>>>>> fix/cleanup } int gpublas_hgemv(cb_order order, cb_transpose transA, From 556ced0999c660407e4173ef4530cffe8dc06f2d Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 11:34:43 -0500 Subject: [PATCH 06/16] fixed/more pygpu interface --- pygpu/blas.pyx | 8 ++++++++ src/gpuarray_blas_cuda_cublas.c | 7 +++---- src/gpuarray_blas_opencl_clblas.c | 6 +++--- src/loaders/libclblas.fn | 8 ++++---- src/private.h | 18 +++++++++--------- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index f83322d0a0..7b27d63350 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -10,6 +10,7 @@ cdef extern from "gpuarray/buffer_blas.h": cb_conj_trans cdef extern from "gpuarray/blas.h": + int GpuArray_rdot(_GpuArray *X, _GpuArray *Y, _GpuArray *Z, int nocopy) int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A, _GpuArray *X, double beta, _GpuArray *Y, int nocopy) int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, @@ -18,6 +19,13 @@ cdef extern from "gpuarray/blas.h": int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A, int nocopy) +cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) except -1: + cdef int err + err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy) + if err != GA_NO_ERROR: + raise GpuArrayException(Gpurray_error(&X.ga, err), err) + return 0 + cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, GpuArray X, double beta, GpuArray Y, bint nocopy) except -1: diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 1897a492dc..39cba704f5 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -790,8 +790,7 @@ static int hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z - ) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } @@ -799,7 +798,7 @@ static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; cublasPointerMode_t pmode; @@ -839,7 +838,7 @@ static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; cublasPointerMode_t pmode; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 5ecb982af6..d2f5e3302f 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -198,7 +198,7 @@ static int hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } @@ -206,7 +206,7 @@ static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } @@ -214,7 +214,7 @@ static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn index 6a4c0ed2ba..f56a2a1393 100644 --- a/src/loaders/libclblas.fn +++ b/src/loaders/libclblas.fn @@ -2,11 +2,11 @@ DEF_PROC(clblasStatus, clblasSetup, (void)); DEF_PROC(void, clblasTeardown, (void)); -DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); diff --git a/src/private.h b/src/private.h index 3ae8a270ce..57d919be88 100644 --- a/src/private.h +++ b/src/private.h @@ -115,17 +115,17 @@ struct _gpuarray_blas_ops { const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*sdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*ddot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, From 6b8518d5eac559364ab91fea5362212b7c8274c6 Mon Sep 17 00:00:00 2001 From: khaotik Date: Sat, 26 Nov 2016 05:28:40 -0500 Subject: [PATCH 07/16] get rid of conflict --- src/gpuarray_buffer_blas.c | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 08f6f5fd1e..c73f3c2f19 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,42 +23,18 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( N, X, offX, incX, Y, offY, incY, Z, offZ); -======= - gpudata *Z) { - return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z); ->>>>>>> Finish BLAS dot for implementation for CUDA -======= - gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z, offZ); ->>>>>>> fix/cleanup } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( N, X, offX, incX, Y, offY, incY, Z, offZ); -======= - gpudata *Z) { - return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z); ->>>>>>> Finish BLAS dot for implementation for CUDA -======= - gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z, offZ); ->>>>>>> fix/cleanup } int gpublas_ddot( @@ -67,15 +43,7 @@ int gpublas_ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->ddot( -<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 - N, X, offX, incX, Y, offY, incY, Z, offZ); -======= - N, X, offX, incX, Y, offY, incY, Z); ->>>>>>> Finish BLAS dot for implementation for CUDA -======= N, X, offX, incX, Y, offY, incY, Z, offZ); ->>>>>>> fix/cleanup } int gpublas_hgemv(cb_order order, cb_transpose transA, From b5b20d74cd2336263629a73f02f941d8c4ce75b3 Mon Sep 17 00:00:00 2001 From: khaotik Date: Sat, 26 Nov 2016 05:43:07 -0500 Subject: [PATCH 08/16] make all inc* arguments as type int --- src/gpuarray_blas_cuda_cublas.c | 60 +++++++++++++++--------------- src/gpuarray_blas_opencl_clblas.c | 36 +++++++++--------- src/gpuarray_blas_opencl_clblast.c | 54 +++++++++++++++++++++------ src/private.h | 36 +++++++++--------- 4 files changed, 108 insertions(+), 78 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 39cba704f5..41e6e372c0 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -39,8 +39,8 @@ typedef struct _blas_handle { static const char *code_sgemvBH_N_a1_b1_small = \ "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], size_t incx, " \ - " float *y[], size_t incy, " \ + " const float *x[], int incx, " \ + " float *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -62,8 +62,8 @@ static const char *code_sgemvBH_N_a1_b1_small = \ static const char *code_sgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], size_t incx, " \ - " float *y[], size_t incy, " \ + " const float *x[], int incx, " \ + " float *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -95,8 +95,8 @@ static const char *atomicadd_double = \ static const char *code_dgemvBH_N_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], size_t incx, " \ - " double *y[], size_t incy, " \ + " const double *x[], int incx, " \ + " double *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -118,8 +118,8 @@ static const char *code_dgemvBH_N_a1_b1_small = \ static const char *code_dgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], size_t incx, " \ - " double *y[], size_t incy, " \ + " const double *x[], int incx, " \ + " double *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -137,8 +137,8 @@ static const char *code_dgemvBH_T_a1_b1_small = \ static const char *code_sgerBH_gen_small = \ "extern \"C\" __global__ void _sgerBH_gen_small(" \ - " const float *x[], size_t incx," \ - " const float *y[], size_t incy," \ + " const float *x[], int incx," \ + " const float *y[], int incy," \ " float alpha, float *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -152,8 +152,8 @@ static const char *code_sgerBH_gen_small = \ static const char *code_dgerBH_gen_small = \ "extern \"C\" __global__ void _dgerBH_gen_small(" \ - " const double *x[], size_t incx, " \ - " const double *y[], size_t incy," \ + " const double *x[], int incx, " \ + " const double *y[], int incy," \ " double alpha, double *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -788,16 +788,16 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -836,8 +836,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -994,8 +994,8 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -1003,8 +1003,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { /* Flags is there for possible future implementations where we might not use atomics or have some alternate implemntation. */ @@ -1129,8 +1129,8 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; @@ -1371,16 +1371,16 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; @@ -1511,8 +1511,8 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index d2f5e3302f..91f31d0728 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -146,8 +146,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -155,8 +155,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -164,31 +164,31 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; @@ -196,24 +196,24 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, static int hdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int ddot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 28b164799f..19f7fc55b7 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -141,8 +141,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -150,8 +150,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -159,36 +159,63 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *Z, size_t offZ + ) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *Z, size_t offZ + ) { + return GA_DEVSUP_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *Z, size_t offZ + ) { + return GA_DEVSUP_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, @@ -436,6 +463,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, /* TODO */ + ddot, /* TODO */ hgemv, sgemv, dgemv, diff --git a/src/private.h b/src/private.h index 57d919be88..ed8ce63293 100644 --- a/src/private.h +++ b/src/private.h @@ -115,16 +115,16 @@ struct _gpuarray_blas_ops { const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ); int (*sdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ); int (*ddot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, @@ -186,34 +186,34 @@ struct _gpuarray_blas_ops { int (*hgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags); int (*sgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags); int (*dgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags); int (*hgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*sgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*dgerBatch)(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); }; From bb845624bdee486f05bb61e3da8a5b8e4e34ff7d Mon Sep 17 00:00:00 2001 From: khaotik Date: Sat, 26 Nov 2016 17:10:58 -0500 Subject: [PATCH 09/16] tests for blas dot - Added tests for BLAS dot - Implementation for CLBlast - modified blas tests from using nested for loops to itertools.product for parametrized tests. --- pygpu/blas.pyx | 12 +++- pygpu/tests/test_blas.py | 111 ++++++++++++++++------------- src/gpuarray_blas_opencl_clblast.c | 84 +++++++++++++++++++--- src/loaders/libclblast.fn | 3 + 4 files changed, 149 insertions(+), 61 deletions(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index 7b27d63350..14d90c0f76 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -23,7 +23,7 @@ cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) ex cdef int err err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy) if err != GA_NO_ERROR: - raise GpuArrayException(Gpurray_error(&X.ga, err), err) + raise GpuArrayException(GpuArray_error(&X.ga, err), err) return 0 cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, @@ -53,6 +53,16 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, return 0 +def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): + if Z is None: + Z = pygpu_empty(0, NULL, X.typecode, GA_ANY_ORDER, X.context, None) + overwrite_z = True + + if not overwrite_z: + Z = pygpu_copy(Z, GA_ANY_ORDER) + pygpu_blas_rdot(X, Y, Z, 0) + return Z + def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, trans_a=False, overwrite_y=False): cdef cb_transpose transA diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 532e4c8fc3..6b24ceea5d 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -1,4 +1,5 @@ -import numpy +from itertools import product +import numpy from nose.plugins.skip import SkipTest from .support import (guard_devsup, gen_gpuarray, context) @@ -15,27 +16,47 @@ import pygpu.blas as gblas def test_dot(): - # TODO [WIP] - raise NotImplementedError() + bools = [True, False] + for N, dtype, offseted_i, sliced in product( + [1, 256, 1337], ['float32', 'float64'], bools, bools): + yield dot, N, dtype, offseted_i, sliced, True, False + for overwrite, init_z in product(bools, bools): + yield dot, 666, 'float32', False, False, overwrite, init_z + +@guard_devsup +def dot(N, dtype, offseted_i, sliced, overwrite, init_z): + cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + cY, gY = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + if init_z: + _, gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + else: + _, gZ = None, None + + if dtype == 'float32': + cr = fblas.sdot(cX, cY) + else: + cr = fblas.ddot(cX, cY) + gr = gblas.dot(gX, gY, gZ, overwrite_z=overwrite) + numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) + def test_gemv(): - for shape in [(100, 128), (128, 50)]: - for order in ['f', 'c']: - for trans in [False, True]: - for offseted_i in [True, False]: - for sliced in [1, 2, -1, -2]: - yield gemv, shape, 'float32', order, trans, \ - offseted_i, sliced, True, False - for overwrite in [True, False]: - for init_y in [True, False]: - yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ - overwrite, init_y + bools = [False, True] + for shape, order, trans, offseted_i, sliced in product( + [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): + yield gemv, shape, 'float32', order, trans, \ + offseted_i, sliced, True, False + for overwrite, init_y in product(bools, bools): + yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ + overwrite, init_y yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False - for alpha in [0, 1, -1, 0.6]: - for beta in [0, 1, -1, 0.6]: - for overwite in [True, False]: - yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ - overwrite, True, alpha, beta + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ + overwrite, True, alpha, beta @guard_devsup @@ -69,28 +90,22 @@ def gemv(shp, dtype, order, trans, offseted_i, sliced, def test_gemm(): - for m, n, k in [(48, 15, 32), (15, 32, 48)]: - for order in [('f', 'f', 'f'), ('c', 'c', 'c'), - ('f', 'f', 'c'), ('f', 'c', 'f'), - ('f', 'c', 'c'), ('c', 'f', 'f'), - ('c', 'f', 'c'), ('c', 'c', 'f')]: - for trans in [(False, False), (True, True), - (False, True), (True, False)]: - for offseted_o in [False, True]: - yield gemm, m, n, k, 'float32', order, trans, \ - offseted_o, 1, False, False - for sliced in [1, 2, -1, -2]: - for overwrite in [True, False]: - for init_res in [True, False]: - yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ - (False, False), False, sliced, overwrite, init_res + bools = [False, True] + for (m, n, k), order, trans, offseted_o in product( + [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)), + list(product(bools, bools)), bools): + yield gemm, m, n, k, 'float32', order, trans, \ + offseted_o, 1, False, False + for sliced, overwrite, init_res in product( + [1, 2, -1, -2], bools, bools): + yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ + (False, False), False, sliced, overwrite, init_res yield gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), \ False, 1, False, False - for alpha in [0, 1, -1, 0.6]: - for beta in [0, 1, -1, 0.6]: - for overwrite in [True, False]: - yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ - (False, False), False, 1, overwrite, True, alpha, beta + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ + (False, False), False, 1, overwrite, True, alpha, beta @guard_devsup def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, @@ -128,19 +143,13 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, def test_ger(): - for m, n in [(4, 5)]: - for order in ['f', 'c']: - for sliced_x in [1, 2, -2, -1]: - for sliced_y in [1, 2, -2, -1]: - yield ger, m, n, 'float32', order, sliced_x, sliced_y, \ - False - + bools = [False, True] + for (m,n), order, sliced_x, sliced_y in product( + [(4,5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]): + yield ger, m, n, 'float32', order, sliced_x, sliced_y, False yield ger, 4, 5, 'float64', 'f', 1, 1, False - - for init_res in [True, False]: - for overwrite in [True, False]: - yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite - + for init_res, overwrite in product(bools, bools): + yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context) diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 19f7fc55b7..0827f936e7 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -193,27 +193,93 @@ static int hdot( size_t N, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *Z, size_t offZ - ) { - return GA_DEVSUP_ERROR; + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastHdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int sdot( size_t N, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *Z, size_t offZ - ) { - return GA_DEVSUP_ERROR; + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastSdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int ddot( size_t N, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *Z, size_t offZ - ) { - return GA_DEVSUP_ERROR; + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastDdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn index 544c164e0c..28f36ba20b 100644 --- a/src/loaders/libclblast.fn +++ b/src/loaders/libclblast.fn @@ -1,3 +1,6 @@ +DEF_PROC(StatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(StatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(StatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); DEF_PROC(StatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); From bec7a725a21999f138aacfcb01b039d8ccbc07ca Mon Sep 17 00:00:00 2001 From: khaotik Date: Sun, 27 Nov 2016 04:25:16 -0500 Subject: [PATCH 10/16] finish dot for clBLAS --- src/gpuarray_blas_opencl_clblas.c | 72 ++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 91f31d0728..1028ce5e70 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -1,6 +1,7 @@ #include "private.h" #include "private_opencl.h" +#include "loaders/libopencl.h" #include "loaders/libclblas.h" #include "gpuarray/buffer_blas.h" @@ -207,7 +208,41 @@ static int sdot( gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { - return GA_DEVSUP_ERROR; + cl_ctx *ctx = X->ctx; + clblasStatus err; + cl_int cl_err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + cl_mem scratch_mem; + + scratch_mem = clCreateBuffer( + ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); + if (cl_err != CL_SUCCESS) + return GA_MEMORY_ERROR; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + // TODO: a thread-safe static buffer or allocator? + err = clblasSdot( + N, Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + scratch_mem, 1, &ctx->q, + num_ev, num_ev ? evl : NULL, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseMemObject(scratch_mem); + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int ddot( @@ -215,7 +250,40 @@ static int ddot( gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { - return GA_DEVSUP_ERROR; + cl_ctx *ctx = X->ctx; + clblasStatus err; + cl_int cl_err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + cl_mem scratch_mem; + + scratch_mem = clCreateBuffer( + ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); + if (cl_err != CL_SUCCESS) + return GA_MEMORY_ERROR; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = clblasDdot( + N, Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + scratch_mem, 1, &ctx->q, + num_ev, num_ev ? evl : NULL, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseMemObject(scratch_mem); + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, From 1956a7ba08bcee1b829afdb1d9f611c40d4d3f75 Mon Sep 17 00:00:00 2001 From: khaotik Date: Sun, 27 Nov 2016 04:46:38 -0500 Subject: [PATCH 11/16] minifixes --- src/gpuarray_blas_opencl_clblas.c | 2 +- src/loaders/libcublas.fn | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 1028ce5e70..5cb926c61b 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -259,7 +259,7 @@ static int ddot( cl_mem scratch_mem; scratch_mem = clCreateBuffer( - ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); + ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(double), NULL, &cl_err); if (cl_err != CL_SUCCESS) return GA_MEMORY_ERROR; diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index 6af6589cc9..c0dbddf41e 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -3,7 +3,7 @@ DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); -DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t* mode)); +DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t *mode)); DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); From 0b8cf5b7fcfd593290f975176a5ee6ea0816fd13 Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 06:50:34 -0500 Subject: [PATCH 12/16] fall back to size_t for strides --- src/gpuarray_blas_cuda_cublas.c | 78 +++++++++++++++--------------- src/gpuarray_blas_opencl_clblas.c | 60 +++++++++++------------ src/gpuarray_blas_opencl_clblast.c | 66 ++++++++++++------------- src/private.h | 60 +++++++++++------------ 4 files changed, 132 insertions(+), 132 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 41e6e372c0..9e81805a15 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -39,8 +39,8 @@ typedef struct _blas_handle { static const char *code_sgemvBH_N_a1_b1_small = \ "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], int incx, " \ - " float *y[], int incy, " \ + " const float *x[], size_t incx, " \ + " float *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -62,8 +62,8 @@ static const char *code_sgemvBH_N_a1_b1_small = \ static const char *code_sgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], int incx, " \ - " float *y[], int incy, " \ + " const float *x[], size_t incx, " \ + " float *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -95,8 +95,8 @@ static const char *atomicadd_double = \ static const char *code_dgemvBH_N_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], int incx, " \ - " double *y[], int incy, " \ + " const double *x[], size_t incx, " \ + " double *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -118,8 +118,8 @@ static const char *code_dgemvBH_N_a1_b1_small = \ static const char *code_dgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], int incx, " \ - " double *y[], int incy, " \ + " const double *x[], size_t incx, " \ + " double *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -137,8 +137,8 @@ static const char *code_dgemvBH_T_a1_b1_small = \ static const char *code_sgerBH_gen_small = \ "extern \"C\" __global__ void _sgerBH_gen_small(" \ - " const float *x[], int incx," \ - " const float *y[], int incy," \ + " const float *x[], size_t incx," \ + " const float *y[], size_t incy," \ " float alpha, float *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -152,8 +152,8 @@ static const char *code_sgerBH_gen_small = \ static const char *code_dgerBH_gen_small = \ "extern \"C\" __global__ void _dgerBH_gen_small(" \ - " const double *x[], int incx, " \ - " const double *y[], int incy," \ + " const double *x[], size_t incx, " \ + " const double *y[], size_t incy," \ " double alpha, double *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -788,16 +788,16 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -836,8 +836,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -876,15 +876,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - float beta, gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, + float beta, gpudata *Y, size_t offY, size_t incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - float beta, gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, + float beta, gpudata *Y, size_t offY, size_t incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -938,8 +938,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - double beta, gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, + double beta, gpudata *Y, size_t offY, size_t incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -994,8 +994,8 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -1003,8 +1003,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { /* Flags is there for possible future implementations where we might not use atomics or have some alternate implemntation. */ @@ -1129,8 +1129,8 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; @@ -1251,13 +1251,13 @@ static int dgemvBatch(cb_order order, cb_transpose transA, static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, + size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, + size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -1314,7 +1314,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, + size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -1371,16 +1371,16 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; @@ -1511,8 +1511,8 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 5cb926c61b..228fed8d41 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -147,8 +147,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -156,8 +156,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -165,31 +165,31 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; @@ -197,16 +197,16 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, static int hdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; @@ -247,8 +247,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; @@ -288,15 +288,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -325,8 +325,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, double beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, double beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -424,15 +424,15 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; @@ -460,8 +460,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 0827f936e7..0d220ac322 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -141,8 +141,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -150,8 +150,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -159,31 +159,31 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; @@ -191,8 +191,8 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, static int hdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; StatusCode err; @@ -222,8 +222,8 @@ static int hdot( static int sdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; StatusCode err; @@ -253,8 +253,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; StatusCode err; @@ -284,8 +284,8 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -311,8 +311,8 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -338,8 +338,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, double beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, double beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -448,8 +448,8 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -474,8 +474,8 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -500,8 +500,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -529,9 +529,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { setup, teardown, error, - hdot, /* TODO */ - sdot, /* TODO */ - ddot, /* TODO */ + hdot, + sdot, + ddot, hgemv, sgemv, dgemv, diff --git a/src/private.h b/src/private.h index ed8ce63293..bdc3f7bb9d 100644 --- a/src/private.h +++ b/src/private.h @@ -115,29 +115,29 @@ struct _gpuarray_blas_ops { const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*sdot)( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*ddot)( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY); + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY); int (*sgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY); + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY); int (*dgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, double beta, - gpudata *Y, size_t offY, int incY); + gpudata *X, size_t offX, size_t incX, double beta, + gpudata *Y, size_t offY, size_t incY); int (*hgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, @@ -154,16 +154,16 @@ struct _gpuarray_blas_ops { gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); int (*hger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda); int (*sger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda); int (*dger)(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda); int (*hgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, @@ -186,34 +186,34 @@ struct _gpuarray_blas_ops { int (*hgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*sgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*dgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*hgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*sgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*dgerBatch)(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); }; From 3bf6a4144c6e788c5f47a6ca214ac95206d651fd Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 07:08:01 -0500 Subject: [PATCH 13/16] now use buffer_alloc to create working buffer --- src/gpuarray_blas_opencl_clblas.c | 36 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 228fed8d41..f3c94a7741 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -7,6 +7,8 @@ #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" +extern const gpuarray_buffer_ops opencl_ops; + static inline clblasOrder convO(cb_order order) { switch (order) { case cb_row: @@ -210,16 +212,17 @@ static int sdot( gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; - cl_int cl_err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; - cl_mem scratch_mem; + gpudata *wbuf; + int alloc_err; - scratch_mem = clCreateBuffer( - ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); - if (cl_err != CL_SUCCESS) - return GA_MEMORY_ERROR; + wbuf = opencl_ops.buffer_alloc( + (gpucontext*)ctx, + N*sizeof(float), NULL, GA_BUFFER_READ_WRITE, &alloc_err); + if (alloc_err != GA_NO_ERROR) + return alloc_err; ARRAY_INIT(X); ARRAY_INIT(Y); @@ -230,7 +233,7 @@ static int sdot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, - scratch_mem, 1, &ctx->q, + wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; @@ -239,7 +242,7 @@ static int sdot( ARRAY_FINI(Y); ARRAY_FINI(Z); - clReleaseMemObject(scratch_mem); + opencl_ops.buffer_release(wbuf); clReleaseEvent(ev); return GA_NO_ERROR; @@ -252,16 +255,17 @@ static int ddot( gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; - cl_int cl_err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; - cl_mem scratch_mem; + gpudata *wbuf; + int alloc_err; - scratch_mem = clCreateBuffer( - ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(double), NULL, &cl_err); - if (cl_err != CL_SUCCESS) - return GA_MEMORY_ERROR; + wbuf = opencl_ops.buffer_alloc( + (gpucontext*)ctx, + N*sizeof(double), NULL, GA_BUFFER_READ_WRITE, &alloc_err); + if (alloc_err != GA_NO_ERROR) + return alloc_err; ARRAY_INIT(X); ARRAY_INIT(Y); @@ -271,7 +275,7 @@ static int ddot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, - scratch_mem, 1, &ctx->q, + wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; @@ -280,7 +284,7 @@ static int ddot( ARRAY_FINI(Y); ARRAY_FINI(Z); - clReleaseMemObject(scratch_mem); + opencl_ops.buffer_release(wbuf); clReleaseEvent(ev); return GA_NO_ERROR; From c74e8e6041278cbef473aa675f6c8220484ddeca Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 08:06:21 -0500 Subject: [PATCH 14/16] revert old int strides --- src/gpuarray_blas_cuda_cublas.c | 18 +++++++++--------- src/gpuarray_blas_opencl_clblas.c | 24 ++++++++++++------------ src/gpuarray_blas_opencl_clblast.c | 24 ++++++++++++------------ 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 9e81805a15..39cba704f5 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -876,15 +876,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, - float beta, gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, + float beta, gpudata *Y, size_t offY, int incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, - float beta, gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, + float beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -938,8 +938,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, - double beta, gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, + double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -1251,13 +1251,13 @@ static int dgemvBatch(cb_order order, cb_transpose transA, static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -1314,7 +1314,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, - size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index f3c94a7741..d6e58ecbfd 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -292,15 +292,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -329,8 +329,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, double beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -428,15 +428,15 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; @@ -464,8 +464,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 0d220ac322..78cca10f20 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -284,8 +284,8 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -311,8 +311,8 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -338,8 +338,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, double beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -448,8 +448,8 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -474,8 +474,8 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -500,8 +500,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; From d6c69b64a9434249812194e7eeadc58546b074ec Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 08:13:31 -0500 Subject: [PATCH 15/16] revert strides in private.h --- src/private.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/private.h b/src/private.h index bdc3f7bb9d..57d919be88 100644 --- a/src/private.h +++ b/src/private.h @@ -128,16 +128,16 @@ struct _gpuarray_blas_ops { gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY); + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY); int (*sgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY); + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY); int (*dgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, double beta, - gpudata *Y, size_t offY, size_t incY); + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY); int (*hgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, @@ -154,16 +154,16 @@ struct _gpuarray_blas_ops { gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); int (*hger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*sger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*dger)(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*hgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, From cae367115ae35f540dd62580ca8418e32ddff60b Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 08:23:26 -0500 Subject: [PATCH 16/16] mini cleanup --- pygpu/tests/test_blas.py | 1 - src/gpuarray_blas_opencl_clblas.c | 1 - 2 files changed, 2 deletions(-) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 6b24ceea5d..8ce7d7aebe 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -58,7 +58,6 @@ def test_gemv(): yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ overwrite, True, alpha, beta - @guard_devsup def gemv(shp, dtype, order, trans, offseted_i, sliced, overwrite, init_y, alpha=1.0, beta=0.0): diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index d6e58ecbfd..2041710735 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -1,7 +1,6 @@ #include "private.h" #include "private_opencl.h" -#include "loaders/libopencl.h" #include "loaders/libclblas.h" #include "gpuarray/buffer_blas.h"