From 42e9cf4bf8a4a9bf62b033527695efc718260a0d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 31 Aug 2015 14:55:15 -0400 Subject: [PATCH 1/8] Don't check the current context all the time and rely on a counter. --- src/gpuarray_buffer_cuda.c | 21 ++++++++++++--------- src/private_cuda.h | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 43db2d5e62..34046715c0 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -74,6 +74,7 @@ void *cuda_make_ctx(CUcontext ctx, int flags) { res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; + res->enter = 0; if (detect_arch(res->bin_id)) { free(res); return NULL; @@ -108,8 +109,10 @@ static void cuda_free_ctx(cuda_context *ctx) { ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { + assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { - ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); + ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, + &blas_ops); blas_ops->teardown(ctx); } ctx->refcnt = 2; /* Prevent recursive calls */ @@ -135,17 +138,17 @@ CUstream cuda_get_stream(void *ctx) { void cuda_enter(cuda_context *ctx) { ASSERT_CTX(ctx); - cuCtxGetCurrent(&ctx->old); - if (ctx->old != ctx->ctx) - ctx->err = cuCtxSetCurrent(ctx->ctx); - /* If no context was there in the first place, then we take over - to avoid the set dance on the thread */ - if (ctx->old == NULL) ctx->old = ctx->ctx; + if (!ctx->enter) + ctx->err = cuCtxPushCurrent(ctx->ctx); + ctx->enter++; } void cuda_exit(cuda_context *ctx) { - if (ctx->old != ctx->ctx) - cuCtxSetCurrent(ctx->old); + ASSERT_CTX(ctx); + assert(ctx->enter > 0); + ctx->enter--; + if (!ctx->enter) + cuCtxPopCurrent(NULL); } gpudata *cuda_make_buf(void *c, CUdeviceptr p, size_t sz) { diff --git a/src/private_cuda.h b/src/private_cuda.h index fcdd324498..c40c7f1125 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -46,7 +46,6 @@ typedef struct _cuda_context { char tag[8]; #endif CUcontext ctx; - CUcontext old; CUresult err; CUstream s; void *blas_handle; @@ -55,6 +54,7 @@ typedef struct _cuda_context { char bin_id[8]; unsigned int refcnt; int flags; + unsigned int enter; } cuda_context; GPUARRAY_LOCAL void *cuda_make_ctx(CUcontext ctx, int flags); From 20913feb2b09cee2ecea1ebd25555faba116b5d0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Sep 2015 12:56:55 -0400 Subject: [PATCH 2/8] Add cuda_wait() and cuda_mark() to make the waiting for other buffers easier. --- src/gpuarray/buffer.h | 4 +++ src/gpuarray/extension.h | 7 +++-- src/gpuarray_buffer_cuda.c | 61 ++++++++++++++++++++++++++------------ src/gpuarray_extension.c | 4 +++ src/private_cuda.h | 9 +++++- 5 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index cac703ea71..5e798542a3 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -176,6 +176,10 @@ typedef struct _gpuarray_buffer_ops { /*#define GA_BUFFER_USE_DATA 0x10*/ +/* The upper 16 bits are private flags */ +#define GA_BUFFER_MASK 0xffff + + /** * @} */ diff --git a/src/gpuarray/extension.h b/src/gpuarray/extension.h index 7ae94213ff..b26b5231e5 100644 --- a/src/gpuarray/extension.h +++ b/src/gpuarray/extension.h @@ -13,8 +13,11 @@ extern "C" { } #endif -/* Keep in sync with the flags in gpuarray_buffer_cuda.c */ -#define GPUARRAY_CUDA_CTX_NOFREE 0x1 +/* Keep in sync with the flags in private_cuda.h */ +#define GPUARRAY_CUDA_CTX_NOFREE 0x10000000 /* DONTFREE */ + +#define GPUARRAY_CUDA_WAIT_READ 0x10000 /* CUDA_WAIT_READ */ +#define GPUARRAY_CUDA_WAIT_WRITE 0x20000 /* CUDA_WAIT_WRITE */ /** * Obtain a function pointer for an extension. diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 34046715c0..3fd6cddd2c 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -444,6 +444,29 @@ static int cuda_share(gpudata *a, gpudata *b, int *ret) { (b->ptr <= a->ptr && b->ptr + b->sz > a->ptr))); } +int cuda_wait(gpudata *a, int flags) { + ASSERT_BUF(a); + /* If others are only reads, no need to wait */ + if (flags & CUDA_WAIT_READ && !(a->flags & CUDA_WAIT_WRITE)) + return GA_NO_ERROR; + cuda_enter(a->ctx); + a->ctx->err = cuStreamWaitEvent(a->ctx->s, a->ev, 0); + if (a->ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + cuda_exit(a->ctx); + return GA_NO_ERROR; +} + +int cuda_mark(gpudata *a, int flags) { + ASSERT_BUF(a); + cuda_enter(a->ctx); + a->ctx->err = cuEventRecord(a->ev, a->ctx->s); + a->flags &= ~(CUDA_WAIT_MASK); + a->flags &= (flags & CUDA_WAIT_MASK); + cuda_exit(a->ctx); + return GA_NO_ERROR; +} + static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = dst->ctx; @@ -1556,22 +1579,22 @@ static const char *cuda_error(void *c) { GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops = {cuda_init, - cuda_deinit, - cuda_alloc, - cuda_retain, - cuda_free, - cuda_share, - cuda_move, - cuda_read, - cuda_write, - cuda_memset, - cuda_newkernel, - cuda_retainkernel, - cuda_freekernel, - cuda_callkernel, - cuda_kernelbin, - cuda_sync, - cuda_extcopy, - cuda_transfer, - cuda_property, - cuda_error}; + cuda_deinit, + cuda_alloc, + cuda_retain, + cuda_free, + cuda_share, + cuda_move, + cuda_read, + cuda_write, + cuda_memset, + cuda_newkernel, + cuda_retainkernel, + cuda_freekernel, + cuda_callkernel, + cuda_kernelbin, + cuda_sync, + cuda_extcopy, + cuda_transfer, + cuda_property, + cuda_error}; diff --git a/src/gpuarray_extension.c b/src/gpuarray_extension.c index 6cddf03fc5..e66a48928c 100644 --- a/src/gpuarray_extension.c +++ b/src/gpuarray_extension.c @@ -16,6 +16,8 @@ extern void *cuda_get_stream(void); extern void *cuda_make_buf(void); extern void *cuda_get_ptr(void); extern void *cuda_get_sz(void); +extern void *cuda_wait(void); +extern void *cuda_mark(void); extern void *cuda_set_compiler(void); #endif #ifdef WITH_OPENCL @@ -36,6 +38,8 @@ static ext ext_list[] = { {"cuda_make_buf", cuda_make_buf}, {"cuda_get_ptr", cuda_get_ptr}, {"cuda_get_sz", cuda_get_sz}, + {"cuda_wait", cuda_wait}, + {"cuda_mark", cuda_mark}, {"cuda_set_compiler", cuda_set_compiler}, #endif #ifdef WITH_OPENCL diff --git a/src/private_cuda.h b/src/private_cuda.h index c40c7f1125..71c8bb7cb8 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -38,7 +38,7 @@ #define CLEAR(o) #endif - +/* Keep in sync with the copy in gpuarray/extension.h */ #define DONTFREE 0x10000000 typedef struct _cuda_context { @@ -78,6 +78,13 @@ struct _gpudata { GPUARRAY_LOCAL gpudata *cuda_make_buf(void *c, CUdeviceptr p, size_t sz); GPUARRAY_LOCAL CUdeviceptr cuda_get_ptr(gpudata *g); GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g); +GPUARRAY_LOCAL int cuda_wait(gpudata *, int); +GPUARRAY_LOCAL int cuda_mark(gpudata *, int); + +/* private flags are in the upper 16 bits */ +#define CUDA_WAIT_READ 0x10000 +#define CUDA_WAIT_WRITE 0x20000 +#define CUDA_WAIT_MASK 0xf0000 struct _gpukernel { #ifdef DEBUG From 0ba987add4856fe028800a8620a72c859ec8e76b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Sep 2015 14:36:58 -0400 Subject: [PATCH 3/8] Stop generating the blas files and stop using the gnarly macro-filled blas file. --- gen_blas.cmake | 14 - gen_blas.py | 772 ------------------------------ pygpu/blas.pyx | 42 +- src/CMakeLists.txt | 1 - src/generic_blas.inc.c | 151 ------ src/gpuarray/blas.h | 19 +- src/gpuarray/buffer_blas.h | 6 +- src/gpuarray_array_blas.c | 110 +++-- src/gpuarray_blas_cuda_cublas.c | 643 +++++++++++++++++++------ src/gpuarray_blas_opencl_clblas.c | 224 +++++++-- 10 files changed, 790 insertions(+), 1192 deletions(-) delete mode 100644 gen_blas.cmake delete mode 100644 gen_blas.py delete mode 100644 src/generic_blas.inc.c diff --git a/gen_blas.cmake b/gen_blas.cmake deleted file mode 100644 index e8cc1a8b44..0000000000 --- a/gen_blas.cmake +++ /dev/null @@ -1,14 +0,0 @@ -set(GEN_BLAS_FILES - ${CMAKE_SOURCE_DIR}/src/generic_blas.inc.c - ${CMAKE_SOURCE_DIR}/src/gpuarray/buffer_blas.h - ${CMAKE_SOURCE_DIR}/src/gpuarray/blas.h - ${CMAKE_SOURCE_DIR}/src/gpuarray_array_blas.c - ${CMAKE_SOURCE_DIR}/pygpu/blas.pyx -) - -add_custom_command( - OUTPUT ${GEN_BLAS_FILES} - COMMAND python ${CMAKE_SOURCE_DIR}/gen_blas.py - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - DEPENDS ${CMAKE_SOURCE_DIR}/gen_blas.py -) diff --git a/gen_blas.py b/gen_blas.py deleted file mode 100644 index bcbd6884ce..0000000000 --- a/gen_blas.py +++ /dev/null @@ -1,772 +0,0 @@ -import sys - -from mako import exceptions -from mako.template import Template - -def make_ops(): - return [ - BlasOp('gemv', (float, double), - [trans('transA'), size('M'), size('N'), scalar('alpha'), - matrix('A'), size('lda'), vector('X'), inc('incX'), - scalar('beta', pydef='0.0'), vector('Y', output=True), - inc('incY')], - check_dims=check_dims_gemv, setup_order=setup_order_gemv, - py_decls=py_decls_gemv, py_ensure_output=py_ensure_output_gemv), - BlasOp('gemm', (float, double), - [trans('transA'), trans('transB'), size('M'), size('N'), - size('K'), scalar('alpha'), matrix('A'), size('lda'), - matrix('B'), size('ldb'), scalar('beta'), - matrix('C', output=True), size('ldc')], - check_dims=check_dims_gemm, setup_order=setup_order_gemm, - py_decls=py_decls_gemm, py_ensure_output=py_ensure_output_gemm), - BlasOp('ger', (float, double), - [size('M'), size('N'), scalar('alpha'), - vector('X'), inc('incX'), vector('Y'), inc('incY'), - matrix('A', output=True), size('lda')], - check_dims=check_dims_ger, setup_order=setup_order_ger, - py_decls=py_decls_ger, py_ensure_output=py_ensure_output_ger), - ] - -class Argument(object): - def __init__(self, name, const=False, pydef=None): - self.name = name - self.const = const - self.pydef = pydef - - def ismatrix(self): - return False - - def isarray(self): - return False - -class ScalarArg(Argument): - def format_as_arg(self, ctype, _=None): - const = "" - if self.const: - const += "const " - return const + self.ctype + ' ' + self.name - - format_simple_arg = format_as_arg - - def format_simple_call(self, _): - return self.name - - def format_as_call(self): - return self.tf_macro + '(' + self.name + ')' - -class scalar(ScalarArg): - tf_macro = 'SCAL' - def format_as_arg(self, ctype, _=None): - const = "" - if self.const: - const += "const " - return const + ctype + ' ' + self.name - - format_simple_arg = format_as_arg - - def format_simple_call(self, _): - return self.name - - def format_as_callarg(self, ctype): - return '(' + ctype + ')' + self.name - - def format_pyarg(self): - res = "double "+self.name - if self.pydef is not None: - res += '='+self.pydef - return res - -class size(ScalarArg): - tf_macro = 'SZ' - ctype = 'size_t' - - def format_as_callarg(self, ctype): - return self.name.lower() - -class inc(ScalarArg): - tf_macro = '' - ctype = 'int' - - def format_as_callarg(self, ctype): - return self.name[-1] + 'p->strides[0] / elsize' - -class trans(ScalarArg): - tf_macro = 'TRANS' - ctype = 'cb_transpose' - - def format_as_callarg(self, ctype): - return self.name - -class ArrayArg(Argument): - def __init__(self, name, output=False): - pydef = None - if output: - pydef = 'None' - Argument.__init__(self, name, pydef=pydef) - self.isoutput = output - - def isarray(self): - return True - - def format_simple_arg(self, ctype, arraytype): - return arraytype + self.name - - def format_simple_call(self, arraypat): - return arraypat % (self.name,) - - def format_as_arg(self, ctype): - return 'gpudata *' + self.name + ', size_t off' + self.name - - def format_as_call(self): - return 'ARRAY(' + self.name + ', dtype)' - - def format_as_callarg(self, ctype): - return self.name + 'p->data, ' + self.name + 'p->offset / elsize' - - def format_pyarg(self): - res = 'GpuArray '+self.name - if self.pydef is not None: - res += '='+self.pydef - return res - -class matrix(ArrayArg): - def ismatrix(self): - return True - -class vector(ArrayArg): - pass - -class BlasOp(object): - def __init__(self, name, types, arguments, check_dims, setup_order, - py_decls, py_ensure_output): - self.name = name - self.types = types - self.arguments = arguments - self.check_dims = check_dims - self.setup_order = setup_order - self.py_decls = py_decls - self.py_ensure_output = py_ensure_output - self.has_order = any(arg.ismatrix() for arg in self.arguments) - - def matrix_args(self): - return [arg for arg in self.arguments if arg.ismatrix()] - - def array_args(self): - return [arg for arg in self.arguments if arg.isarray()] - - def size_args(self): - return self.args_per_class(size) - - def simple_args(self): - return [arg for arg in self.arguments if (arg.isarray() or type(arg) is scalar or type(arg) is trans)] - - def py_args(self): - return [arg for arg in self.arguments if (arg.isarray() or type(arg) is scalar)] - - def args_per_class(self, cls): - return [arg for arg in self.arguments if type(arg) is cls] - - def format_arguments(self, ctype): - order = '' - if self.has_order: - order = 'cb_order order, ' - return order + ', '.join(arg.format_as_arg(ctype) for arg in self.arguments) - def format_blas_args(self, ctype): - return ', '.join(arg.format_as_callarg(ctype) for arg in self.arguments) - - def format_call_args(self): - order = '' - if self.has_order: - order = 'ORDER ' - return order + ', '.join(arg.format_as_call() for arg in self.arguments) - - def format_simple_args(self, ctype, arraytype): - return ', '.join(arg.format_simple_arg(ctype, arraytype) for arg in self.simple_args()) - - def format_simple_call(self, arraypat): - return ', '.join(arg.format_simple_call(arraypat) for arg in self.simple_args()) - - def format_pyargs(self): - l = [arg.format_pyarg() for arg in self.py_args()] - l.extend('trans_'+t.name[-1].lower()+'=False' for t in self.args_per_class(trans)) - l.extend('overwrite_'+a.name.lower()+'=False' for a in self.array_args() if a.isoutput) - return ', '.join(l) - -class Dtype(object): - def __init__(self, name, c): - self.name = name - self.c = c -float = Dtype('float', 's') -double = Dtype('double', 'd') - -check_dims_gemv = """ - if (transA == cb_no_trans) { - m = A->dimensions[0]; - n = A->dimensions[1]; - } else { - m = A->dimensions[1]; - n = A->dimensions[0]; - } - - if (Y->dimensions[0] != m || X->dimensions[0] != n) - return GA_VALUE_ERROR; - - m = A->dimensions[0]; - n = A->dimensions[1]; -""" - -setup_order_gemv = """ - if (Ap->flags & GA_F_CONTIGUOUS) { - o = cb_fortran; - lda = Ap->dimensions[0]; - } else if (Ap->flags & GA_C_CONTIGUOUS) { - o = cb_c; - lda = Ap->dimensions[1]; - } else { - /* Might be worth looking at making degenerate matrices (1xn) work here. */ - err = GA_VALUE_ERROR; - goto cleanup; - } -""" - -py_decls_gemv = "cdef size_t Yshp" - -py_ensure_output_gemv = """ - if A.ga.nd != 2: - raise TypeError, "A is not a matrix" - if transA == cb_no_trans: - Yshp = A.ga.dimensions[0] - else: - Yshp = A.ga.dimensions[1] - if Y is None: - if beta != 0.0: - raise ValueError, "Y not provided and beta != 0" - Y = pygpu_empty(1, &Yshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) - overwrite_y = True -""" - -check_dims_gemm = """ - if (transA == cb_no_trans) { - m = A->dimensions[0]; - k = A->dimensions[1]; - } else { - m = A->dimensions[1]; - k = A->dimensions[0]; - } - - if (transB == cb_no_trans) { - n = B->dimensions[1]; - if (B->dimensions[0] != k) - return GA_VALUE_ERROR; - } else { - n = B->dimensions[0]; - if (B->dimensions[1] != k) - return GA_VALUE_ERROR; - } - - if (C->dimensions[0] != m || C->dimensions[1] != n) - return GA_VALUE_ERROR; -""" - -setup_order_gemm = """ - if (Cp->flags & GA_F_CONTIGUOUS) { - o = cb_fortran; - ldc = Cp->dimensions[0]; - } else if (Cp->flags & GA_C_CONTIGUOUS) { - o = cb_c; - ldc = Cp->dimensions[1]; - } else { - err = GA_VALUE_ERROR; - goto cleanup; - } - if (Ap->flags & GA_F_CONTIGUOUS) { - lda = Ap->dimensions[0]; - if (o == cb_c) { - if (transA == cb_no_trans) - transA = cb_trans; - else - transA = cb_no_trans; - } - } else if (Ap->flags & GA_C_CONTIGUOUS) { - lda = Ap->dimensions[1]; - if (o == cb_fortran) { - if (transA == cb_no_trans) - transA = cb_trans; - else - transA = cb_no_trans; - } - } else { - err = GA_VALUE_ERROR; - goto cleanup; - } - if (Bp->flags & GA_F_CONTIGUOUS) { - ldb = Bp->dimensions[0]; - if (o == cb_c) { - if (transB == cb_no_trans) - transB = cb_trans; - else - transB = cb_no_trans; - } - } else if (Bp->flags & GA_C_CONTIGUOUS) { - ldb = Bp->dimensions[1]; - if (o == cb_fortran) { - if (transB == cb_no_trans) - transB = cb_trans; - else - transB = cb_no_trans; - } - } else { - err = GA_VALUE_ERROR; - goto cleanup; - } -""" - -py_decls_gemm = "cdef size_t[2] Cshp" - -py_ensure_output_gemm = """ - if A.ga.nd != 2: - raise TypeError, "A is not a matrix" - if B.ga.nd != 2: - raise TypeError, "B is not a matrix" - if transA == cb_no_trans: - Cshp[0] = A.ga.dimensions[0] - else: - Cshp[0] = A.ga.dimensions[1] - if transB == cb_no_trans: - Cshp[1] = B.ga.dimensions[1] - else: - Cshp[1] = B.ga.dimensions[0] - if C is None: - if beta != 0.0: - raise ValueError, "C not provided and beta != 0" - C = pygpu_empty(2, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) - overwrite_c = True -""" - -check_dims_ger = """ - m = X->dimensions[0]; - n = Y->dimensions[0]; - if (A->dimensions[0] != m || A->dimensions[1] != n) - return GA_VALUE_ERROR; -""" - -setup_order_ger = """ - if (Ap->flags & GA_F_CONTIGUOUS) { - o = cb_fortran; - lda = Ap->dimensions[0]; - } else if (Ap->flags & GA_C_CONTIGUOUS) { - o = cb_c; - lda = Ap->dimensions[1]; - } else { - /* Might be worth looking at making degenerate matrices (1xn) work here. */ - err = GA_VALUE_ERROR; - goto cleanup; - } -""" - -py_decls_ger = "cdef size_t[2] Ashp" - -py_ensure_output_ger = """ - if A is None: - Ashp[0] = X.ga.dimensions[0]; - Ashp[1] = Y.ga.dimensions[0]; - A = pygpu_zeros(2, Ashp, X.ga.typecode, GA_ANY_ORDER, X.context, None) - overwrite_a = True -""" - -# having two (or three) layers of backslash-interpreting can be pretty -# confusing if you want to output a backslash. Add to that mako's -# parsers bugs around backslahes and the 'pass a parameter that is a -# backslash string' approach seems the most likely to work on a range -# of versions. -BS = '\\' - -GENERIC_TMPL = Template(""" -/* This file is generated by gen_blas.py in the root of the distribution */ -#if !defined(FETCH_CONTEXT) || !defined(PREFIX) || !defined(ARRAY) || !defined(POST_CALL) -#error "required macros not defined" -#endif - -#ifdef ORDER -% for op in ops: -#ifndef PREP_ORDER_${op.name.upper()} -#define PREP_ORDER_${op.name.upper()} -#endif -#ifndef HANDLE_ORDER_${op.name.upper()} -#define HANDLE_ORDER_${op.name.upper()} -#endif -% endfor -#else -#define ORDER -#endif - -#ifndef INIT_ARGS -#define INIT_ARGS -#endif - -#ifndef TRAIL_ARGS -#define TRAIL_ARGS -#endif - -#ifndef SZ -#define SZ(a) a -#endif - -#ifndef TRANS -#define TRANS(t) t -#endif - -#ifndef SCAL -#define SCAL(s) s -#endif - -#ifndef FUNC_INIT -#define FUNC_INIT -#endif - -#ifndef FUNC_FINI -#define FUNC_FINI -#endif - -#define __GLUE(part1, part2) __GLUE_INT(part1, part2) -#define __GLUE_INT(part1, part2) part1 ## part2 - -% for op in ops: -#define ${op.name.upper()}(dtype, typec, TYPEC) ${bs} - static int typec ## ${op.name}(${op.format_arguments('dtype')}) { ${bs} - FETCH_CONTEXT(${op.array_args()[0].name}); ${bs} - FUNC_DECLS; ${bs} - PREP_ORDER_${op.name.upper()}; ${bs} - ${bs} - HANDLE_ORDER_${op.name.upper()}; ${bs} - FUNC_INIT; ${bs} - ${bs} -% for a in op.array_args(): - ARRAY_INIT(${a.name}); ${bs} -% endfor - ${bs} - PRE_CALL __GLUE(PREFIX(typec, TYPEC), ${op.name})(INIT_ARGS ${op.format_call_args()} TRAIL_ARGS); ${bs} - POST_CALL; ${bs} - ${bs} -% for a in op.array_args(): - ARRAY_FINI(${a.name}); ${bs} -% endfor - FUNC_FINI; ${bs} - ${bs} - return GA_NO_ERROR; ${bs} - } - -% for type in op.types: -${op.name.upper()}(${type.name}, ${type.c}, ${type.c.upper()}) -% endfor -% endfor - -GPUARRAY_LOCAL gpuarray_blas_ops __GLUE(NAME, _ops) = { - setup, - teardown, -% for op in ops: - % for type in op.types: - ${type.c}${op.name}, - % endfor -% endfor - sgemmBatch, - dgemmBatch, -}; -""") - -BUFFERBLAS_TMPL = Template(""" -/* This file is generated by gen_blas.py in the root of the distribution */ -#ifndef GPUARRAY_BUFFER_BLAS_H -#define GPUARRAY_BUFFER_BLAS_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum _cb_order { - cb_row, - cb_column -} cb_order; - -#define cb_c cb_row -#define cb_fortran cb_column - -typedef enum _cb_side { - cb_left, - cb_right -} cb_side; - -typedef enum _cb_transpose { - cb_no_trans, - cb_trans, - cb_conj_trans -} cb_transpose; - -typedef enum _cb_uplo { - cb_upper, - cb_lower -} cb_uplo; - -typedef struct _gpuarray_blas_ops { - int (*setup)(void *ctx); - void (*teardown)(void *ctx); -% for op in ops: - % for type in op.types: - int (*${type.c}${op.name})(${op.format_arguments(type.name)}); - % endfor -% endfor - int (*sgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); - int (*dgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); -} gpuarray_blas_ops; - -#ifdef __cplusplus -} -#endif - -#endif -""") - -BLAS_TMPL = Template(""" -/* This file is generated by gen_blas.py in the root of the distribution */ -#ifndef GPUARRAY_BLAS_H -#define GPUARRAY_BLAS_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -% for op in ops: -GPUARRAY_PUBLIC int GpuArray_r${op.name}(${op.format_simple_args('double', 'GpuArray *')}, - int nocopy); - % for type in op.types: -#define GpuArray_${type.c}${op.name} GpuArray_r${op.name} - % endfor -% endfor - -#ifdef __cplusplus -} -#endif - -#endif -""") - -ARRAYBLAS_TMPL = Template(""" -/* This file is generated by gen_blas.py in the root of the distribution */ -#include "gpuarray/blas.h" -#include "gpuarray/buffer_blas.h" -#include "gpuarray/types.h" -#include "gpuarray/util.h" -#include "gpuarray/error.h" - -% for op in ops: -int GpuArray_r${op.name}(${op.format_simple_args('double', 'GpuArray *')}, - int nocopy) { - % for a in op.array_args(): - GpuArray *${a.name}p = ${a.name}; - % if not a.isoutput: - GpuArray copy${a.name}; - % endif - % endfor - gpuarray_blas_ops *blas; - void *ctx; - size_t elsize; - size_t ${', '.join(a.name.lower() for a in op.size_args())}; - cb_order o; - int err; -<% firsta = op.array_args()[0].name %> - - if (${firsta}->typecode != GA_FLOAT && ${firsta}->typecode != GA_DOUBLE) - return GA_INVALID_ERROR; - -<% -def ndcond(ary): - if ary.ismatrix(): - return ary.name + "->nd != 2" - else: - return ary.name + "->nd != 1" - -def typecond(first, ary): - return ary.name + "->typecode != " + first + "->typecode" - -def aligncond(a): - return "!(" + a.name + "->flags & GA_ALIGNED)" -%> - if (${'||'.join(ndcond(a) for a in op.array_args())} || - ${'||'.join(typecond(firsta, a) for a in op.array_args())}) - return GA_VALUE_ERROR; - - if (${'||'.join(aligncond(a) for a in op.array_args())}) - return GA_UNALIGNED_ERROR; - - ${op.check_dims} - - elsize = gpuarray_get_elsize(${firsta}->typecode); - -% for a in op.array_args(): - % if a.ismatrix(): - if (!GpuArray_ISONESEGMENT(${a.name})) { - % if a.isoutput: - err = GA_VALUE_ERROR; - goto cleanup; - % else: - if (nocopy) - return GA_COPY_ERROR; - else { - err = GpuArray_copy(©${a.name}, ${a.name}, GA_F_ORDER); - if (err != GA_NO_ERROR) - goto cleanup; - ${a.name}p = ©${a.name}; - } - % endif - } - % else: - if (${a.name}->strides[0] < 0) { - % if a.isoutput: - err = GA_VALUE_ERROR; - goto cleanup; - % else: - if (nocopy) - return GA_COPY_ERROR; - else { - err = GpuArray_copy(©${a.name}, ${a.name}, GA_ANY_ORDER); - if (err != GA_NO_ERROR) - goto cleanup; - ${a.name}p = ©${a.name}; - } - % endif - } - % endif -% endfor - - ${op.setup_order} - - err = ${firsta}p->ops->property(NULL, ${firsta}p->data, NULL, GA_BUFFER_PROP_CTX, &ctx); - if (err != GA_NO_ERROR) - goto cleanup; - err = ${firsta}p->ops->property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas); - if (err != GA_NO_ERROR) - goto cleanup; - - err = blas->setup(ctx); - if (err != GA_NO_ERROR) - goto cleanup; - - if (${firsta}p->typecode == GA_FLOAT) - err = blas->s${op.name}(o, ${op.format_blas_args('float')}); - else - err = blas->d${op.name}(o, ${op.format_blas_args('double')}); - - cleanup: -% for a in op.array_args(): - % if not a.isoutput: - if (${a.name}p == ©${a.name}) - GpuArray_clear(©${a.name}); - % endif -% endfor - return err; -} -% endfor -""") - -BLAS_PYX_TMPL = Template(""" -# This file is generated by gen_blas.py in the root of the distribution -from pygpu.gpuarray import GpuArrayException -from pygpu.gpuarray cimport (_GpuArray, GpuArray, GA_NO_ERROR, GpuArray_error, - pygpu_copy, pygpu_empty, pygpu_zeros, - GA_ANY_ORDER, GA_F_ORDER, GpuArray_ISONESEGMENT) - -cdef extern from "gpuarray/buffer_blas.h": - ctypedef enum cb_transpose: - cb_no_trans, - cb_trans, - cb_conj_trans - -cdef extern from "gpuarray/blas.h": -% for op in ops: - int GpuArray_r${op.name}(${op.format_simple_args('double', '_GpuArray *')}, - int nocopy) -% endfor - -% for op in ops: -cdef api int pygpu_blas_r${op.name}(${op.format_simple_args('double', 'GpuArray ')}, - bint nocopy) except -1: - cdef int err - err = GpuArray_r${op.name}(${op.format_simple_call('&%s.ga')}, nocopy); - if err != GA_NO_ERROR: - raise GpuArrayException(GpuArray_error(&${op.array_args()[0].name}.ga, err), err) - return 0 - -% endfor - -% for op in ops: -def ${op.name}(${op.format_pyargs()}): - % for m in op.matrix_args(): - % if not m.isoutput: - cdef cb_transpose trans${m.name} - % endif - % endfor - ${op.py_decls} - - % for m in op.matrix_args(): - % if not m.isoutput: - if trans_${m.name.lower()}: - trans${m.name} = cb_trans - else: - trans${m.name} = cb_no_trans - % endif - % endfor - - ${op.py_ensure_output} - - % for a in op.array_args(): - % if a.isoutput: - if not overwrite_${a.name.lower()}: - ${a.name} = pygpu_copy(${a.name}, GA_ANY_ORDER) - % endif - % endfor - pygpu_blas_r${op.name}(${op.format_simple_call('%s')}, 0) -<% -outas = [] -for a in op.array_args(): - if a.isoutput: - outas.append(a.name) -assert len(outas) is not 0 -outa = ', '.join(outas) -%> - return ${outa} - -% endfor -""") - -OPS=make_ops() - -try: - generic = GENERIC_TMPL.render(ops=OPS, bs=BS) - bufferblas = BUFFERBLAS_TMPL.render(ops=OPS) - blas = BLAS_TMPL.render(ops=OPS) - arrayblas = ARRAYBLAS_TMPL.render(ops=OPS) - blas_pyx = BLAS_PYX_TMPL.render(ops=OPS) -except Exception: - print exceptions.text_error_template().render() - sys.exit(1) - -with open('src/generic_blas.inc.c', 'w') as f: - f.write(generic) - -with open('src/gpuarray/buffer_blas.h', 'w') as f: - f.write(bufferblas) - -with open('src/gpuarray/blas.h', 'w') as f: - f.write(blas) - -with open('src/gpuarray_array_blas.c', 'w') as f: - f.write(arrayblas) - -with open('pygpu/blas.pyx', 'w') as f: - f.write(blas_pyx) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index a1041ea692..f83322d0a0 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -1,5 +1,3 @@ - -# This file is generated by gen_blas.py in the root of the distribution from pygpu.gpuarray import GpuArrayException from pygpu.gpuarray cimport (_GpuArray, GpuArray, GA_NO_ERROR, GpuArray_error, pygpu_copy, pygpu_empty, pygpu_zeros, @@ -12,23 +10,26 @@ cdef extern from "gpuarray/buffer_blas.h": cb_conj_trans cdef extern from "gpuarray/blas.h": - int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A, _GpuArray *X, double beta, _GpuArray *Y, - int nocopy) - int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, _GpuArray *A, _GpuArray *B, double beta, _GpuArray *C, - int nocopy) + int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A, + _GpuArray *X, double beta, _GpuArray *Y, int nocopy) + int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, + double alpha, _GpuArray *A, _GpuArray *B, + double beta, _GpuArray *C, int nocopy) int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A, - int nocopy) + int nocopy) -cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, GpuArray X, double beta, GpuArray Y, - bint nocopy) except -1: +cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, + GpuArray X, double beta, GpuArray Y, + bint nocopy) except -1: cdef int err err = GpuArray_rgemv(transA, alpha, &A.ga, &X.ga, beta, &Y.ga, nocopy); if err != GA_NO_ERROR: raise GpuArrayException(GpuArray_error(&A.ga, err), err) return 0 -cdef api int pygpu_blas_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray A, GpuArray B, double beta, GpuArray C, - bint nocopy) except -1: +cdef api int pygpu_blas_rgemm(cb_transpose transA, cb_transpose transB, + double alpha, GpuArray A, GpuArray B, + double beta, GpuArray C, bint nocopy) except -1: cdef int err err = GpuArray_rgemm(transA, transB, alpha, &A.ga, &B.ga, beta, &C.ga, nocopy); if err != GA_NO_ERROR: @@ -36,7 +37,7 @@ cdef api int pygpu_blas_rgemm(cb_transpose transA, cb_transpose transB, double a return 0 cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, - bint nocopy) except -1: + bint nocopy) except -1: cdef int err err = GpuArray_rger(alpha, &X.ga, &Y.ga, &A.ga, nocopy); if err != GA_NO_ERROR: @@ -44,7 +45,8 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, return 0 -def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, trans_a=False, overwrite_y=False): +def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, + GpuArray Y=None, trans_a=False, overwrite_y=False): cdef cb_transpose transA cdef size_t Yshp @@ -53,7 +55,6 @@ def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, else: transA = cb_no_trans - if A.ga.nd != 2: raise TypeError, "A is not a matrix" if transA == cb_no_trans: @@ -66,14 +67,14 @@ def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, Y = pygpu_empty(1, &Yshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_y = True - if not overwrite_y: Y = pygpu_copy(Y, GA_ANY_ORDER) pygpu_blas_rgemv(transA, alpha, A, X, beta, Y, 0) return Y -def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, trans_a=False, trans_b=False, overwrite_c=False): +def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, + trans_a=False, trans_b=False, overwrite_c=False): cdef cb_transpose transA cdef cb_transpose transB cdef size_t[2] Cshp @@ -87,7 +88,6 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, tra else: transB = cb_no_trans - if A.ga.nd != 2: raise TypeError, "A is not a matrix" if B.ga.nd != 2: @@ -106,28 +106,24 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, tra C = pygpu_empty(2, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_c = True - if not overwrite_c: C = pygpu_copy(C, GA_ANY_ORDER) pygpu_blas_rgemm(transA, transB, alpha, A, B, beta, C, 0) return C -def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None, overwrite_a=False): +def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None, + overwrite_a=False): cdef size_t[2] Ashp - - if A is None: Ashp[0] = X.ga.dimensions[0]; Ashp[1] = Y.ga.dimensions[0]; A = pygpu_zeros(2, Ashp, X.ga.typecode, GA_ANY_ORDER, X.context, None) overwrite_a = True - if not overwrite_a: A = pygpu_copy(A, GA_ANY_ORDER) pygpu_blas_rger(alpha, X, Y, A, 0) return A - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bca10bea1c..e1a9e443a6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,4 @@ include(CheckFunctionExists) -include(${CMAKE_SOURCE_DIR}/gen_blas.cmake) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DDEBUG") diff --git a/src/generic_blas.inc.c b/src/generic_blas.inc.c deleted file mode 100644 index 60e50869d7..0000000000 --- a/src/generic_blas.inc.c +++ /dev/null @@ -1,151 +0,0 @@ - -/* This file is generated by gen_blas.py in the root of the distribution */ -#if !defined(FETCH_CONTEXT) || !defined(PREFIX) || !defined(ARRAY) || !defined(POST_CALL) -#error "required macros not defined" -#endif - -#ifdef ORDER -#ifndef PREP_ORDER_GEMV -#define PREP_ORDER_GEMV -#endif -#ifndef HANDLE_ORDER_GEMV -#define HANDLE_ORDER_GEMV -#endif -#ifndef PREP_ORDER_GEMM -#define PREP_ORDER_GEMM -#endif -#ifndef HANDLE_ORDER_GEMM -#define HANDLE_ORDER_GEMM -#endif -#ifndef PREP_ORDER_GER -#define PREP_ORDER_GER -#endif -#ifndef HANDLE_ORDER_GER -#define HANDLE_ORDER_GER -#endif -#else -#define ORDER -#endif - -#ifndef INIT_ARGS -#define INIT_ARGS -#endif - -#ifndef TRAIL_ARGS -#define TRAIL_ARGS -#endif - -#ifndef SZ -#define SZ(a) a -#endif - -#ifndef TRANS -#define TRANS(t) t -#endif - -#ifndef SCAL -#define SCAL(s) s -#endif - -#ifndef FUNC_INIT -#define FUNC_INIT -#endif - -#ifndef FUNC_FINI -#define FUNC_FINI -#endif - -#define __GLUE(part1, part2) __GLUE_INT(part1, part2) -#define __GLUE_INT(part1, part2) part1 ## part2 - -#define GEMV(dtype, typec, TYPEC) \ - static int typec ## gemv(cb_order order, cb_transpose transA, size_t M, size_t N, dtype alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, dtype beta, gpudata *Y, size_t offY, int incY) { \ - FETCH_CONTEXT(A); \ - FUNC_DECLS; \ - PREP_ORDER_GEMV; \ - \ - HANDLE_ORDER_GEMV; \ - FUNC_INIT; \ - \ - ARRAY_INIT(A); \ - ARRAY_INIT(X); \ - ARRAY_INIT(Y); \ - \ - PRE_CALL __GLUE(PREFIX(typec, TYPEC), gemv)(INIT_ARGS ORDER TRANS(transA), SZ(M), SZ(N), SCAL(alpha), ARRAY(A, dtype), SZ(lda), ARRAY(X, dtype), (incX), SCAL(beta), ARRAY(Y, dtype), (incY) TRAIL_ARGS); \ - POST_CALL; \ - \ - ARRAY_FINI(A); \ - ARRAY_FINI(X); \ - ARRAY_FINI(Y); \ - FUNC_FINI; \ - \ - return GA_NO_ERROR; \ - } - -GEMV(float, s, S) -GEMV(double, d, D) -#define GEMM(dtype, typec, TYPEC) \ - static int typec ## gemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, dtype alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, dtype beta, gpudata *C, size_t offC, size_t ldc) { \ - FETCH_CONTEXT(A); \ - FUNC_DECLS; \ - PREP_ORDER_GEMM; \ - \ - HANDLE_ORDER_GEMM; \ - FUNC_INIT; \ - \ - ARRAY_INIT(A); \ - ARRAY_INIT(B); \ - ARRAY_INIT(C); \ - \ - PRE_CALL __GLUE(PREFIX(typec, TYPEC), gemm)(INIT_ARGS ORDER TRANS(transA), TRANS(transB), SZ(M), SZ(N), SZ(K), SCAL(alpha), ARRAY(A, dtype), SZ(lda), ARRAY(B, dtype), SZ(ldb), SCAL(beta), ARRAY(C, dtype), SZ(ldc) TRAIL_ARGS); \ - POST_CALL; \ - \ - ARRAY_FINI(A); \ - ARRAY_FINI(B); \ - ARRAY_FINI(C); \ - FUNC_FINI; \ - \ - return GA_NO_ERROR; \ - } - -GEMM(float, s, S) -GEMM(double, d, D) -#define GER(dtype, typec, TYPEC) \ - static int typec ## ger(cb_order order, size_t M, size_t N, dtype alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { \ - FETCH_CONTEXT(X); \ - FUNC_DECLS; \ - PREP_ORDER_GER; \ - \ - HANDLE_ORDER_GER; \ - FUNC_INIT; \ - \ - ARRAY_INIT(X); \ - ARRAY_INIT(Y); \ - ARRAY_INIT(A); \ - \ - PRE_CALL __GLUE(PREFIX(typec, TYPEC), ger)(INIT_ARGS ORDER SZ(M), SZ(N), SCAL(alpha), ARRAY(X, dtype), (incX), ARRAY(Y, dtype), (incY), ARRAY(A, dtype), SZ(lda) TRAIL_ARGS); \ - POST_CALL; \ - \ - ARRAY_FINI(X); \ - ARRAY_FINI(Y); \ - ARRAY_FINI(A); \ - FUNC_FINI; \ - \ - return GA_NO_ERROR; \ - } - -GER(float, s, S) -GER(double, d, D) - -GPUARRAY_LOCAL gpuarray_blas_ops __GLUE(NAME, _ops) = { - setup, - teardown, - sgemv, - dgemv, - sgemm, - dgemm, - sger, - dger, - sgemmBatch, - dgemmBatch, -}; diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index bee4832f4f..fcd49d0705 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -1,5 +1,3 @@ - -/* This file is generated by gen_blas.py in the root of the distribution */ #ifndef GPUARRAY_BLAS_H #define GPUARRAY_BLAS_H @@ -10,16 +8,21 @@ extern "C" { #endif -GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, - int nocopy); +GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha, + GpuArray *A, GpuArray *X, double beta, + GpuArray *Y, int nocopy); +#define GpuArray_hgemv GpuArray_rgemv #define GpuArray_sgemv GpuArray_rgemv #define GpuArray_dgemv GpuArray_rgemv -GPUARRAY_PUBLIC int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, - int nocopy); +GPUARRAY_PUBLIC int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, + double alpha, GpuArray *A, GpuArray *B, + double beta, GpuArray *C, int nocopy); +#define GpuArray_hgemm GpuArray_rgemm #define GpuArray_sgemm GpuArray_rgemm #define GpuArray_dgemm GpuArray_rgemm -GPUARRAY_PUBLIC int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, - int nocopy); +GPUARRAY_PUBLIC int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, + GpuArray *A, int nocopy); +#define GpuArray_hger GpuArray_rger #define GpuArray_sger GpuArray_rger #define GpuArray_dger GpuArray_rger diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 0a4573a314..81a53c4c88 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -1,5 +1,3 @@ - -/* This file is generated by gen_blas.py in the root of the distribution */ #ifndef GPUARRAY_BUFFER_BLAS_H #define GPUARRAY_BUFFER_BLAS_H @@ -37,12 +35,16 @@ typedef enum _cb_uplo { typedef struct _gpuarray_blas_ops { int (*setup)(void *ctx); void (*teardown)(void *ctx); + int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY); int (*sgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY); int (*dgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY); + int (*hgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc); int (*sgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc); int (*dgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); + int (*hger)(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*sger)(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*dger)(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); + int (*hgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); int (*sgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); int (*dgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); } gpuarray_blas_ops; diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index c522387897..3cc37e48f3 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -1,13 +1,11 @@ - -/* This file is generated by gen_blas.py in the root of the distribution */ #include "gpuarray/blas.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/types.h" #include "gpuarray/util.h" #include "gpuarray/error.h" -int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, - int nocopy) { +int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, + GpuArray *X, double beta, GpuArray *Y, int nocopy) { GpuArray *Ap = A; GpuArray copyA; GpuArray *Xp = X; @@ -20,19 +18,20 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, cb_order o; int err; - - if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) + if (A->typecode != GA_HALF && + A->typecode != GA_FLOAT && + A->typecode != GA_DOUBLE) return GA_INVALID_ERROR; - - if (A->nd != 2||X->nd != 1||Y->nd != 1 || - A->typecode != A->typecode||X->typecode != A->typecode||Y->typecode != A->typecode) + if (A->nd != 2 || X->nd != 1 || Y->nd != 1 || + A->typecode != A->typecode || X->typecode != A->typecode || + Y->typecode != A->typecode) return GA_VALUE_ERROR; - if (!(A->flags & GA_ALIGNED)||!(X->flags & GA_ALIGNED)||!(Y->flags & GA_ALIGNED)) + if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || + !(Y->flags & GA_ALIGNED)) return GA_UNALIGNED_ERROR; - if (transA == cb_no_trans) { m = A->dimensions[0]; n = A->dimensions[1]; @@ -47,7 +46,6 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, m = A->dimensions[0]; n = A->dimensions[1]; - elsize = gpuarray_get_elsize(A->typecode); if (!GpuArray_ISONESEGMENT(A)) { @@ -75,7 +73,6 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, goto cleanup; } - if (Ap->flags & GA_F_CONTIGUOUS) { o = cb_fortran; lda = Ap->dimensions[0]; @@ -88,7 +85,6 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, goto cleanup; } - err = Ap->ops->property(NULL, Ap->data, NULL, GA_BUFFER_PROP_CTX, &ctx); if (err != GA_NO_ERROR) goto cleanup; @@ -100,11 +96,20 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, if (err != GA_NO_ERROR) goto cleanup; - if (Ap->typecode == GA_FLOAT) + switch (Ap->typecode) { + case GA_HALF: + if (blas->hgemv == NULL) + err = GA_DEVSUP_ERROR; + else + err = blas->hgemv(o, transA, m, n, (float)alpha, Ap->data, Ap->offset / elsize, lda, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, (float)beta, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize); + break; + case GA_FLOAT: err = blas->sgemv(o, transA, m, n, (float)alpha, Ap->data, Ap->offset / elsize, lda, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, (float)beta, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize); - else + break; + case GA_DOUBLE: err = blas->dgemv(o, transA, m, n, (double)alpha, Ap->data, Ap->offset / elsize, lda, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, (double)beta, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize); - + break; + } cleanup: if (Ap == ©A) GpuArray_clear(©A); @@ -112,8 +117,10 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, GpuArray_clear(©X); return err; } -int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, - int nocopy) { + +int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, + GpuArray *A, GpuArray *B, double beta, GpuArray *C, + int nocopy) { GpuArray *Ap = A; GpuArray copyA; GpuArray *Bp = B; @@ -126,19 +133,19 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuAr cb_order o; int err; - - if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) + if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && + A->typecode != GA_DOUBLE) return GA_INVALID_ERROR; - - if (A->nd != 2||B->nd != 2||C->nd != 2 || - A->typecode != A->typecode||B->typecode != A->typecode||C->typecode != A->typecode) + if (A->nd != 2 || B->nd != 2 || C->nd != 2 || + A->typecode != A->typecode || B->typecode != A->typecode || + C->typecode != A->typecode) return GA_VALUE_ERROR; - if (!(A->flags & GA_ALIGNED)||!(B->flags & GA_ALIGNED)||!(C->flags & GA_ALIGNED)) + if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || + !(C->flags & GA_ALIGNED)) return GA_UNALIGNED_ERROR; - if (transA == cb_no_trans) { m = A->dimensions[0]; k = A->dimensions[1]; @@ -160,7 +167,6 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuAr if (C->dimensions[0] != m || C->dimensions[1] != n) return GA_VALUE_ERROR; - elsize = gpuarray_get_elsize(A->typecode); if (!GpuArray_ISONESEGMENT(A)) { @@ -188,7 +194,6 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuAr goto cleanup; } - if (Cp->flags & GA_F_CONTIGUOUS) { o = cb_fortran; ldc = Cp->dimensions[0]; @@ -240,7 +245,6 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuAr goto cleanup; } - err = Ap->ops->property(NULL, Ap->data, NULL, GA_BUFFER_PROP_CTX, &ctx); if (err != GA_NO_ERROR) goto cleanup; @@ -252,10 +256,20 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuAr if (err != GA_NO_ERROR) goto cleanup; - if (Ap->typecode == GA_FLOAT) + switch (Ap->typecode) { + case GA_HALF: + if (blas->hgemm == NULL) + err = GA_DEVSUP_ERROR; + else + err = blas->hgemm(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset / elsize, lda, Bp->data, Bp->offset / elsize, ldb, (float)beta, Cp->data, Cp->offset / elsize, ldc); + break; + case GA_FLOAT: err = blas->sgemm(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset / elsize, lda, Bp->data, Bp->offset / elsize, ldb, (float)beta, Cp->data, Cp->offset / elsize, ldc); - else + break; + case GA_DOUBLE: err = blas->dgemm(o, transA, transB, m, n, k, (double)alpha, Ap->data, Ap->offset / elsize, lda, Bp->data, Bp->offset / elsize, ldb, (double)beta, Cp->data, Cp->offset / elsize, ldc); + break; + } cleanup: if (Ap == ©A) @@ -264,8 +278,9 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuAr GpuArray_clear(©B); return err; } + int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, - int nocopy) { + int nocopy) { GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; @@ -278,25 +293,24 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, cb_order o; int err; - - if (X->typecode != GA_FLOAT && X->typecode != GA_DOUBLE) + if (X->typecode != GA_HALF && X->typecode != GA_FLOAT && + X->typecode != GA_DOUBLE) return GA_INVALID_ERROR; - - if (X->nd != 1||Y->nd != 1||A->nd != 2 || - X->typecode != X->typecode||Y->typecode != X->typecode||A->typecode != X->typecode) + if (X->nd != 1 || Y->nd != 1 || A->nd != 2 || + X->typecode != X->typecode || Y->typecode != X->typecode || + A->typecode != X->typecode) return GA_VALUE_ERROR; - if (!(X->flags & GA_ALIGNED)||!(Y->flags & GA_ALIGNED)||!(A->flags & GA_ALIGNED)) + if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || + !(A->flags & GA_ALIGNED)) return GA_UNALIGNED_ERROR; - m = X->dimensions[0]; n = Y->dimensions[0]; if (A->dimensions[0] != m || A->dimensions[1] != n) return GA_VALUE_ERROR; - elsize = gpuarray_get_elsize(X->typecode); if (X->strides[0] < 0) { @@ -324,7 +338,6 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, goto cleanup; } - if (Ap->flags & GA_F_CONTIGUOUS) { o = cb_fortran; lda = Ap->dimensions[0]; @@ -337,7 +350,6 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, goto cleanup; } - err = Xp->ops->property(NULL, Xp->data, NULL, GA_BUFFER_PROP_CTX, &ctx); if (err != GA_NO_ERROR) goto cleanup; @@ -349,10 +361,20 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, if (err != GA_NO_ERROR) goto cleanup; - if (Xp->typecode == GA_FLOAT) + switch(Xp->typecode) { + case GA_HALF: + if (blas->hger == NULL) + err = GA_DEVSUP_ERROR; + else + err = blas->hger(o, m, n, (float)alpha, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Ap->data, Ap->offset / elsize, lda); + break; + case GA_FLOAT: err = blas->sger(o, m, n, (float)alpha, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Ap->data, Ap->offset / elsize, lda); - else + break; + case GA_DOUBLE: err = blas->dger(o, m, n, (double)alpha, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Ap->data, Ap->offset / elsize, lda); + break; + } cleanup: if (Xp == ©X) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 92103dfeaa..3b2ed1d9fa 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -62,108 +62,182 @@ static void teardown(void *c) { cuda_exit(ctx); } -#define NAME cublas - -#define FETCH_CONTEXT(A) cuda_context *ctx = (A)->ctx -#define FUNC_DECLS cublasStatus_t err -#define PREP_ORDER_GEMV size_t t - -#define HANDLE_ORDER_GEMV \ - if (order == cb_c) { \ - t = N; \ - N = M; \ - M = t; \ - if (transA == cb_no_trans) { \ - transA = cb_trans; \ - } else { \ - transA = cb_no_trans; \ - } \ +static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, + float beta, gpudata *C, size_t offC, size_t ldc) { + cuda_context *ctx = A->ctx; + gpudata *T; + size_t t; + cublasStatus_t err; + cb_transpose transT; + + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + transT = transA; + transA = transB; + transB = transT; + t = offA; + offA = offB; + offB = t; } -#define PREP_ORDER_GEMM \ - size_t lt, t; \ - gpudata *T; \ - cb_transpose transT - -#define PREP_ORDER_GEMMBATCH \ - size_t *lt, t; \ - gpudata **T; \ - cb_transpose transT - -#define HANDLE_ORDER_GEMM \ - if (order == cb_c) { \ - t = N; \ - N = M; \ - M = t; \ - T = A; \ - A = B; \ - B = T; \ - t = lda; \ - lda = ldb; \ - ldb = t; \ - transT = transA; \ - transA = transB; \ - transB = transT; \ - lt = offA; \ - offA = offB; \ - offB = lt; \ + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + cuda_wait(A, CUDA_WAIT_READ); + cuda_wait(B, CUDA_WAIT_READ); + cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + err = cublasSgemm(ctx->blas_handle, convT(transA), convT(transB), M, N, K, + &alpha, ((float *)A->ptr) + offA, lda, + ((float *)B->ptr) + offB, ldb, &beta, + ((float *)C->ptr) + offC, ldc); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; } + cuda_mark(A, CUDA_WAIT_READ); + cuda_mark(B, CUDA_WAIT_READ); + cuda_mark(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + cuda_exit(ctx); + return GA_NO_ERROR; +} + +static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, + double beta, gpudata *C, size_t offC, size_t ldc) { + cuda_context *ctx = A->ctx; + gpudata *T; + size_t t; + cublasStatus_t err; + cb_transpose transT; + + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + transT = transA; + transA = transB; + transB = transT; + t = offA; + offA = offB; + offB = t; + } + + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + cuda_wait(A, CUDA_WAIT_READ); + cuda_wait(B, CUDA_WAIT_READ); + cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); -#define HANDLE_ORDER_GEMMBATCH HANDLE_ORDER_GEMM - -#define PREP_ORDER_GER \ - size_t t; \ - gpudata *td - -#define HANDLE_ORDER_GER \ - if (order == cb_c) { \ - t = M; \ - M = N; \ - N = t; \ - t = offX; \ - offX = offY; \ - offY = t; \ - t = incX; \ - incX = incY; \ - incY = t; \ - td = X; \ - X = Y; \ - Y = td; \ + err = cublasDgemm(ctx->blas_handle, convT(transA), convT(transB), M, N, K, + &alpha, ((double *)A->ptr) + offA, lda, + ((double *)B->ptr) + offB, ldb, &beta, + ((double *)C->ptr) + offC, ldc); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; } + cuda_mark(A, CUDA_WAIT_READ); + cuda_mark(B, CUDA_WAIT_READ); + cuda_mark(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); -#define FUNC_INIT \ - cuda_enter(ctx); \ - if (ctx->err != CUDA_SUCCESS) \ - return GA_IMPL_ERROR + cuda_exit(ctx); + return GA_NO_ERROR; +} -#define FUNC_FINI cuda_exit(ctx) +static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, + float beta, gpudata *C, size_t offC, size_t ldc) { +#ifdef HAVE_CUBLAS_SGEMMEX + cuda_context *ctx = A->ctx; + gpudata *T; + size_t t; + cublasStatus_t err; + cb_transpose transT; + + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + transT = transA; + transA = transB; + transB = transT; + t = offA; + offA = offB; + offB = t; + } -/*#define ARRAY_INIT(A) \ - ctx->err = cuStreamWaitEvent(ctx->s, (A)->ev, 0); \ - if (ctx->err != CUDA_SUCCESS) { \ - cuda_exit(ctx); \ - return GA_IMPL_ERROR; \ - }*/ -#define ARRAY_INIT(A) + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; -/*#define ARRAY_FINI(A) cuEventRecord((A)->ev, ctx->s)*/ -#define ARRAY_FINI(A) + cuda_wait(A, CUDA_WAIT_READ); + cuda_wait(B, CUDA_WAIT_READ); + cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); -#define PRE_CALL err = -#define PREFIX(typec, TYPEC) cublas ## TYPEC -#define INIT_ARGS ctx->blas_handle, -#define TRANS(tr) convT(tr) -#define SZ(s) s -#define SCAL(s) &s -#define ARRAY(A, dtype) ((dtype *)A->ptr) + off ## A + err = cublasSgemmEx(ctx->blas_handle, convT(transA), convT(transB), M, N, K, + &alpha, + ((uint16_t *)A->ptr) + offA, CUBLAS_DATA_HALF, lda, + ((uint16_t *)B->ptr) + offB, CUBLAS_DATA_HALF, ldb, + &beta, + ((uint16_t *)C->ptr) + offC, CUBLAS_DATA_HALF, ldc); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; + } -#define POST_CALL \ - if (err == CUBLAS_STATUS_ARCH_MISMATCH) \ - return GA_DEVSUP_ERROR; \ - if (err != CUBLAS_STATUS_SUCCESS) \ - return GA_BLAS_ERROR + cuda_mark(A, CUDA_WAIT_READ); + cuda_mark(B, CUDA_WAIT_READ); + cuda_mark(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + cuda_exit(ctx); + return GA_NO_ERROR; +#else + return GA_DEVSUP_ERROR; +#endif +} static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, @@ -171,28 +245,55 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { - FETCH_CONTEXT(A[0]); - FUNC_DECLS; - PREP_ORDER_GEMMBATCH; + cuda_context *ctx; + size_t *lt, t; + gpudata **T; + size_t i; + cb_transpose transT; + cublasStatus_t err; + + if (batchCount == 0) return GA_NO_ERROR; + + ctx = A[0]->ctx; + + /* Possibly optimize this to make multiple dispatch of sgemm for + * bigger sizes */ float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; - HANDLE_ORDER_GEMMBATCH; - FUNC_INIT; - - { - size_t i; - for (i = 0; i < batchCount; i++) { - ARRAY_INIT(A[i]); - A_l[i] = ((float *)A[i]->ptr) + offA[i]; - ARRAY_INIT(B[i]); - B_l[i] = ((float *)B[i]->ptr) + offB[i]; - ARRAY_INIT(C[i]); - C_l[i] = ((float *)C[i]->ptr) + offC[i]; - } + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + transT = transA; + transA = transB; + transB = transT; + lt = offA; + offA = offB; + offB = lt; + } + + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + for (i = 0; i < batchCount; i++) { + cuda_wait(A[i], CUDA_WAIT_READ); + cuda_wait(B[i], CUDA_WAIT_READ); + cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); + A_l[i] = ((float *)A[i]->ptr) + offA[i]; + B_l[i] = ((float *)B[i]->ptr) + offB[i]; + C_l[i] = ((float *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3); @@ -202,22 +303,26 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3); - PRE_CALL cublasSgemmBatched(INIT_ARGS TRANS(transA), TRANS(transB), SZ(M), SZ(N), SZ(K), SCAL(alpha), (const float **)Aa, SZ(lda), (const float **)Ba, SZ(ldb), SCAL(beta), (float **)Ca, SZ(ldc), batchCount); - POST_CALL; - + err = cublasSgemmBatched(ctx->blas_handle, convT(transA), convT(transB), + M, N, K, &alpha, (const float **)Aa, lda, + (const float **)Ba, ldb, &beta, + (float **)Ca, ldc, batchCount); cuMemFree(Ta); - - { - size_t i; - for (i = 0; i < batchCount; i++) { - ARRAY_FINI(A[i]); - ARRAY_FINI(B[i]); - ARRAY_FINI(C[i]); - } + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; } - FUNC_FINI; + for (i = 0; i < batchCount; i++) { + cuda_mark(A[i], CUDA_WAIT_READ); + cuda_mark(B[i], CUDA_WAIT_READ); + cuda_mark(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); + } + + cuda_exit(ctx); return GA_NO_ERROR; } @@ -227,28 +332,55 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { - FETCH_CONTEXT(A[0]); - FUNC_DECLS; - PREP_ORDER_GEMMBATCH; + cuda_context *ctx; + size_t *lt, t; + gpudata **T; + size_t i; + cb_transpose transT; + cublasStatus_t err; + + if (batchCount == 0) return GA_NO_ERROR; + + ctx = A[0]->ctx; + + /* Possibly optimize this to make multiple dispatch of sgemm for + * bigger sizes */ double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; - HANDLE_ORDER_GEMMBATCH; - FUNC_INIT; - - { - size_t i; - for (i = 0; i < batchCount; i++) { - ARRAY_INIT(A[i]); - A_l[i] = ((double *)A[i]->ptr) + offA[i]; - ARRAY_INIT(B[i]); - B_l[i] = ((double *)B[i]->ptr) + offB[i]; - ARRAY_INIT(C[i]); - C_l[i] = ((double *)C[i]->ptr) + offC[i]; - } + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + transT = transA; + transA = transB; + transB = transT; + lt = offA; + offA = offB; + offB = lt; + } + + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + for (i = 0; i < batchCount; i++) { + cuda_wait(A[i], CUDA_WAIT_READ); + cuda_wait(B[i], CUDA_WAIT_READ); + cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); + A_l[i] = ((double *)A[i]->ptr) + offA[i]; + B_l[i] = ((double *)B[i]->ptr) + offB[i]; + C_l[i] = ((double *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); @@ -258,24 +390,247 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); - PRE_CALL cublasDgemmBatched(INIT_ARGS TRANS(transA), TRANS(transB), SZ(M), SZ(N), SZ(K), SCAL(alpha), (const double **)Aa, SZ(lda), (const double **)Ba, SZ(ldb), SCAL(beta), (double **)Ca, SZ(ldc), batchCount); - POST_CALL; - + err = cublasDgemmBatched(ctx->blas_handle, convT(transA), convT(transB), + M, N, K, &alpha, (const double **)Aa, lda, + (const double **)Ba, ldb, &beta, + (double **)Ca, ldc, batchCount); cuMemFree(Ta); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; + } - { - size_t i; - for (i = 0; i < batchCount; i++) { - ARRAY_FINI(A[i]); - ARRAY_FINI(B[i]); - ARRAY_FINI(C[i]); + for (i = 0; i < batchCount; i++) { + cuda_mark(A[i], CUDA_WAIT_READ); + cuda_mark(B[i], CUDA_WAIT_READ); + cuda_mark(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); + } + + cuda_exit(ctx); + return GA_NO_ERROR; +} + +static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + float alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, + float beta, gpudata *Y, size_t offY, int incY) { + cuda_context *ctx = A->ctx; + cublasStatus_t err; + size_t t; + + ASSERT_BUF(A); + ASSERT_BUF(X); + ASSERT_BUF(Y); + + if (order == cb_c) { + t = N; + N = M; + M = t; + + if (transA == cb_no_trans) { + transA = cb_trans; + } else { + transA = cb_no_trans; + } + } + + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + cuda_wait(A, CUDA_WAIT_READ); + cuda_wait(X, CUDA_WAIT_READ); + cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + err = cublasSgemv(ctx->blas_handle, convT(transA), M, N, &alpha, + ((float *)A->ptr) + offA, lda, + ((float *)X->ptr) + offX, incX, + &beta, ((float *)Y->ptr) + offY, incY); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; + } + + cuda_mark(A, CUDA_WAIT_READ); + cuda_mark(X, CUDA_WAIT_READ); + cuda_mark(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + +static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + double alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, + double beta, gpudata *Y, size_t offY, int incY) { + cuda_context *ctx = A->ctx; + cublasStatus_t err; + size_t t; + + ASSERT_BUF(A); + ASSERT_BUF(X); + ASSERT_BUF(Y); + + if (order == cb_c) { + t = N; + N = M; + M = t; + + if (transA == cb_no_trans) { + transA = cb_trans; + } else { + transA = cb_no_trans; } } - FUNC_FINI; + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + cuda_wait(A, CUDA_WAIT_READ); + cuda_wait(X, CUDA_WAIT_READ); + cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + err = cublasDgemv(ctx->blas_handle, convT(transA), M, N, &alpha, + ((double *)A->ptr) + offA, lda, + ((double *)X->ptr) + offX, incX, + &beta, ((double *)Y->ptr) + offY, incY); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; + } + + cuda_mark(A, CUDA_WAIT_READ); + cuda_mark(X, CUDA_WAIT_READ); + cuda_mark(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + cuda_exit(ctx); return GA_NO_ERROR; } -#include "generic_blas.inc.c" +static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, + gpudata *A, size_t offA, size_t lda) { + cuda_context *ctx = X->ctx; + gpudata *td; + size_t t; + cublasStatus_t err; + + if (order == cb_c) { + t = M; + M = N; + N = t; + t = offX; + offX = offY; + offY = t; + t = incX; + incX = incY; + incY = t; + td = X; + X = Y; + Y = td; + } + + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + cuda_wait(X, CUDA_WAIT_READ); + cuda_wait(Y, CUDA_WAIT_READ); + cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + err = cublasSger(ctx->blas_handle, M, N, &alpha, + ((float *)X->ptr) + offX, incX, + ((float *)Y->ptr) + offY, incY, + ((float *)A->ptr) + offA, lda); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; + } + + cuda_mark(X, CUDA_WAIT_READ); + cuda_mark(Y, CUDA_WAIT_READ); + cuda_mark(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + +static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, + gpudata *A, size_t offA, size_t lda) { + cuda_context *ctx = X->ctx; + gpudata *td; + size_t t; + cublasStatus_t err; + + if (order == cb_c) { + t = M; + M = N; + N = t; + t = offX; + offX = offY; + offY = t; + t = incX; + incX = incY; + incY = t; + td = X; + X = Y; + Y = td; + } + + cuda_enter(ctx); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + + cuda_wait(X, CUDA_WAIT_READ); + cuda_wait(Y, CUDA_WAIT_READ); + cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + err = cublasDger(ctx->blas_handle, M, N, &alpha, + ((double *)X->ptr) + offX, incX, + ((double *)Y->ptr) + offY, incY, + ((double *)A->ptr) + offA, lda); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + if (err == CUBLAS_STATUS_ARCH_MISMATCH) + return GA_DEVSUP_ERROR; + return GA_BLAS_ERROR; + } + + cuda_mark(X, CUDA_WAIT_READ); + cuda_mark(Y, CUDA_WAIT_READ); + cuda_mark(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} +GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = { + setup, + teardown, + NULL, /* hgemv */ + sgemv, + dgemv, + hgemm, + sgemm, + dgemm, + NULL, /* hger */ + sger, + dger, + NULL, /* hgemmBatch */ + sgemmBatch, + dgemmBatch, +}; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 3add97f397..c2cde12d37 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -58,15 +58,6 @@ static void teardown(void *c) { clblasTeardown(); } -#define NAME clblas - -#define FETCH_CONTEXT(A) cl_ctx *ctx = (A)->ctx -#define FUNC_DECLS \ - clblasStatus err; \ - cl_uint num_ev = 0; \ - cl_event evl[3]; \ - cl_event ev - #define ARRAY_INIT(A) \ if (A->ev != NULL) \ evl[num_ev++] = A->ev @@ -77,40 +68,33 @@ static void teardown(void *c) { A->ev = ev; \ clRetainEvent(A->ev) -#define PRE_CALL err = -#define PREFIX(typec, TYPEC) clblas ## TYPEC -#define TRANS(tr) convT(tr) -#define ARRAY(A, dtype) A->buf, off ## A -#define SCAL(s) s -#define SZ(s) s -#define INIT_ARGS -#define TRAIL_ARGS , 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev - -#define POST_CALL \ - if (err != clblasSuccess) \ - return GA_BLAS_ERROR - -#define ORDER convO(order), - static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { - FETCH_CONTEXT(A[0]); - FUNC_DECLS; + cl_ctx *ctx = A[0]->ctx; + cl_event evl[3]; + cl_event ev; size_t i; + cl_uint num_ev; + clblasStatus err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - PRE_CALL clblasSgemm(INIT_ARGS ORDER TRANS(transA), TRANS(transB), SZ(M), SZ(N), SZ(K), SCAL(alpha), ARRAY(A[i], float), SZ(lda), ARRAY(B[i], float), SZ(ldb), SCAL(beta), ARRAY(C[i], float), SZ(ldc) TRAIL_ARGS); - POST_CALL; + err = clblasSgemm(convO(order) convT(transA), convT(transB), M, N, K, + alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, + beta, C[i]->buf, offB[i], ldc, 1, &ctx-q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err = clblasSuccess) + return GA_BLAS_ERROR; ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); + clReleaseEvent(ev); } return GA_NO_ERROR; @@ -122,22 +106,196 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { - FETCH_CONTEXT(A[0]); - FUNC_DECLS; + cl_ctx *ctx = A[0]->ctx; + cl_event evl[3]; + cl_event ev; size_t i; + cl_uint num_ev; + clblasStatus err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - PRE_CALL clblasDgemm(INIT_ARGS ORDER TRANS(transA), TRANS(transB), SZ(M), SZ(N), SZ(K), SCAL(alpha), ARRAY(A[i], double), SZ(lda), ARRAY(B[i], double), SZ(ldb), SCAL(beta), ARRAY(C[i], double), SZ(ldc) TRAIL_ARGS); - POST_CALL; + err = clblasDgemm(convO(order) convT(transA), convT(transB), M, N, K, + alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, + beta, C[i]->buf, offB[i], ldc, 1, &ctx-q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err = clblasSuccess) + return GA_BLAS_ERROR; ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); + clReleaseEvent(ev); } return GA_NO_ERROR; } -#include "generic_blas.inc.c" +static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + float alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { + cl_ctx *ctx = A->ctx; + clblasStatus err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(X); + ARRAY_INIT(Y); + + err = clblasSgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(X); + ARRAY_FINI(Y); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + double alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY) { + cl_ctx *ctx = A->ctx; + clblasStatus err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(X); + ARRAY_INIT(Y); + + err = clblasDgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(X); + ARRAY_FINI(Y); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, float beta, + gpudata *C, size_t offC, size_t ldc) { + cl_ctx *ctx = A->ctx; + clblasStatus err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(B); + ARRAY_INIT(C); + + err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, + alpha, A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(B); + ARRAY_FINI(C); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, double beta, + gpudata *C, size_t offC, size_t ldc) { + cl_ctx *ctx = A->ctx; + clblasStatus err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(B); + ARRAY_INIT(C); + + err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, + alpha, A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(B); + ARRAY_FINI(C); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int sger(cb_order order, size_t M, size_t N, float alpha, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *A, size_t offA, size_t lda) { + cl_ctx *ctx = X->ctx; + cl_event evl[3]; + cl_event ev; + cl_uint num_ev = 0; + clblasStatus err; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(A); + + err = clblasSger(convO(order), M, N, alpha, X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev); + if (err = clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(A); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = { + setup, + teardown, + NULL, /* hgemv */ + sgemv, + dgemv, + NULL, /* hgemm */ + sgemm, + dgemm, + NULL, /* hger */ + sger, + dger, + NULL, /* hgemmBatch */ + sgemmBatch, + dgemmBatch, +}; From 919f3fac866b7706e41c9ddf2c11f11af8345534 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Sep 2015 15:21:05 -0400 Subject: [PATCH 4/8] Stop masking the error with cuda_enter(). --- src/gpuarray_blas_cuda_cublas.c | 20 -------------- src/gpuarray_buffer_cuda.c | 47 ++------------------------------- 2 files changed, 2 insertions(+), 65 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 3b2ed1d9fa..0deec34d82 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -28,8 +28,6 @@ static int setup(void *c) { return GA_NO_ERROR; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; err = cublasCreate(&handle); cuda_exit(ctx); @@ -93,8 +91,6 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); @@ -150,8 +146,6 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); @@ -208,8 +202,6 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); @@ -284,8 +276,6 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; for (i = 0; i < batchCount; i++) { cuda_wait(A[i], CUDA_WAIT_READ); @@ -371,8 +361,6 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; for (i = 0; i < batchCount; i++) { cuda_wait(A[i], CUDA_WAIT_READ); @@ -437,8 +425,6 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); @@ -489,8 +475,6 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); @@ -540,8 +524,6 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ); @@ -591,8 +573,6 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ); diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 3fd6cddd2c..0117282a74 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -139,7 +139,7 @@ CUstream cuda_get_stream(void *ctx) { void cuda_enter(cuda_context *ctx) { ASSERT_CTX(ctx); if (!ctx->enter) - ctx->err = cuCtxPushCurrent(ctx->ctx); + cuCtxPushCurrent(ctx->ctx); ctx->enter++; } @@ -161,10 +161,6 @@ gpudata *cuda_make_buf(void *c, CUdeviceptr p, size_t sz) { res->refcnt = 1; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) { - free(res); - return NULL; - } res->ptr = p; if (ctx->flags & GA_CTX_MULTI_THREAD) @@ -369,10 +365,6 @@ static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY); cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) { - free(res); - FAIL(NULL, GA_IMPL_ERROR); - } if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; @@ -481,8 +473,6 @@ static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, return GA_VALUE_ERROR; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuMemcpyDtoDAsync(dst->ptr + dstoff, src->ptr + srcoff, sz, ctx->s); @@ -505,8 +495,6 @@ static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { return GA_VALUE_ERROR; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuEventSynchronize(src->ev); if (ctx->err != CUDA_SUCCESS) { @@ -535,8 +523,6 @@ static int cuda_write(gpudata *dst, size_t dstoff, const void *src, return GA_VALUE_ERROR; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuEventSynchronize(dst->ev); if (ctx->err != CUDA_SUCCESS) { @@ -561,8 +547,6 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) { if ((dst->sz - dstoff) == 0) return GA_NO_ERROR; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuMemsetD8Async(dst->ptr + dstoff, data, dst->sz - dstoff, ctx->s); @@ -777,8 +761,6 @@ static gpukernel *cuda_newkernel(void *c, unsigned int count, } cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - FAIL(NULL, GA_IMPL_ERROR); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { @@ -948,8 +930,6 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, ASSERT_KER(k); cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; switch (n) { case 1: @@ -991,8 +971,6 @@ static int cuda_sync(gpudata *b) { ASSERT_BUF(b); cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuEventSynchronize(b->ev); cuda_exit(ctx); if (ctx->err != CUDA_SUCCESS) @@ -1318,10 +1296,7 @@ static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, GA_BUFFER_WRITE_ONLY), NULL); if (dst == NULL) return NULL; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) { - cuda_free(dst); - return NULL; - } + ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); @@ -1338,10 +1313,6 @@ static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, if (dst == NULL) return NULL; cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) { - cuda_free(dst); - return NULL; - } ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset, src->ctx->ctx, sz, dst_ctx->s); cuEventRecord(dst->ev, dst_ctx->s); @@ -1396,8 +1367,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); @@ -1420,8 +1389,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); @@ -1439,8 +1406,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_LMEMSIZE: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); @@ -1458,8 +1423,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_NUMPROCS: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); @@ -1478,8 +1441,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_MAXGSIZE: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); @@ -1527,8 +1488,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k); @@ -1540,8 +1499,6 @@ static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); From 96740abf14267eab921b2b94f7d7c06c4757854f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Sep 2015 15:56:44 -0400 Subject: [PATCH 5/8] Fix longstanding bug that would raise exceptions when there was no errors. --- src/gpuarray_buffer_cuda.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 0117282a74..d862185bd4 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -59,7 +59,7 @@ static int cuda_property(void *, gpudata *, gpukernel *, int, void *); #define val_free(v) cuda_freekernel(*v); #include "cache_extcopy.h" -static int detect_arch(char *ret); +static int detect_arch(char *ret, CUresult *err); void *cuda_make_ctx(CUcontext ctx, int flags) { int64_t v = 0; @@ -75,7 +75,7 @@ void *cuda_make_ctx(CUcontext ctx, int flags) { res->refcnt = 1; res->flags = flags; res->enter = 0; - if (detect_arch(res->bin_id)) { + if (detect_arch(res->bin_id, &err)) { free(res); return NULL; } @@ -574,15 +574,14 @@ static CUresult get_cc(CUdevice dev, int *maj, int *min) { #endif } -static int detect_arch(char *ret) { +static int detect_arch(char *ret, CUresult *err) { CUdevice dev; int major, minor; int res; - CUresult err; - err = cuCtxGetDevice(&dev); - if (err != CUDA_SUCCESS) return GA_IMPL_ERROR; - err = get_cc(dev, &major, &minor); - if (err != CUDA_SUCCESS) return GA_IMPL_ERROR; + *err = cuCtxGetDevice(&dev); + if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; + *err = get_cc(dev, &major, &minor); + if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; res = snprintf(ret, 6, "sm_%d%d", major, minor); if (res == -1 || res > 6) return GA_UNSUPPORTED_ERROR; return GA_NO_ERROR; @@ -591,12 +590,12 @@ static int detect_arch(char *ret) { static const char *TMP_VAR_NAMES[] = {"GPUARRAY_TMPDIR", "TMPDIR", "TMP", "TEMP", "USERPROFILE"}; -static void *call_compiler_impl(const char *src, size_t len, size_t *bin_len, +static void *call_compiler_impl(const char *src, size_t len, + const char *arch_arg, size_t *bin_len, int *ret) { char namebuf[PATH_MAX]; char outbuf[PATH_MAX]; char *tmpdir; - char arch_arg[6]; /* Must be at least 6, see detect_arch() */ struct stat st; ssize_t s; #ifndef _WIN32 @@ -606,10 +605,6 @@ static void *call_compiler_impl(const char *src, size_t len, size_t *bin_len, int sys_err; int fd; char *buf; - int res; - - res = detect_arch(arch_arg); - if (res != GA_NO_ERROR) FAIL(NULL, res); for (i = 0; i < sizeof(TMP_VAR_NAMES)/sizeof(TMP_VAR_NAMES[0]); i++) { tmpdir = getenv(TMP_VAR_NAMES[i]); @@ -716,9 +711,12 @@ static void *call_compiler_impl(const char *src, size_t len, size_t *bin_len, return buf; } -static void *(*call_compiler)(const char *src, size_t len, size_t *bin_len, int *ret) = call_compiler_impl; +static void *(*call_compiler)(const char *src, size_t len, + const char *arch_arg, size_t *bin_len, + int *ret) = call_compiler_impl; GPUARRAY_LOCAL void cuda_set_compiler(void *(*compiler_f)(const char *, size_t, + const char *, size_t *, int *)) { return; /* Disable custom compilers @@ -829,7 +827,7 @@ static gpukernel *cuda_newkernel(void *c, unsigned int count, if (ptx_mode) { bin = sb.s; } else { - bin = call_compiler(sb.s, sb.l, &bin_len, ret); + bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, ret); if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; @@ -1121,7 +1119,6 @@ static inline int gen_extcopy_kernel(const cache_key_t *a, const char *in_t, *in_ld_t; const char *out_t, *out_ld_t; const char *rmod; - char arch[6]; /* Must be at least 6, see detect_arch() */ in_t = map_t(a->itype); out_t = map_t(a->otype); @@ -1137,10 +1134,8 @@ static inline int gen_extcopy_kernel(const cache_key_t *a, out_ld_t = out_t; rmod = get_rmod(a->itype, a->otype); if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR; - res = detect_arch(arch); - if (res != GA_NO_ERROR) return res; - strb_appendf(&sb, ELEM_HEADER_PTX, arch, bits, bits, bits, + strb_appendf(&sb, ELEM_HEADER_PTX, ctx->bin_id, bits, bits, bits, bits, in_t, out_t, bits, bits, bits, bits, bits, nEls, bits, bits); From 81d2b1da6cca2e41534e8b81611c3d454c427251 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Sep 2015 16:44:48 -0400 Subject: [PATCH 6/8] Rename cuda_mark() to cuda_record() --- src/gpuarray_blas_cuda_cublas.c | 54 ++++++++++++++++----------------- src/gpuarray_buffer_cuda.c | 2 +- src/gpuarray_extension.c | 4 +-- src/private_cuda.h | 2 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 0deec34d82..b8c743dcdd 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -107,9 +107,9 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, return GA_BLAS_ERROR; } - cuda_mark(A, CUDA_WAIT_READ); - cuda_mark(B, CUDA_WAIT_READ); - cuda_mark(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A, CUDA_WAIT_READ); + cuda_record(B, CUDA_WAIT_READ); + cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; @@ -162,9 +162,9 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, return GA_BLAS_ERROR; } - cuda_mark(A, CUDA_WAIT_READ); - cuda_mark(B, CUDA_WAIT_READ); - cuda_mark(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A, CUDA_WAIT_READ); + cuda_record(B, CUDA_WAIT_READ); + cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; @@ -220,9 +220,9 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, return GA_BLAS_ERROR; } - cuda_mark(A, CUDA_WAIT_READ); - cuda_mark(B, CUDA_WAIT_READ); - cuda_mark(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A, CUDA_WAIT_READ); + cuda_record(B, CUDA_WAIT_READ); + cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; @@ -307,9 +307,9 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, for (i = 0; i < batchCount; i++) { - cuda_mark(A[i], CUDA_WAIT_READ); - cuda_mark(B[i], CUDA_WAIT_READ); - cuda_mark(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A[i], CUDA_WAIT_READ); + cuda_record(B[i], CUDA_WAIT_READ); + cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); @@ -391,9 +391,9 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, } for (i = 0; i < batchCount; i++) { - cuda_mark(A[i], CUDA_WAIT_READ); - cuda_mark(B[i], CUDA_WAIT_READ); - cuda_mark(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A[i], CUDA_WAIT_READ); + cuda_record(B[i], CUDA_WAIT_READ); + cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); @@ -441,9 +441,9 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, return GA_BLAS_ERROR; } - cuda_mark(A, CUDA_WAIT_READ); - cuda_mark(X, CUDA_WAIT_READ); - cuda_mark(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A, CUDA_WAIT_READ); + cuda_record(X, CUDA_WAIT_READ); + cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); @@ -491,9 +491,9 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, return GA_BLAS_ERROR; } - cuda_mark(A, CUDA_WAIT_READ); - cuda_mark(X, CUDA_WAIT_READ); - cuda_mark(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(A, CUDA_WAIT_READ); + cuda_record(X, CUDA_WAIT_READ); + cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); @@ -540,9 +540,9 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, return GA_BLAS_ERROR; } - cuda_mark(X, CUDA_WAIT_READ); - cuda_mark(Y, CUDA_WAIT_READ); - cuda_mark(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(X, CUDA_WAIT_READ); + cuda_record(Y, CUDA_WAIT_READ); + cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); @@ -589,9 +589,9 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, return GA_BLAS_ERROR; } - cuda_mark(X, CUDA_WAIT_READ); - cuda_mark(Y, CUDA_WAIT_READ); - cuda_mark(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); + cuda_record(X, CUDA_WAIT_READ); + cuda_record(Y, CUDA_WAIT_READ); + cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index d862185bd4..cf0af604c9 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -449,7 +449,7 @@ int cuda_wait(gpudata *a, int flags) { return GA_NO_ERROR; } -int cuda_mark(gpudata *a, int flags) { +int cuda_record(gpudata *a, int flags) { ASSERT_BUF(a); cuda_enter(a->ctx); a->ctx->err = cuEventRecord(a->ev, a->ctx->s); diff --git a/src/gpuarray_extension.c b/src/gpuarray_extension.c index e66a48928c..7be8bf018f 100644 --- a/src/gpuarray_extension.c +++ b/src/gpuarray_extension.c @@ -17,7 +17,7 @@ extern void *cuda_make_buf(void); extern void *cuda_get_ptr(void); extern void *cuda_get_sz(void); extern void *cuda_wait(void); -extern void *cuda_mark(void); +extern void *cuda_record(void); extern void *cuda_set_compiler(void); #endif #ifdef WITH_OPENCL @@ -39,7 +39,7 @@ static ext ext_list[] = { {"cuda_get_ptr", cuda_get_ptr}, {"cuda_get_sz", cuda_get_sz}, {"cuda_wait", cuda_wait}, - {"cuda_mark", cuda_mark}, + {"cuda_record", cuda_record}, {"cuda_set_compiler", cuda_set_compiler}, #endif #ifdef WITH_OPENCL diff --git a/src/private_cuda.h b/src/private_cuda.h index 71c8bb7cb8..3ebc5a62e6 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -79,7 +79,7 @@ GPUARRAY_LOCAL gpudata *cuda_make_buf(void *c, CUdeviceptr p, size_t sz); GPUARRAY_LOCAL CUdeviceptr cuda_get_ptr(gpudata *g); GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g); GPUARRAY_LOCAL int cuda_wait(gpudata *, int); -GPUARRAY_LOCAL int cuda_mark(gpudata *, int); +GPUARRAY_LOCAL int cuda_record(gpudata *, int); /* private flags are in the upper 16 bits */ #define CUDA_WAIT_READ 0x10000 From 92e0acff4d7c3597f910ec4948be8693e38aaa3a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Sep 2015 17:02:21 -0400 Subject: [PATCH 7/8] Add CMake black magic to check for cublasSgemmEx --- src/CMakeLists.txt | 5 +++++ src/gpuarray_blas_cuda_cublas.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e1a9e443a6..22ccb76988 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,6 +82,11 @@ if (CUDA_FOUND) include_directories("${CUDADRV_INCLUDE}") set(GPUARRAY_SRC ${GPUARRAY_SRC} gpuarray_blas_cuda_cublas.c) add_definitions(-DWITH_CUDA_CUBLAS) + set(CMAKE_REQUIRED_LIBRARIES ${CUDA_CUBLAS_LIBRARIES}) + check_function_exists(cublasSgemmEx CUBLAS_SGEMMEX) + if (CUBLAS_SGEMMEX) + add_definitions(-DHAVE_CUBLAS_SGEMMEX) + endif() endif() if(OPENCL_FOUND) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index b8c743dcdd..d8008e69f2 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -176,6 +176,9 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { #ifdef HAVE_CUBLAS_SGEMMEX + /* This will use float32 for computation as it's the best we can + * have right now. In the future when native float16 support will be + * there we will switch to that. */ cuda_context *ctx = A->ctx; gpudata *T; size_t t; From 977cd575b0dc68bc2d9af3acbcec384237b6b030 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Sep 2015 16:19:51 -0400 Subject: [PATCH 8/8] Add ASSERT_BUF() where needed. --- src/gpuarray_blas_cuda_cublas.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index d8008e69f2..53e2fae150 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -71,6 +71,10 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, cublasStatus_t err; cb_transpose transT; + ASSERT_BUF(A); + ASSERT_BUF(B); + ASSERT_BUF(C); + if (order == cb_c) { /* swap A and B */ t = N; @@ -126,6 +130,10 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, cublasStatus_t err; cb_transpose transT; + ASSERT_BUF(A); + ASSERT_BUF(B); + ASSERT_BUF(C); + if (order == cb_c) { /* swap A and B */ t = N; @@ -185,6 +193,10 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, cublasStatus_t err; cb_transpose transT; + ASSERT_BUF(A); + ASSERT_BUF(B); + ASSERT_BUF(C); + if (order == cb_c) { /* swap A and B */ t = N; @@ -249,6 +261,8 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, if (batchCount == 0) return GA_NO_ERROR; + ASSERT_BUF(A[0]); + ctx = A[0]->ctx; /* Possibly optimize this to make multiple dispatch of sgemm for @@ -281,6 +295,9 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cuda_enter(ctx); for (i = 0; i < batchCount; i++) { + ASSERT_BUF(A[i]); + ASSERT_BUF(B[i]); + ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); @@ -334,6 +351,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, if (batchCount == 0) return GA_NO_ERROR; + ASSERT_BUF(A[0]); + ctx = A[0]->ctx; /* Possibly optimize this to make multiple dispatch of sgemm for @@ -366,6 +385,9 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cuda_enter(ctx); for (i = 0; i < batchCount; i++) { + ASSERT_BUF(A[i]); + ASSERT_BUF(B[i]); + ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); @@ -511,6 +533,10 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t t; cublasStatus_t err; + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(A); + if (order == cb_c) { t = M; M = N; @@ -560,6 +586,10 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t t; cublasStatus_t err; + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(A); + if (order == cb_c) { t = M; M = N;