From 06602bedcbd29b6ae318c1290a980866c3f0cf4d Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 3 Nov 2016 19:32:07 -0400 Subject: [PATCH 1/2] Convert internally from unsigned to signed axis numbers. This avoids an infinite loop when counting ndX downto 0 with unsigned integers, as the condition `unsigned >= 0` always holds. --- src/gpuarray_reduction.c | 42 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 0c05e14397..9ca69d491e 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -27,27 +27,27 @@ struct maxandargmax_ctx{ GpuArray* dstMax; GpuArray* dstArgmax; const GpuArray* src; - unsigned reduxLen; - const unsigned* reduxList; + int reduxLen; + const int* reduxList; /* General. */ int ret; - unsigned* axisList; + int* axisList; gpucontext* gpuCtx; /* Source code Generator. */ const char* dstMaxType; const char* dstArgmaxType; - unsigned ndd; - unsigned ndr; - unsigned nds; - unsigned ndh; + int ndd; + int ndr; + int nds; + int ndh; strb s; char* sourceCode; GpuKernel kernel; /* Scheduler */ - unsigned hwAxisList[3]; + int hwAxisList[3]; size_t blockSize [3]; size_t gridSize [3]; size_t chunkSize [3]; @@ -64,8 +64,8 @@ typedef struct maxandargmax_ctx maxandargmax_ctx; /* Function prototypes */ -static int axisInSet (unsigned v, - const unsigned* set, +static int axisInSet (int v, + const int* set, size_t setLen, size_t* where); static void appendIdxes (strb* s, @@ -102,7 +102,8 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, reduxLen, reduxList}, + maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, + (int)reduxLen, (const int*)reduxList}, *ctx = &ctxSTACK; if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR && @@ -127,8 +128,8 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ -static int axisInSet (unsigned v, - const unsigned* set, +static int axisInSet (int v, + const int* set, size_t setLen, size_t* where){ size_t i; @@ -190,7 +191,7 @@ static void appendIdxes (strb* s, */ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ - unsigned i; + int i; /** * We initialize certain parts of the context. @@ -216,13 +217,14 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ /* Insane src or reduxLen? */ if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || - ctx->reduxLen == 0 || ctx->reduxLen >= ctx->src->nd){ + ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){ return ctx->ret=GA_INVALID_ERROR; } /* Insane or duplicate list entry? */ for(i=0;ireduxLen;i++){ - if(ctx->reduxList[i] >= ctx->src->nd || + if(ctx->reduxList[i] < 0 || + ctx->reduxList[i] >= (int)ctx->src->nd || axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ return ctx->ret=GA_INVALID_ERROR; } @@ -260,8 +262,8 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ */ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ - unsigned i, j, maxI = 0; - size_t maxV; + int i, j, maxI = 0; + size_t maxV; ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; @@ -355,7 +357,7 @@ static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t\n"); } static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ - unsigned i; + int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); @@ -605,7 +607,7 @@ static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "#undef DSTAINDEXER\n"); } static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ - unsigned i, f=0; + int i, f=0; for(i=0;inds;i++){ if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ From cf702b565f4dcdbad7e3255f7b585003f687b2f6 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 12:14:49 -0400 Subject: [PATCH 2/2] Make all-dims-reduced usecase work. All-dims-reduced will be slow but does work now without errors. Added testcase to ensure this remains the case. --- src/gpuarray_reduction.c | 34 +++++++++------- tests/check_reduction.c | 88 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 15 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 9ca69d491e..0e6ba09749 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -364,24 +364,26 @@ static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - strb_appends(&ctx->s, "\tX "); - for(i=0;indh;i++){ - strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->ndh-1) ? ";\n" : ", "); + if(ctx->ndh>0){ + strb_appends(&ctx->s, "\tX "); + for(i=0;indh;i++){ + strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", + i, i, (i==ctx->ndh-1) ? ";\n" : ", "); + } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n"); + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} + if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} + if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} + if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); @@ -725,8 +727,10 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ } } - dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; - gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); + if(ctx->ndh > 0){ + dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; + gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); + } /** * Factorization job. We'll steadily increase the slack in case of failure @@ -806,7 +810,7 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ ctx->dstMaxStepsGD && ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, - ctx->ndh, + ctx->ndh>0 ? ctx->ndh : 1, ctx->blockSize, ctx->gridSize, 0, diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 106f6f3fc5..5138e5c02d 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -348,6 +348,93 @@ START_TEST(test_veryhighrank){ GpuArray_clear(&gaArgmax); }END_TEST +START_TEST(test_alldimsreduced){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on all dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,1,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = calloc(1, sizeof(*pMax) ); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = (i*dims[1] + j)*dims[2] + k; + } + } + } + } + + ck_assert_msg(gtMax == pMax[0], "Max value mismatch!"); + ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!"); + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); + GpuArray_clear(&gaArgmax); +}END_TEST + Suite *get_suite(void) { Suite *s = suite_create("reduction"); TCase *tc = tcase_create("basic"); @@ -357,6 +444,7 @@ Suite *get_suite(void) { tcase_add_test(tc, test_reduction); tcase_add_test(tc, test_idxtranspose); tcase_add_test(tc, test_veryhighrank); + tcase_add_test(tc, test_alldimsreduced); suite_add_tcase(s, tc); return s;