From 47d255143e692bd46d21991cc3c87e389a84f824 Mon Sep 17 00:00:00 2001 From: davidweichiang Date: Tue, 28 Jul 2015 11:56:01 +0800 Subject: [PATCH] round reduction size up to nearest power of two to avoid overloading cache --- pygpu/reduction.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 6bc7c9c0df..812af8b6c4 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -8,6 +8,12 @@ from .tools import ArrayArg, check_args, prod, lfu_cache from .elemwise import parse_c_args, massage_op +def _ceil_log2(x): + # nearest power of 2 (going up) + if x != 0: + return int(math.ceil(math.log(x, 2))) + else: + return 0 basic_kernel = Template(""" ${preamble} @@ -172,11 +178,7 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux, def _find_kernel_ls(self, tmpl, max_ls, *tmpl_args): local_size = min(self.init_local_size, max_ls) - # nearest power of 2 (going up) - if local_size != 0: - count_lim = int(math.ceil(math.log(local_size, 2))) - else: - count_lim = 0 + count_lim = _ceil_log2(local_size) local_size = 2**count_lim loop_count = 0 while loop_count <= count_lim: @@ -248,7 +250,7 @@ def __call__(self, *args, **kwargs): if self.init_local_size < n: k, _, _, ls = self._get_basic_kernel(self.init_local_size, nd) else: - k, _, _, ls = self._get_basic_kernel(n, nd) + k, _, _, ls = self._get_basic_kernel(2**_ceil_log2(n), nd) kargs = [n, out] kargs.extend(dims)