Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for 64-bit indexes #376

Merged
merged 4 commits into from Dec 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 8 additions & 3 deletions doc/src/user_guide.rst
Expand Up @@ -117,15 +117,20 @@ Parameterises the backend with

``single`` | ``double``

2. ``rank-allocator`` --- MPI rank allocator:
2. ``memory-model`` --- if to enable support for large working sets;
should be ``normal`` unless a memory-model error is encountered:

``normal`` | ``large``

3. ``rank-allocator`` --- MPI rank allocator:

``linear`` | ``random``

3. ``collect-wait-times`` --- If to track MPI request wait times or not:
4. ``collect-wait-times`` --- if to track MPI request wait times or not:

``True`` | ``False``

4. ``collect-wait-times-len`` --- Size of the wait time history buffer:
5. ``collect-wait-times-len`` --- size of the wait time history buffer:

*int*

Expand Down
28 changes: 24 additions & 4 deletions pyfr/backends/base/backend.py
Expand Up @@ -40,6 +40,16 @@ def __init__(self, cfg):
self.fpdtype_eps = np.finfo(self.fpdtype).eps
self.fpdtype_max = np.finfo(self.fpdtype).max

# Memory model
match cfg.get('backend', 'memory-model', 'normal'):
case 'normal':
self.ixdtype = np.int32
case 'large':
self.ixdtype = np.int64
case _:
raise ValueError('Backend memory model must be either normal '
'or large')

# Allocated matrices
self.mats = WeakValueDictionary()
self._mat_counter = count()
Expand All @@ -55,16 +65,19 @@ def __init__(self, cfg):
@cached_property
def lookup(self):
pkg = f'pyfr.backends.{self.name}.kernels'
dfltargs = dict(fpdtype=self.fpdtype, fpdtype_max=self.fpdtype_max,
soasz=self.soasz, csubsz=self.csubsz, math=math)
dfltargs = {
'fpdtype': self.fpdtype, 'ixdtype': self.ixdtype,
'fpdtype_max': self.fpdtype_max, 'csubsz': self.csubsz,
'soasz': self.soasz, 'math': math
}

return DottedTemplateLookup(pkg, dfltargs)

def malloc(self, obj, extent):
# If no extent has been specified then autocommit
if extent is None:
# Perform the allocation
data = self._malloc_impl(obj.nbytes)
data = self._malloc_checked(obj.nbytes)

# Fire the callback
obj.onalloc(data, 0)
Expand Down Expand Up @@ -99,7 +112,7 @@ def commit(self):
sz = sum(obj.nbytes - (obj.nbytes % -self.alignb) for obj in reqs)

# Perform the allocation
data = self._malloc_impl(sz)
data = self._malloc_checked(sz)

offset = 0
for obj in reqs:
Expand All @@ -118,6 +131,13 @@ def commit(self):
self._pend_aliases.clear()
self._pend_extents.clear()

def _malloc_checked(self, nbytes):
if self.ixdtype == np.int32 and nbytes > 4*2**31 - 1:
raise RuntimeError('Allocation too large for normal backend '
'memory-model')

return self._malloc_impl(nbytes)

def _malloc_impl(self, nbytes):
pass

Expand Down
22 changes: 12 additions & 10 deletions pyfr/backends/base/generator.py
Expand Up @@ -67,10 +67,11 @@ def __init__(self, name, spec, body):


class BaseKernelGenerator:
def __init__(self, name, ndim, args, body, fpdtype):
def __init__(self, name, ndim, args, body, fpdtype, ixdtype):
self.name = name
self.ndim = ndim
self.fpdtype = fpdtype
self.ixdtype = ixdtype

# Parse and sort our argument list
sargs = sorted((k, Arg(k, v, body)) for k, v in args.items())
Expand Down Expand Up @@ -103,7 +104,7 @@ def argspec(self):

# Dimensions
argn += self._dims
argt += [[np.int32]]*self.ndim
argt += [[self.ixdtype]]*self.ndim

# Scalar args (always of type fpdtype)
argn += [sa.name for sa in self.scalargs]
Expand All @@ -114,11 +115,11 @@ def argspec(self):
argn.append(va.name)

if va.isview:
argt.append([np.intp]*(2 + (va.ncdim == 2)))
argt.append([np.uintp]*(2 + (va.ncdim == 2)))
elif self.needs_ldim(va):
argt.append([np.intp, np.int32])
argt.append([np.uintp, self.ixdtype])
else:
argt.append([np.intp])
argt.append([np.uintp])

# Return
return self.ndim, argn, argt
Expand Down Expand Up @@ -285,11 +286,12 @@ def __init__(self, *args, **kwargs):
if self.ndim == 1:
self.preamble += 'if (_x < _nx)'
else:
blk_y, lid_y = self.block2d[1], self._lid[1]
self.preamble += f'''
int _ysize = (_ny + {self.block2d[1] - 1}) / {self.block2d[1]};
int _ystart = {self._lid[1]}*_ysize;
int _yend = min(_ny, _ystart + _ysize);
for (int _y = _ystart; _x < _nx && _y < _yend; _y++)'''
ixdtype_t _ysize = (_ny + {blk_y - 1}) / {blk_y};
ixdtype_t _ystart = {lid_y}*_ysize;
ixdtype_t _yend = min(_ny, _ystart + _ysize);
for (ixdtype_t _y = _ystart; _x < _nx && _y < _yend; _y++)'''

def ldim_size(self, name, factor=1):
return f'ld{name}'
Expand Down Expand Up @@ -377,7 +379,7 @@ def render(self):

return f'''{spec}
{{
int _x = {self._gid};
ixdtype_t _x = {self._gid};
#define X_IDX (_x)
#define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
#define BCAST_BLK(r, c, ld) c
Expand Down
7 changes: 4 additions & 3 deletions pyfr/backends/base/makoutil.py
Expand Up @@ -158,11 +158,12 @@ def kernel(context, name, ndim, **kwargs):
# Capture the kernel body
body = capture(context, context['caller'].body)

# Get the generator class and floating point data type
kerngen, fpdtype = context['_kernel_generator'], context['fpdtype']
# Get the generator class and data types
kerngen = context['_kernel_generator']
fpdtype, ixdtype = context['fpdtype'], context['ixdtype']

# Instantiate
kern = kerngen(name, int(ndim), kwargs, body, fpdtype)
kern = kerngen(name, int(ndim), kwargs, body, fpdtype, ixdtype)

# Save the argument/type list for later use
context['_kernel_argspecs'][name] = kern.argspec()
Expand Down
13 changes: 8 additions & 5 deletions pyfr/backends/base/types.py
Expand Up @@ -237,13 +237,16 @@ def __init__(self, backend, matmap, rmap, cmap, rstridemap, vshape, tags):
if any(m.dtype != self.refdtype for m in self._mats):
raise TypeError('Mixed data types are not supported')

# Index type
ixdtype = backend.ixdtype

# SoA size
k, csubsz = backend.soasz, backend.csubsz

# Base offsets and leading dimensions for each point
offset = np.empty(self.n, dtype=np.int32)
leaddim = np.empty(self.n, dtype=np.int32)
blkdisp = np.empty(self.n, dtype=np.int32)
offset = np.empty(self.n, dtype=ixdtype)
leaddim = np.empty(self.n, dtype=ixdtype)
blkdisp = np.empty(self.n, dtype=ixdtype)

for m in self._mats:
ix = np.where(matmap == m.mid)
Expand All @@ -256,12 +259,12 @@ def __init__(self, backend, matmap, rmap, cmap, rstridemap, vshape, tags):
coldisp = (cmapmod // k)*k*self.nvcol + cmapmod % k

mapping = (offset + blkdisp + rowdisp + coldisp)[None, :]
self.mapping = backend.const_matrix(mapping, dtype=np.int32, tags=tags)
self.mapping = backend.const_matrix(mapping, dtype=ixdtype, tags=tags)

# Row strides
if self.nvrow > 1:
rstrides = (rstridemap*leaddim)[None, :]
self.rstrides = backend.const_matrix(rstrides, dtype=np.int32,
self.rstrides = backend.const_matrix(rstrides, dtype=ixdtype,
tags=tags)


Expand Down
18 changes: 10 additions & 8 deletions pyfr/backends/cuda/blasext.py
Expand Up @@ -10,7 +10,8 @@ def axnpby(self, *arr, subdims=None):
raise ValueError('Incompatible matrix types')

nv = len(arr)
nrow, ncol, ldim, dtype = arr[0].traits[1:]
ixdtype = self.backend.ixdtype
nrow, ncol, ldim, fpdtype = arr[0].traits[1:]
ncola, ncolb = arr[0].ioshape[1:]

# Render the kernel template
Expand All @@ -20,7 +21,7 @@ def axnpby(self, *arr, subdims=None):

# Build the kernel
kern = self._build_kernel('axnpby', src,
[np.int32]*3 + [np.intp]*nv + [dtype]*nv)
[ixdtype]*3 + [np.uintp]*nv + [fpdtype]*nv)

# Determine the grid/block
block = (128, 1, 1)
Expand Down Expand Up @@ -59,7 +60,8 @@ def reduction(self, *rs, method, norm, dt_mat=None):
raise ValueError('Incompatible matrix types')

cuda = self.backend.cuda
nrow, ncol, ldim, dtype = rs[0].traits[1:]
ixdtype = self.backend.ixdtype
nrow, ncol, ldim, fpdtype = rs[0].traits[1:]
ncola, ncolb = rs[0].ioshape[1:]

# Reduction block dimensions
Expand All @@ -72,7 +74,7 @@ def reduction(self, *rs, method, norm, dt_mat=None):
reduced_dev = cuda.mem_alloc(ncola*grid[0]*rs[0].itemsize)

# Empty result buffer on the host
reduced_host = cuda.pagelocked_empty((ncola, grid[0]), dtype)
reduced_host = cuda.pagelocked_empty((ncola, grid[0]), fpdtype)

tplargs = dict(norm=norm, method=method)

Expand All @@ -86,11 +88,11 @@ def reduction(self, *rs, method, norm, dt_mat=None):

# Argument types for reduction kernel
if method == 'errest':
argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2
argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]*2
elif method == 'resid' and dt_mat:
argt = [np.int32]*3 + [np.intp]*4 + [dtype]
argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]
else:
argt = [np.int32]*3 + [np.intp]*3 + [dtype]
argt = [ixdtype]*3 + [np.uintp]*3 + [fpdtype]

# Build the reduction kernel
rkern = self._build_kernel('reduction', src, argt)
Expand All @@ -100,7 +102,7 @@ def reduction(self, *rs, method, norm, dt_mat=None):
params.set_args(nrow, ncolb, ldim, reduced_dev, *regs)

# Runtime argument offset
facoff = argt.index(dtype)
facoff = argt.index(fpdtype)

# Norm type
reducer = np.max if norm == 'uniform' else np.sum
Expand Down
14 changes: 9 additions & 5 deletions pyfr/backends/cuda/cublas.py
Expand Up @@ -81,14 +81,21 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0):
if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
raise ValueError('Incompatible matrices for out = a*b')

# CUBLAS expects inputs to be column-major (or Fortran order in
# cuBLAS expects inputs to be column-major (or Fortran order in
# numpy parlance). However as C = A*B => C^T = (A*B)^T
# = (B^T)*(A^T) with a little trickery we can multiply our
# row-major matrices directly.
m, n, k = b.ncol, a.nrow, a.ncol
A, B, C = b, a, out

# α and β factors for C = α*(A*op(B)) + β*C
# Cache key
ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim)

# Size checks
if any(sz > 2**31 - 1 for sz in ckey[3:]):
raise ValueError('Matrices too large for cuBLAS')

# α and β factors for C = α*(A*B) + β*C
if a.dtype == np.float64:
cublasgemm = w.cublasDgemm
alpha_ct, beta_ct = c_double(alpha), c_double(beta)
Expand All @@ -102,9 +109,6 @@ def gemm(stream):
cublasgemm(h, w.OP_N, w.OP_N, m, n, k, alpha_ct, A, A.leaddim,
B, B.leaddim, beta_ct, C, C.leaddim)

# Cache key
ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim)

# Obtain the performance of the kernel
try:
dt = self._mul_timing[ckey]
Expand Down
16 changes: 9 additions & 7 deletions pyfr/backends/cuda/generator.py
Expand Up @@ -5,33 +5,35 @@

class CUDAKernelGenerator(BaseGPUKernelGenerator):
_lid = ('threadIdx.x', 'threadIdx.y')
_gid = 'blockIdx.x*blockDim.x + threadIdx.x'
_gid = 'ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x'
_shared_prfx = '__shared__'
_shared_sync = '__syncthreads()'

def _render_spec(self):
res = '__restrict__'

# We first need the argument list; starting with the dimensions
kargs = [f'int {d}' for d in self._dims]
kargs = [f'ixdtype_t {d}' for d in self._dims]

# Now add any scalar arguments
kargs.extend(f'{sa.dtype} {sa.name}' for sa in self.scalargs)

# Finally, add the vector arguments
for va in self.vectargs:
if va.intent == 'in':
kargs.append(f'const {va.dtype}* __restrict__ {va.name}_v')
kargs.append(f'const {va.dtype}* {res} {va.name}_v')
else:
kargs.append(f'{va.dtype}* __restrict__ {va.name}_v')
kargs.append(f'{va.dtype}* {res} {va.name}_v')

# Views
if va.isview:
kargs.append(f'const int* __restrict__ {va.name}_vix')
kargs.append(f'const ixdtype_t* {res} {va.name}_vix')

if va.ncdim == 2:
kargs.append(f'const int* __restrict__ {va.name}_vrstri')
kargs.append(f'const ixdtype_t* {res} {va.name}_vrstri')
# Arrays
elif self.needs_ldim(va):
kargs.append(f'int ld{va.name}')
kargs.append(f'ixdtype_t ld{va.name}')

# Determine the launch bounds for the kernel
nthrds = prod(self.block1d if self.ndim == 1 else self.block2d)
Expand Down
7 changes: 4 additions & 3 deletions pyfr/backends/cuda/kernels/axnpby.mako
Expand Up @@ -2,14 +2,15 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>

__global__ void
axnpby(int nrow, int ncolb, int ldim, fpdtype_t* __restrict__ x0,
axnpby(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim,
fpdtype_t* __restrict__ x0,
${', '.join(f'const fpdtype_t* __restrict__ x{i}'
for i in range(1, nv))},
${', '.join(f'fpdtype_t a{i}' for i in range(nv))})
{
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
int idx;
ixdtype_t j = ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x;
ixdtype_t idx;

if (j < ncolb && a0 == 0.0)
{
Expand Down
5 changes: 4 additions & 1 deletion pyfr/backends/cuda/kernels/base.mako
Expand Up @@ -5,8 +5,11 @@
#define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)

// Typedefs
typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
typedef unsigned int uint32_t;
typedef long long int64_t;
typedef unsigned long long uint64_t;
typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t;

// Atomic helpers
__device__ void atomic_min_fpdtype(fpdtype_t* addr, fpdtype_t val)
Expand Down