From cf075582d7c03dc9dc1e3a9849dc3cd27157356b Mon Sep 17 00:00:00 2001 From: Freddie Witherden Date: Mon, 18 Apr 2022 18:30:26 -0500 Subject: [PATCH 1/3] Cleanups. --- pyfr/plugins/base.py | 2 +- pyfr/regions.py | 210 ++++++++++++++++++++++--------------------- 2 files changed, 109 insertions(+), 103 deletions(-) diff --git a/pyfr/plugins/base.py b/pyfr/plugins/base.py index 031501de3..195921124 100644 --- a/pyfr/plugins/base.py +++ b/pyfr/plugins/base.py @@ -116,7 +116,7 @@ def __init__(self, intg, *args, **kwargs): # All elements if region == '*': self._prepare_region_data_all(intg) - # All elements inside some region region + # All elements inside some region else: # Geometric region if '(' in region: diff --git a/pyfr/regions.py b/pyfr/regions.py index cc757cb04..62d65edfd 100644 --- a/pyfr/regions.py +++ b/pyfr/regions.py @@ -11,26 +11,9 @@ from pyfr.util import match_paired_paren, subclasses, subclass_where -def get_region(name, *args): - return subclass_where(BaseRegion, name=name)(*args) - - class BaseRegion: - name = None - def interior_eles(self, mesh, rallocs): - eset = {} - - for shape in subclasses(BaseShape, just_leaf=True): - f = f'spt_{shape.name}_p{rallocs.prank}' - if f not in mesh: - continue - - inside = self.pts_in_region(mesh[f]) - if np.any(inside): - eset[shape.name] = np.any(inside, axis=0).nonzero()[0].tolist() - - return eset + pass def surface_faces(self, mesh, rallocs, exclbcs=[]): from mpi4py import MPI @@ -88,7 +71,102 @@ def surface_faces(self, mesh, rallocs, exclbcs=[]): return {k: sorted(v) for k, v in nsfaces.items()} -class BoxRegion(BaseRegion): +class BoundaryRegion(BaseRegion): + def __init__(self, bcname, nlayers=1): + self.bcname = bcname + self.nlayers = nlayers + + def interior_eles(self, mesh, rallocs): + from mpi4py import MPI + + bc = f'bcon_{self.bcname}_p{rallocs.prank}' + eset = defaultdict(list) + comm, rank, root = get_comm_rank_root() + + # Ensure the boundary exists + bcranks = comm.gather(bc in mesh, root=root) + if rank == root and not any(bcranks): + raise ValueError(f'Boundary {self.bcname} does not exist') + + # Determine which of our elements are directly on the boundary + if bc in mesh: + for etype, eidx in mesh[bc][['f0', 'f1']].astype('U4,i4'): + eset[etype].append(eidx) + + # Handle the case where multiple layers have been requested + if self.nlayers > 1: + # Load our internal connectivity array + con = mesh[f'con_p{rallocs.prank}'].T + con = con[['f0', 'f1']].astype('U4,i4').tolist() + + # Load our partition boundary connectivity arrays + pcon = {} + for p in rallocs.prankconn[rallocs.prank]: + pcon[p] = mesh[f'con_p{rallocs.prank}p{p}'] + pcon[p] = pcon[p][['f0', 'f1']].astype('U4,i4').tolist() + + # Tag all elements in the set as belonging to the first layer + neset = {(k, j): 0 for k, v in eset.items() for j in v} + + # Iteratively grow out the element set + for i in range(self.nlayers - 1): + reqs, bufs = [], [] + + # Exchange information about recent updates to our set + for p, pc in pcon.items(): + sb = np.array([c in neset and neset[c] == i for c in pc]) + rb = np.empty_like(sb) + + # Start the send/recv requests + reqs.append(comm.Isend(sb, rallocs.pmrankmap[p])) + reqs.append(comm.Irecv(rb, rallocs.pmrankmap[p])) + + bufs.append((pc, sb, rb)) + + # Grow our element set by considering internal connectivity + for l, r in con: + if l in neset and r not in neset and neset[l] == i: + neset[r] = i + 1 + eset[r[0]].append(r[1]) + elif r in neset and l not in neset and neset[r] == i: + neset[l] = i + 1 + eset[l[0]].append(l[1]) + + # Wait for the exchanges to finish + MPI.Request.Waitall(reqs) + + # Grow our element set by considering adjacent partitions + for pc, sb, rb in bufs: + for l, b in zip(pc, rb): + if b and l not in neset: + neset[l] = i + 1 + eset[l[0]].append(l[1]) + + return {k: sorted(v) for k, v in eset.items()} + + +class BaseGeometricRegion(BaseRegion): + name = None + + def interior_eles(self, mesh, rallocs): + eset = {} + + for shape in subclasses(BaseShape, just_leaf=True): + f = f'spt_{shape.name}_p{rallocs.prank}' + if f not in mesh: + continue + + inside = self.pts_in_region(mesh[f]) + if np.any(inside): + eset[shape.name] = np.any(inside, axis=0).nonzero()[0].tolist() + + return eset + + def pts_in_region(self, pts): + pass + + +class BoxRegion(BaseGeometricRegion): name = 'box' def __init__(self, x0, x1): @@ -105,7 +183,7 @@ def pts_in_region(self, pts): return inside -class ConicalFrustumRegion(BaseRegion): +class ConicalFrustumRegion(BaseGeometricRegion): name = 'conical_frustum' def __init__(self, x0, x1, r0, r1): @@ -148,7 +226,7 @@ def __init__(self, x0, x1, r): super().__init__(x0, x1, r, r) -class EllipsoidRegion(BaseRegion): +class EllipsoidRegion(BaseGeometricRegion): name = 'ellipsoid' def __init__(self, x0, a, b, c): @@ -159,101 +237,29 @@ def pts_in_region(self, pts): return np.sum(((pts - self.x0) / self.abc)**2, axis=-1) <= 1 -class SphereRegion(BaseRegion): +class SphereRegion(EllipsoidRegion): name = 'sphere' def __init__(self, x0, r): super().__init__(x0, r, r, r) -class BoundaryRegion(BaseRegion): - def __init__(self, bcname, nlayers=1): - self.bcname = bcname - self.nlayers = nlayers - - def interior_eles(self, mesh, rallocs): - from mpi4py import MPI - - bc = f'bcon_{self.bcname}_p{rallocs.prank}' - eset = defaultdict(list) - comm, rank, root = get_comm_rank_root() - - # Ensure the boundary exists - bcranks = comm.gather(bc in mesh, root=root) - if rank == root and not any(bcranks): - raise ValueError(f'Boundary {self.bcname} does not exist') - - # Determine which of our elements are directly on the boundary - if bc in mesh: - for etype, eidx in mesh[bc][['f0', 'f1']].astype('U4,i4'): - eset[etype].append(eidx) - - # Handle the case where multiple layers have been requested - if self.nlayers > 1: - # Load our internal connectivity array - con = mesh[f'con_p{rallocs.prank}'].T - con = con[['f0', 'f1']].astype('U4,i4').tolist() - - # Load our partition boundary connectivity arrays - pcon = {} - for p in rallocs.prankconn[rallocs.prank]: - pcon[p] = mesh[f'con_p{rallocs.prank}p{p}'] - pcon[p] = pcon[p][['f0', 'f1']].astype('U4,i4').tolist() - - # Tag all elements in the set as belonging to the first layer - neset = {(k, j): 0 for k, v in eset.items() for j in v} - - # Iteratively grow out the element set - for i in range(self.nlayers - 1): - reqs, bufs = [], [] - - # Exchange information about recent updates to our set - for p, pc in pcon.items(): - sb = np.array([c in neset and neset[c] == i for c in pc]) - rb = np.empty_like(sb) - - # Send/recv this information - reqs.append(comm.Isend(sb, rallocs.pmrankmap[p])) - reqs.append(comm.Irecv(rb, rallocs.pmrankmap[p])) - - bufs.append((pc, sb, rb)) - - # Grow our element set by considering internal connectivity - for l, r in con: - if l in neset and r not in neset and neset[l] == i: - neset[r] = i + 1 - eset[r[0]].append(r[1]) - elif r in neset and l not in neset and neset[r] == i: - neset[l] = i + 1 - eset[l[0]].append(l[1]) - - # Wait for the exchanges to finish - MPI.Request.Waitall(reqs) - - # Grow our element set through external connectivity - for pc, sb, rb in bufs: - for l, b in zip(pc, rb): - if b and l not in neset: - neset[l] = i + 1 - eset[l[0]].append(l[1]) - - return {k: sorted(v) for k, v in eset.items()} - - -class ConstructiveRegion(BaseRegion): +class ConstructiveRegion(BaseGeometricRegion): def __init__(self, expr): - regions = [] + rexprs = [] # Factor out the individual region expressions expr = re.sub( r'(\w+)\((' + match_paired_paren('()') + r')\)', - lambda m: regions.append(m.groups()) or f'r{len(regions) - 1}', + lambda m: rexprs.append(m.groups()) or f'r{len(rexprs) - 1}', expr ) - # Parse and construct these individual regions - self.regions = [get_region(name, *literal_eval(args)) - for name, args in regions] + # Parse these region expressions + self.regions = regions = [] + for name, args in rexprs: + cls = subclass_where(BaseGeometricRegion, name=name) + regions.append(cls(*literal_eval(args))) # Rewrite in terms of boolean operators self.expr = expr.replace('+', '|').replace('-', '&~') From 3350170d5dbc186179bb557040354bf62c05de07 Mon Sep 17 00:00:00 2001 From: Freddie Witherden Date: Tue, 19 Apr 2022 15:03:49 -0500 Subject: [PATCH 2/3] Cleanups. --- pyfr/partitioners/base.py | 9 +++------ pyfr/plugins/fluidforce.py | 2 +- pyfr/readers/native.py | 9 +++++++++ pyfr/regions.py | 10 +++++----- pyfr/solvers/base/system.py | 6 +++--- 5 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pyfr/partitioners/base.py b/pyfr/partitioners/base.py index d527f2bac..b6a6b91f7 100644 --- a/pyfr/partitioners/base.py +++ b/pyfr/partitioners/base.py @@ -49,7 +49,7 @@ def _combine_mesh_parts(self, mesh): rnum[en].update(((i, j), (0, off + j)) for j in range(n)) def offset_con(con, pr): - con = con.copy().astype('U4,i4,i1,i2') + con = con.copy() for en, pn in pinf.items(): if pn[pr] > 0: @@ -76,18 +76,15 @@ def offset_con(con, pr): name, l = bc[1], int(bc[2]) bccon[name].append(offset_con(mesh[f], l)) - # Output data type - dtype = 'U4,i4,i1,i2' - # Concatenate these arrays to from the new mesh - newmesh = {'con_p0': np.hstack(intcon).astype(dtype)} + newmesh = {'con_p0': np.hstack(intcon)} for en in spts: newmesh[f'spt_{en}_p0'] = np.hstack(spts[en]) newmesh[f'spt_{en}_p0', 'linear'] = np.hstack(linf[en]) for k, v in bccon.items(): - newmesh[f'bcon_{k}_p0'] = np.hstack(v).astype(dtype) + newmesh[f'bcon_{k}_p0'] = np.hstack(v) return newmesh, rnum diff --git a/pyfr/plugins/fluidforce.py b/pyfr/plugins/fluidforce.py index b97ee10dd..793608564 100644 --- a/pyfr/plugins/fluidforce.py +++ b/pyfr/plugins/fluidforce.py @@ -86,7 +86,7 @@ def __init__(self, intg, cfgsect, suffix): norms = defaultdict(list) rfpts = defaultdict(list) - for etype, eidx, fidx, flags in mesh[bc].astype('U4,i4,i1,i2'): + for etype, eidx, fidx, flags in mesh[bc].tolist(): eles = elemap[etype] if (etype, fidx) not in m0: diff --git a/pyfr/readers/native.py b/pyfr/readers/native.py index c1e4ae57e..7841cb114 100644 --- a/pyfr/readers/native.py +++ b/pyfr/readers/native.py @@ -32,6 +32,15 @@ def __getitem__(self, aname): else: ret = np.array(ret) + # Handle strings in compound data types + if ret.dtype.kind == 'V': + ndtype = [] + for k, v in ret.dtype.descr: + v = v[0] if isinstance(v, tuple) else v + ndtype.append((k, v.replace('S', 'U'))) + + ret = ret.astype(ndtype) + return ret.decode() if isinstance(ret, bytes) else ret else: return self._file[aname[0]].attrs[aname[1]] diff --git a/pyfr/regions.py b/pyfr/regions.py index 62d65edfd..20e315550 100644 --- a/pyfr/regions.py +++ b/pyfr/regions.py @@ -27,14 +27,14 @@ def surface_faces(self, mesh, rallocs, exclbcs=[]): # Eliminate any faces with internal connectivity con = mesh[f'con_p{rallocs.prank}'].T - for l, r in con[['f0', 'f1', 'f2']].astype('U4,i4,i1').tolist(): + for l, r in con[['f0', 'f1', 'f2']].tolist(): if l in sfaces and r in sfaces: sfaces.difference_update([l, r]) # Eliminate faces on specified boundaries for b in exclbcs: if (f := f'bcon_{b}_p{rallocs.prank}') in mesh: - bcon = mesh[f][['f0', 'f1', 'f2']].astype('U4,i4,i1') + bcon = mesh[f][['f0', 'f1', 'f2']] sfaces.difference_update(bcon.tolist()) comm, rank, root = get_comm_rank_root() @@ -90,20 +90,20 @@ def interior_eles(self, mesh, rallocs): # Determine which of our elements are directly on the boundary if bc in mesh: - for etype, eidx in mesh[bc][['f0', 'f1']].astype('U4,i4'): + for etype, eidx in mesh[bc][['f0', 'f1']].tolist(): eset[etype].append(eidx) # Handle the case where multiple layers have been requested if self.nlayers > 1: # Load our internal connectivity array con = mesh[f'con_p{rallocs.prank}'].T - con = con[['f0', 'f1']].astype('U4,i4').tolist() + con = con[['f0', 'f1']].tolist() # Load our partition boundary connectivity arrays pcon = {} for p in rallocs.prankconn[rallocs.prank]: pcon[p] = mesh[f'con_p{rallocs.prank}p{p}'] - pcon[p] = pcon[p][['f0', 'f1']].astype('U4,i4').tolist() + pcon[p] = pcon[p][['f0', 'f1']].tolist() # Tag all elements in the set as belonging to the first layer neset = {(k, j): 0 for k, v in eset.items() for j in v} diff --git a/pyfr/solvers/base/system.py b/pyfr/solvers/base/system.py index 96f9d43c4..4bc3e06ff 100644 --- a/pyfr/solvers/base/system.py +++ b/pyfr/solvers/base/system.py @@ -125,7 +125,7 @@ def _load_eles(self, rallocs, mesh, initsoln, nregs, nonce): def _load_int_inters(self, rallocs, mesh, elemap): key = f'con_p{rallocs.prank}' - lhs, rhs = mesh[key].astype('U4,i4,i1,i2').tolist() + lhs, rhs = mesh[key].tolist() int_inters = self.intinterscls(self.backend, lhs, rhs, elemap, self.cfg) @@ -138,7 +138,7 @@ def _load_mpi_inters(self, rallocs, mesh, elemap): for rhsprank in rallocs.prankconn[lhsprank]: rhsmrank = rallocs.pmrankmap[rhsprank] interarr = mesh[f'con_p{lhsprank}p{rhsprank}'] - interarr = interarr.astype('U4,i4,i1,i2').tolist() + interarr = interarr.tolist() mpiiface = self.mpiinterscls(self.backend, interarr, rhsmrank, rallocs, elemap, self.cfg) @@ -157,7 +157,7 @@ def _load_bc_inters(self, rallocs, mesh, elemap): cfgsect = f'soln-bcs-{m[1]}' # Get the interface - interarr = mesh[f].astype('U4,i4,i1,i2').tolist() + interarr = mesh[f].tolist() # Instantiate bcclass = bcmap[self.cfg.get(cfgsect, 'type')] From c74a12dc2731934c8cc4d7ea5a378c30af1efc98 Mon Sep 17 00:00:00 2001 From: Freddie Witherden Date: Tue, 14 Nov 2023 07:10:16 -0600 Subject: [PATCH 3/3] Add support for 64-bit indexes. --- doc/src/user_guide.rst | 11 +++++--- pyfr/backends/base/backend.py | 28 +++++++++++++++++--- pyfr/backends/base/generator.py | 22 ++++++++------- pyfr/backends/base/makoutil.py | 7 ++--- pyfr/backends/base/types.py | 13 +++++---- pyfr/backends/cuda/blasext.py | 18 +++++++------ pyfr/backends/cuda/cublas.py | 14 ++++++---- pyfr/backends/cuda/generator.py | 16 ++++++----- pyfr/backends/cuda/kernels/axnpby.mako | 7 ++--- pyfr/backends/cuda/kernels/base.mako | 5 +++- pyfr/backends/cuda/kernels/pack.mako | 14 +++++----- pyfr/backends/cuda/kernels/reduction.mako | 11 ++++---- pyfr/backends/cuda/packing.py | 3 ++- pyfr/backends/hip/blasext.py | 18 +++++++------ pyfr/backends/hip/generator.py | 16 ++++++----- pyfr/backends/hip/kernels/axnpby.mako | 7 ++--- pyfr/backends/hip/kernels/base.mako | 1 + pyfr/backends/hip/kernels/pack.mako | 14 +++++----- pyfr/backends/hip/kernels/reduction.mako | 11 ++++---- pyfr/backends/hip/packing.py | 3 ++- pyfr/backends/hip/rocblas.py | 12 ++++++--- pyfr/backends/metal/blasext.py | 18 +++++++------ pyfr/backends/metal/generator.py | 8 +++--- pyfr/backends/metal/gimmik.py | 2 +- pyfr/backends/metal/kernels/axnpby.mako | 8 +++--- pyfr/backends/metal/kernels/base.mako | 1 + pyfr/backends/metal/kernels/pack.mako | 11 ++++---- pyfr/backends/metal/kernels/reduction.mako | 14 +++++----- pyfr/backends/metal/packing.py | 4 ++- pyfr/backends/metal/provider.py | 2 +- pyfr/backends/opencl/blasext.py | 18 +++++++------ pyfr/backends/opencl/driver.py | 2 +- pyfr/backends/opencl/generator.py | 16 +++++------ pyfr/backends/opencl/kernels/axnpby.mako | 7 ++--- pyfr/backends/opencl/kernels/base.mako | 2 ++ pyfr/backends/opencl/kernels/pack.mako | 10 +++---- pyfr/backends/opencl/kernels/reduction.mako | 13 ++++----- pyfr/backends/opencl/packing.py | 3 ++- pyfr/backends/openmp/blasext.py | 25 ++++++++++------- pyfr/backends/openmp/generator.py | 15 ++++++----- pyfr/backends/openmp/kernels/axnpby.mako | 15 ++++++----- pyfr/backends/openmp/kernels/base.mako | 2 ++ pyfr/backends/openmp/kernels/batch-gemm.mako | 6 ++--- pyfr/backends/openmp/kernels/pack.mako | 10 +++---- pyfr/backends/openmp/kernels/par-memcpy.mako | 9 +++---- pyfr/backends/openmp/kernels/reduction.mako | 14 +++++----- pyfr/backends/openmp/packing.py | 6 ++++- pyfr/backends/openmp/xsmm.py | 6 ++++- pyfr/nputil.py | 2 +- pyfr/partitioners/base.py | 6 ++--- pyfr/plugins/base.py | 2 +- pyfr/plugins/fluidforce.py | 2 +- pyfr/readers/base.py | 6 ++--- pyfr/regions.py | 6 ++--- pyfr/solvers/base/system.py | 6 ++--- 55 files changed, 307 insertions(+), 221 deletions(-) diff --git a/doc/src/user_guide.rst b/doc/src/user_guide.rst index 9b63aa299..d4b100006 100644 --- a/doc/src/user_guide.rst +++ b/doc/src/user_guide.rst @@ -117,15 +117,20 @@ Parameterises the backend with ``single`` | ``double`` -2. ``rank-allocator`` --- MPI rank allocator: +2. ``memory-model`` --- if to enable support for large working sets; + should be ``normal`` unless a memory-model error is encountered: + + ``normal`` | ``large`` + +3. ``rank-allocator`` --- MPI rank allocator: ``linear`` | ``random`` -3. ``collect-wait-times`` --- If to track MPI request wait times or not: +4. ``collect-wait-times`` --- if to track MPI request wait times or not: ``True`` | ``False`` -4. ``collect-wait-times-len`` --- Size of the wait time history buffer: +5. ``collect-wait-times-len`` --- size of the wait time history buffer: *int* diff --git a/pyfr/backends/base/backend.py b/pyfr/backends/base/backend.py index 19a07fce5..18e83133e 100644 --- a/pyfr/backends/base/backend.py +++ b/pyfr/backends/base/backend.py @@ -40,6 +40,16 @@ def __init__(self, cfg): self.fpdtype_eps = np.finfo(self.fpdtype).eps self.fpdtype_max = np.finfo(self.fpdtype).max + # Memory model + match cfg.get('backend', 'memory-model', 'normal'): + case 'normal': + self.ixdtype = np.int32 + case 'large': + self.ixdtype = np.int64 + case _: + raise ValueError('Backend memory model must be either normal ' + 'or large') + # Allocated matrices self.mats = WeakValueDictionary() self._mat_counter = count() @@ -55,8 +65,11 @@ def __init__(self, cfg): @cached_property def lookup(self): pkg = f'pyfr.backends.{self.name}.kernels' - dfltargs = dict(fpdtype=self.fpdtype, fpdtype_max=self.fpdtype_max, - soasz=self.soasz, csubsz=self.csubsz, math=math) + dfltargs = { + 'fpdtype': self.fpdtype, 'ixdtype': self.ixdtype, + 'fpdtype_max': self.fpdtype_max, 'csubsz': self.csubsz, + 'soasz': self.soasz, 'math': math + } return DottedTemplateLookup(pkg, dfltargs) @@ -64,7 +77,7 @@ def malloc(self, obj, extent): # If no extent has been specified then autocommit if extent is None: # Perform the allocation - data = self._malloc_impl(obj.nbytes) + data = self._malloc_checked(obj.nbytes) # Fire the callback obj.onalloc(data, 0) @@ -99,7 +112,7 @@ def commit(self): sz = sum(obj.nbytes - (obj.nbytes % -self.alignb) for obj in reqs) # Perform the allocation - data = self._malloc_impl(sz) + data = self._malloc_checked(sz) offset = 0 for obj in reqs: @@ -118,6 +131,13 @@ def commit(self): self._pend_aliases.clear() self._pend_extents.clear() + def _malloc_checked(self, nbytes): + if self.ixdtype == np.int32 and nbytes > 4*2**31 - 1: + raise RuntimeError('Allocation too large for normal backend ' + 'memory-model') + + return self._malloc_impl(nbytes) + def _malloc_impl(self, nbytes): pass diff --git a/pyfr/backends/base/generator.py b/pyfr/backends/base/generator.py index 25cd9a4b1..5d9578a1a 100644 --- a/pyfr/backends/base/generator.py +++ b/pyfr/backends/base/generator.py @@ -67,10 +67,11 @@ def __init__(self, name, spec, body): class BaseKernelGenerator: - def __init__(self, name, ndim, args, body, fpdtype): + def __init__(self, name, ndim, args, body, fpdtype, ixdtype): self.name = name self.ndim = ndim self.fpdtype = fpdtype + self.ixdtype = ixdtype # Parse and sort our argument list sargs = sorted((k, Arg(k, v, body)) for k, v in args.items()) @@ -103,7 +104,7 @@ def argspec(self): # Dimensions argn += self._dims - argt += [[np.int32]]*self.ndim + argt += [[self.ixdtype]]*self.ndim # Scalar args (always of type fpdtype) argn += [sa.name for sa in self.scalargs] @@ -114,11 +115,11 @@ def argspec(self): argn.append(va.name) if va.isview: - argt.append([np.intp]*(2 + (va.ncdim == 2))) + argt.append([np.uintp]*(2 + (va.ncdim == 2))) elif self.needs_ldim(va): - argt.append([np.intp, np.int32]) + argt.append([np.uintp, self.ixdtype]) else: - argt.append([np.intp]) + argt.append([np.uintp]) # Return return self.ndim, argn, argt @@ -285,11 +286,12 @@ def __init__(self, *args, **kwargs): if self.ndim == 1: self.preamble += 'if (_x < _nx)' else: + blk_y, lid_y = self.block2d[1], self._lid[1] self.preamble += f''' - int _ysize = (_ny + {self.block2d[1] - 1}) / {self.block2d[1]}; - int _ystart = {self._lid[1]}*_ysize; - int _yend = min(_ny, _ystart + _ysize); - for (int _y = _ystart; _x < _nx && _y < _yend; _y++)''' + ixdtype_t _ysize = (_ny + {blk_y - 1}) / {blk_y}; + ixdtype_t _ystart = {lid_y}*_ysize; + ixdtype_t _yend = min(_ny, _ystart + _ysize); + for (ixdtype_t _y = _ystart; _x < _nx && _y < _yend; _y++)''' def ldim_size(self, name, factor=1): return f'ld{name}' @@ -377,7 +379,7 @@ def render(self): return f'''{spec} {{ - int _x = {self._gid}; + ixdtype_t _x = {self._gid}; #define X_IDX (_x) #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv) #define BCAST_BLK(r, c, ld) c diff --git a/pyfr/backends/base/makoutil.py b/pyfr/backends/base/makoutil.py index 804c15fc6..7585965da 100644 --- a/pyfr/backends/base/makoutil.py +++ b/pyfr/backends/base/makoutil.py @@ -158,11 +158,12 @@ def kernel(context, name, ndim, **kwargs): # Capture the kernel body body = capture(context, context['caller'].body) - # Get the generator class and floating point data type - kerngen, fpdtype = context['_kernel_generator'], context['fpdtype'] + # Get the generator class and data types + kerngen = context['_kernel_generator'] + fpdtype, ixdtype = context['fpdtype'], context['ixdtype'] # Instantiate - kern = kerngen(name, int(ndim), kwargs, body, fpdtype) + kern = kerngen(name, int(ndim), kwargs, body, fpdtype, ixdtype) # Save the argument/type list for later use context['_kernel_argspecs'][name] = kern.argspec() diff --git a/pyfr/backends/base/types.py b/pyfr/backends/base/types.py index 02efaa88e..c77827e03 100644 --- a/pyfr/backends/base/types.py +++ b/pyfr/backends/base/types.py @@ -237,13 +237,16 @@ def __init__(self, backend, matmap, rmap, cmap, rstridemap, vshape, tags): if any(m.dtype != self.refdtype for m in self._mats): raise TypeError('Mixed data types are not supported') + # Index type + ixdtype = backend.ixdtype + # SoA size k, csubsz = backend.soasz, backend.csubsz # Base offsets and leading dimensions for each point - offset = np.empty(self.n, dtype=np.int32) - leaddim = np.empty(self.n, dtype=np.int32) - blkdisp = np.empty(self.n, dtype=np.int32) + offset = np.empty(self.n, dtype=ixdtype) + leaddim = np.empty(self.n, dtype=ixdtype) + blkdisp = np.empty(self.n, dtype=ixdtype) for m in self._mats: ix = np.where(matmap == m.mid) @@ -256,12 +259,12 @@ def __init__(self, backend, matmap, rmap, cmap, rstridemap, vshape, tags): coldisp = (cmapmod // k)*k*self.nvcol + cmapmod % k mapping = (offset + blkdisp + rowdisp + coldisp)[None, :] - self.mapping = backend.const_matrix(mapping, dtype=np.int32, tags=tags) + self.mapping = backend.const_matrix(mapping, dtype=ixdtype, tags=tags) # Row strides if self.nvrow > 1: rstrides = (rstridemap*leaddim)[None, :] - self.rstrides = backend.const_matrix(rstrides, dtype=np.int32, + self.rstrides = backend.const_matrix(rstrides, dtype=ixdtype, tags=tags) diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py index 5c4087d55..2580d5beb 100644 --- a/pyfr/backends/cuda/blasext.py +++ b/pyfr/backends/cuda/blasext.py @@ -10,7 +10,8 @@ def axnpby(self, *arr, subdims=None): raise ValueError('Incompatible matrix types') nv = len(arr) - nrow, ncol, ldim, dtype = arr[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = arr[0].traits[1:] ncola, ncolb = arr[0].ioshape[1:] # Render the kernel template @@ -20,7 +21,7 @@ def axnpby(self, *arr, subdims=None): # Build the kernel kern = self._build_kernel('axnpby', src, - [np.int32]*3 + [np.intp]*nv + [dtype]*nv) + [ixdtype]*3 + [np.uintp]*nv + [fpdtype]*nv) # Determine the grid/block block = (128, 1, 1) @@ -59,7 +60,8 @@ def reduction(self, *rs, method, norm, dt_mat=None): raise ValueError('Incompatible matrix types') cuda = self.backend.cuda - nrow, ncol, ldim, dtype = rs[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = rs[0].traits[1:] ncola, ncolb = rs[0].ioshape[1:] # Reduction block dimensions @@ -72,7 +74,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): reduced_dev = cuda.mem_alloc(ncola*grid[0]*rs[0].itemsize) # Empty result buffer on the host - reduced_host = cuda.pagelocked_empty((ncola, grid[0]), dtype) + reduced_host = cuda.pagelocked_empty((ncola, grid[0]), fpdtype) tplargs = dict(norm=norm, method=method) @@ -86,11 +88,11 @@ def reduction(self, *rs, method, norm, dt_mat=None): # Argument types for reduction kernel if method == 'errest': - argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2 + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]*2 elif method == 'resid' and dt_mat: - argt = [np.int32]*3 + [np.intp]*4 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype] else: - argt = [np.int32]*3 + [np.intp]*3 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*3 + [fpdtype] # Build the reduction kernel rkern = self._build_kernel('reduction', src, argt) @@ -100,7 +102,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): params.set_args(nrow, ncolb, ldim, reduced_dev, *regs) # Runtime argument offset - facoff = argt.index(dtype) + facoff = argt.index(fpdtype) # Norm type reducer = np.max if norm == 'uniform' else np.sum diff --git a/pyfr/backends/cuda/cublas.py b/pyfr/backends/cuda/cublas.py index 691cd418b..c566cc392 100644 --- a/pyfr/backends/cuda/cublas.py +++ b/pyfr/backends/cuda/cublas.py @@ -81,14 +81,21 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0): if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') - # CUBLAS expects inputs to be column-major (or Fortran order in + # cuBLAS expects inputs to be column-major (or Fortran order in # numpy parlance). However as C = A*B => C^T = (A*B)^T # = (B^T)*(A^T) with a little trickery we can multiply our # row-major matrices directly. m, n, k = b.ncol, a.nrow, a.ncol A, B, C = b, a, out - # α and β factors for C = α*(A*op(B)) + β*C + # Cache key + ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim) + + # Size checks + if any(sz > 2**31 - 1 for sz in ckey[3:]): + raise ValueError('Matrices too large for cuBLAS') + + # α and β factors for C = α*(A*B) + β*C if a.dtype == np.float64: cublasgemm = w.cublasDgemm alpha_ct, beta_ct = c_double(alpha), c_double(beta) @@ -102,9 +109,6 @@ def gemm(stream): cublasgemm(h, w.OP_N, w.OP_N, m, n, k, alpha_ct, A, A.leaddim, B, B.leaddim, beta_ct, C, C.leaddim) - # Cache key - ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim) - # Obtain the performance of the kernel try: dt = self._mul_timing[ckey] diff --git a/pyfr/backends/cuda/generator.py b/pyfr/backends/cuda/generator.py index aa32113f8..b16198ea5 100644 --- a/pyfr/backends/cuda/generator.py +++ b/pyfr/backends/cuda/generator.py @@ -5,13 +5,15 @@ class CUDAKernelGenerator(BaseGPUKernelGenerator): _lid = ('threadIdx.x', 'threadIdx.y') - _gid = 'blockIdx.x*blockDim.x + threadIdx.x' + _gid = 'ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x' _shared_prfx = '__shared__' _shared_sync = '__syncthreads()' def _render_spec(self): + res = '__restrict__' + # We first need the argument list; starting with the dimensions - kargs = [f'int {d}' for d in self._dims] + kargs = [f'ixdtype_t {d}' for d in self._dims] # Now add any scalar arguments kargs.extend(f'{sa.dtype} {sa.name}' for sa in self.scalargs) @@ -19,19 +21,19 @@ def _render_spec(self): # Finally, add the vector arguments for va in self.vectargs: if va.intent == 'in': - kargs.append(f'const {va.dtype}* __restrict__ {va.name}_v') + kargs.append(f'const {va.dtype}* {res} {va.name}_v') else: - kargs.append(f'{va.dtype}* __restrict__ {va.name}_v') + kargs.append(f'{va.dtype}* {res} {va.name}_v') # Views if va.isview: - kargs.append(f'const int* __restrict__ {va.name}_vix') + kargs.append(f'const ixdtype_t* {res} {va.name}_vix') if va.ncdim == 2: - kargs.append(f'const int* __restrict__ {va.name}_vrstri') + kargs.append(f'const ixdtype_t* {res} {va.name}_vrstri') # Arrays elif self.needs_ldim(va): - kargs.append(f'int ld{va.name}') + kargs.append(f'ixdtype_t ld{va.name}') # Determine the launch bounds for the kernel nthrds = prod(self.block1d if self.ndim == 1 else self.block2d) diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako index 1735cbfb9..794cf1094 100644 --- a/pyfr/backends/cuda/kernels/axnpby.mako +++ b/pyfr/backends/cuda/kernels/axnpby.mako @@ -2,14 +2,15 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> __global__ void -axnpby(int nrow, int ncolb, int ldim, fpdtype_t* __restrict__ x0, +axnpby(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim, + fpdtype_t* __restrict__ x0, ${', '.join(f'const fpdtype_t* __restrict__ x{i}' for i in range(1, nv))}, ${', '.join(f'fpdtype_t a{i}' for i in range(nv))}) { int i = blockIdx.y*blockDim.y + threadIdx.y; - int j = blockIdx.x*blockDim.x + threadIdx.x; - int idx; + ixdtype_t j = ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x; + ixdtype_t idx; if (j < ncolb && a0 == 0.0) { diff --git a/pyfr/backends/cuda/kernels/base.mako b/pyfr/backends/cuda/kernels/base.mako index 055c27670..28447d4b3 100644 --- a/pyfr/backends/cuda/kernels/base.mako +++ b/pyfr/backends/cuda/kernels/base.mako @@ -5,8 +5,11 @@ #define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ) // Typedefs -typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t; typedef unsigned int uint32_t; +typedef long long int64_t; +typedef unsigned long long uint64_t; +typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t; +typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t; // Atomic helpers __device__ void atomic_min_fpdtype(fpdtype_t* addr, fpdtype_t val) diff --git a/pyfr/backends/cuda/kernels/pack.mako b/pyfr/backends/cuda/kernels/pack.mako index e5ddb9882..08e87ae92 100644 --- a/pyfr/backends/cuda/kernels/pack.mako +++ b/pyfr/backends/cuda/kernels/pack.mako @@ -1,21 +1,21 @@ <%inherit file='base'/> __global__ void -pack_view(int n, int nrv, int ncv, +pack_view(ixdtype_t n, ixdtype_t nrv, ixdtype_t ncv, const fpdtype_t* __restrict__ v, - const int* __restrict__ vix, - const int* __restrict__ vrstri, + const ixdtype_t* __restrict__ vix, + const ixdtype_t* __restrict__ vrstri, fpdtype_t* __restrict__ pmat) { - int i = blockIdx.x*blockDim.x + threadIdx.x; + ixdtype_t i = ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x; if (i < n && ncv == 1) pmat[i] = v[vix[i]]; else if (i < n && nrv == 1) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[c*n + i] = v[vix[i] + SOA_SZ*c]; else if (i < n) - for (int r = 0; r < nrv; ++r) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t r = 0; r < nrv; ++r) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c]; } diff --git a/pyfr/backends/cuda/kernels/reduction.mako b/pyfr/backends/cuda/kernels/reduction.mako index 3a407fa55..c4988c1f2 100644 --- a/pyfr/backends/cuda/kernels/reduction.mako +++ b/pyfr/backends/cuda/kernels/reduction.mako @@ -2,7 +2,8 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> __global__ void -reduction(int nrow, int ncolb, int ldim, fpdtype_t *__restrict__ reduced, +reduction(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim, + fpdtype_t *__restrict__ reduced, fpdtype_t *__restrict__ rcurr, fpdtype_t *__restrict__ rold, % if method == 'errest': fpdtype_t *__restrict__ rerr, fpdtype_t atol, fpdtype_t rtol) @@ -13,16 +14,16 @@ reduction(int nrow, int ncolb, int ldim, fpdtype_t *__restrict__ reduced, % endif { int tid = threadIdx.x; - int i = blockIdx.x*blockDim.x + tid; + ixdtype_t i = ixdtype_t(blockIdx.x)*blockDim.x + tid; __shared__ fpdtype_t sdata[32]; fpdtype_t r, acc = 0; if (i < ncolb) { - for (int j = 0; j < nrow; j++) + for (ixdtype_t j = 0; j < nrow; j++) { - int idx = j*ldim + SOA_IX(i, blockIdx.y, gridDim.y); + ixdtype_t idx = j*ldim + SOA_IX(i, blockIdx.y, gridDim.y); % if method == 'errest': r = rerr[idx]/(atol + rtol*max(fabs(rcurr[idx]), fabs(rold[idx]))); % elif method == 'resid': @@ -64,6 +65,6 @@ reduction(int nrow, int ncolb, int ldim, fpdtype_t *__restrict__ reduced, % endif if (tid == 0) - reduced[blockIdx.y*gridDim.x + blockIdx.x] = acc; + reduced[ixdtype_t(blockIdx.y)*gridDim.x + blockIdx.x] = acc; } } diff --git a/pyfr/backends/cuda/packing.py b/pyfr/backends/cuda/packing.py index 146cedc7c..fce84b439 100644 --- a/pyfr/backends/cuda/packing.py +++ b/pyfr/backends/cuda/packing.py @@ -6,6 +6,7 @@ class CUDAPackingKernels(CUDAKernelProvider): def pack(self, mv): cuda = self.backend.cuda + ixdtype = self.backend.ixdtype # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view @@ -14,7 +15,7 @@ def pack(self, mv): src = self.backend.lookup.get_template('pack').render() # Build - kern = self._build_kernel('pack_view', src, 'iiiPPPP') + kern = self._build_kernel('pack_view', src, [ixdtype]*3 + [np.uintp]*4) # Compute the grid and thread-block size block = (128, 1, 1) diff --git a/pyfr/backends/hip/blasext.py b/pyfr/backends/hip/blasext.py index 0b384c104..425f4fcfe 100644 --- a/pyfr/backends/hip/blasext.py +++ b/pyfr/backends/hip/blasext.py @@ -10,7 +10,8 @@ def axnpby(self, *arr, subdims=None): raise ValueError('Incompatible matrix types') nv = len(arr) - nrow, ncol, ldim, dtype = arr[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = arr[0].traits[1:] ncola, ncolb = arr[0].ioshape[1:] # Determine the grid/block @@ -24,7 +25,7 @@ def axnpby(self, *arr, subdims=None): # Build the kernel kern = self._build_kernel('axnpby', src, - [np.int32]*3 + [np.intp]*nv + [dtype]*nv) + [ixdtype]*3 + [np.uintp]*nv + [fpdtype]*nv) # Set the parameters params = kern.make_params(grid, block) @@ -59,7 +60,8 @@ def reduction(self, *rs, method, norm, dt_mat=None): raise ValueError('Incompatible matrix types') hip = self.backend.hip - nrow, ncol, ldim, dtype = rs[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = rs[0].traits[1:] ncola, ncolb = rs[0].ioshape[1:] # Reduction block dimensions @@ -72,7 +74,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): reduced_dev = hip.mem_alloc(ncola*grid[0]*rs[0].itemsize) # Empty result buffer on the host - reduced_host = hip.pagelocked_empty((ncola, grid[0]), dtype) + reduced_host = hip.pagelocked_empty((ncola, grid[0]), fpdtype) tplargs = dict(norm=norm, blocksz=block[0], method=method) @@ -86,11 +88,11 @@ def reduction(self, *rs, method, norm, dt_mat=None): # Argument types for reduction kernel if method == 'errest': - argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2 + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]*2 elif method == 'resid' and dt_mat: - argt = [np.int32]*3 + [np.intp]*4 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype] else: - argt = [np.int32]*3 + [np.intp]*3 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*3 + [fpdtype] # Build the reduction kernel rkern = self._build_kernel('reduction', src, argt) @@ -100,7 +102,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): params.set_args(nrow, ncolb, ldim, reduced_dev, *regs) # Runtime argument offset - facoff = argt.index(dtype) + facoff = argt.index(fpdtype) # Norm type reducer = np.max if norm == 'uniform' else np.sum diff --git a/pyfr/backends/hip/generator.py b/pyfr/backends/hip/generator.py index ab0374d26..1ffd40f84 100644 --- a/pyfr/backends/hip/generator.py +++ b/pyfr/backends/hip/generator.py @@ -5,13 +5,15 @@ class HIPKernelGenerator(BaseGPUKernelGenerator): _lid = ('threadIdx.x', 'threadIdx.y') - _gid = 'blockIdx.x*blockDim.x + threadIdx.x' + _gid = 'ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x' _shared_prfx = '__shared__' _shared_sync = '__syncthreads()' def _render_spec(self): + res = '__restrict__' + # We first need the argument list; starting with the dimensions - kargs = [f'int {d}' for d in self._dims] + kargs = [f'ixdtype_t {d}' for d in self._dims] # Now add any scalar arguments kargs.extend(f'{sa.dtype} {sa.name}' for sa in self.scalargs) @@ -19,19 +21,19 @@ def _render_spec(self): # Finally, add the vector arguments for va in self.vectargs: if va.intent == 'in': - kargs.append(f'const {va.dtype}* __restrict__ {va.name}_v') + kargs.append(f'const {va.dtype}* {res} {va.name}_v') else: - kargs.append(f'{va.dtype}* __restrict__ {va.name}_v') + kargs.append(f'{va.dtype}* {res} {va.name}_v') # Views if va.isview: - kargs.append(f'const int* __restrict__ {va.name}_vix') + kargs.append(f'const ixdtype_t* {res} {va.name}_vix') if va.ncdim == 2: - kargs.append(f'const int* __restrict__ {va.name}_vrstri') + kargs.append(f'const ixdtype_t* {res} {va.name}_vrstri') # Arrays elif self.needs_ldim(va): - kargs.append(f'int ld{va.name}') + kargs.append(f'ixdtype_t ld{va.name}') # Determine the launch bounds for the kernel nthrds = prod(self.block1d if self.ndim == 1 else self.block2d) diff --git a/pyfr/backends/hip/kernels/axnpby.mako b/pyfr/backends/hip/kernels/axnpby.mako index 5170f1c06..0815481c5 100644 --- a/pyfr/backends/hip/kernels/axnpby.mako +++ b/pyfr/backends/hip/kernels/axnpby.mako @@ -2,14 +2,15 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> __global__ __launch_bounds__(${block[0]*block[1]}) void -axnpby(int nrow, int ncolb, int ldim, fpdtype_t* __restrict__ x0, +axnpby(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim, + fpdtype_t* __restrict__ x0, ${', '.join(f'const fpdtype_t* __restrict__ x{i}' for i in range(1, nv))}, ${', '.join(f'fpdtype_t a{i}' for i in range(nv))}) { int i = blockIdx.y*blockDim.y + threadIdx.y; - int j = blockIdx.x*blockDim.x + threadIdx.x; - int idx; + ixdtype_t j = ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x; + ixdtype_t idx; if (j < ncolb && a0 == 0.0) { diff --git a/pyfr/backends/hip/kernels/base.mako b/pyfr/backends/hip/kernels/base.mako index c8f56dd5f..01cbecd3d 100644 --- a/pyfr/backends/hip/kernels/base.mako +++ b/pyfr/backends/hip/kernels/base.mako @@ -6,6 +6,7 @@ // Typedefs typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t; +typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t; // Atomic helpers #define atomic_min_fpdtype(addr, val) atomicMin(addr, val) diff --git a/pyfr/backends/hip/kernels/pack.mako b/pyfr/backends/hip/kernels/pack.mako index 4f4f20d3c..fb64844c3 100644 --- a/pyfr/backends/hip/kernels/pack.mako +++ b/pyfr/backends/hip/kernels/pack.mako @@ -1,21 +1,21 @@ <%inherit file='base'/> __global__ __launch_bounds__(${blocksz}) void -pack_view(int n, int nrv, int ncv, +pack_view(ixdtype_t n, ixdtype_t nrv, ixdtype_t ncv, const fpdtype_t* __restrict__ v, - const int* __restrict__ vix, - const int* __restrict__ vrstri, + const ixdtype_t* __restrict__ vix, + const ixdtype_t* __restrict__ vrstri, fpdtype_t* __restrict__ pmat) { - int i = blockIdx.x*blockDim.x + threadIdx.x; + ixdtype_t i = blockIdx.x*blockDim.x + threadIdx.x; if (i < n && ncv == 1) pmat[i] = v[vix[i]]; else if (i < n && nrv == 1) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[c*n + i] = v[vix[i] + SOA_SZ*c]; else if (i < n) - for (int r = 0; r < nrv; ++r) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t r = 0; r < nrv; ++r) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c]; } diff --git a/pyfr/backends/hip/kernels/reduction.mako b/pyfr/backends/hip/kernels/reduction.mako index 9b8fad455..ecf9f192a 100644 --- a/pyfr/backends/hip/kernels/reduction.mako +++ b/pyfr/backends/hip/kernels/reduction.mako @@ -2,7 +2,8 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> __global__ __launch_bounds__(${blocksz}) void -reduction(int nrow, int ncolb, int ldim, fpdtype_t *__restrict__ reduced, +reduction(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim, + fpdtype_t *__restrict__ reduced, fpdtype_t *__restrict__ rcurr, fpdtype_t *__restrict__ rold, % if method == 'errest': fpdtype_t *__restrict__ rerr, fpdtype_t atol, fpdtype_t rtol) @@ -13,16 +14,16 @@ reduction(int nrow, int ncolb, int ldim, fpdtype_t *__restrict__ reduced, % endif { int tid = threadIdx.x; - int i = blockIdx.x*blockDim.x + tid; + ixdtype_t i = ixdtype_t(blockIdx.x)*blockDim.x + tid; __shared__ fpdtype_t sdata[32]; fpdtype_t r, acc = 0; if (i < ncolb) { - for (int j = 0; j < nrow; j++) + for (ixdtype_t j = 0; j < nrow; j++) { - int idx = j*ldim + SOA_IX(i, blockIdx.y, gridDim.y); + ixdtype_t idx = j*ldim + SOA_IX(i, blockIdx.y, gridDim.y); % if method == 'errest': r = rerr[idx]/(atol + rtol*max(fabs(rcurr[idx]), fabs(rold[idx]))); % elif method == 'resid': @@ -64,6 +65,6 @@ reduction(int nrow, int ncolb, int ldim, fpdtype_t *__restrict__ reduced, % endif if (tid == 0) - reduced[blockIdx.y*gridDim.x + blockIdx.x] = acc; + reduced[ixdtype_t(blockIdx.y)*gridDim.x + blockIdx.x] = acc; } } diff --git a/pyfr/backends/hip/packing.py b/pyfr/backends/hip/packing.py index 386e9c7b8..947f60f6f 100644 --- a/pyfr/backends/hip/packing.py +++ b/pyfr/backends/hip/packing.py @@ -6,6 +6,7 @@ class HIPPackingKernels(HIPKernelProvider): def pack(self, mv): hip = self.backend.hip + ixdtype = self.backend.ixdtype # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view @@ -18,7 +19,7 @@ def pack(self, mv): src = self.backend.lookup.get_template('pack').render(blocksz=block[0]) # Build - kern = self._build_kernel('pack_view', src, 'iiiPPPP') + kern = self._build_kernel('pack_view', src, [ixdtype]*3 + [np.uintp]*4) # Set the arguments params = kern.make_params(grid, block) diff --git a/pyfr/backends/hip/rocblas.py b/pyfr/backends/hip/rocblas.py index 7646a19b9..d7ca11390 100644 --- a/pyfr/backends/hip/rocblas.py +++ b/pyfr/backends/hip/rocblas.py @@ -80,10 +80,17 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0): m, n, k = b.ncol, a.nrow, a.ncol A, B, C = b, a, out + # Cache key + ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim) + + # Size checks + if any(sz > 2**31 - 1 for sz in ckey[3:]): + raise ValueError('Matrices too large for rocBLAS') + # Do not transpose either A or B opA = opB = w.OPERATION_NONE - # α and β factors for C = α*(A*op(B)) + β*C + # α and β factors for C = α*(A*B) + β*C if a.dtype == np.float64: rocblas_gemm = w.rocblas_dgemm alpha_ct, beta_ct = c_double(alpha), c_double(beta) @@ -97,9 +104,6 @@ def gemm(stream): rocblas_gemm(h, opA, opB, m, n, k, alpha_ct, A, A.leaddim, B, B.leaddim, beta_ct, C, C.leaddim) - # Cache key - ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim) - # Obtain the performance of the kernel try: dt = self._mul_timing[ckey] diff --git a/pyfr/backends/metal/blasext.py b/pyfr/backends/metal/blasext.py index 24d3c8509..78046da52 100644 --- a/pyfr/backends/metal/blasext.py +++ b/pyfr/backends/metal/blasext.py @@ -10,7 +10,8 @@ def axnpby(self, *arr, subdims=None): raise ValueError('Incompatible matrix types') nv = len(arr) - nrow, ncol, ldim, dtype = arr[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = arr[0].traits[1:] ncola, ncolb = arr[0].ioshape[1:] # Render the kernel template @@ -20,7 +21,7 @@ def axnpby(self, *arr, subdims=None): # Build the kernel kern = self._build_kernel('axnpby', src, - [np.int32]*3 + [np.intp]*nv + [dtype]*nv) + [ixdtype]*3 + [np.uintp]*nv + [fpdtype]*nv) grid, tgrp = (ncolb, nrow, 1), (128, 1, 1) kargs = [nrow, ncolb, ldim] + [a.data for a in arr] + [1.0]*nv @@ -53,7 +54,8 @@ def reduction(self, *rs, method, norm, dt_mat=None): if any(r.traits != rs[0].traits for r in rs[1:]): raise ValueError('Incompatible matrix types') - nrow, ncol, ldim, dtype = rs[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = rs[0].traits[1:] ncola, ncolb = rs[0].ioshape[1:] itemsize = rs[0].itemsize @@ -68,7 +70,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): reduced_dev = call_(self.backend.dev, 'newBufferWith', length=bufsz, options=MTLResourceStorageModeManaged) reduced_host = reduced_dev.contents().as_buffer(bufsz) - reduced_host = np.frombuffer(reduced_host, dtype=dtype) + reduced_host = np.frombuffer(reduced_host, dtype=fpdtype) reduced_host = reduced_host.reshape(ncola, -1) # Template arguments @@ -84,17 +86,17 @@ def reduction(self, *rs, method, norm, dt_mat=None): # Argument types for the reduction kernel if method == 'errest': - argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2 + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]*2 elif method == 'resid' and dt_mat: - argt = [np.int32]*3 + [np.intp]*4 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype] else: - argt = [np.int32]*3 + [np.intp]*3 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*3 + [fpdtype] # Build the reduction kernel rkern = self._build_kernel('reduction', src, argt) # Runtime argument offset - facoff = argt.index(dtype) + facoff = argt.index(fpdtype) nfacs = 2 if method == 'errest' else 1 # Kernel arguments diff --git a/pyfr/backends/metal/generator.py b/pyfr/backends/metal/generator.py index dc574e308..e61ca1c94 100644 --- a/pyfr/backends/metal/generator.py +++ b/pyfr/backends/metal/generator.py @@ -11,7 +11,7 @@ class MetalKernelGenerator(BaseGPUKernelGenerator): def _render_spec(self): # We first need the argument list; starting with the dimensions - kargs = [f'constant int& {d}' for d in self._dims] + kargs = [f'constant ixdtype_t& {d}' for d in self._dims] # Now add any scalar arguments kargs.extend(f'constant {sa.dtype}& {sa.name}' for sa in self.scalargs) @@ -25,13 +25,13 @@ def _render_spec(self): # Views if va.isview: - kargs.append(f'device const int* {va.name}_vix') + kargs.append(f'device const ixdtype_t* {va.name}_vix') if va.ncdim == 2: - kargs.append(f'device const int* {va.name}_vrstri') + kargs.append(f'device const ixdtype_t* {va.name}_vrstri') # Arrays elif self.needs_ldim(va): - kargs.append(f'device const int& ld{va.name}') + kargs.append(f'device const ixdtype_t& ld{va.name}') # Finally, the attribute arguments kargs.append('uint2 _tpig [[thread_position_in_grid]]') diff --git a/pyfr/backends/metal/gimmik.py b/pyfr/backends/metal/gimmik.py index a5d275855..af33cdb0d 100644 --- a/pyfr/backends/metal/gimmik.py +++ b/pyfr/backends/metal/gimmik.py @@ -71,7 +71,7 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0): src, meta = kgen.send(sdata) grid, tgrp = meta['grid'], meta['threadgroup'] - kern = self._build_kernel(kname, src, [np.intp]*2) + kern = self._build_kernel(kname, src, [np.uintp]*2) # Obtain the runtime dt = self._benchmark( diff --git a/pyfr/backends/metal/kernels/axnpby.mako b/pyfr/backends/metal/kernels/axnpby.mako index dab541baa..c9913a589 100644 --- a/pyfr/backends/metal/kernels/axnpby.mako +++ b/pyfr/backends/metal/kernels/axnpby.mako @@ -2,14 +2,14 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> kernel void -axnpby(constant int& nrow, constant int& ncolb, constant int& ldim, - device fpdtype_t* x0, +axnpby(constant ixdtype_t& nrow, constant ixdtype_t& ncolb, + constant ixdtype_t& ldim, device fpdtype_t* x0, ${', '.join(f'device const fpdtype_t* x{i}' for i in range(1, nv))}, ${', '.join(f'constant fpdtype_t& a{i}' for i in range(nv))}, uint2 ji [[thread_position_in_grid]]) { - int j = ji.x, i = ji.y; - int idx; + ixdtype_t j = ji.x, i = ji.y; + ixdtype_t idx; if (j < ncolb && a0 == 0) { diff --git a/pyfr/backends/metal/kernels/base.mako b/pyfr/backends/metal/kernels/base.mako index 52ab2b609..8d724aa5f 100644 --- a/pyfr/backends/metal/kernels/base.mako +++ b/pyfr/backends/metal/kernels/base.mako @@ -10,6 +10,7 @@ using namespace metal; // Typedefs typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t; +typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t; // Atomic helpers inline void atomic_min_fpdtype(device fpdtype_t* addr, fpdtype_t val) diff --git a/pyfr/backends/metal/kernels/pack.mako b/pyfr/backends/metal/kernels/pack.mako index 27906ed8c..25f5eb118 100644 --- a/pyfr/backends/metal/kernels/pack.mako +++ b/pyfr/backends/metal/kernels/pack.mako @@ -1,20 +1,21 @@ <%inherit file='base'/> kernel void -pack_view(constant int& n, constant int& nrv, constant int& ncv, +pack_view(constant ixdtype_t& n, constant ixdtype_t& nrv, + constant ixdtype_t& ncv, device const fpdtype_t* v, - device const int* vix, - device const int* vrstri, + device const ixdtype_t* vix, + device const ixdtype_t* vrstri, device fpdtype_t* pmat, uint i [[thread_position_in_grid]]) { if (i < n && ncv == 1) pmat[i] = v[vix[i]]; else if (i < n && nrv == 1) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[c*n + i] = v[vix[i] + SOA_SZ*c]; else if (i < n) - for (int r = 0; r < nrv; ++r) + for (ixdtype_t r = 0; r < nrv; ++r) for (int c = 0; c < ncv; ++c) pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c]; } diff --git a/pyfr/backends/metal/kernels/reduction.mako b/pyfr/backends/metal/kernels/reduction.mako index 2814f299b..73e613613 100644 --- a/pyfr/backends/metal/kernels/reduction.mako +++ b/pyfr/backends/metal/kernels/reduction.mako @@ -2,8 +2,8 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> kernel void -reduction(constant int& nrow, constant int& ncolb, - device const int& ldim, device fpdtype_t* reduced, +reduction(constant ixdtype_t& nrow, constant ixdtype_t& ncolb, + device const ixdtype_t& ldim, device fpdtype_t* reduced, device const fpdtype_t* rcurr, device const fpdtype_t* rold, % if method == 'errest': device const fpdtype_t* rerr, @@ -20,17 +20,17 @@ reduction(constant int& nrow, constant int& ncolb, uint2 blockIdx [[threadgroup_position_in_grid]]) { int tid = threadIdx.x; - int i = blockIdx.x*${blocksz} + tid; - int nblocks = (ncolb + ${blocksz} - 1) / ${blocksz}; + ixdtype_t i = ixdtype_t(blockIdx.x)*${blocksz} + tid; + ixdtype_t nblocks = (ncolb + ${blocksz} - 1) / ${blocksz}; fpdtype_t r, acc = 0; threadgroup fpdtype_t sdata[${blocksz // 8 - 1}]; if (i < ncolb) { - for (int j = 0; j < nrow; j++) + for (ixdtype_t j = 0; j < nrow; j++) { - int idx = j*ldim + SOA_IX(i, blockIdx.y, ${ncola}); + ixdtype_t idx = j*ldim + SOA_IX(i, blockIdx.y, ${ncola}); % if method == 'errest': r = rerr[idx]/(atol + rtol*max(fabs(rcurr[idx]), fabs(rold[idx]))); % elif method == 'resid': @@ -70,6 +70,6 @@ reduction(constant int& nrow, constant int& ncolb, } // Copy to global memory - reduced[blockIdx.y*nblocks + blockIdx.x] = acc; + reduced[ixdtype_t(blockIdx.y)*nblocks + blockIdx.x] = acc; } } diff --git a/pyfr/backends/metal/packing.py b/pyfr/backends/metal/packing.py index 5316f35b4..3dfecc5f7 100644 --- a/pyfr/backends/metal/packing.py +++ b/pyfr/backends/metal/packing.py @@ -5,6 +5,8 @@ class MetalPackingKernels(MetalKernelProvider): def pack(self, mv): + ixdtype = self.backend.ixdtype + # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view @@ -12,7 +14,7 @@ def pack(self, mv): src = self.backend.lookup.get_template('pack').render() # Build - kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*4) + kern = self._build_kernel('pack_view', src, [ixdtype]*3 + [np.uintp]*4) grid, tgrp = (v.n, 1, 1), (128, 1, 1) # Arguments diff --git a/pyfr/backends/metal/provider.py b/pyfr/backends/metal/provider.py index f58cdffba..c46d57ce5 100644 --- a/pyfr/backends/metal/provider.py +++ b/pyfr/backends/metal/provider.py @@ -91,7 +91,7 @@ def _build_kernel(self, name, src, argtypes): # Classify the arguments as either pointers or scalars pargs, sargs = [], [] for i, argt in enumerate(argtypes): - if argt == np.intp: + if argt == np.uintp: pargs.append(i) else: ctype = npdtype_to_ctypestype(argt) diff --git a/pyfr/backends/opencl/blasext.py b/pyfr/backends/opencl/blasext.py index 662d5250e..234b92d34 100644 --- a/pyfr/backends/opencl/blasext.py +++ b/pyfr/backends/opencl/blasext.py @@ -9,7 +9,8 @@ def axnpby(self, *arr, subdims=None): raise ValueError('Incompatible matrix types') nv = len(arr) - nrow, ncol, ldim, dtype = arr[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = arr[0].traits[1:] ncola, ncolb = arr[0].ioshape[1:] # Render the kernel template @@ -19,7 +20,7 @@ def axnpby(self, *arr, subdims=None): # Build the kernel kern = self._build_kernel('axnpby', src, - [np.int32]*3 + [np.intp]*nv + [dtype]*nv) + [ixdtype]*3 + [np.uintp]*nv + [fpdtype]*nv) kern.set_dims((ncolb, nrow)) kern.set_args(nrow, ncolb, ldim, *arr) @@ -50,7 +51,8 @@ def reduction(self, *rs, method, norm, dt_mat=None): raise ValueError('Incompatible matrix types') cl = self.backend.cl - nrow, ncol, ldim, dtype = rs[0].traits[1:] + ixdtype = self.backend.ixdtype + nrow, ncol, ldim, fpdtype = rs[0].traits[1:] ncola, ncolb = rs[0].ioshape[1:] # Reduction workgroup dimensions @@ -58,7 +60,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): gs = (ncolb - ncolb % -ls[0], ncola) # Empty result buffer on host with (nvars, ngroups) - reduced_host = np.empty((ncola, gs[0] // ls[0]), dtype) + reduced_host = np.empty((ncola, gs[0] // ls[0]), fpdtype) # Corresponding device memory allocation reduced_dev = cl.mem_alloc(reduced_host.nbytes) @@ -75,11 +77,11 @@ def reduction(self, *rs, method, norm, dt_mat=None): # Argument types for reduction kernel if method == 'errest': - argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2 + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]*2 elif method == 'resid' and dt_mat: - argt = [np.int32]*3 + [np.intp]*4 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype] else: - argt = [np.int32]*3 + [np.intp]*3 + [dtype] + argt = [ixdtype]*3 + [np.uintp]*3 + [fpdtype] # Build the reduction kernel rkern = self._build_kernel('reduction', src, argt) @@ -90,7 +92,7 @@ def reduction(self, *rs, method, norm, dt_mat=None): reducer = np.max if norm == 'uniform' else np.sum # Runtime argument offset - facoff = argt.index(dtype) + facoff = argt.index(fpdtype) class ReductionKernel(OpenCLKernel): @property diff --git a/pyfr/backends/opencl/driver.py b/pyfr/backends/opencl/driver.py index a471a17d7..7044494b9 100644 --- a/pyfr/backends/opencl/driver.py +++ b/pyfr/backends/opencl/driver.py @@ -425,7 +425,7 @@ def get_kernel(self, name, argtypes): class OpenCLKernel(_OpenCLWaitFor, _OpenCLBase): _destroyfn = 'clReleaseKernel' - typemap = [c_double, c_float, c_int32, c_int64, c_ulong] + typemap = [c_double, c_float, c_int32, c_int64, c_uint64] typemap = {k: (k(), sizeof(k)) for k in typemap} def __init__(self, lib, ptr, argtypes): diff --git a/pyfr/backends/opencl/generator.py b/pyfr/backends/opencl/generator.py index 7b84e1ed7..e16c80ebf 100644 --- a/pyfr/backends/opencl/generator.py +++ b/pyfr/backends/opencl/generator.py @@ -8,8 +8,10 @@ class OpenCLKernelGenerator(BaseGPUKernelGenerator): _shared_sync = 'work_group_barrier(CLK_GLOBAL_MEM_FENCE)' def _render_spec(self): + g, c, r = '__global', 'const', 'restrict' + # We first need the argument list; starting with the dimensions - kargs = [f'int {d}' for d in self._dims] + kargs = [f'ixdtype_t {d}' for d in self._dims] # Now add any scalar arguments kargs.extend(f'{sa.dtype} {sa.name}' for sa in self.scalargs) @@ -17,21 +19,19 @@ def _render_spec(self): # Finally, add the vector arguments for va in self.vectargs: if va.intent == 'in': - kargs.append(f'__global const {va.dtype}* restrict ' - f'{va.name}_v') + kargs.append(f'{g} {c} {va.dtype}* {r} {va.name}_v') else: - kargs.append(f'__global {va.dtype}* restrict {va.name}_v') + kargs.append(f'{g} {va.dtype}* {r} {va.name}_v') # Views if va.isview: - kargs.append(f'__global const int* restrict {va.name}_vix') + kargs.append(f'{g} {c} ixdtype_t* {r} {va.name}_vix') if va.ncdim == 2: - kargs.append('__global const int* restrict ' - f'{va.name}_vrstri') + kargs.append(f'{g} {c} ixdtype_t* {r} {va.name}_vrstri') # Arrays elif self.needs_ldim(va): - kargs.append(f'int ld{va.name}') + kargs.append(f'ixdtype_t ld{va.name}') # Determine the work group size for the kernel wgs = (*self.block1d, 1, 1) if self.ndim == 1 else (*self.block2d, 1) diff --git a/pyfr/backends/opencl/kernels/axnpby.mako b/pyfr/backends/opencl/kernels/axnpby.mako index 098852f3c..72225287e 100644 --- a/pyfr/backends/opencl/kernels/axnpby.mako +++ b/pyfr/backends/opencl/kernels/axnpby.mako @@ -2,13 +2,14 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> __kernel void -axnpby(int nrow, int ncolb, int ldim, __global fpdtype_t* restrict x0, +axnpby(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim, + __global fpdtype_t* restrict x0, ${', '.join(f'__global const fpdtype_t* restrict x{i}' for i in range(1, nv))}, ${', '.join(f'fpdtype_t a{i}' for i in range(nv))}) { - int i = get_global_id(1), j = get_global_id(0); - int idx; + ixdtype_t i = get_global_id(1), j = get_global_id(0); + ixdtype_t idx; if (j < ncolb && a0 == 0) { diff --git a/pyfr/backends/opencl/kernels/base.mako b/pyfr/backends/opencl/kernels/base.mako index ad3f9bfd3..433039fd4 100644 --- a/pyfr/backends/opencl/kernels/base.mako +++ b/pyfr/backends/opencl/kernels/base.mako @@ -5,7 +5,9 @@ #define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ) // Typedefs +typedef long int64_t; typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t; +typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t; // Atomic helpers % if pyfr.npdtype_to_ctype(fpdtype) == 'float': diff --git a/pyfr/backends/opencl/kernels/pack.mako b/pyfr/backends/opencl/kernels/pack.mako index d75d37d8e..780d4c97a 100644 --- a/pyfr/backends/opencl/kernels/pack.mako +++ b/pyfr/backends/opencl/kernels/pack.mako @@ -1,21 +1,21 @@ <%inherit file='base'/> __kernel void -pack_view(int n, int nrv, int ncv, +pack_view(ixdtype_t n, ixdtype_t nrv, ixdtype_t ncv, __global const fpdtype_t* restrict v, __global const int* restrict vix, __global const int* restrict vrstri, __global fpdtype_t* restrict pmat) { - int i = get_global_id(0); + ixdtype_t i = get_global_id(0); if (i < n && ncv == 1) pmat[i] = v[vix[i]]; else if (i < n && nrv == 1) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[c*n + i] = v[vix[i] + SOA_SZ*c]; else if (i < n) - for (int r = 0; r < nrv; ++r) - for (int c = 0; c < ncv; ++c) + for (ixdtype_t r = 0; r < nrv; ++r) + for (ixdtype_t c = 0; c < ncv; ++c) pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c]; } diff --git a/pyfr/backends/opencl/kernels/reduction.mako b/pyfr/backends/opencl/kernels/reduction.mako index d33ceda13..91ec88c11 100644 --- a/pyfr/backends/opencl/kernels/reduction.mako +++ b/pyfr/backends/opencl/kernels/reduction.mako @@ -2,7 +2,8 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> __kernel void -reduction(int nrow, int ncolb, int ldim, __global fpdtype_t* restrict reduced, +reduction(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim, + __global fpdtype_t* restrict reduced, __global const fpdtype_t* restrict rcurr, __global const fpdtype_t* restrict rold, % if method == 'errest': @@ -13,17 +14,17 @@ reduction(int nrow, int ncolb, int ldim, __global fpdtype_t* restrict reduced, fpdtype_t dt_fac) % endif { - int i = get_global_id(0), tid = get_local_id(0); - int gdim = get_num_groups(0), bid = get_group_id(0); - int ncola = get_num_groups(1), k = get_group_id(1); + ixdtype_t i = get_global_id(0), tid = get_local_id(0); + ixdtype_t gdim = get_num_groups(0), bid = get_group_id(0); + ixdtype_t ncola = get_num_groups(1), k = get_group_id(1); fpdtype_t r, acc = 0; if (i < ncolb) { - for (int j = 0; j < nrow; j++) + for (ixdtype_t j = 0; j < nrow; j++) { - int idx = j*ldim + SOA_IX(i, k, ncola); + ixdtype_t idx = j*ldim + SOA_IX(i, k, ncola); % if method == 'errest': r = rerr[idx]/(atol + rtol*max(fabs(rcurr[idx]), fabs(rold[idx]))); % elif method == 'resid': diff --git a/pyfr/backends/opencl/packing.py b/pyfr/backends/opencl/packing.py index af440a613..c00280879 100644 --- a/pyfr/backends/opencl/packing.py +++ b/pyfr/backends/opencl/packing.py @@ -6,6 +6,7 @@ class OpenCLPackingKernels(OpenCLKernelProvider): def pack(self, mv): cl = self.backend.cl + ixdtype = self.backend.ixdtype # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view @@ -14,7 +15,7 @@ def pack(self, mv): src = self.backend.lookup.get_template('pack').render() # Build - kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*4) + kern = self._build_kernel('pack_view', src, [ixdtype]*3 + [np.uintp]*4) kern.set_dims((v.n,)) kern.set_args(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m) diff --git a/pyfr/backends/openmp/blasext.py b/pyfr/backends/openmp/blasext.py index d2e6d4630..00a04d077 100644 --- a/pyfr/backends/openmp/blasext.py +++ b/pyfr/backends/openmp/blasext.py @@ -9,7 +9,8 @@ def axnpby(self, *arr, subdims=None): raise ValueError('Incompatible matrix types') nv = len(arr) - nblocks, nrow, *_, dtype = arr[0].traits + ixdtype = self.backend.ixdtype + nblocks, nrow, *_, fpdtype = arr[0].traits ncola = arr[0].ioshape[-2] # Render the kernel template @@ -19,7 +20,7 @@ def axnpby(self, *arr, subdims=None): # Build the kernel kern = self._build_kernel('axnpby', src, - [np.int32] + [np.intp]*nv + [dtype]*nv) + [ixdtype] + [np.uintp]*nv + [fpdtype]*nv) # Set the static arguments kern.set_nblocks(nblocks) @@ -32,6 +33,8 @@ def bind(self, *consts): return AxnpbyKernel(mats=arr, kernel=kern) def copy(self, dst, src): + ixdtype = self.backend.ixdtype + if dst.traits != src.traits: raise ValueError('Incompatible matrix types') @@ -43,8 +46,9 @@ def copy(self, dst, src): nblocks = src.nblocks # Build the kernel - kern = self._build_kernel('par_memcpy', ksrc, 'PiPiii') - kern.set_args(dst, dbbytes, src, sbbytes, bnbytes, nblocks) + kern = self._build_kernel('par_memcpy', ksrc, + [np.uintp]*2 + [ixdtype]*4) + kern.set_args(dst, src, dbbytes, sbbytes, bnbytes, nblocks) return OpenMPKernel(mats=[dst, src], kernel=kern) @@ -52,7 +56,8 @@ def reduction(self, *rs, method, norm, dt_mat=None): if any(r.traits != rs[0].traits for r in rs[1:]): raise ValueError('Incompatible matrix types') - nblocks, nrow, *_, dtype = rs[0].traits + ixdtype = self.backend.ixdtype + nblocks, nrow, *_, fpdtype = rs[0].traits ncola = rs[0].ioshape[-2] tplargs = dict(norm=norm, ncola=ncola, method=method) @@ -64,24 +69,24 @@ def reduction(self, *rs, method, norm, dt_mat=None): src = self.backend.lookup.get_template('reduction').render(**tplargs) # Array for the reduced data - reduced = np.zeros(ncola, dtype=dtype) + reduced = np.zeros(ncola, dtype=fpdtype) regs = list(rs) + [dt_mat] if dt_mat else rs # Argument types for reduction kernel if method == 'errest': - argt = [np.int32]*2 + [np.intp]*4 + [dtype]*2 + argt = [ixdtype]*2 + [np.uintp]*4 + [fpdtype]*2 elif method == 'resid' and dt_mat: - argt = [np.int32]*2 + [np.intp]*4 + [dtype] + argt = [ixdtype]*2 + [np.uintp]*4 + [fpdtype] else: - argt = [np.int32]*2 + [np.intp]*3 + [dtype] + argt = [ixdtype]*2 + [np.uintp]*3 + [fpdtype] # Build rkern = self._build_kernel('reduction', src, argt) rkern.set_args(nrow, nblocks, reduced.ctypes.data, *regs) # Runtime argument offset - facoff = argt.index(dtype) + facoff = argt.index(fpdtype) class ReductionKernel(OpenMPKernel): @property diff --git a/pyfr/backends/openmp/generator.py b/pyfr/backends/openmp/generator.py index 01dc135a2..7cf95f8b4 100644 --- a/pyfr/backends/openmp/generator.py +++ b/pyfr/backends/openmp/generator.py @@ -26,7 +26,7 @@ def render(self): }}''' else: core = f''' - for (int _y = 0; _y < _ny; _y++) + for (ixdtype_t _y = 0; _y < _ny; _y++) {{ for (int _xi = 0; _xi < BLK_SZ; _xi += SOA_SZ) {{ @@ -38,7 +38,7 @@ def render(self): }} }}''' clean = f''' - for (int _y = 0, _xi = 0; _y < _ny; _y++) + for (ixdtype_t _y = 0, _xi = 0; _y < _ny; _y++) {{ #pragma omp simd for (int _xj = 0; _xj < _nx % BLK_SZ; _xj++) @@ -49,7 +49,8 @@ def render(self): return f''' struct {self.name}_kargs {{ {kargdefn}; }}; - void {self.name}(int _ib, const struct {self.name}_kargs *args, + void {self.name}(ixdtype_t _ib, + const struct {self.name}_kargs *args, int _disp_mask) {{ {kargassn}; @@ -102,7 +103,7 @@ def _displace_arg(self, arg): def _render_args(self, argn): # We first need the argument list; starting with the dimensions - kargs = [('int ', d, None, None) for d in self._dims] + kargs = [('ixdtype_t', d, None, None) for d in self._dims] # Now add any scalar arguments kargs.extend((sa.dtype, sa.name, None, None) for sa in self.scalargs) @@ -119,11 +120,11 @@ def _render_args(self, argn): # Views if va.isview: - kargs.append(('const int*', f'{va.name}_vix', '_ib*BLK_SZ', - None)) + kargs.append(('const ixdtype_t*', f'{va.name}_vix', + '_ib*BLK_SZ', None)) if va.ncdim == 2: - kargs.append(('const int*', f'{va.name}_vrstri', + kargs.append(('const ixdtype_t*', f'{va.name}_vrstri', '_ib*BLK_SZ', None)) # Argument definitions and assignments diff --git a/pyfr/backends/openmp/kernels/axnpby.mako b/pyfr/backends/openmp/kernels/axnpby.mako index 8361e4e86..a78b7f909 100644 --- a/pyfr/backends/openmp/kernels/axnpby.mako +++ b/pyfr/backends/openmp/kernels/axnpby.mako @@ -3,32 +3,33 @@ struct kargs { - int nrow; + ixdtype_t nrow; fpdtype_t ${','.join(f'*x{i}' for i in range(nv))}; fpdtype_t ${','.join(f'a{i}' for i in range(nv))}; }; void axnpby(int ib, const struct kargs *restrict args, int _disp_mask) { - int nrow = args->nrow; + ixdtype_t nrow = args->nrow; + % for i in range(nv): fpdtype_t *x${i} = args->x${i}, a${i} = args->a${i}; % endfor % if sorted(subdims) == list(range(ncola)): #pragma omp simd - for (int i = ib*nrow*BLK_SZ*${ncola}; i < (ib + 1)*nrow*BLK_SZ*${ncola}; i++) + for (ixdtype_t i = ib*nrow*BLK_SZ*${ncola}; i < (ib + 1)*nrow*BLK_SZ*${ncola}; i++) x0[i] = ${pyfr.dot('a{l}', 'x{l}[i]', l=nv)}; % else: #define X_IDX_AOSOA(v, nv) ((_xi/SOA_SZ*(nv) + (v))*SOA_SZ + _xj) - for (int _y = 0; _y < nrow; _y++) + for (ixdtype_t _y = 0; _y < nrow; _y++) { - for (int _xi = 0; _xi < BLK_SZ; _xi += SOA_SZ) + for (ixdtype_t _xi = 0; _xi < BLK_SZ; _xi += SOA_SZ) { #pragma omp simd - for (int _xj = 0; _xj < SOA_SZ; _xj++) + for (ixdtype_t _xj = 0; _xj < SOA_SZ; _xj++) { - int i; + ixdtype_t i; % for k in subdims: i = _y*BLK_SZ*${ncola} + ib*BLK_SZ*${ncola}*nrow + X_IDX_AOSOA(${k}, ${ncola}); diff --git a/pyfr/backends/openmp/kernels/base.mako b/pyfr/backends/openmp/kernels/base.mako index 017d9f8f6..e461176b0 100644 --- a/pyfr/backends/openmp/kernels/base.mako +++ b/pyfr/backends/openmp/kernels/base.mako @@ -1,6 +1,7 @@ <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/> #include +#include #include #include @@ -12,6 +13,7 @@ // Typedefs typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t; +typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t; // Atomic helpers #define atomic_min_fpdtype(addr, val) _Pragma("omp atomic compare") if ((val) < *(addr)) { *(addr) = (val); } diff --git a/pyfr/backends/openmp/kernels/batch-gemm.mako b/pyfr/backends/openmp/kernels/batch-gemm.mako index 0fa7c73d7..ec6a29f29 100644 --- a/pyfr/backends/openmp/kernels/batch-gemm.mako +++ b/pyfr/backends/openmp/kernels/batch-gemm.mako @@ -5,12 +5,12 @@ struct kargs void (*exec)(void *, const fpdtype_t *, fpdtype_t *); void *blockk; const fpdtype_t *b; - int bblocksz; + ixdtype_t bblocksz; fpdtype_t *c; - int cblocksz; + ixdtype_t cblocksz; }; -void batch_gemm(int ib, const struct kargs *args, int _disp_mask) +void batch_gemm(ixdtype_t ib, const struct kargs *args, int _disp_mask) { args->exec(args->blockk, args->b + ((_disp_mask & 1) ? 0 : ib*args->bblocksz), diff --git a/pyfr/backends/openmp/kernels/pack.mako b/pyfr/backends/openmp/kernels/pack.mako index 34f6651e9..ba75afc1e 100644 --- a/pyfr/backends/openmp/kernels/pack.mako +++ b/pyfr/backends/openmp/kernels/pack.mako @@ -3,20 +3,20 @@ struct kargs { - int n; + ixdtype_t n; fpdtype_t *v; - int *vix, *vrstri; + ixdtype_t *vix, *vrstri; fpdtype_t *pmat; }; void pack_view(const struct kargs *restrict args) { - int n = args->n; - int *vix = args->vix, *vrstri = args->vrstri; + ixdtype_t n = args->n; + ixdtype_t *vix = args->vix, *vrstri = args->vrstri; fpdtype_t *v = args->v, *pmat = args->pmat; #pragma omp simd - for (int i = 0; i < n; i++) + for (ixdtype_t i = 0; i < n; i++) { % if nrv == 1: % for c in range(ncv): diff --git a/pyfr/backends/openmp/kernels/par-memcpy.mako b/pyfr/backends/openmp/kernels/par-memcpy.mako index 7c700ad90..2555c4cf5 100644 --- a/pyfr/backends/openmp/kernels/par-memcpy.mako +++ b/pyfr/backends/openmp/kernels/par-memcpy.mako @@ -6,15 +6,14 @@ struct kargs { char *dst; - int dbbytes; const char *src; - int sbbytes, bnbytes, nblocks; + ixdtype_t dbbytes, sbbytes, bnbytes, nblocks; }; void par_memcpy(const struct kargs *restrict args) { #pragma omp parallel for ${schedule} - for (int ib = 0; ib < args->nblocks; ib++) - memcpy(args->dst + ((size_t) args->dbbytes)*ib, - args->src + ((size_t) args->sbbytes)*ib, args->bnbytes); + for (ixdtype_t ib = 0; ib < args->nblocks; ib++) + memcpy(args->dst + ((ssize_t) args->dbbytes)*ib, + args->src + ((ssize_t) args->sbbytes)*ib, args->bnbytes); } diff --git a/pyfr/backends/openmp/kernels/reduction.mako b/pyfr/backends/openmp/kernels/reduction.mako index 44d9f7097..204d50e2c 100644 --- a/pyfr/backends/openmp/kernels/reduction.mako +++ b/pyfr/backends/openmp/kernels/reduction.mako @@ -3,7 +3,7 @@ struct kargs { - int nrow, nblocks; + ixdtype_t nrow, nblocks; fpdtype_t *reduced, *rcurr, *rold; % if method == 'errest': fpdtype_t *rerr, atol, rtol; @@ -16,7 +16,7 @@ struct kargs void reduction(const struct kargs *restrict args) { - int nrow = args->nrow, nblocks = args->nblocks; + ixdtype_t nrow = args->nrow, nblocks = args->nblocks; fpdtype_t *reduced = args->reduced, *rcurr = args->rcurr, *rold = args->rold; % if method == 'errest': fpdtype_t *rerr = args->rerr, atol = args->atol, rtol = args->rtol; @@ -36,16 +36,16 @@ void reduction(const struct kargs *restrict args) % else: #pragma omp parallel for ${schedule} reduction(+ : ${','.join(f'red{i}' for i in range(ncola))}) % endif - for (int ib = 0; ib < nblocks; ib++) + for (ixdtype_t ib = 0; ib < nblocks; ib++) { - for (int _y = 0; _y < nrow; _y++) + for (ixdtype_t _y = 0; _y < nrow; _y++) { - for (int _xi = 0; _xi < BLK_SZ; _xi += SOA_SZ) + for (ixdtype_t _xi = 0; _xi < BLK_SZ; _xi += SOA_SZ) { #pragma omp simd - for (int _xj = 0; _xj < SOA_SZ; _xj++) + for (ixdtype_t _xj = 0; _xj < SOA_SZ; _xj++) { - int idx; + ixdtype_t idx; fpdtype_t temp; % for i in range(ncola): diff --git a/pyfr/backends/openmp/packing.py b/pyfr/backends/openmp/packing.py index b8e8a7a7c..104c5ae10 100644 --- a/pyfr/backends/openmp/packing.py +++ b/pyfr/backends/openmp/packing.py @@ -1,9 +1,13 @@ +import numpy as np + from pyfr.backends.base import NullKernel from pyfr.backends.openmp.provider import OpenMPKernel, OpenMPKernelProvider class OpenMPPackingKernels(OpenMPKernelProvider): def pack(self, mv): + ixdtype = self.backend.ixdtype + # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view @@ -12,7 +16,7 @@ def pack(self, mv): ncv=v.nvcol) # Build - kern = self._build_kernel('pack_view', src, 'iPPPP') + kern = self._build_kernel('pack_view', src, [ixdtype] + [np.uintp]*4) kern.set_args(v.n, v.basedata, v.mapping, v.rstrides or 0, m) return OpenMPKernel(mats=[mv], kernel=kern) diff --git a/pyfr/backends/openmp/xsmm.py b/pyfr/backends/openmp/xsmm.py index 77066f5ed..a86a714de 100644 --- a/pyfr/backends/openmp/xsmm.py +++ b/pyfr/backends/openmp/xsmm.py @@ -62,6 +62,9 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0): if beta != 0.0 and beta != 1.0: raise NotSuitableError('libxsmm requires β = 0 or β = 1') + # Index type + ixdtype = self.backend.ixdtype + # Dimensions ldb, ldc = b.leaddim, out.leaddim @@ -103,7 +106,8 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0): src = self.backend.lookup.get_template('batch-gemm').render() # Build - batch_gemm = self._build_kernel('batch_gemm', src, 'PPPiPi') + batch_gemm = self._build_kernel('batch_gemm', src, + [np.uintp]*2 + [np.uintp, ixdtype]*2) batch_gemm.set_args(self._exec_ptr, blkptr, b, b.blocksz, out, out.blocksz) batch_gemm.set_nblocks(b.nblocks) diff --git a/pyfr/nputil.py b/pyfr/nputil.py index 740e1c1cc..02a5c382b 100644 --- a/pyfr/nputil.py +++ b/pyfr/nputil.py @@ -102,7 +102,7 @@ def fuzzysort(arr, idx, dim=0, tol=1e-6): _ctype_map = { np.int32: 'int', np.uint32: 'unsigned int', - np.int64: 'long long', np.uint64: 'unsigned long long', + np.int64: 'int64_t', np.uint64: 'uint64_t', np.float32: 'float', np.float64: 'double' } diff --git a/pyfr/partitioners/base.py b/pyfr/partitioners/base.py index 62419e32f..20f4555dd 100644 --- a/pyfr/partitioners/base.py +++ b/pyfr/partitioners/base.py @@ -53,7 +53,7 @@ def _combine_mesh_parts(self, mesh): rnum[en].update(((i, j), (0, off + j)) for j in range(n)) def offset_con(con, pr): - con = con.copy().astype('U4,i4,i1,i2') + con = con.copy().astype('U4,i8,i1,i2') for en, pn in pinf.items(): if pn[pr] > 0: @@ -88,7 +88,7 @@ def offset_con(con, pr): bccon[name].append(offset_con(mesh[f], l)) # Output data type - dtype = 'U4,i4,i1,i2' + dtype = 'U4,i8,i1,i2' # Concatenate these arrays to from the new mesh newmesh = {'con_p0': np.hstack(intcon).astype(dtype)} @@ -345,7 +345,7 @@ def _partition_con(self, mesh, vetimap, vparts, pnames): bcon_px[m[1], lpart].append(conl) # Output data type - dtype = 'S4,i4,i1,i2' + dtype = 'S4,i8,i1,i2' # Output con = {} diff --git a/pyfr/plugins/base.py b/pyfr/plugins/base.py index 78384bd42..e7da4fcf0 100644 --- a/pyfr/plugins/base.py +++ b/pyfr/plugins/base.py @@ -61,7 +61,7 @@ def region_data(cfg, cfgsect, mesh, rallocs): if not comm.reduce(bool(eset), op=mpi.LOR, root=root) and rank == root: raise ValueError(f'Empty region {region}') - return {etype: np.unique(eidxs).astype(np.int32) + return {etype: np.unique(eidxs) for etype, eidxs in sorted(eset.items())} diff --git a/pyfr/plugins/fluidforce.py b/pyfr/plugins/fluidforce.py index c3b97059f..ebd46671c 100644 --- a/pyfr/plugins/fluidforce.py +++ b/pyfr/plugins/fluidforce.py @@ -85,7 +85,7 @@ def __init__(self, intg, cfgsect, suffix): norms = defaultdict(list) rfpts = defaultdict(list) - for etype, eidx, fidx, flags in mesh[bc].astype('U4,i4,i1,i2'): + for etype, eidx, fidx, flags in mesh[bc].astype('U4,i8,i1,i2'): eles = elemap[etype] itype, proj, norm = eles.basis.faces[fidx] diff --git a/pyfr/readers/base.py b/pyfr/readers/base.py index 3875b75bb..068d5618f 100644 --- a/pyfr/readers/base.py +++ b/pyfr/readers/base.py @@ -216,13 +216,13 @@ def get_connectivity(self, spinner=NullProgressSpinner()): spinner() # Output - ret = {'con_p0': np.array(con, dtype='S4,i4,i1,i2').T} + ret = {'con_p0': np.array(con, dtype='S4,i8,i1,i2').T} for k, v in con_pnames.items(): - ret['con_p0', f'periodic_{k}'] = np.array(v, dtype=np.int32) + ret['con_p0', f'periodic_{k}'] = np.array(v, dtype=np.int64) for k, v in bcon.items(): - ret[f'bcon_{k}_p0'] = np.array(v, dtype='S4,i4,i1,i2') + ret[f'bcon_{k}_p0'] = np.array(v, dtype='S4,i8,i1,i2') return ret diff --git a/pyfr/regions.py b/pyfr/regions.py index d66f37f2f..1e75d2a5d 100644 --- a/pyfr/regions.py +++ b/pyfr/regions.py @@ -33,14 +33,14 @@ def surface_faces(self, mesh, rallocs, exclbcs=[]): # Eliminate any faces with internal connectivity con = mesh[f'con_p{rallocs.prank}'].T - for l, r in con[['f0', 'f1', 'f2']].astype('U4,i4,i1').tolist(): + for l, r in con[['f0', 'f1', 'f2']].astype('U4,i8,i1').tolist(): if l in sfaces and r in sfaces: sfaces.difference_update([l, r]) # Eliminate faces on specified boundaries for b in exclbcs: if (f := f'bcon_{b}_p{rallocs.prank}') in mesh: - bcon = mesh[f][['f0', 'f1', 'f2']].astype('U4,i4,i1') + bcon = mesh[f][['f0', 'f1', 'f2']].astype('U4,i8,i1') sfaces.difference_update(bcon.tolist()) comm, rank, root = get_comm_rank_root() @@ -49,7 +49,7 @@ def surface_faces(self, mesh, rallocs, exclbcs=[]): # Next, consider faces on partition boundaries for p in rallocs.prankconn[rallocs.prank]: con = mesh[f'con_p{rallocs.prank}p{p}'] - con = con[['f0', 'f1', 'f2']].astype('U4,i4,i1').tolist() + con = con[['f0', 'f1', 'f2']].astype('U4,i8,i1').tolist() # See which of these faces are on the surface boundary sb = np.array([c in sfaces for c in con]) diff --git a/pyfr/solvers/base/system.py b/pyfr/solvers/base/system.py index 13b0c8c61..3a090036d 100644 --- a/pyfr/solvers/base/system.py +++ b/pyfr/solvers/base/system.py @@ -130,7 +130,7 @@ def _load_eles(self, rallocs, mesh, initsoln, nregs, nonce): def _load_int_inters(self, rallocs, mesh, elemap): key = f'con_p{rallocs.prank}' - lhs, rhs = mesh[key].astype('U4,i4,i1,i2').tolist() + lhs, rhs = mesh[key].astype('U4,i8,i1,i2').tolist() int_inters = self.intinterscls(self.backend, lhs, rhs, elemap, self.cfg) @@ -143,7 +143,7 @@ def _load_mpi_inters(self, rallocs, mesh, elemap): for rhsprank in rallocs.prankconn[lhsprank]: rhsmrank = rallocs.pmrankmap[rhsprank] interarr = mesh[f'con_p{lhsprank}p{rhsprank}'] - interarr = interarr.astype('U4,i4,i1,i2').tolist() + interarr = interarr.astype('U4,i8,i1,i2').tolist() mpiiface = self.mpiinterscls(self.backend, interarr, rhsmrank, rallocs, elemap, self.cfg) @@ -162,7 +162,7 @@ def _load_bc_inters(self, rallocs, mesh, elemap): cfgsect = f'soln-bcs-{m[1]}' # Get the interface - interarr = mesh[f].astype('U4,i4,i1,i2').tolist() + interarr = mesh[f].astype('U4,i8,i1,i2').tolist() # Instantiate bcclass = bcmap[self.cfg.get(cfgsect, 'type')]