PyFR · FreddieWitherden · Dec 3, 2023 · Apr 18, 2022 · Apr 19, 2022 · Nov 14, 2023
diff --git a/doc/src/user_guide.rst b/doc/src/user_guide.rst
@@ -117,15 +117,20 @@ Parameterises the backend with
 
     ``single`` | ``double``
 
-2. ``rank-allocator`` --- MPI rank allocator:
+2.  ``memory-model`` --- if to enable support for large working sets;
+    should be ``normal`` unless a memory-model error is encountered:
+
+    ``normal`` | ``large``
+
+3. ``rank-allocator`` --- MPI rank allocator:
 
     ``linear`` | ``random``
 
-3. ``collect-wait-times`` --- If to track MPI request wait times or not:
+4. ``collect-wait-times`` --- if to track MPI request wait times or not:
 
     ``True`` | ``False``
 
-4. ``collect-wait-times-len`` --- Size of the wait time history buffer:
+5. ``collect-wait-times-len`` --- size of the wait time history buffer:
 
      *int*
 

diff --git a/pyfr/backends/base/backend.py b/pyfr/backends/base/backend.py
@@ -40,6 +40,16 @@ def __init__(self, cfg):
         self.fpdtype_eps = np.finfo(self.fpdtype).eps
         self.fpdtype_max = np.finfo(self.fpdtype).max
 
+        # Memory model
+        match cfg.get('backend', 'memory-model', 'normal'):
+            case 'normal':
+                self.ixdtype = np.int32
+            case 'large':
+                self.ixdtype = np.int64
+            case _:
+                raise ValueError('Backend memory model must be either normal '
+                                 'or large')
+
         # Allocated matrices
         self.mats = WeakValueDictionary()
         self._mat_counter = count()
@@ -55,16 +65,19 @@ def __init__(self, cfg):
     @cached_property
     def lookup(self):
         pkg = f'pyfr.backends.{self.name}.kernels'
-        dfltargs = dict(fpdtype=self.fpdtype, fpdtype_max=self.fpdtype_max,
-                        soasz=self.soasz, csubsz=self.csubsz, math=math)
+        dfltargs = {
+            'fpdtype': self.fpdtype, 'ixdtype': self.ixdtype,
+            'fpdtype_max': self.fpdtype_max, 'csubsz': self.csubsz,
+            'soasz': self.soasz, 'math': math
+        }
 
         return DottedTemplateLookup(pkg, dfltargs)
 
     def malloc(self, obj, extent):
         # If no extent has been specified then autocommit
         if extent is None:
             # Perform the allocation
-            data = self._malloc_impl(obj.nbytes)
+            data = self._malloc_checked(obj.nbytes)
 
             # Fire the callback
             obj.onalloc(data, 0)
@@ -99,7 +112,7 @@ def commit(self):
             sz = sum(obj.nbytes - (obj.nbytes % -self.alignb) for obj in reqs)
 
             # Perform the allocation
-            data = self._malloc_impl(sz)
+            data = self._malloc_checked(sz)
 
             offset = 0
             for obj in reqs:
@@ -118,6 +131,13 @@ def commit(self):
         self._pend_aliases.clear()
         self._pend_extents.clear()
 
+    def _malloc_checked(self, nbytes):
+        if self.ixdtype == np.int32 and nbytes > 4*2**31 - 1:
+            raise RuntimeError('Allocation too large for normal backend '
+                               'memory-model')
+
+        return self._malloc_impl(nbytes)
+
     def _malloc_impl(self, nbytes):
         pass
 

diff --git a/pyfr/backends/base/generator.py b/pyfr/backends/base/generator.py
@@ -67,10 +67,11 @@ def __init__(self, name, spec, body):
 
 
 class BaseKernelGenerator:
-    def __init__(self, name, ndim, args, body, fpdtype):
+    def __init__(self, name, ndim, args, body, fpdtype, ixdtype):
         self.name = name
         self.ndim = ndim
         self.fpdtype = fpdtype
+        self.ixdtype = ixdtype
 
         # Parse and sort our argument list
         sargs = sorted((k, Arg(k, v, body)) for k, v in args.items())
@@ -103,7 +104,7 @@ def argspec(self):
 
         # Dimensions
         argn += self._dims
-        argt += [[np.int32]]*self.ndim
+        argt += [[self.ixdtype]]*self.ndim
 
         # Scalar args (always of type fpdtype)
         argn += [sa.name for sa in self.scalargs]
@@ -114,11 +115,11 @@ def argspec(self):
             argn.append(va.name)
 
             if va.isview:
-                argt.append([np.intp]*(2 + (va.ncdim == 2)))
+                argt.append([np.uintp]*(2 + (va.ncdim == 2)))
             elif self.needs_ldim(va):
-                argt.append([np.intp, np.int32])
+                argt.append([np.uintp, self.ixdtype])
             else:
-                argt.append([np.intp])
+                argt.append([np.uintp])
 
         # Return
         return self.ndim, argn, argt
@@ -285,11 +286,12 @@ def __init__(self, *args, **kwargs):
         if self.ndim == 1:
             self.preamble += 'if (_x < _nx)'
         else:
+            blk_y, lid_y = self.block2d[1], self._lid[1]
             self.preamble += f'''
-                int _ysize = (_ny + {self.block2d[1] - 1}) / {self.block2d[1]};
-                int _ystart = {self._lid[1]}*_ysize;
-                int _yend = min(_ny, _ystart + _ysize);
-                for (int _y = _ystart; _x < _nx && _y < _yend; _y++)'''
+                ixdtype_t _ysize = (_ny + {blk_y - 1}) / {blk_y};
+                ixdtype_t _ystart = {lid_y}*_ysize;
+                ixdtype_t _yend = min(_ny, _ystart + _ysize);
+                for (ixdtype_t _y = _ystart; _x < _nx && _y < _yend; _y++)'''
 
     def ldim_size(self, name, factor=1):
         return f'ld{name}'
@@ -377,7 +379,7 @@ def render(self):
 
         return f'''{spec}
             {{
-                int _x = {self._gid};
+                ixdtype_t _x = {self._gid};
                 #define X_IDX (_x)
                 #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
                 #define BCAST_BLK(r, c, ld)  c

diff --git a/pyfr/backends/base/makoutil.py b/pyfr/backends/base/makoutil.py
@@ -158,11 +158,12 @@ def kernel(context, name, ndim, **kwargs):
     # Capture the kernel body
     body = capture(context, context['caller'].body)
 
-    # Get the generator class and floating point data type
-    kerngen, fpdtype = context['_kernel_generator'], context['fpdtype']
+    # Get the generator class and data types
+    kerngen = context['_kernel_generator']
+    fpdtype, ixdtype = context['fpdtype'], context['ixdtype']
 
     # Instantiate
-    kern = kerngen(name, int(ndim), kwargs, body, fpdtype)
+    kern = kerngen(name, int(ndim), kwargs, body, fpdtype, ixdtype)
 
     # Save the argument/type list for later use
     context['_kernel_argspecs'][name] = kern.argspec()

diff --git a/pyfr/backends/base/types.py b/pyfr/backends/base/types.py
@@ -237,13 +237,16 @@ def __init__(self, backend, matmap, rmap, cmap, rstridemap, vshape, tags):
         if any(m.dtype != self.refdtype for m in self._mats):
             raise TypeError('Mixed data types are not supported')
 
+        # Index type
+        ixdtype = backend.ixdtype
+
         # SoA size
         k, csubsz = backend.soasz, backend.csubsz
 
         # Base offsets and leading dimensions for each point
-        offset = np.empty(self.n, dtype=np.int32)
-        leaddim = np.empty(self.n, dtype=np.int32)
-        blkdisp = np.empty(self.n, dtype=np.int32)
+        offset = np.empty(self.n, dtype=ixdtype)
+        leaddim = np.empty(self.n, dtype=ixdtype)
+        blkdisp = np.empty(self.n, dtype=ixdtype)
 
         for m in self._mats:
             ix = np.where(matmap == m.mid)
@@ -256,12 +259,12 @@ def __init__(self, backend, matmap, rmap, cmap, rstridemap, vshape, tags):
         coldisp = (cmapmod // k)*k*self.nvcol + cmapmod % k
 
         mapping = (offset + blkdisp + rowdisp + coldisp)[None, :]
-        self.mapping = backend.const_matrix(mapping, dtype=np.int32, tags=tags)
+        self.mapping = backend.const_matrix(mapping, dtype=ixdtype, tags=tags)
 
         # Row strides
         if self.nvrow > 1:
             rstrides = (rstridemap*leaddim)[None, :]
-            self.rstrides = backend.const_matrix(rstrides, dtype=np.int32,
+            self.rstrides = backend.const_matrix(rstrides, dtype=ixdtype,
                                                  tags=tags)
 
 

diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py
@@ -10,7 +10,8 @@ def axnpby(self, *arr, subdims=None):
             raise ValueError('Incompatible matrix types')
 
         nv = len(arr)
-        nrow, ncol, ldim, dtype = arr[0].traits[1:]
+        ixdtype = self.backend.ixdtype
+        nrow, ncol, ldim, fpdtype = arr[0].traits[1:]
         ncola, ncolb = arr[0].ioshape[1:]
 
         # Render the kernel template
@@ -20,7 +21,7 @@ def axnpby(self, *arr, subdims=None):
 
         # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
+                                  [ixdtype]*3 + [np.uintp]*nv + [fpdtype]*nv)
 
         # Determine the grid/block
         block = (128, 1, 1)
@@ -59,7 +60,8 @@ def reduction(self, *rs, method, norm, dt_mat=None):
             raise ValueError('Incompatible matrix types')
 
         cuda = self.backend.cuda
-        nrow, ncol, ldim, dtype = rs[0].traits[1:]
+        ixdtype = self.backend.ixdtype
+        nrow, ncol, ldim, fpdtype = rs[0].traits[1:]
         ncola, ncolb = rs[0].ioshape[1:]
 
         # Reduction block dimensions
@@ -72,7 +74,7 @@ def reduction(self, *rs, method, norm, dt_mat=None):
         reduced_dev = cuda.mem_alloc(ncola*grid[0]*rs[0].itemsize)
 
         # Empty result buffer on the host
-        reduced_host = cuda.pagelocked_empty((ncola, grid[0]), dtype)
+        reduced_host = cuda.pagelocked_empty((ncola, grid[0]), fpdtype)
 
         tplargs = dict(norm=norm, method=method)
 
@@ -86,11 +88,11 @@ def reduction(self, *rs, method, norm, dt_mat=None):
 
         # Argument types for reduction kernel
         if method == 'errest':
-            argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2
+            argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]*2
         elif method == 'resid' and dt_mat:
-            argt = [np.int32]*3 + [np.intp]*4 + [dtype]
+            argt = [ixdtype]*3 + [np.uintp]*4 + [fpdtype]
         else:
-            argt = [np.int32]*3 + [np.intp]*3 + [dtype]
+            argt = [ixdtype]*3 + [np.uintp]*3 + [fpdtype]
 
         # Build the reduction kernel
         rkern = self._build_kernel('reduction', src, argt)
@@ -100,7 +102,7 @@ def reduction(self, *rs, method, norm, dt_mat=None):
         params.set_args(nrow, ncolb, ldim, reduced_dev, *regs)
 
         # Runtime argument offset
-        facoff = argt.index(dtype)
+        facoff = argt.index(fpdtype)
 
         # Norm type
         reducer = np.max if norm == 'uniform' else np.sum

diff --git a/pyfr/backends/cuda/cublas.py b/pyfr/backends/cuda/cublas.py
@@ -81,14 +81,21 @@ def mul(self, a, b, out, alpha=1.0, beta=0.0):
         if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
             raise ValueError('Incompatible matrices for out = a*b')
 
-        # CUBLAS expects inputs to be column-major (or Fortran order in
+        # cuBLAS expects inputs to be column-major (or Fortran order in
         # numpy parlance).  However as C = A*B => C^T = (A*B)^T
         # = (B^T)*(A^T) with a little trickery we can multiply our
         # row-major matrices directly.
         m, n, k = b.ncol, a.nrow, a.ncol
         A, B, C = b, a, out
 
-        # α and β factors for C = α*(A*op(B)) + β*C
+        # Cache key
+        ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim)
+
+        # Size checks
+        if any(sz > 2**31 - 1 for sz in ckey[3:]):
+            raise ValueError('Matrices too large for cuBLAS')
+
+        # α and β factors for C = α*(A*B) + β*C
         if a.dtype == np.float64:
             cublasgemm = w.cublasDgemm
             alpha_ct, beta_ct = c_double(alpha), c_double(beta)
@@ -102,9 +109,6 @@ def gemm(stream):
             cublasgemm(h, w.OP_N, w.OP_N, m, n, k, alpha_ct, A, A.leaddim,
                        B, B.leaddim, beta_ct, C, C.leaddim)
 
-        # Cache key
-        ckey = (A.dtype, alpha, beta, m, n, k, A.leaddim, B.leaddim, C.leaddim)
-
         # Obtain the performance of the kernel
         try:
             dt = self._mul_timing[ckey]

diff --git a/pyfr/backends/cuda/generator.py b/pyfr/backends/cuda/generator.py
@@ -5,33 +5,35 @@
 
 class CUDAKernelGenerator(BaseGPUKernelGenerator):
     _lid = ('threadIdx.x', 'threadIdx.y')
-    _gid = 'blockIdx.x*blockDim.x + threadIdx.x'
+    _gid = 'ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x'
     _shared_prfx = '__shared__'
     _shared_sync = '__syncthreads()'
 
     def _render_spec(self):
+        res = '__restrict__'
+
         # We first need the argument list; starting with the dimensions
-        kargs = [f'int {d}' for d in self._dims]
+        kargs = [f'ixdtype_t {d}' for d in self._dims]
 
         # Now add any scalar arguments
         kargs.extend(f'{sa.dtype} {sa.name}' for sa in self.scalargs)
 
         # Finally, add the vector arguments
         for va in self.vectargs:
             if va.intent == 'in':
-                kargs.append(f'const {va.dtype}* __restrict__ {va.name}_v')
+                kargs.append(f'const {va.dtype}* {res} {va.name}_v')
             else:
-                kargs.append(f'{va.dtype}* __restrict__ {va.name}_v')
+                kargs.append(f'{va.dtype}* {res} {va.name}_v')
 
             # Views
             if va.isview:
-                kargs.append(f'const int* __restrict__ {va.name}_vix')
+                kargs.append(f'const ixdtype_t* {res} {va.name}_vix')
 
                 if va.ncdim == 2:
-                    kargs.append(f'const int* __restrict__ {va.name}_vrstri')
+                    kargs.append(f'const ixdtype_t* {res} {va.name}_vrstri')
             # Arrays
             elif self.needs_ldim(va):
-                kargs.append(f'int ld{va.name}')
+                kargs.append(f'ixdtype_t ld{va.name}')
 
         # Determine the launch bounds for the kernel
         nthrds = prod(self.block1d if self.ndim == 1 else self.block2d)

diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako
@@ -2,14 +2,15 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 __global__ void
-axnpby(int nrow, int ncolb, int ldim, fpdtype_t* __restrict__ x0,
+axnpby(ixdtype_t nrow, ixdtype_t ncolb, ixdtype_t ldim,
+       fpdtype_t* __restrict__ x0,
        ${', '.join(f'const fpdtype_t* __restrict__ x{i}'
                    for i in range(1, nv))},
        ${', '.join(f'fpdtype_t a{i}' for i in range(nv))})
 {
     int i = blockIdx.y*blockDim.y + threadIdx.y;
-    int j = blockIdx.x*blockDim.x + threadIdx.x;
-    int idx;
+    ixdtype_t j = ixdtype_t(blockIdx.x)*blockDim.x + threadIdx.x;
+    ixdtype_t idx;
 
     if (j < ncolb && a0 == 0.0)
     {

diff --git a/pyfr/backends/cuda/kernels/base.mako b/pyfr/backends/cuda/kernels/base.mako
@@ -5,8 +5,11 @@
 #define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)
 
 // Typedefs
-typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
 typedef unsigned int uint32_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
+typedef ${pyfr.npdtype_to_ctype(ixdtype)} ixdtype_t;
 
 // Atomic helpers
 __device__ void atomic_min_fpdtype(fpdtype_t* addr, fpdtype_t val)