diff --git a/pySDC/helpers/fieldsIO.py b/pySDC/helpers/fieldsIO.py
index a57fa77f2e..77ae205288 100644
--- a/pySDC/helpers/fieldsIO.py
+++ b/pySDC/helpers/fieldsIO.py
@@ -154,7 +154,8 @@ def fromFile(cls, fileName):
         fieldsIO : :class:`FieldsIO`
             The specialized `FieldsIO` adapted to the file.
         """
-        assert os.path.isfile(fileName), f"not a file ({fileName})"
+        if not os.path.isfile(fileName):
+            raise FileNotFoundError(f"not a file ({fileName})")
         with open(fileName, "rb") as f:
             STRUCT, DTYPE = np.fromfile(f, dtype=H_DTYPE, count=2)
             fieldsIO: FieldsIO = cls.STRUCTS[STRUCT](DTYPES[DTYPE], fileName)
diff --git a/pySDC/helpers/plot_helper.py b/pySDC/helpers/plot_helper.py
index 608f367c49..e76ca5beaa 100644
--- a/pySDC/helpers/plot_helper.py
+++ b/pySDC/helpers/plot_helper.py
@@ -45,6 +45,7 @@ def figsize_by_journal(journal, scale, ratio):  # pragma: no cover
         'Springer_proceedings': 347.12354,
         'JSC_thesis': 434.26027,
         'TUHH_thesis': 426.79135,
+        'Nature_CS': 372.0,
     }
     # store text height in points here, get this from LaTeX using \the\textheight
     textheights = {
@@ -52,6 +53,7 @@ def figsize_by_journal(journal, scale, ratio):  # pragma: no cover
         'JSC_thesis': 635.5,
         'TUHH_thesis': 631.65118,
         'Springer_proceedings': 549.13828,
+        'Nature_CS': 552.69478,
     }
     assert (
         journal in textwidths.keys()
diff --git a/pySDC/helpers/spectral_helper.py b/pySDC/helpers/spectral_helper.py
index 469023c296..235fc3780b 100644
--- a/pySDC/helpers/spectral_helper.py
+++ b/pySDC/helpers/spectral_helper.py
@@ -97,6 +97,7 @@ def __init__(self, N, x0=None, x1=None, useGPU=False, useFFTW=False):
 
         if useGPU:
             self.setup_GPU()
+            self.logger.debug('Set up for GPU')
         else:
             self.setup_CPU(useFFTW=useFFTW)
 
@@ -1026,7 +1027,6 @@ def __init__(self, comm=None, useGPU=False, debug=False):
         self.BCs = None
 
         self.fft_cache = {}
-        self.fft_dealias_shape_cache = {}
 
         self.logger = logging.getLogger(name='Spectral Discretization')
         if debug:
@@ -1313,11 +1313,12 @@ def setup_BCs(self):
 
         diags = self.xp.ones(self.BCs.shape[0])
         diags[self.BC_zero_index] = 0
-        self.BC_line_zero_matrix = sp.diags(diags)
+        self.BC_line_zero_matrix = sp.diags(diags).tocsc()
 
         # prepare BCs in spectral space to easily add to the RHS
         rhs_BCs = self.put_BCs_in_rhs(self.u_init)
-        self.rhs_BCs_hat = self.transform(rhs_BCs)
+        self.rhs_BCs_hat = self.transform(rhs_BCs).view(self.xp.ndarray)
+        del self.BC_rhs_mask
 
     def check_BCs(self, u):
         """
@@ -1370,7 +1371,7 @@ def put_BCs_in_rhs_hat(self, rhs_hat):
             Generate a mask where we need to set values in the rhs in spectral space to zero, such that can replace them
             by the boundary conditions. The mask is then cached.
             """
-            self._rhs_hat_zero_mask = self.newDistArray().astype(bool)
+            self._rhs_hat_zero_mask = self.newDistArray(forward_output=True).astype(bool).view(self.xp.ndarray)
 
             for axis in range(self.ndim):
                 for bc in self.full_BCs:
@@ -1518,7 +1519,7 @@ def get_indices(self, forward_output=True):
     def get_pfft(self, axes=None, padding=None, grid=None):
         if self.ndim == 1 or self.comm is None:
             return None
-        from mpi4py_fft import PFFT
+        from mpi4py_fft import PFFT, newDistArray
 
         axes = tuple(i for i in range(self.ndim)) if axes is None else axes
         padding = list(padding if padding else [1.0 for _ in range(self.ndim)])
@@ -1550,6 +1551,10 @@ def no_transform(u, *args, **kwargs):
             transforms=transforms,
             grid=grid,
         )
+
+        # do a transform to do the planning
+        _u = newDistArray(pfft, forward_output=False)
+        pfft.backward(pfft.forward(_u))
         return pfft
 
     def get_fft(self, axes=None, direction='object', padding=None, shape=None):
@@ -1905,6 +1910,7 @@ def get_differentiation_matrix(self, axes, **kwargs):
             _D = self.axes[axis].get_differentiation_matrix(**kwargs)
             D = D @ self.expand_matrix_ND(_D, axis)
 
+        self.logger.debug(f'Set up differentiation matrix along axes {axes} with kwargs {kwargs}')
         return D
 
     def get_integration_matrix(self, axes):
@@ -1968,4 +1974,5 @@ def get_basis_change_matrix(self, axes=None, **kwargs):
             _C = self.axes[axis].get_basis_change_matrix(**kwargs)
             C = C @ self.expand_matrix_ND(_C, axis)
 
+        self.logger.debug(f'Set up basis change matrix along axes {axes} with kwargs {kwargs}')
         return C
diff --git a/pySDC/implementations/problem_classes/RayleighBenard3D.py b/pySDC/implementations/problem_classes/RayleighBenard3D.py
index cf67e03f79..5b1da41d7b 100644
--- a/pySDC/implementations/problem_classes/RayleighBenard3D.py
+++ b/pySDC/implementations/problem_classes/RayleighBenard3D.py
@@ -141,6 +141,10 @@ def __init__(
         U01 = self.get_basis_change_matrix(p_in=0, p_out=1)
         U12 = self.get_basis_change_matrix(p_in=1, p_out=2)
         U02 = self.get_basis_change_matrix(p_in=0, p_out=2)
+        self.eliminate_zeros(S1)
+        self.eliminate_zeros(S2)
+        self.eliminate_zeros(Dz)
+        self.eliminate_zeros(Dzz)
 
         self.Dx = Dx
         self.Dxx = Dxx
@@ -158,13 +162,12 @@ def __init__(
 
         # construct operators
         _D = U02 @ (Dxx + Dyy) + Dzz
-        L_lhs = {
-            'p': {'u': U01 @ Dx, 'v': U01 @ Dy, 'w': Dz},  # divergence free constraint
-            'u': {'p': U02 @ Dx, 'u': -self.nu * _D},
-            'v': {'p': U02 @ Dy, 'v': -self.nu * _D},
-            'w': {'p': U12 @ Dz, 'w': -self.nu * _D, 'T': -U02 @ Id},
-            'T': {'T': -self.kappa * _D},
-        }
+        L_lhs = {}
+        L_lhs['p'] = {'u': U01 @ Dx, 'v': U01 @ Dy, 'w': Dz}  # divergence free constraint
+        L_lhs['u'] = {'p': U02 @ Dx, 'u': -self.nu * _D}
+        L_lhs['v'] = {'p': U02 @ Dy, 'v': -self.nu * _D}
+        L_lhs['w'] = {'p': U12 @ Dz, 'w': -self.nu * _D, 'T': -U02 @ Id}
+        L_lhs['T'] = {'T': -self.kappa * _D}
         self.setup_L(L_lhs)
 
         # mass matrix
@@ -308,9 +311,10 @@ def compute_Nusselt_numbers(self, u):
             u: The solution you want to compute the Nusselt numbers of
 
         Returns:
-            dict: Nusselt number averaged over the entire volume and horizontally averaged at the top and bottom.
+            dict: Nusselt number averaged over the entire volume and horizontally averaged at the top and bottom as well
+                  as computed from thermal and kinetic dissipation.
         """
-        iw, iT = self.index(['w', 'T'])
+        iu, iv, iw, iT = self.index(['u', 'v', 'w', 'T'])
         zAxis = self.spectral.axes[-1]
 
         if self.spectral_space:
@@ -318,6 +322,19 @@ def compute_Nusselt_numbers(self, u):
         else:
             u_hat = self.transform(u)
 
+        # evaluate derivatives in x, y, and z for u, v, w, and T
+        derivatives = []
+        derivative_indices = [iu, iv, iw, iT]
+        u_hat_flat = [u_hat[i].flatten() for i in derivative_indices]
+        _D_u_hat = self.u_init_forward
+        for D in [self.Dx, self.Dy, self.Dz]:
+            _D_u_hat *= 0
+            for i in derivative_indices:
+                self.xp.copyto(_D_u_hat[i], (D @ u_hat_flat[i]).reshape(_D_u_hat[i].shape))
+            derivatives.append(
+                self.itransform(_D_u_hat).real
+            )  # derivatives[0] contains x derivatives, [2] is y and [3] is z
+
         DzT_hat = (self.Dz @ u_hat[iT].flatten()).reshape(u_hat[iT].shape)
 
         # compute wT with dealiasing
@@ -327,31 +344,56 @@ def compute_Nusselt_numbers(self, u):
         _me[0] = u_pad[iw] * u_pad[iT]
         wT_hat = self.transform(_me, padding=padding)[0]
 
+        # compute Nusselt number
         nusselt_hat = (wT_hat / self.kappa - DzT_hat) * self.axes[-1].L
 
-        if not hasattr(self, '_zInt'):
-            self._zInt = zAxis.get_integration_matrix()
+        # compute thermal dissipation
+        thermal_dissipation = self.u_init_physical
+        thermal_dissipation[0, ...] = (
+            self.kappa * (derivatives[0][iT].real + derivatives[1][iT].real + derivatives[2][iT].real) ** 2
+        )
+        thermal_dissipation_hat = self.transform(thermal_dissipation)[0]
+
+        # compute kinetic energy dissipation
+        kinetic_energy_dissipation = self.u_init_physical
+        for i in [iu, iv, iw]:
+            kinetic_energy_dissipation[0, ...] += (
+                self.nu * (derivatives[0][i].real + derivatives[1][i].real + derivatives[2][i].real) ** 2
+            )
+        kinetic_energy_dissipation_hat = self.transform(kinetic_energy_dissipation)[0]
 
-        # get coefficients for evaluation on the boundary
+        # get coefficients for evaluation on the boundary and vertical integration
+        zInt = zAxis.get_integration_weights()
         top = zAxis.get_BC(kind='Dirichlet', x=1)
         bot = zAxis.get_BC(kind='Dirichlet', x=-1)
 
+        # compute volume averages
         integral_V = 0
+        integral_V_th = 0
+        integral_V_kin = 0
         if self.comm.rank == 0:
 
-            integral_z = (self._zInt @ nusselt_hat[0, 0]).real
-            integral_z[0] = zAxis.get_integration_constant(integral_z, axis=-1)
-            integral_V = ((top - bot) * integral_z).sum() * self.axes[0].L * self.axes[1].L / self.nx / self.ny
+            integral_V = (zInt @ nusselt_hat[0, 0]).real * self.axes[0].L * self.axes[1].L / self.nx / self.ny
+            integral_V_th = (
+                (zInt @ thermal_dissipation_hat[0, 0]).real * self.axes[0].L * self.axes[1].L / self.nx / self.ny
+            )
+            integral_V_kin = (
+                (zInt @ kinetic_energy_dissipation_hat[0, 0]).real * self.axes[0].L * self.axes[1].L / self.nx / self.ny
+            )
 
+        # communicate result across all tasks
         Nusselt_V = self.comm.bcast(integral_V / self.spectral.V, root=0)
-
         Nusselt_t = self.comm.bcast(self.xp.sum(nusselt_hat.real[0, 0, :] * top, axis=-1) / self.nx / self.ny, root=0)
         Nusselt_b = self.comm.bcast(self.xp.sum(nusselt_hat.real[0, 0] * bot / self.nx / self.ny, axis=-1), root=0)
+        Nusselt_thermal = self.comm.bcast(1 / self.kappa * integral_V_th / self.spectral.V, root=0)
+        Nusselt_kinetic = self.comm.bcast(1 + 1 / self.kappa * integral_V_kin / self.spectral.V, root=0)
 
         return {
             'V': Nusselt_V,
             't': Nusselt_t,
             'b': Nusselt_b,
+            'thermal': Nusselt_thermal,
+            'kinetic': Nusselt_kinetic,
         }
 
     def get_frequency_spectrum(self, u):
diff --git a/pySDC/implementations/problem_classes/generic_spectral.py b/pySDC/implementations/problem_classes/generic_spectral.py
index a8cff43611..e7aba58da2 100644
--- a/pySDC/implementations/problem_classes/generic_spectral.py
+++ b/pySDC/implementations/problem_classes/generic_spectral.py
@@ -132,21 +132,29 @@ def __init__(
         if self.heterogeneous:
             self.__heterogeneous_setup = False
 
+        self.logger.debug('Finished GenericSpectralLinear __init__')
+
     def heterogeneous_setup(self):
         if self.heterogeneous and not self.__heterogeneous_setup:
+
             CPU_only = ['BC_line_zero_matrix', 'BCs']
             both = ['Pl', 'Pr', 'L', 'M']
 
+            self.logger.debug(f'Starting heterogeneous setup. Moving {CPU_only} and copying {both} to CPU')
+
             if self.useGPU:
                 for key in CPU_only:
+                    self.logger.debug(f'Moving {key} to CPU')
                     setattr(self.spectral, key, getattr(self.spectral, key).get())
 
                 for key in both:
+                    self.logger.debug(f'Copying {key} to CPU')
                     setattr(self, f'{key}_CPU', getattr(self, key).get())
             else:
                 for key in both:
                     setattr(self, f'{key}_CPU', getattr(self, key))
 
+            self.logger.debug('Done with heterogeneous setup')
         self.__heterogeneous_setup = True
 
     def __getattr__(self, name):
@@ -195,6 +203,7 @@ def setup_L(self, LHS):
             LHS (dict): Dictionary containing the equations.
         """
         self.L = self._setup_operator(LHS)
+        self.logger.debug('Set up L matrix')
 
     def setup_M(self, LHS):
         '''
@@ -203,6 +212,7 @@ def setup_M(self, LHS):
         diff_index = list(LHS.keys())
         self.diff_mask = [me in diff_index for me in self.components]
         self.M = self._setup_operator(LHS)
+        self.logger.debug('Set up M matrix')
 
     def setup_preconditioner(self, Dirichlet_recombination=True, left_preconditioner=True):
         """
@@ -213,8 +223,14 @@ def setup_preconditioner(self, Dirichlet_recombination=True, left_preconditioner
             left_preconditioner (bool): If True, it will interleave the variables and reverse the Kronecker product
         """
         N = np.prod(self.init[0][1:])
+        if self.useGPU:
+            from cupy_backends.cuda.libs.cusparse import CuSparseError as MemError
+        else:
+            MemError = MemoryError
 
         if left_preconditioner:
+            self.logger.debug(f'Setting up left preconditioner with {N} local degrees of freedom')
+
             # reverse Kronecker product
             if self.spectral.useGPU:
                 import scipy.sparse as sp
@@ -228,18 +244,27 @@ def setup_preconditioner(self, Dirichlet_recombination=True, left_preconditioner
                     R[i * self.ncomponents + j, j * N + i] = 1
 
             self.Pl = self.spectral.sparse_lib.csc_matrix(R, dtype=complex)
+
+            self.logger.debug('Finished setup of left preconditioner')
         else:
             Id = self.spectral.sparse_lib.eye(N)
             Pl_lhs = {comp: {comp: Id} for comp in self.components}
             self.Pl = self._setup_operator(Pl_lhs)
 
         if Dirichlet_recombination and type(self.axes[-1]).__name__ in ['ChebychevHelper', 'UltrasphericalHelper']:
+            self.logger.debug('Using Dirichlet recombination as right preconditioner')
             _Pr = self.spectral.get_Dirichlet_recombination_matrix(axis=-1)
         else:
             _Pr = self.spectral.sparse_lib.eye(N)
 
         Pr_lhs = {comp: {comp: _Pr} for comp in self.components}
-        self.Pr = self._setup_operator(Pr_lhs) @ self.Pl.T
+        operator = self._setup_operator(Pr_lhs)
+
+        try:
+            self.Pr = operator @ self.Pl.T
+        except MemError:
+            self.logger.debug('Setting up right preconditioner on CPU due to memory error')
+            self.Pr = self.spectral.sparse_lib.csc_matrix(operator.get() @ self.Pl.T.get())
 
     def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs):
         """
@@ -330,6 +355,7 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs)
                     import scipy.sparse as sp
 
                     cpu_decomp = sp.linalg.splu(A)
+
                     if self.useGPU:
                         from cupyx.scipy.sparse.linalg import SuperLU
 
diff --git a/pySDC/tests/test_problems/test_RayleighBenard3D.py b/pySDC/tests/test_problems/test_RayleighBenard3D.py
index 89811f6995..47d52c2cfe 100644
--- a/pySDC/tests/test_problems/test_RayleighBenard3D.py
+++ b/pySDC/tests/test_problems/test_RayleighBenard3D.py
@@ -274,10 +274,10 @@ def test_banded_matrix(preconditioning):
 
 
 @pytest.mark.cupy
-def test_heterogeneous_implementation():
+def test_heterogeneous_implementation(N=8, useGPU=True):
     from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
 
-    params = {'nx': 2, 'ny': 2, 'nz': 2, 'useGPU': True}
+    params = {'nx': N, 'ny': N, 'nz': N, 'useGPU': useGPU}
     gpu = RayleighBenard3D(**params)
     het = RayleighBenard3D(**params, heterogeneous=True)
 
@@ -293,52 +293,63 @@ def test_heterogeneous_implementation():
 
 
 @pytest.mark.mpi4py
-@pytest.mark.parametrize('w', [0, 1, 3.14])
-def test_Nusselt_number_computation(w, N=6):
+@pytest.mark.parametrize('c', [0, 1, 3.14])
+def test_Nusselt_number_computation(c, N=6):
     from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
 
     prob = RayleighBenard3D(nx=N, ny=N, nz=N, dealiasing=1.0, spectral_space=False, Rayleigh=1.0, Prandtl=1.0)
     xp = prob.xp
-    iw, iT = prob.index(['w', 'T'])
+    iu, iw, iT = prob.index(['u', 'w', 'T'])
 
-    # constant temperature and perturbed velocity
+    # temperature gradient and perturbed velocity
     u = prob.u_init
     u[iT, ...] = 3 * prob.Z**2 + 1
-    u[iw] = w * (1 + xp.sin(prob.Y / prob.axes[1].L * 2 * xp.pi))
+    u[iw] = c * (1 + xp.sin(prob.Y / prob.axes[1].L * 2 * xp.pi))
     Nu = prob.compute_Nusselt_numbers(u)
 
-    for key, expect in zip(['t', 'b', 'V'], [prob.Lz * (3 + 1) * w - 6, w, w * (1 + 1) - 3]):
+    for key, expect in zip(['t', 'b', 'V', 'thermal'], [prob.Lz * (3 + 1) * c - 6, c, c * (1 + 1) - 3, 12]):
         assert xp.isclose(Nu[key], expect), f'Expected Nu_{key}={expect}, but got {Nu[key]}'
 
     # zero
     u = prob.u_init
     Nu = prob.compute_Nusselt_numbers(u)
-    assert xp.allclose(list(Nu.values()), 0)
+    for key in ['t', 'b', 'V', 'thermal']:
+        assert xp.isclose(Nu[key], 0), f'Unexpected non-zero Nusselt number in {key} in constant zero profile'
+    assert xp.isclose(Nu['kinetic'], 1), 'Unexpected non-one kinetic Nusselt number in constant zero profile'
 
     # constant gradient
     u = prob.u_init
     u[iT] = prob.Z**2 + 1
+    u[iu] = c * xp.sqrt(5) / 3 * prob.Z**3 + c
     Nu = prob.compute_Nusselt_numbers(u)
 
-    for key, expect in zip(['t', 'b', 'V'], [-prob.Lz * 2, 0, -1]):
+    for key, expect in zip(['t', 'b', 'V', 'thermal', 'kinetic'], [-prob.Lz * 2, 0, -1, 4 / 3, 1 + c**2]):
         assert xp.isclose(Nu[key], expect), f'Expected Nu_{key}={expect}, but got {Nu[key]} with T=z**2!'
 
     # gradient plus fluctuations
     u = prob.u_init
-    u[iT] = prob.Z * 1 + (xp.sin(prob.X / prob.axes[0].L * 2 * xp.pi) * xp.sin(prob.Y / prob.axes[1].L * 2 * xp.pi))
+    u[iT] = prob.Z * (1 + xp.sin(prob.X / prob.axes[0].L * 2 * xp.pi) * xp.sin(prob.Y / prob.axes[1].L * 2 * xp.pi))
     Nu = prob.compute_Nusselt_numbers(u)
 
-    for key in ['t', 'b', 'V']:
+    for key in [
+        't',
+        'b',
+        'V',
+    ]:
         assert xp.isclose(Nu[key], -1), f'Expected Nu_{key}=-1, but got {Nu[key]} with T=z*(1+sin(x)+sin(y))!'
+    assert xp.isclose(Nu['kinetic'], 1), f'Expected Nu_kinetic=1, but got {Nu["kinetic"]} with T=z*(1+sin(x)+sin(y))!'
 
     # constant temperature and perturbed velocity
     u = prob.u_init
     u[iT, ...] = 1
-    u[iw] = w * (1 + xp.sin(prob.Y / prob.axes[1].L * 2 * xp.pi))
+    u[iw] = c * (1 + xp.sin(prob.Y / prob.axes[1].L * 2 * xp.pi))
     Nu = prob.compute_Nusselt_numbers(u)
 
     for key in ['t', 'b', 'V']:
-        assert xp.isclose(Nu[key], w), f'Expected Nu_{key}={w}, but got {Nu[key]} with constant T and perturbed w!'
+        assert xp.isclose(Nu[key], c), f'Expected Nu_{key}={c}, but got {Nu[key]} with constant T and perturbed w!'
+    assert xp.isclose(
+        Nu['thermal'], 0
+    ), f'Expected Nu_thermal=0, but got {Nu["thermal"]} with constant T and perturbed w!'
 
 
 @pytest.mark.mpi4py
@@ -388,11 +399,12 @@ def test_spectrum_computation(mpi_ranks):
 
 
 if __name__ == '__main__':
-    test_eval_f(2**2, 2**1, 'x', False)
+    # test_eval_f(2**2, 2**1, 'x', False)
     # test_libraries()
     # test_Poisson_problems(4, 'u')
     # test_Poisson_problem_w()
     # test_solver_convergence('bicgstab+ilu', 32, False, True)
     # test_banded_matrix(False)
     # test_heterogeneous_implementation()
-    # test_Nusselt_number_computation(N=4, w=3.14)
+    test_Nusselt_number_computation(N=6, c=3)
+    # test_spectrum_computation(None)