diff --git a/doc/releases/changelog-dev.md b/doc/releases/changelog-dev.md
index aa4aeeaa7b8..073c577e3ee 100644
--- a/doc/releases/changelog-dev.md
+++ b/doc/releases/changelog-dev.md
@@ -111,6 +111,7 @@
   `qml.beta.QNode`, and `@qml.beta.qnode`.
   [(#1642)](https://github.com/PennyLaneAI/pennylane/pull/1642)
   [(#1646)](https://github.com/PennyLaneAI/pennylane/pull/1646)
+  [(#1651)](https://github.com/PennyLaneAI/pennylane/pull/1651)
 
   It differs from the standard QNode in several ways:
 
@@ -136,6 +137,14 @@
     significant performance improvement when executing the QNode on remote
     quantum hardware.
 
+  - When decomposing the circuit, the default decomposition strategy will prioritize
+    decompositions that result in the smallest number of parametrized operations
+    required to satisfy the differentiation method. Additional decompositions required
+    to satisfy the native gate set of the quantum device will be performed later, by the
+    device at execution time. While this may lead to a slight increase in classical processing,
+    it significantly reduces the number of circuit evaluations needed to compute
+    gradients of complex unitaries.
+
   In an upcoming release, this QNode will replace the existing one. If you come across any bugs
   while using this QNode, please let us know via a [bug
   report](https://github.com/PennyLaneAI/pennylane/issues/new?assignees=&labels=bug+%3Abug%3A&template=bug_report.yml&title=%5BBUG%5D)
@@ -143,7 +152,6 @@
 
   Currently, this beta QNode does not support the following features:
 
-  - Circuit decompositions
   - Non-mutability via the `mutable` keyword argument
   - Viewing specifications with `qml.specs`
   - The `reversible` QNode differentiation method
@@ -151,6 +159,30 @@
 
   It is also not tested with the `qml.qnn` module.
 
+* Two new methods were added to the Device API, allowing PennyLane devices
+  increased control over circuit decompositions.
+  [(#1651)](https://github.com/PennyLaneAI/pennylane/pull/1651)
+
+  - `Device.expand_fn(tape) -> tape`: expands a tape such that it is supported by the device. By
+    default, performs the standard device-specific gate set decomposition done in the default
+    QNode. Devices may overwrite this method in order to define their own decomposition logic.
+
+    Note that the numerical result after applying this method should remain unchanged; PennyLane
+    will assume that the expanded tape returns exactly the same value as the original tape when
+    executed.
+
+  - `Device.batch_transform(tape) -> (tapes, processing_fn)`: preprocesses the tape in the case
+    where the device needs to generate multiple circuits to execute from the input circuit. The
+    requirement of a post-processing function makes this distinct to the `expand_fn` method above.
+    
+    By default, this method applies the transform
+
+    .. math:: \left\langle \sum_i c_i h_i\right\rangle -> \sum_i c_i \left\langle h_i \right\rangle
+
+    if `expval(H)` is present on devices that do not natively support Hamiltonians with
+    non-commuting terms.
+
+
 <h3>Improvements</h3>
 
 * The tests for qubit operations are split into multiple files.
diff --git a/pennylane/_device.py b/pennylane/_device.py
index 3824d2a5168..e6c3a3af4e2 100644
--- a/pennylane/_device.py
+++ b/pennylane/_device.py
@@ -561,6 +561,95 @@ def gradients(self, circuits, method="jacobian", **kwargs):
         gradient_method = getattr(self, method)
         return [gradient_method(circuit, **kwargs) for circuit in circuits]
 
+    def expand_fn(self, circuit, max_expansion=10):
+        """Method for expanding or decomposing an input circuit.
+        This method should be overwritten if custom expansion logic is
+        required.
+
+        By default, this method expands the tape if:
+
+        - nested tapes are present,
+        - any operations are not supported on the device, or
+        - multiple observables are measured on the same wire.
+
+        Args:
+            circuit (.QuantumTape): the circuit to expand.
+            max_expansion (int): The number of times the circuit should be
+                expanded. Expansion occurs when an operation or measurement is not
+                supported, and results in a gate decomposition. If any operations
+                in the decomposition remain unsupported by the device, another
+                expansion occurs.
+
+        Returns:
+            .QuantumTape: The expanded/decomposed circuit, such that the device
+            will natively support all operations.
+        """
+        obs_on_same_wire = len(
+            circuit._obs_sharing_wires  # pylint: disable=protected-access
+        ) > 0 and not self.supports_observable("Hamiltonian")
+
+        ops_not_supported = any(
+            isinstance(op, qml.tape.QuantumTape)  # nested tapes must be expanded
+            or not self.supports_operation(op.name)  # unsupported ops must be expanded
+            for op in circuit.operations
+        )
+
+        if ops_not_supported or obs_on_same_wire:
+            circuit = circuit.expand(
+                depth=max_expansion,
+                stop_at=lambda obj: not isinstance(obj, qml.tape.QuantumTape)
+                and self.supports_operation(obj.name),
+            )
+
+        return circuit
+
+    def batch_transform(self, circuit):
+        """Apply a differentiable batch transform for preprocessing a circuit
+        prior to execution. This method is called directly by the QNode, and
+        should be overwritten if the device requires a transform that
+        generates multiple circuits prior to execution.
+
+        By default, this method contains logic for generating multiple
+        circuits, one per term, of a circuit that terminates in ``expval(H)``,
+        if the underlying device does not support Hamiltonian expectation values,
+        or if the device requires finite shots.
+
+        .. warning::
+
+            This method will be tracked by autodifferentiation libraries,
+            such as Autograd, JAX, TensorFlow, and Torch. Please make sure
+            to use ``qml.math`` for autodiff-agnostic tensor processing
+            if required.
+
+        Args:
+            circuit (.QuantumTape): the circuit to preprocess
+
+        Returns:
+            tuple[Sequence[.QuantumTape], callable]: Returns a tuple containing
+            the sequence of circuits to be executed, and a post-processing function
+            to be applied to the list of evaluated circuit results.
+        """
+
+        # If the observable contains a Hamiltonian and the device does not
+        # support Hamiltonians, or if the simulation uses finite shots,
+        # split tape into multiple tapes of diagonalizable known observables.
+        supports_hamiltonian = self.supports_observable("Hamiltonian")
+        finite_shots = self.shots is not None
+
+        hamiltonian_in_obs = "Hamiltonian" in [obs.name for obs in circuit.observables]
+
+        if hamiltonian_in_obs and (not supports_hamiltonian or finite_shots):
+            try:
+                return qml.transforms.hamiltonian_expand(circuit, group=False)
+
+            except ValueError as e:
+                raise ValueError(
+                    "Can only return the expectation of a single Hamiltonian observable"
+                ) from e
+
+        # otherwise, return an identity transform
+        return [circuit], lambda res: res[0]
+
     @property
     def op_queue(self):
         """The operation queue to be applied.
diff --git a/pennylane/beta/qnode.py b/pennylane/beta/qnode.py
index 43cec0195a7..c85cab28327 100644
--- a/pennylane/beta/qnode.py
+++ b/pennylane/beta/qnode.py
@@ -51,9 +51,10 @@ class QNode:
 
         Currently, this beta QNode does not support the following features:
 
-        - Circuit decompositions
         - Non-mutability via the ``mutable`` keyword argument
         - Viewing specifications with ``qml.specs``
+        - The ``reversible`` QNode differentiation method
+        - The ability to specify a ``dtype`` when using PyTorch and TensorFlow.
 
         It is also not tested with the :mod:`~.qnn` module.
 
@@ -123,6 +124,19 @@ class QNode:
 
             * ``None``: QNode cannot be differentiated. Works the same as ``interface=None``.
 
+        expansion_strategy (str): The strategy to use when circuit expansions or decompositions
+            are required.
+
+            - ``gradient``: The QNode will attempt to decompose
+              the internal circuit such that all circuit operations are supported by the gradient
+              method. Further decompositions required for device execution are performed by the
+              device prior to circuit execution.
+
+            - ``device``: The QNode will attempt to decompose the internal circuit
+              such that all circuit operations are natively supported by the device.
+
+            The ``gradient`` strategy typically results in a reduction in quantum device evaluations
+            required during optimization, at the expense of an increase in classical preprocessing.
         max_expansion (int): The number of times the internal circuit should be expanded when
             executed on a device. Expansion occurs when an operation or measurement is not
             supported, and results in a gate decomposition. If any operations in the decomposition
@@ -174,6 +188,7 @@ def __init__(
         device,
         interface="autograd",
         diff_method="best",
+        expansion_strategy="gradient",
         max_expansion=10,
         mode="best",
         cache=True,
@@ -208,6 +223,7 @@ def __init__(
         self.device = device
         self._interface = interface
         self.diff_method = diff_method
+        self.expansion_strategy = expansion_strategy
         self.max_expansion = max_expansion
 
         # execution keyword arguments
@@ -216,8 +232,12 @@ def __init__(
             "cache": cache,
             "cachesize": cachesize,
             "max_diff": max_diff,
+            "max_expansion": max_expansion,
         }
 
+        if self.expansion_strategy == "device":
+            self.execute_kwargs["expand_fn"] = None
+
         # internal data attributes
         self._tape = None
         self._qfunc_output = None
@@ -518,6 +538,14 @@ def construct(self, args, kwargs):
                         "Operator {} must act on all wires".format(obj.name)
                     )
 
+        if self.expansion_strategy == "device":
+            self._tape = self.device.expand_fn(self.tape, max_expansion=self.max_expansion)
+
+        # If the gradient function is a transform, expand the tape so that
+        # all operations are supported by the transform.
+        if isinstance(self.gradient_fn, qml.gradients.gradient_transform):
+            self._tape = self.gradient_fn.expand_fn(self._tape)
+
     def __call__(self, *args, **kwargs):
         override_shots = False
 
@@ -540,15 +568,20 @@ def __call__(self, *args, **kwargs):
         # construct the tape
         self.construct(args, kwargs)
 
+        # preprocess the tapes by applying any device-specific transforms
+        tapes, processing_fn = self.device.batch_transform(self.tape)
+
         res = qml.execute(
-            [self.tape],
+            tapes,
             device=self.device,
             gradient_fn=self.gradient_fn,
             interface=self.interface,
             gradient_kwargs=self.gradient_kwargs,
             override_shots=override_shots,
             **self.execute_kwargs,
-        )[0]
+        )
+
+        res = processing_fn(res)
 
         if override_shots is not False:
             # restore the initialization gradient function
diff --git a/pennylane/gradients/gradient_transform.py b/pennylane/gradients/gradient_transform.py
index 1846be0f5da..b4a194fb85e 100644
--- a/pennylane/gradients/gradient_transform.py
+++ b/pennylane/gradients/gradient_transform.py
@@ -48,7 +48,10 @@ def gradient_expand(tape, depth=10):
             and ((supported_op(obj) and trainable_op(obj)) or not trainable_op(obj))
         )
 
-        return tape.expand(depth=depth, stop_at=stop_cond)
+        new_tape = tape.expand(depth=depth, stop_at=stop_cond)
+        params = new_tape.get_parameters(trainable_only=False)
+        new_tape.trainable_params = qml.math.get_trainable_indices(params)
+        return new_tape
 
     return tape
 
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 6f4f1adb610..ee9953655df 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -69,7 +69,7 @@ def set_shots(device, shots):
             device.shots = original_shots
 
 
-def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
+def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True, expand_fn=None):
     """Decorator that adds caching to a function that executes
     multiple tapes on a device.
 
@@ -106,6 +106,12 @@ def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
         function: a wrapped version of the execution function ``fn`` with caching
         support
     """
+    if expand_fn is not None:
+        original_fn = fn
+
+        def fn(tapes, **kwargs):  # pylint: disable=function-redefined
+            tapes = [expand_fn(tape) for tape in tapes]
+            return original_fn(tapes, **kwargs)
 
     @wraps(fn)
     def wrapper(tapes, **kwargs):
@@ -189,6 +195,8 @@ def execute(
     cachesize=10000,
     max_diff=2,
     override_shots=False,
+    expand_fn="device",
+    max_expansion=10,
 ):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
@@ -217,6 +225,14 @@ def execute(
             the maximum number of derivatives to support. Increasing this value allows
             for higher order derivatives to be extracted, at the cost of additional
             (classical) computational overhead during the backwards pass.
+        expand_fn (function): Tape expansion function to be called prior to device execution.
+            Must have signature of the form ``expand_fn(tape, max_expansion)``, and return a
+            single :class:`~.QuantumTape`. If not provided, by default :meth:`Device.expand_fn`
+            is called.
+        max_expansion (int): The number of times the internal circuit should be expanded when
+            executed on a device. Expansion occurs when an operation or measurement is not
+            supported, and results in a gate decomposition. If any operations in the decomposition
+            remain unsupported by the device, another expansion occurs.
 
     Returns:
         list[list[float]]: A nested list of tape results. Each element in
@@ -284,21 +300,33 @@ def cost_fn(params, x):
 
     batch_execute = set_shots(device, override_shots)(device.batch_execute)
 
+    if expand_fn == "device":
+        expand_fn = lambda tape: device.expand_fn(tape, max_expansion=max_expansion)
+
     if gradient_fn is None:
         with qml.tape.Unwrap(*tapes):
-            res = cache_execute(batch_execute, cache, return_tuple=False)(tapes)
+            res = cache_execute(batch_execute, cache, return_tuple=False, expand_fn=expand_fn)(
+                tapes
+            )
 
         return res
 
     if gradient_fn == "backprop" or interface is None:
-        return cache_execute(batch_execute, cache, return_tuple=False)(tapes)
+        return cache_execute(batch_execute, cache, return_tuple=False, expand_fn=expand_fn)(tapes)
 
     # the default execution function is batch_execute
-    execute_fn = cache_execute(batch_execute, cache)
+    execute_fn = cache_execute(batch_execute, cache, expand_fn=expand_fn)
 
     if gradient_fn == "device":
         # gradient function is a device method
 
+        # Expand all tapes as per the device's expand function here.
+        # We must do this now, prior to the interface, to ensure that
+        # decompositions with parameter processing is tracked by the
+        # autodiff frameworks.
+        for i, tape in enumerate(tapes):
+            tapes[i] = expand_fn(tape)
+
         if mode in ("forward", "best"):
             # replace the forward execution function to return
             # both results and gradients
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 97a56bb13e8..75d9dfa6676 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -174,7 +174,7 @@ def grad_fn(dy):
 
                 # Generate and execute the required gradient tapes
                 if _n == max_diff:
-                    with qml.tape.Unwrap(*tapes):
+                    with qml.tape.Unwrap(*tapes, set_trainable=False):
                         vjp_tapes, processing_fn = qml.gradients.batch_vjp(
                             tapes,
                             dy,
diff --git a/pennylane/interfaces/batch/torch.py b/pennylane/interfaces/batch/torch.py
index b81d6d16c80..9d263d2ad5d 100644
--- a/pennylane/interfaces/batch/torch.py
+++ b/pennylane/interfaces/batch/torch.py
@@ -160,7 +160,7 @@ def backward(ctx, *dy):
                     # The derivative order is at the maximum. Compute the VJP
                     # in a non-differentiable manner to reduce overhead.
 
-                    with qml.tape.Unwrap(*ctx.tapes):
+                    with qml.tape.Unwrap(*ctx.tapes, set_trainable=False):
                         vjp_tapes, processing_fn = qml.gradients.batch_vjp(
                             ctx.tapes,
                             dy,
diff --git a/pennylane/templates/embeddings/amplitude.py b/pennylane/templates/embeddings/amplitude.py
index 307b4f55b4a..a332a06ca6c 100644
--- a/pennylane/templates/embeddings/amplitude.py
+++ b/pennylane/templates/embeddings/amplitude.py
@@ -124,6 +124,7 @@ def circuit(f=None):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(
         self, features, wires, pad_with=None, normalize=False, pad=None, do_queue=True, id=None
diff --git a/pennylane/templates/embeddings/angle.py b/pennylane/templates/embeddings/angle.py
index dd52043849e..abbcef74116 100644
--- a/pennylane/templates/embeddings/angle.py
+++ b/pennylane/templates/embeddings/angle.py
@@ -49,6 +49,7 @@ class AngleEmbedding(Operation):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, features, wires, rotation="X", do_queue=True, id=None):
 
diff --git a/pennylane/templates/layers/basic_entangler.py b/pennylane/templates/layers/basic_entangler.py
index 6440b7f2c78..214f18b1e07 100644
--- a/pennylane/templates/layers/basic_entangler.py
+++ b/pennylane/templates/layers/basic_entangler.py
@@ -124,6 +124,7 @@ def circuit(weights):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, weights, wires=None, rotation=None, do_queue=True, id=None):
 
diff --git a/pennylane/templates/layers/cv_neural_net.py b/pennylane/templates/layers/cv_neural_net.py
index cfb067a474a..80f59559157 100644
--- a/pennylane/templates/layers/cv_neural_net.py
+++ b/pennylane/templates/layers/cv_neural_net.py
@@ -83,6 +83,7 @@ def circuit():
     num_params = 11
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(
         self,
diff --git a/pennylane/templates/layers/particle_conserving_u1.py b/pennylane/templates/layers/particle_conserving_u1.py
index 087ac5e8fc0..2d0632a4069 100644
--- a/pennylane/templates/layers/particle_conserving_u1.py
+++ b/pennylane/templates/layers/particle_conserving_u1.py
@@ -228,6 +228,7 @@ class ParticleConservingU1(Operation):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, weights, wires, init_state=None, do_queue=True, id=None):
 
diff --git a/pennylane/templates/layers/particle_conserving_u2.py b/pennylane/templates/layers/particle_conserving_u2.py
index dbf959760b7..36a41a00d08 100644
--- a/pennylane/templates/layers/particle_conserving_u2.py
+++ b/pennylane/templates/layers/particle_conserving_u2.py
@@ -150,6 +150,7 @@ class ParticleConservingU2(Operation):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, weights, wires, init_state=None, do_queue=True, id=None):
 
diff --git a/pennylane/templates/layers/random.py b/pennylane/templates/layers/random.py
index 4d0da0dbd18..e61c08bea9f 100644
--- a/pennylane/templates/layers/random.py
+++ b/pennylane/templates/layers/random.py
@@ -178,6 +178,7 @@ def circuit_rnd(weights):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(
         self,
diff --git a/pennylane/templates/layers/simplified_two_design.py b/pennylane/templates/layers/simplified_two_design.py
index 01558e652dd..92474b0e661 100644
--- a/pennylane/templates/layers/simplified_two_design.py
+++ b/pennylane/templates/layers/simplified_two_design.py
@@ -101,6 +101,7 @@ def circuit(init_weights, weights):
     num_params = 2
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, initial_layer_weights, weights, wires, do_queue=True, id=None):
 
diff --git a/pennylane/templates/layers/strongly_entangling.py b/pennylane/templates/layers/strongly_entangling.py
index 699f3f9d2e0..97225279b9c 100644
--- a/pennylane/templates/layers/strongly_entangling.py
+++ b/pennylane/templates/layers/strongly_entangling.py
@@ -67,6 +67,7 @@ class StronglyEntanglingLayers(Operation):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, weights, wires, ranges=None, imprimitive=None, do_queue=True, id=None):
 
diff --git a/pennylane/templates/state_preparations/arbitrary_state_preparation.py b/pennylane/templates/state_preparations/arbitrary_state_preparation.py
index 1c0222eea65..534f28ec59c 100644
--- a/pennylane/templates/state_preparations/arbitrary_state_preparation.py
+++ b/pennylane/templates/state_preparations/arbitrary_state_preparation.py
@@ -82,6 +82,7 @@ def vqe(weights):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, weights, wires, do_queue=True, id=None):
 
diff --git a/pennylane/templates/state_preparations/basis.py b/pennylane/templates/state_preparations/basis.py
index ba4426a0a92..1e2c402ac12 100644
--- a/pennylane/templates/state_preparations/basis.py
+++ b/pennylane/templates/state_preparations/basis.py
@@ -51,6 +51,7 @@ def circuit(basis_state):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, basis_state, wires, do_queue=True, id=None):
 
diff --git a/pennylane/templates/state_preparations/mottonen.py b/pennylane/templates/state_preparations/mottonen.py
index ac320f50493..b65a4023006 100644
--- a/pennylane/templates/state_preparations/mottonen.py
+++ b/pennylane/templates/state_preparations/mottonen.py
@@ -248,6 +248,7 @@ class MottonenStatePreparation(Operation):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, state_vector, wires, do_queue=True, id=None):
 
diff --git a/pennylane/templates/subroutines/all_singles_doubles.py b/pennylane/templates/subroutines/all_singles_doubles.py
index 35f8173f04e..99b725a141b 100644
--- a/pennylane/templates/subroutines/all_singles_doubles.py
+++ b/pennylane/templates/subroutines/all_singles_doubles.py
@@ -114,6 +114,7 @@ def circuit(weights, hf_state, singles, doubles):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(
         self, weights, wires, hf_state, singles=None, doubles=None, do_queue=True, id=None
diff --git a/pennylane/templates/subroutines/approx_time_evolution.py b/pennylane/templates/subroutines/approx_time_evolution.py
index bfde18f723f..7b338d1701b 100644
--- a/pennylane/templates/subroutines/approx_time_evolution.py
+++ b/pennylane/templates/subroutines/approx_time_evolution.py
@@ -100,6 +100,7 @@ def circuit(time):
     num_params = 3
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, hamiltonian, time, n, do_queue=True, id=None):
 
diff --git a/pennylane/templates/subroutines/arbitrary_unitary.py b/pennylane/templates/subroutines/arbitrary_unitary.py
index e6acb4fde9c..f25b5902d2b 100644
--- a/pennylane/templates/subroutines/arbitrary_unitary.py
+++ b/pennylane/templates/subroutines/arbitrary_unitary.py
@@ -95,6 +95,7 @@ def arbitrary_nearest_neighbour_interaction(weights, wires):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, weights, wires, do_queue=True, id=None):
 
diff --git a/pennylane/templates/subroutines/double_excitation_unitary.py b/pennylane/templates/subroutines/double_excitation_unitary.py
index ba0fa88bd60..24026e804cb 100644
--- a/pennylane/templates/subroutines/double_excitation_unitary.py
+++ b/pennylane/templates/subroutines/double_excitation_unitary.py
@@ -15,12 +15,23 @@
 Contains the DoubleExcitationUnitary template.
 """
 # pylint: disable-msg=too-many-branches,too-many-arguments,protected-access
+import math
+
 import numpy as np
 import pennylane as qml
 from pennylane.operation import Operation, AnyWires
 from pennylane.ops import RZ, RX, CNOT, Hadamard
 
 
+# Four term gradient recipe for controlled rotations
+INV_SQRT2 = 1 / math.sqrt(2)
+c1 = INV_SQRT2 * (np.sqrt(2) + 1) / 4
+c2 = INV_SQRT2 * (np.sqrt(2) - 1) / 4
+a = np.pi / 2
+b = 3 * np.pi / 2
+four_term_grad_recipe = ([[c1, 1, a], [-c1, 1, -a], [-c2, 1, b], [c2, 1, -b]],)
+
+
 def _layer1(weight, s, r, q, p, set_cnot_wires):
     r"""Implement the first layer of the circuit to exponentiate the double-excitation
     operator entering the UCCSD ansatz.
@@ -475,7 +486,9 @@ def circuit(weight, wires1=None, wires2=None):
 
     num_params = 1
     num_wires = AnyWires
-    par_domain = "A"
+    par_domain = "R"
+    grad_method = "A"
+    grad_recipe = four_term_grad_recipe
 
     def __init__(self, weight, wires1=None, wires2=None, do_queue=True, id=None):
 
diff --git a/pennylane/templates/subroutines/grover.py b/pennylane/templates/subroutines/grover.py
index ff5975259b1..46f2d83ab8c 100644
--- a/pennylane/templates/subroutines/grover.py
+++ b/pennylane/templates/subroutines/grover.py
@@ -102,6 +102,7 @@ def GroverSearch(num_iterations=1):
     num_params = 0
     num_wires = AnyWires
     par_domain = None
+    grad_method = None
 
     def __init__(self, wires=None, work_wires=None, do_queue=True, id=None):
         if (not hasattr(wires, "__len__")) or (len(wires) < 2):
diff --git a/pennylane/templates/subroutines/permute.py b/pennylane/templates/subroutines/permute.py
index c9f05448a28..a6e514b1fac 100644
--- a/pennylane/templates/subroutines/permute.py
+++ b/pennylane/templates/subroutines/permute.py
@@ -143,6 +143,7 @@ def circuit()
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, permutation, wires, do_queue=True, id=None):
 
diff --git a/pennylane/templates/subroutines/qmc.py b/pennylane/templates/subroutines/qmc.py
index 09e362bf3e5..0b328a2e57b 100644
--- a/pennylane/templates/subroutines/qmc.py
+++ b/pennylane/templates/subroutines/qmc.py
@@ -330,6 +330,7 @@ def circuit():
     num_params = 3
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, probs, func, target_wires, estimation_wires, do_queue=True, id=None):
         if isinstance(probs, np.ndarray) and probs.ndim != 1:
diff --git a/pennylane/templates/subroutines/qpe.py b/pennylane/templates/subroutines/qpe.py
index 34c05ea5e76..8dc6d6b5b45 100644
--- a/pennylane/templates/subroutines/qpe.py
+++ b/pennylane/templates/subroutines/qpe.py
@@ -107,6 +107,7 @@ def circuit():
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(self, unitary, target_wires, estimation_wires, do_queue=True, id=None):
         self.target_wires = list(target_wires)
diff --git a/pennylane/templates/subroutines/single_excitation_unitary.py b/pennylane/templates/subroutines/single_excitation_unitary.py
index 8b6a8c2aed8..97ebdece520 100644
--- a/pennylane/templates/subroutines/single_excitation_unitary.py
+++ b/pennylane/templates/subroutines/single_excitation_unitary.py
@@ -15,12 +15,23 @@
 Contains the SingleExcitationUnitary template.
 """
 # pylint: disable-msg=too-many-branches,too-many-arguments,protected-access
+import math
+
 import pennylane as qml
 from pennylane import numpy as np
 from pennylane.operation import Operation, AnyWires
 from pennylane.ops import RZ, RX, CNOT, Hadamard
 
 
+# Four term gradient recipe for controlled rotations
+INV_SQRT2 = 1 / math.sqrt(2)
+c1 = INV_SQRT2 * (np.sqrt(2) + 1) / 4
+c2 = INV_SQRT2 * (np.sqrt(2) - 1) / 4
+a = np.pi / 2
+b = 3 * np.pi / 2
+four_term_grad_recipe = ([[c1, 1, a], [-c1, 1, -a], [-c2, 1, b], [c2, 1, -b]],)
+
+
 class SingleExcitationUnitary(Operation):
     r"""Circuit to exponentiate the tensor product of Pauli matrices representing the
     single-excitation operator entering the Unitary Coupled-Cluster Singles
@@ -114,7 +125,9 @@ def circuit(weight, wires=None):
 
     num_params = 1
     num_wires = AnyWires
-    par_domain = "A"
+    par_domain = "R"
+    grad_method = "A"
+    grad_recipe = four_term_grad_recipe
 
     def __init__(self, weight, wires=None, do_queue=True, id=None):
         if len(wires) < 2:
diff --git a/pennylane/templates/subroutines/uccsd.py b/pennylane/templates/subroutines/uccsd.py
index b6dc04b4fb7..a430882e945 100644
--- a/pennylane/templates/subroutines/uccsd.py
+++ b/pennylane/templates/subroutines/uccsd.py
@@ -143,6 +143,7 @@ class UCCSD(Operation):
     num_params = 1
     num_wires = AnyWires
     par_domain = "A"
+    grad_method = None
 
     def __init__(
         self, weights, wires, s_wires=None, d_wires=None, init_state=None, do_queue=True, id=None
diff --git a/tests/beta/test_beta_qnode.py b/tests/beta/test_beta_qnode.py
index a2d1952dead..1791b2a8d77 100644
--- a/tests/beta/test_beta_qnode.py
+++ b/tests/beta/test_beta_qnode.py
@@ -898,3 +898,229 @@ def circuit(x, y):
             assert info["num_trainable_params"] == 4
         else:
             assert info["device_name"] == "default.qubit.autograd"
+
+
+class TestTapeExpansion:
+    """Test that tape expansion within the QNode works correctly"""
+
+    @pytest.mark.parametrize(
+        "diff_method,mode",
+        [("parameter-shift", "backward"), ("adjoint", "forward"), ("adjoint", "backward")],
+    )
+    def test_device_expansion(self, diff_method, mode, mocker):
+        """Test expansion of an unsupported operation on the device"""
+        dev = qml.device("default.qubit", wires=1)
+
+        class UnsupportedOp(qml.operation.Operation):
+            num_wires = 1
+            num_params = 1
+            par_domain = "R"
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RX(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method=diff_method, mode=mode)
+        def circuit(x):
+            UnsupportedOp(x, wires=0)
+            return qml.expval(qml.PauliZ(0))
+
+        if diff_method == "adjoint" and mode == "forward":
+            spy = mocker.spy(circuit.device, "execute_and_gradients")
+        else:
+            spy = mocker.spy(circuit.device, "batch_execute")
+
+        x = np.array(0.5)
+        circuit(x)
+
+        tape = spy.call_args[0][0][0]
+        assert len(tape.operations) == 1
+        assert tape.operations[0].name == "RX"
+        assert np.allclose(tape.operations[0].parameters, 3 * x)
+
+    def test_no_gradient_expansion(self, mocker):
+        """Test that an unsupported operation with defined gradient recipe is
+        not expanded"""
+        dev = qml.device("default.qubit", wires=1)
+
+        class UnsupportedOp(qml.operation.Operation):
+            num_wires = 1
+            num_params = 1
+            par_domain = "R"
+
+            grad_method = "A"
+            grad_recipe = ([[3 / 2, 1, np.pi / 6], [-3 / 2, 1, -np.pi / 6]],)
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RX(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method="parameter-shift", max_diff=2)
+        def circuit(x):
+            UnsupportedOp(x, wires=0)
+            return qml.expval(qml.PauliZ(0))
+
+        x = np.array(0.5)
+        spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+        qml.grad(circuit)(x)
+
+        # check that the gradient recipe was applied *prior* to
+        # device expansion
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 1
+        assert input_tape.operations[0].name == "UnsupportedOp"
+        assert input_tape.operations[0].data[0] == x
+
+        shifted_tape1, shifted_tape2 = spy.spy_return[0]
+
+        assert len(shifted_tape1.operations) == 1
+        assert shifted_tape1.operations[0].name == "UnsupportedOp"
+
+        assert len(shifted_tape2.operations) == 1
+        assert shifted_tape2.operations[0].name == "UnsupportedOp"
+
+        # check second derivative
+        assert np.allclose(qml.grad(qml.grad(circuit))(x), -9 * np.cos(3 * x))
+
+    def test_gradient_expansion(self, mocker):
+        """Test that a *supported* operation with no gradient recipe is
+        expanded when applying the gradient transform, but not for execution."""
+        dev = qml.device("default.qubit", wires=1)
+
+        class PhaseShift(qml.PhaseShift):
+            grad_method = None
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RY(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method="parameter-shift", max_diff=2)
+        def circuit(x):
+            qml.Hadamard(wires=0)
+            PhaseShift(x, wires=0)
+            return qml.expval(qml.PauliX(0))
+
+        spy = mocker.spy(circuit.device, "batch_execute")
+        x = np.array(0.5)
+        circuit(x)
+
+        tape = spy.call_args[0][0][0]
+
+        spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+        res = qml.grad(circuit)(x)
+
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 2
+        assert input_tape.operations[1].name == "RY"
+        assert input_tape.operations[1].data[0] == 3 * x
+
+        shifted_tape1, shifted_tape2 = spy.spy_return[0]
+
+        assert len(shifted_tape1.operations) == 2
+        assert shifted_tape1.operations[1].name == "RY"
+
+        assert len(shifted_tape2.operations) == 2
+        assert shifted_tape2.operations[1].name == "RY"
+
+        assert np.allclose(res, -3 * np.sin(3 * x))
+
+        # test second order derivatives
+        res = qml.grad(qml.grad(circuit))(x)
+        assert np.allclose(res, -9 * np.cos(3 * x))
+
+    def test_hamiltonian_expansion_analytic(self, mocker):
+        """Test that the Hamiltonian is not expanded if there
+        are non-commuting groups and the number of shots is None"""
+        dev = qml.device("default.qubit", wires=3, shots=None)
+
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+        c = np.array([-0.6543, 0.24, 0.54])
+        H = qml.Hamiltonian(c, obs)
+        H.compute_grouping()
+
+        assert len(H.grouping_indices) == 2
+
+        @qnode(dev)
+        def circuit():
+            return qml.expval(H)
+
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        res = circuit()
+        assert np.allclose(res, c[2], atol=0.1)
+
+        spy.assert_not_called()
+
+    def test_hamiltonian_expansion_finite_shots(self, mocker):
+        """Test that the Hamiltonian is expanded if there
+        are non-commuting groups and the number of shots is finite"""
+        dev = qml.device("default.qubit", wires=3, shots=50000)
+
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+        c = np.array([-0.6543, 0.24, 0.54])
+        H = qml.Hamiltonian(c, obs)
+        H.compute_grouping()
+
+        assert len(H.grouping_indices) == 2
+
+        @qnode(dev)
+        def circuit():
+            return qml.expval(H)
+
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        res = circuit()
+        assert np.allclose(res, c[2], atol=0.1)
+
+        spy.assert_called()
+        tapes, fn = spy.spy_return
+
+        assert len(tapes) == 2
+
+    def test_invalid_hamiltonian_expansion_finite_shots(self, mocker):
+        """Test that an error is raised if multiple expectations are requested
+        when using finite shots"""
+        dev = qml.device("default.qubit", wires=3, shots=50000)
+
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+        c = np.array([-0.6543, 0.24, 0.54])
+        H = qml.Hamiltonian(c, obs)
+        H.compute_grouping()
+
+        assert len(H.grouping_indices) == 2
+
+        @qnode(dev)
+        def circuit():
+            return qml.expval(H), qml.expval(H)
+
+        with pytest.raises(
+            ValueError, match="Can only return the expectation of a single Hamiltonian"
+        ):
+            circuit()
+
+    def test_device_expansion_strategy(self, mocker):
+        """Test that the device expansion strategy performs the device
+        decomposition at construction time, and not at execution time"""
+        dev = qml.device("default.qubit", wires=2)
+        x = np.array(0.5)
+
+        @qnode(dev, diff_method="parameter-shift", expansion_strategy="device")
+        def circuit(x):
+            qml.SingleExcitation(x, wires=[0, 1])
+            return qml.expval(qml.PauliX(0))
+
+        assert circuit.expansion_strategy == "device"
+        assert circuit.execute_kwargs["expand_fn"] is None
+
+        spy_expand = mocker.spy(circuit.device, "expand_fn")
+
+        circuit.construct([x], {})
+        assert len(circuit.tape.operations) > 0
+        spy_expand.assert_called_once()
+
+        circuit(x)
+        assert len(spy_expand.call_args_list) == 2
+
+        qml.grad(circuit)(x)
+        assert len(spy_expand.call_args_list) == 3
diff --git a/tests/gradients/test_gradient_transform.py b/tests/gradients/test_gradient_transform.py
index 5b84db167d1..88670944e07 100644
--- a/tests/gradients/test_gradient_transform.py
+++ b/tests/gradients/test_gradient_transform.py
@@ -81,6 +81,10 @@ class NonDiffPhaseShift(qml.PhaseShift):
             qml.CNOT(wires=[0, 1])
             qml.expval(qml.PauliZ(0))
 
+        params = tape.get_parameters(trainable_only=False)
+        tape.trainable_params = qml.math.get_trainable_indices(params)
+        assert tape.trainable_params == {1}
+
         spy = mocker.spy(tape, "expand")
         new_tape = gradient_expand(tape)
 
diff --git a/tests/interfaces/test_batch_autograd_qnode.py b/tests/interfaces/test_batch_autograd_qnode.py
index 7d73fb77073..847f4326a28 100644
--- a/tests/interfaces/test_batch_autograd_qnode.py
+++ b/tests/interfaces/test_batch_autograd_qnode.py
@@ -377,7 +377,6 @@ def circuit(data1):
         with pytest.raises(qml.numpy.NonDifferentiableError, match="is non-differentiable"):
             grad_fn(data1)
 
-    @pytest.mark.xfail
     def test_differentiable_expand(self, dev_name, diff_method, mode, tol):
         """Test that operation and nested tape expansion
         is differentiable"""
@@ -404,23 +403,6 @@ def circuit(a, p):
             return qml.expval(qml.PauliX(0))
 
         res = circuit(a, p)
-
-        if diff_method == "finite-diff":
-            assert circuit.qtape.trainable_params == {1, 2, 3, 4}
-        elif diff_method == "backprop":
-            # For a backprop device, no interface wrapping is performed, and JacobianTape.jacobian()
-            # is never called. As a result, JacobianTape.trainable_params is never set --- the ML
-            # framework uses its own backprop logic and its own bookkeeping re: trainable parameters.
-            assert circuit.qtape.trainable_params == {0, 1, 2, 3, 4}
-
-        assert [i.name for i in circuit.qtape.operations] == ["RX", "Rot", "PhaseShift"]
-
-        if diff_method == "finite-diff":
-            assert np.all(circuit.qtape.get_parameters() == [p[2], p[0], -p[2], p[1] + p[2]])
-        elif diff_method == "backprop":
-            # In backprop mode, all parameters are returned.
-            assert np.all(circuit.qtape.get_parameters() == [a, p[2], p[0], -p[2], p[1] + p[2]])
-
         expected = np.cos(a) * np.cos(p[1]) * np.sin(p[0]) + np.sin(a) * (
             np.cos(p[2]) * np.sin(p[1]) + np.cos(p[0]) * np.cos(p[1]) * np.sin(p[2])
         )
@@ -685,20 +667,25 @@ def circuit():
         assert res.shape == (2, 10)
         assert isinstance(res, np.ndarray)
 
-    @pytest.mark.xfail
     def test_chained_qnodes(self, dev_name, diff_method, mode):
         """Test that the gradient of chained QNodes works without error"""
         dev = qml.device(dev_name, wires=2)
 
+        class Template(qml.templates.StronglyEntanglingLayers):
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.templates.StronglyEntanglingLayers(*self.parameters, self.wires)
+                return tape
+
         @qnode(dev, interface="autograd", diff_method=diff_method)
         def circuit1(weights):
-            qml.templates.StronglyEntanglingLayers(weights, wires=[0, 1])
+            Template(weights, wires=[0, 1])
             return qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliZ(1))
 
         @qnode(dev, interface="autograd", diff_method=diff_method)
         def circuit2(data, weights):
             qml.templates.AngleEmbedding(data, wires=[0, 1])
-            qml.templates.StronglyEntanglingLayers(weights, wires=[0, 1])
+            Template(weights, wires=[0, 1])
             return qml.expval(qml.PauliX(0))
 
         def cost(weights):
@@ -707,10 +694,13 @@ def cost(weights):
             c2 = circuit2(c1, w2)
             return np.sum(c2) ** 2
 
-        w1 = qml.init.strong_ent_layers_normal(n_wires=2, n_layers=3)
-        w2 = qml.init.strong_ent_layers_normal(n_wires=2, n_layers=4)
+        w1 = qml.templates.StronglyEntanglingLayers.shape(n_wires=2, n_layers=3)
+        w2 = qml.templates.StronglyEntanglingLayers.shape(n_wires=2, n_layers=4)
 
-        weights = [w1, w2]
+        weights = [
+            np.random.random(w1),
+            np.random.random(w2),
+        ]
 
         grad_fn = qml.grad(cost)
         res = grad_fn(weights)
@@ -1236,3 +1226,158 @@ def circ(x):
     assert circ.device.num_executions == 1
 
     spy.assert_called_with(mocker.ANY, use_device_state=True)
+
+
+@pytest.mark.parametrize("dev_name,diff_method,mode", qubit_device_and_diff_method)
+class TestTapeExpansion:
+    """Test that tape expansion within the QNode integrates correctly
+    with the Autograd interface"""
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_gradient_expansion_trainable_only(self, dev_name, diff_method, mode, max_diff, mocker):
+        """Test that a *supported* operation with no gradient recipe is only
+        expanded for parameter-shift and finite-differences when it is trainable."""
+        if diff_method not in ("parameter-shift", "finite-diff"):
+            pytest.skip("Only supports gradient transforms")
+
+        dev = qml.device(dev_name, wires=1)
+
+        class PhaseShift(qml.PhaseShift):
+            grad_method = None
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RY(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff)
+        def circuit(x, y):
+            qml.Hadamard(wires=0)
+            PhaseShift(x, wires=0)
+            PhaseShift(2 * y, wires=0)
+            return qml.expval(qml.PauliX(0))
+
+        spy = mocker.spy(circuit.device, "batch_execute")
+        x = np.array(0.5, requires_grad=True)
+        y = np.array(0.7, requires_grad=False)
+        circuit(x, y)
+
+        spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+        res = qml.grad(circuit)(x, y)
+
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 3
+        assert input_tape.operations[1].name == "RY"
+        assert input_tape.operations[1].data[0] == 3 * x
+        assert input_tape.operations[2].name == "PhaseShift"
+        assert input_tape.operations[2].grad_method is None
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_hamiltonian_expansion_analytic(self, dev_name, diff_method, mode, max_diff, mocker):
+        """Test that the Hamiltonian is not expanded if there
+        are non-commuting groups and the number of shots is None
+        and the first and second order gradients are correctly evaluated"""
+        if diff_method == "adjoint":
+            pytest.skip("The adjoint method does not yet support Hamiltonians")
+
+        dev = qml.device(dev_name, wires=3, shots=None)
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff)
+        def circuit(data, weights, coeffs):
+            weights = weights.reshape(1, -1)
+            qml.templates.AngleEmbedding(data, wires=[0, 1])
+            qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
+            return qml.expval(qml.Hamiltonian(coeffs, obs))
+
+        d = np.array([0.1, 0.2], requires_grad=False)
+        w = np.array([0.654, -0.734], requires_grad=True)
+        c = np.array([-0.6543, 0.24, 0.54], requires_grad=True)
+
+        # test output
+        res = circuit(d, w, c)
+        expected = c[2] * np.cos(d[1] + w[1]) - c[1] * np.sin(d[0] + w[0]) * np.sin(d[1] + w[1])
+        assert np.allclose(res, expected)
+        spy.assert_not_called()
+
+        # test gradients
+        grad = qml.grad(circuit)(d, w, c)
+        expected_w = [
+            -c[1] * np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]),
+            -c[1] * np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]) - c[2] * np.sin(d[1] + w[1]),
+        ]
+        expected_c = [0, -np.sin(d[0] + w[0]) * np.sin(d[1] + w[1]), np.cos(d[1] + w[1])]
+        assert np.allclose(grad[0], expected_w)
+        assert np.allclose(grad[1], expected_c)
+
+        # test second-order derivatives
+        if diff_method in ("parameter-shift", "backprop") and max_diff == 2:
+
+            grad2_c = qml.jacobian(qml.grad(circuit, argnum=2), argnum=2)(d, w, c)
+            assert np.allclose(grad2_c, 0)
+
+            grad2_w_c = qml.jacobian(qml.grad(circuit, argnum=1), argnum=2)(d, w, c)
+            expected = [0, -np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]), 0], [
+                0,
+                -np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]),
+                -np.sin(d[1] + w[1]),
+            ]
+            assert np.allclose(grad2_w_c, expected)
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_hamiltonian_expansion_finite_shots(
+        self, dev_name, diff_method, mode, max_diff, mocker
+    ):
+        """Test that the Hamiltonian is expanded if there
+        are non-commuting groups and the number of shots is finite
+        and the first and second order gradients are correctly evaluated"""
+        if diff_method in ("adjoint", "backprop", "finite-diff"):
+            pytest.skip("The adjoint and backprop methods do not yet support sampling")
+
+        dev = qml.device(dev_name, wires=3, shots=50000)
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff)
+        def circuit(data, weights, coeffs):
+            weights = weights.reshape(1, -1)
+            qml.templates.AngleEmbedding(data, wires=[0, 1])
+            qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
+            H = qml.Hamiltonian(coeffs, obs)
+            H.compute_grouping()
+            return qml.expval(H)
+
+        d = np.array([0.1, 0.2], requires_grad=False)
+        w = np.array([0.654, -0.734], requires_grad=True)
+        c = np.array([-0.6543, 0.24, 0.54], requires_grad=True)
+
+        # test output
+        res = circuit(d, w, c)
+        expected = c[2] * np.cos(d[1] + w[1]) - c[1] * np.sin(d[0] + w[0]) * np.sin(d[1] + w[1])
+        assert np.allclose(res, expected, atol=0.1)
+        spy.assert_called()
+
+        # test gradients
+        grad = qml.grad(circuit)(d, w, c)
+        expected_w = [
+            -c[1] * np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]),
+            -c[1] * np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]) - c[2] * np.sin(d[1] + w[1]),
+        ]
+        expected_c = [0, -np.sin(d[0] + w[0]) * np.sin(d[1] + w[1]), np.cos(d[1] + w[1])]
+        assert np.allclose(grad[0], expected_w, atol=0.1)
+        assert np.allclose(grad[1], expected_c, atol=0.1)
+
+        # test second-order derivatives
+        if diff_method == "parameter-shift" and max_diff == 2:
+
+            grad2_c = qml.jacobian(qml.grad(circuit, argnum=2), argnum=2)(d, w, c)
+            assert np.allclose(grad2_c, 0, atol=0.1)
+
+            grad2_w_c = qml.jacobian(qml.grad(circuit, argnum=1), argnum=2)(d, w, c)
+            expected = [0, -np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]), 0], [
+                0,
+                -np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]),
+                -np.sin(d[1] + w[1]),
+            ]
+            assert np.allclose(grad2_w_c, expected, atol=0.1)
diff --git a/tests/interfaces/test_batch_tensorflow_qnode.py b/tests/interfaces/test_batch_tensorflow_qnode.py
index 62db71219a1..41252fb1493 100644
--- a/tests/interfaces/test_batch_tensorflow_qnode.py
+++ b/tests/interfaces/test_batch_tensorflow_qnode.py
@@ -363,7 +363,6 @@ def circuit(U, a):
         res = tape.jacobian(res, a)
         assert np.allclose(res, tf.sin(a), atol=tol, rtol=0)
 
-    @pytest.mark.xfail
     def test_differentiable_expand(self, dev_name, diff_method, mode, tol):
         """Test that operation and nested tapes expansion
         is differentiable"""
@@ -392,20 +391,7 @@ def circuit(a, p):
         with tf.GradientTape() as tape:
             res = circuit(a, p)
 
-        if diff_method == "finite-diff":
-            assert circuit.qtape.trainable_params == {1, 2, 3, 4}
-        elif diff_method == "backprop":
-            # For a backprop device, no interface wrapping is performed, and JacobianTape.jacobian()
-            # is never called. As a result, JacobianTape.trainable_params is never set --- the ML
-            # framework uses its own backprop logic and its own bookkeeping re: trainable parameters.
-            assert circuit.qtape.trainable_params == {0, 1, 2, 3, 4}
-
-        assert [i.name for i in circuit.qtape.operations] == ["RX", "Rot", "PhaseShift"]
-
-        if diff_method == "finite-diff":
-            assert np.all(circuit.qtape.get_parameters() == [p[2], p[0], -p[2], p[1] + p[2]])
-        elif diff_method == "backprop":
-            assert np.all(circuit.qtape.get_parameters() == [a, p[2], p[0], -p[2], p[1] + p[2]])
+        assert circuit.qtape.trainable_params == {1, 2, 3}
 
         expected = tf.cos(a) * tf.cos(p[1]) * tf.sin(p[0]) + tf.sin(a) * (
             tf.cos(p[2]) * tf.sin(p[1]) + tf.cos(p[0]) * tf.cos(p[1]) * tf.sin(p[2])
@@ -1055,3 +1041,221 @@ def circuit(n, a):
         grad = tape.gradient(res, [n, a])
         expected = [2 * a ** 2 + 2 * n + 1, 2 * a * (2 * n + 1)]
         assert np.allclose(grad, expected, atol=tol, rtol=0)
+
+
+@pytest.mark.parametrize("dev_name,diff_method,mode", qubit_device_and_diff_method)
+class TestTapeExpansion:
+    """Test that tape expansion within the QNode integrates correctly
+    with the TF interface"""
+
+    def test_gradient_expansion(self, dev_name, diff_method, mode, mocker):
+        """Test that a *supported* operation with no gradient recipe is
+        expanded for both parameter-shift and finite-differences, but not for execution."""
+        if diff_method not in ("parameter-shift", "finite-diff"):
+            pytest.skip("Only supports gradient transforms")
+
+        dev = qml.device(dev_name, wires=1)
+
+        class PhaseShift(qml.PhaseShift):
+            grad_method = None
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RY(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=2, interface="tf")
+        def circuit(x):
+            qml.Hadamard(wires=0)
+            PhaseShift(x, wires=0)
+            return qml.expval(qml.PauliX(0))
+
+        spy = mocker.spy(circuit.device, "batch_execute")
+        x = tf.Variable(0.5, dtype=tf.float64)
+
+        with tf.GradientTape() as t2:
+            with tf.GradientTape() as t1:
+                loss = circuit(x)
+
+            tape = spy.call_args[0][0][0]
+
+            spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+            res = t1.gradient(loss, x)
+
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 2
+        assert input_tape.operations[1].name == "RY"
+        assert input_tape.operations[1].data[0] == 3 * x
+
+        shifted_tape1, shifted_tape2 = spy.spy_return[0]
+
+        assert len(shifted_tape1.operations) == 2
+        assert shifted_tape1.operations[1].name == "RY"
+
+        assert len(shifted_tape2.operations) == 2
+        assert shifted_tape2.operations[1].name == "RY"
+
+        assert np.allclose(res, -3 * np.sin(3 * x))
+
+        if diff_method == "parameter-shift":
+            # test second order derivatives
+            res = t2.gradient(res, x)
+            assert np.allclose(res, -9 * np.cos(3 * x))
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_gradient_expansion_trainable_only(self, dev_name, diff_method, mode, max_diff, mocker):
+        """Test that a *supported* operation with no gradient recipe is only
+        expanded for parameter-shift and finite-differences when it is trainable."""
+        if diff_method not in ("parameter-shift", "finite-diff"):
+            pytest.skip("Only supports gradient transforms")
+
+        dev = qml.device(dev_name, wires=1)
+
+        class PhaseShift(qml.PhaseShift):
+            grad_method = None
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RY(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff, interface="tf")
+        def circuit(x, y):
+            qml.Hadamard(wires=0)
+            PhaseShift(x, wires=0)
+            PhaseShift(2 * y, wires=0)
+            return qml.expval(qml.PauliX(0))
+
+        spy = mocker.spy(circuit.device, "batch_execute")
+        x = tf.Variable(0.5, dtype=tf.float64)
+        y = tf.constant(0.7, dtype=tf.float64)
+
+        with tf.GradientTape() as t:
+            res = circuit(x, y)
+
+        spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+        res = t.gradient(res, [x, y])
+
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 3
+        assert input_tape.operations[1].name == "RY"
+        assert input_tape.operations[1].data[0] == 3 * x
+        assert input_tape.operations[2].name == "PhaseShift"
+        assert input_tape.operations[2].grad_method is None
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_hamiltonian_expansion_analytic(self, dev_name, diff_method, mode, max_diff, mocker):
+        """Test that the Hamiltonian is not expanded if there
+        are non-commuting groups and the number of shots is None
+        and the first and second order gradients are correctly evaluated"""
+        if diff_method == "adjoint":
+            pytest.skip("The adjoint method does not yet support Hamiltonians")
+
+        dev = qml.device(dev_name, wires=3, shots=None)
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff, interface="tf")
+        def circuit(data, weights, coeffs):
+            weights = tf.reshape(weights, [1, -1])
+            qml.templates.AngleEmbedding(data, wires=[0, 1])
+            qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
+            return qml.expval(qml.Hamiltonian(coeffs, obs))
+
+        d = tf.constant([0.1, 0.2], dtype=tf.float64)
+        w = tf.Variable([0.654, -0.734], dtype=tf.float64)
+        c = tf.Variable([-0.6543, 0.24, 0.54], dtype=tf.float64)
+
+        # test output
+        with tf.GradientTape(persistent=True) as t2:
+            with tf.GradientTape() as t1:
+                res = circuit(d, w, c)
+
+            expected = c[2] * np.cos(d[1] + w[1]) - c[1] * np.sin(d[0] + w[0]) * np.sin(d[1] + w[1])
+            assert np.allclose(res, expected)
+            spy.assert_not_called()
+
+            # test gradients
+            grad = t1.gradient(res, [d, w, c])
+
+        expected_w = [
+            -c[1] * np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]),
+            -c[1] * np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]) - c[2] * np.sin(d[1] + w[1]),
+        ]
+        expected_c = [0, -np.sin(d[0] + w[0]) * np.sin(d[1] + w[1]), np.cos(d[1] + w[1])]
+        assert np.allclose(grad[1], expected_w)
+        assert np.allclose(grad[2], expected_c)
+
+        # test second-order derivatives
+        if diff_method in ("parameter-shift", "backprop") and max_diff == 2:
+
+            grad2_c = t2.jacobian(grad[2], c)
+            assert grad2_c is None or np.allclose(grad2_c, 0)
+
+            grad2_w_c = t2.jacobian(grad[1], c)
+            expected = [0, -np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]), 0], [
+                0,
+                -np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]),
+                -np.sin(d[1] + w[1]),
+            ]
+            assert np.allclose(grad2_w_c, expected)
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_hamiltonian_expansion_finite_shots(
+        self, dev_name, diff_method, mode, max_diff, mocker
+    ):
+        """Test that the Hamiltonian is expanded if there
+        are non-commuting groups and the number of shots is finite
+        and the first and second order gradients are correctly evaluated"""
+        if diff_method in ("adjoint", "backprop", "finite-diff"):
+            pytest.skip("The adjoint and backprop methods do not yet support sampling")
+
+        dev = qml.device(dev_name, wires=3, shots=50000)
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff, interface="tf")
+        def circuit(data, weights, coeffs):
+            weights = tf.reshape(weights, [1, -1])
+            qml.templates.AngleEmbedding(data, wires=[0, 1])
+            qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
+            H = qml.Hamiltonian(coeffs, obs)
+            H.compute_grouping()
+            return qml.expval(H)
+
+        d = tf.constant([0.1, 0.2], dtype=tf.float64)
+        w = tf.Variable([0.654, -0.734], dtype=tf.float64)
+        c = tf.Variable([-0.6543, 0.24, 0.54], dtype=tf.float64)
+
+        # test output
+        with tf.GradientTape(persistent=True) as t2:
+            with tf.GradientTape() as t1:
+                res = circuit(d, w, c)
+
+            expected = c[2] * np.cos(d[1] + w[1]) - c[1] * np.sin(d[0] + w[0]) * np.sin(d[1] + w[1])
+            assert np.allclose(res, expected, atol=0.1)
+            spy.assert_called()
+
+            # test gradients
+            grad = t1.gradient(res, [d, w, c])
+
+        expected_w = [
+            -c[1] * np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]),
+            -c[1] * np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]) - c[2] * np.sin(d[1] + w[1]),
+        ]
+        expected_c = [0, -np.sin(d[0] + w[0]) * np.sin(d[1] + w[1]), np.cos(d[1] + w[1])]
+        assert np.allclose(grad[1], expected_w, atol=0.1)
+        assert np.allclose(grad[2], expected_c, atol=0.1)
+
+        # test second-order derivatives
+        if diff_method == "parameter-shift" and max_diff == 2:
+            grad2_c = t2.jacobian(grad[2], c)
+            assert grad2_c is None
+
+            grad2_w_c = t2.jacobian(grad[1], c)
+            expected = [0, -np.cos(d[0] + w[0]) * np.sin(d[1] + w[1]), 0], [
+                0,
+                -np.cos(d[1] + w[1]) * np.sin(d[0] + w[0]),
+                -np.sin(d[1] + w[1]),
+            ]
+            assert np.allclose(grad2_w_c, expected, atol=0.1)
diff --git a/tests/interfaces/test_batch_torch_qnode.py b/tests/interfaces/test_batch_torch_qnode.py
index 2b2a1cd1ca1..9901afac62c 100644
--- a/tests/interfaces/test_batch_torch_qnode.py
+++ b/tests/interfaces/test_batch_torch_qnode.py
@@ -379,7 +379,6 @@ def circuit(U, a):
         res.backward()
         assert np.allclose(a.grad, np.sin(a_val), atol=tol, rtol=0)
 
-    @pytest.mark.xfail
     def test_differentiable_expand(self, dev_name, diff_method, mode, tol):
         """Test that operation and nested tapes expansion
         is differentiable"""
@@ -408,20 +407,7 @@ def circuit(a, p):
 
         res = circuit(a, p)
 
-        if diff_method == "finite-diff":
-            assert circuit.qtape.trainable_params == {1, 2, 3, 4}
-        elif diff_method == "backprop":
-            # For a backprop device, no interface wrapping is performed, and JacobianTape.jacobian()
-            # is never called. As a result, JacobianTape.trainable_params is never set --- the ML
-            # framework uses its own backprop logic and its own bookkeeping re: trainable parameters.
-            assert circuit.qtape.trainable_params == {0, 1, 2, 3, 4}
-
-        assert [i.name for i in circuit.qtape.operations] == ["RX", "Rot", "PhaseShift"]
-
-        if diff_method == "finite-diff":
-            assert np.all(circuit.qtape.get_parameters() == [p[2], p[0], -p[2], p[1] + p[2]])
-        elif diff_method == "backprop":
-            assert np.all(circuit.qtape.get_parameters() == [a, p[2], p[0], -p[2], p[1] + p[2]])
+        assert circuit.qtape.trainable_params == {1, 2, 3}
 
         expected = np.cos(a) * np.cos(p_val[1]) * np.sin(p_val[0]) + np.sin(a) * (
             np.cos(p_val[2]) * np.sin(p_val[1])
@@ -757,7 +743,6 @@ def circuit():
         assert isinstance(res[0], torch.Tensor)
         assert isinstance(res[1], torch.Tensor)
 
-    @pytest.mark.xfail
     def test_chained_qnodes(self, dev_name, diff_method, mode):
         """Test that the gradient of chained QNodes works without error"""
         dev = qml.device(dev_name, wires=2)
@@ -777,7 +762,7 @@ def cost(weights):
             w1, w2 = weights
             c1 = circuit1(w1)
             c2 = circuit2(c1, w2)
-            return np.sum(c2) ** 2
+            return torch.sum(c2) ** 2
 
         w1 = qml.init.strong_ent_layers_normal(n_wires=2, n_layers=3)
         w2 = qml.init.strong_ent_layers_normal(n_wires=2, n_layers=4)
@@ -1098,3 +1083,244 @@ def circuit(n, a):
         res = torch.tensor([n.grad, a.grad])
         expected = torch.tensor([[2 * a ** 2 + 2 * n + 1, 2 * a * (2 * n + 1)]])
         assert torch.allclose(res, expected, atol=tol, rtol=0)
+
+
+@pytest.mark.parametrize("dev_name,diff_method,mode", qubit_device_and_diff_method)
+class TestTapeExpansion:
+    """Test that tape expansion within the QNode integrates correctly
+    with the Torch interface"""
+
+    def test_gradient_expansion(self, dev_name, diff_method, mode, mocker):
+        """Test that a *supported* operation with no gradient recipe is
+        expanded for both parameter-shift and finite-differences, but not for execution."""
+        if diff_method not in ("parameter-shift", "finite-diff"):
+            pytest.skip("Only supports gradient transforms")
+
+        dev = qml.device(dev_name, wires=1)
+
+        class PhaseShift(qml.PhaseShift):
+            grad_method = None
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RY(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=2, interface="torch")
+        def circuit(x):
+            qml.Hadamard(wires=0)
+            PhaseShift(x, wires=0)
+            return qml.expval(qml.PauliX(0))
+
+        spy = mocker.spy(circuit.device, "batch_execute")
+        x = torch.tensor(0.5, requires_grad=True)
+
+        loss = circuit(x)
+
+        tape = spy.call_args[0][0][0]
+
+        spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+        loss.backward()
+        res = x.grad
+
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 2
+        assert input_tape.operations[1].name == "RY"
+        assert input_tape.operations[1].data[0] == 3 * x
+
+        shifted_tape1, shifted_tape2 = spy.spy_return[0]
+
+        assert len(shifted_tape1.operations) == 2
+        assert shifted_tape1.operations[1].name == "RY"
+
+        assert len(shifted_tape2.operations) == 2
+        assert shifted_tape2.operations[1].name == "RY"
+
+        assert torch.allclose(res, -3 * torch.sin(3 * x))
+
+        if diff_method == "parameter-shift":
+            # test second order derivatives
+            res = torch.autograd.functional.hessian(circuit, x)
+            assert torch.allclose(res, -9 * torch.cos(3 * x))
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_gradient_expansion_trainable_only(self, dev_name, diff_method, mode, max_diff, mocker):
+        """Test that a *supported* operation with no gradient recipe is only
+        expanded for parameter-shift and finite-differences when it is trainable."""
+        if diff_method not in ("parameter-shift", "finite-diff"):
+            pytest.skip("Only supports gradient transforms")
+
+        dev = qml.device(dev_name, wires=1)
+
+        class PhaseShift(qml.PhaseShift):
+            grad_method = None
+
+            def expand(self):
+                with qml.tape.QuantumTape() as tape:
+                    qml.RY(3 * self.data[0], wires=self.wires)
+                return tape
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff, interface="torch")
+        def circuit(x, y):
+            qml.Hadamard(wires=0)
+            PhaseShift(x, wires=0)
+            PhaseShift(2 * y, wires=0)
+            return qml.expval(qml.PauliX(0))
+
+        spy = mocker.spy(circuit.device, "batch_execute")
+        x = torch.tensor(0.5, requires_grad=True)
+        y = torch.tensor(0.7, requires_grad=False)
+
+        loss = circuit(x, y)
+
+        spy = mocker.spy(circuit.gradient_fn, "transform_fn")
+        loss.backward()
+
+        input_tape = spy.call_args[0][0]
+        assert len(input_tape.operations) == 3
+        assert input_tape.operations[1].name == "RY"
+        assert input_tape.operations[1].data[0] == 3 * x
+        assert input_tape.operations[2].name == "PhaseShift"
+        assert input_tape.operations[2].grad_method is None
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_hamiltonian_expansion_analytic(self, dev_name, diff_method, mode, max_diff, mocker):
+        """Test that the Hamiltonian is not expanded if there
+        are non-commuting groups and the number of shots is None
+        and the first and second order gradients are correctly evaluated"""
+        if diff_method == "adjoint":
+            pytest.skip("The adjoint method does not yet support Hamiltonians")
+
+        dev = qml.device(dev_name, wires=3, shots=None)
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff, interface="torch")
+        def circuit(data, weights, coeffs):
+            weights = torch.reshape(weights, [1, -1])
+            qml.templates.AngleEmbedding(data, wires=[0, 1])
+            qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
+            return qml.expval(qml.Hamiltonian(coeffs, obs))
+
+        d = torch.tensor([0.1, 0.2], requires_grad=False, dtype=torch.float64)
+        w = torch.tensor([0.654, -0.734], requires_grad=True, dtype=torch.float64)
+        c = torch.tensor([-0.6543, 0.24, 0.54], requires_grad=True, dtype=torch.float64)
+
+        # test output
+        res = circuit(d, w, c)
+
+        expected = c[2] * torch.cos(d[1] + w[1]) - c[1] * torch.sin(d[0] + w[0]) * torch.sin(
+            d[1] + w[1]
+        )
+        assert torch.allclose(res, expected)
+        spy.assert_not_called()
+
+        # test gradients
+        res.backward()
+        grad = (w.grad, c.grad)
+
+        expected_w = torch.tensor(
+            [
+                -c[1] * torch.cos(d[0] + w[0]) * torch.sin(d[1] + w[1]),
+                -c[1] * torch.cos(d[1] + w[1]) * torch.sin(d[0] + w[0])
+                - c[2] * torch.sin(d[1] + w[1]),
+            ]
+        )
+        expected_c = torch.tensor(
+            [0, -torch.sin(d[0] + w[0]) * torch.sin(d[1] + w[1]), torch.cos(d[1] + w[1])]
+        )
+        assert torch.allclose(grad[0], expected_w)
+        assert torch.allclose(grad[1], expected_c)
+
+        # test second-order derivatives
+        if diff_method in ("parameter-shift", "backprop") and max_diff == 2:
+            hessians = torch.autograd.functional.hessian(circuit, (d, w, c))
+
+            grad2_c = hessians[2][2]
+            assert torch.allclose(grad2_c, torch.zeros([3, 3], dtype=torch.float64))
+
+            grad2_w_c = hessians[1][2]
+            expected = torch.tensor(
+                [
+                    [0, -torch.cos(d[0] + w[0]) * torch.sin(d[1] + w[1]), 0],
+                    [
+                        0,
+                        -torch.cos(d[1] + w[1]) * torch.sin(d[0] + w[0]),
+                        -torch.sin(d[1] + w[1]),
+                    ],
+                ]
+            )
+            assert torch.allclose(grad2_w_c, expected)
+
+    @pytest.mark.parametrize("max_diff", [1, 2])
+    def test_hamiltonian_expansion_finite_shots(
+        self, dev_name, diff_method, mode, max_diff, mocker
+    ):
+        """Test that the Hamiltonian is expanded if there
+        are non-commuting groups and the number of shots is finite
+        and the first and second order gradients are correctly evaluated"""
+        if diff_method in ("adjoint", "backprop", "finite-diff"):
+            pytest.skip("The adjoint and backprop methods do not yet support sampling")
+
+        dev = qml.device(dev_name, wires=3, shots=50000)
+        spy = mocker.spy(qml.transforms, "hamiltonian_expand")
+        obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]
+
+        @qnode(dev, diff_method=diff_method, mode=mode, max_diff=max_diff, interface="torch")
+        def circuit(data, weights, coeffs):
+            weights = torch.reshape(weights, [1, -1])
+            qml.templates.AngleEmbedding(data, wires=[0, 1])
+            qml.templates.BasicEntanglerLayers(weights, wires=[0, 1])
+            H = qml.Hamiltonian(coeffs, obs)
+            H.compute_grouping()
+            return qml.expval(H)
+
+        d = torch.tensor([0.1, 0.2], requires_grad=False, dtype=torch.float64)
+        w = torch.tensor([0.654, -0.734], requires_grad=True, dtype=torch.float64)
+        c = torch.tensor([-0.6543, 0.24, 0.54], requires_grad=True, dtype=torch.float64)
+
+        # test output
+        res = circuit(d, w, c)
+
+        expected = c[2] * torch.cos(d[1] + w[1]) - c[1] * torch.sin(d[0] + w[0]) * torch.sin(
+            d[1] + w[1]
+        )
+        assert torch.allclose(res, expected, atol=0.1)
+        spy.assert_called()
+
+        # test gradients
+        res.backward()
+        grad = (w.grad, c.grad)
+
+        expected_w = torch.tensor(
+            [
+                -c[1] * torch.cos(d[0] + w[0]) * torch.sin(d[1] + w[1]),
+                -c[1] * torch.cos(d[1] + w[1]) * torch.sin(d[0] + w[0])
+                - c[2] * torch.sin(d[1] + w[1]),
+            ]
+        )
+        expected_c = torch.tensor(
+            [0, -torch.sin(d[0] + w[0]) * torch.sin(d[1] + w[1]), torch.cos(d[1] + w[1])]
+        )
+        assert torch.allclose(grad[0], expected_w, atol=0.1)
+        assert torch.allclose(grad[1], expected_c, atol=0.1)
+
+        # test second-order derivatives
+        if diff_method == "parameter-shift" and max_diff == 2:
+            hessians = torch.autograd.functional.hessian(circuit, (d, w, c))
+
+            grad2_c = hessians[2][2]
+            assert torch.allclose(grad2_c, torch.zeros([3, 3], dtype=torch.float64), atol=0.1)
+
+            grad2_w_c = hessians[1][2]
+            expected = torch.tensor(
+                [
+                    [0, -torch.cos(d[0] + w[0]) * torch.sin(d[1] + w[1]), 0],
+                    [
+                        0,
+                        -torch.cos(d[1] + w[1]) * torch.sin(d[0] + w[0]),
+                        -torch.sin(d[1] + w[1]),
+                    ],
+                ]
+            )
+            assert torch.allclose(grad2_w_c, expected, atol=0.1)