diff --git a/pennylane/gradients/__init__.py b/pennylane/gradients/__init__.py
index 3cbfaf03468..105201a1711 100644
--- a/pennylane/gradients/__init__.py
+++ b/pennylane/gradients/__init__.py
@@ -15,5 +15,7 @@
 import pennylane as qml
 
 from . import finite_difference
+from . import parameter_shift
 
 from .finite_difference import finite_diff, finite_diff_coeffs, generate_shifted_tapes
+from .parameter_shift import param_shift
diff --git a/pennylane/gradients/finite_difference.py b/pennylane/gradients/finite_difference.py
index 557ed7035cc..cc91ba7fa80 100644
--- a/pennylane/gradients/finite_difference.py
+++ b/pennylane/gradients/finite_difference.py
@@ -16,12 +16,15 @@
 of a quantum tape.
 """
 # pylint: disable=protected-access,too-many-arguments
+import functools
+
 import numpy as np
 from scipy.special import factorial
 
 import pennylane as qml
 
 
+@functools.lru_cache(maxsize=None)
 def finite_diff_coeffs(n, approx_order, strategy):
     r"""Generate the finite difference shift values and corresponding
     term coefficients for a given derivative order, approximation accuracy,
diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
new file mode 100644
index 00000000000..38005ddc334
--- /dev/null
+++ b/pennylane/gradients/parameter_shift.py
@@ -0,0 +1,516 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains functions for computing the parameter-shift gradient
+of a qubit-based quantum tape.
+"""
+# pylint: disable=protected-access,too-many-arguments
+import numpy as np
+
+import pennylane as qml
+
+from .finite_difference import finite_diff, generate_shifted_tapes
+
+
+NONINVOLUTORY_OBS = {
+    "Hermitian": lambda obs: obs.__class__(obs.matrix @ obs.matrix, wires=obs.wires),
+    "SparseHamiltonian": lambda obs: obs.__class__(obs.matrix @ obs.matrix, wires=obs.wires),
+    "Projector": lambda obs: obs,
+}
+"""Dict[str, callable]: mapping from a non-involutory observable name
+to a callable that accepts an observable object, and returns the square
+of that observable.
+"""
+
+
+def _square_observable(obs):
+    """Returns the square of an observable."""
+
+    if isinstance(obs, qml.operation.Tensor):
+        # Observable is a tensor, we must consider its
+        # component observables independently. Note that
+        # we assume all component observables are on distinct wires.
+
+        components_squared = []
+
+        for comp in obs.obs:
+
+            try:
+                components_squared.append(NONINVOLUTORY_OBS[comp.name](comp))
+            except KeyError:
+                # component is involutory
+                pass
+
+        return qml.operation.Tensor(*components_squared)
+
+    return NONINVOLUTORY_OBS[obs.name](obs)
+
+
+def _get_operation_recipe(tape, t_idx, shift=np.pi / 2):
+    """Utility function to return the parameter-shift rule
+    of the operation corresponding to trainable parameter
+    t_idx on tape.
+
+    If the corresponding operation has grad_recipe=None, then
+    the default two-term parameter-shift rule is assumed.
+    """
+    # get the index of the parameter in the tape
+    parameter_idx = list(tape.trainable_params)[t_idx]
+
+    # get the corresponding operation
+    op = tape._par_info[parameter_idx]["op"]
+
+    # get the corresponding operation parameter index
+    # (that is, index of the parameter within the operation)
+    op_p_idx = tape._par_info[parameter_idx]["p_idx"]
+
+    # return the parameter-shift gradient for that
+    # operation parameter.
+    return op.get_parameter_shift(op_p_idx, shift=shift)
+
+
+def _process_gradient_recipe(gradient_recipe, tol=1e-10):
+    """Utility function to process gradient recipes."""
+    gradient_recipe = np.array(gradient_recipe).T
+    # remove all small coefficients, shifts, and multipliers
+    gradient_recipe[np.abs(gradient_recipe) < tol] = 0
+    # remove columns where the coefficients are 0
+    gradient_recipe = gradient_recipe[:, ~(gradient_recipe[0] == 0)]
+    # sort columns according to abs(shift)
+    return gradient_recipe[:, np.argsort(np.abs(gradient_recipe)[-1])]
+
+
+def _gradient_analysis(tape, use_graph=True):
+    """Update the parameter information dictionary of the tape with
+    gradient information of each parameter."""
+
+    if getattr(tape, "_gradient_fn", None) is param_shift:
+        # gradient analysis has already been performed on this tape
+        return
+
+    tape._gradient_fn = param_shift
+
+    for idx, info in tape._par_info.items():
+
+        if idx not in tape.trainable_params:
+            info["grad_method"] = None
+        else:
+            op = tape._par_info[idx]["op"]
+
+            if op.grad_method == "F":
+                info["grad_method"] = "F"
+            else:
+                info["grad_method"] = tape._grad_method(
+                    idx, use_graph=use_graph, default_method="A"
+                )
+
+
+def expval_param_shift(tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, f0=None):
+    r"""Generate the parameter-shift tapes and postprocessing methods required
+    to compute the gradient of a gate parameter with respect to an
+    expectation value.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        argnum (int or list[int] or None): Trainable parameter indices to differentiate
+            with respect to. If not provided, the derivatives with respect to all
+            trainable indices are returned.
+        shift (float): The shift value to use for the two-term parameter-shift formula.
+            Only valid if the operation in question supports the two-term parameter-shift
+            rule (that is, it has two distinct eigenvalues) and ``gradient_recipes``
+            is ``None``.
+        gradient_recipes (tuple(list[list[float]] or None)): List of gradient recipes
+            for the parameter-shift method. One gradient recipe must be provided
+            per trainable parameter.
+        f0 (tensor_like[float] or None): Output of the evaluated input tape. If provided,
+            and the gradient recipe contains an unshifted term, this value is used,
+            saving a quantum evaluation.
+
+    Returns:
+        tuple[list[QuantumTape], function]: A tuple containing a
+        list of generated tapes, in addition to a post-processing
+        function to be applied to the results of the evaluated tapes.
+    """
+    argnum = argnum or tape.trainable_params
+
+    gradient_tapes = []
+    gradient_coeffs = []
+    shapes = []
+    unshifted_coeffs = []
+
+    for idx, _ in enumerate(tape.trainable_params):
+
+        if idx not in argnum:
+            # parameter has zero gradient
+            shapes.append(0)
+            gradient_coeffs.append([])
+            continue
+
+        # get the gradient recipe for the trainable parameter
+        recipe = gradient_recipes[argnum.index(idx)]
+        recipe = recipe or _get_operation_recipe(tape, idx, shift=shift)
+        recipe = _process_gradient_recipe(recipe)
+        coeffs, multipliers, shifts = recipe
+
+        if shifts[0] == 0 and multipliers[0] == 1:
+            # Gradient recipe includes a term with zero shift.
+
+            if not unshifted_coeffs and f0 is None:
+                # Ensure that the unshifted tape is appended
+                # to the gradient tapes, if not already.
+                gradient_tapes.append(tape)
+
+            # Store the unshifted coefficient. We know that
+            # it will always be the first coefficient due to processing.
+            unshifted_coeffs.append(coeffs[0])
+            coeffs, multipliers, shifts = recipe[:, 1:]
+
+        # generate the gradient tapes
+        gradient_coeffs.append(coeffs)
+        g_tapes = generate_shifted_tapes(tape, idx, shifts, multipliers)
+
+        gradient_tapes.extend(g_tapes)
+        shapes.append(len(g_tapes))
+
+    def processing_fn(results):
+        grads = []
+        start = 1 if unshifted_coeffs and f0 is None else 0
+        r0 = f0 or results[0]
+
+        for i, s in enumerate(shapes):
+
+            if s == 0:
+                # parameter has zero gradient
+                g = qml.math.zeros_like(results[0])
+                grads.append(g)
+                continue
+
+            res = results[start : start + s]
+            start = start + s
+
+            # compute the linear combination of results and coefficients
+            res = qml.math.stack(res)
+            g = sum([c * r for c, r in zip(gradient_coeffs[i], res)])
+
+            if unshifted_coeffs:
+                # add on the unshifted term
+                g = g + unshifted_coeffs[i] * r0
+
+            grads.append(g)
+
+        # The following is for backwards compatibility; currently,
+        # the device stacks multiple measurement arrays, even if not the same
+        # size, resulting in a ragged array.
+        # In the future, we might want to change this so that only tuples
+        # of arrays are returned.
+        for i, g in enumerate(grads):
+            g = qml.math.convert_like(g, res[0])
+            if hasattr(g, "dtype") and g.dtype is np.dtype("object"):
+                grads[i] = qml.math.hstack(g)
+
+        return qml.math.T(qml.math.stack(grads))
+
+    return gradient_tapes, processing_fn
+
+
+def var_param_shift(tape, argnum, shift=np.pi / 2, gradient_recipes=None, f0=None):
+    r"""Generate the parameter-shift tapes and postprocessing methods required
+    to compute the gradient of a gate parameter with respect to a
+    variance value.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        argnum (int or list[int] or None): Trainable parameter indices to differentiate
+            with respect to. If not provided, the derivative with respect to all
+            trainable indices are returned.
+        shift (float): The shift value to use for the two-term parameter-shift formula.
+            Only valid if the operation in question supports the two-term parameter-shift
+            rule (that is, it has two distinct eigenvalues) and ``gradient_recipes``
+            is ``None``.
+        gradient_recipes (tuple(list[list[float]] or None)): List of gradient recipes
+            for the parameter-shift method. One gradient recipe must be provided
+            per trainable parameter.
+        f0 (tensor_like[float] or None): Output of the evaluated input tape. If provided,
+            and the gradient recipe contains an unshifted term, this value is used,
+            saving a quantum evaluation.
+
+    Returns:
+        tuple[list[QuantumTape], function]: A tuple containing a
+        list of generated tapes, in addition to a post-processing
+        function to be applied to the results of the evaluated tapes.
+    """
+    argnum = argnum or tape.trainable_params
+
+    # Determine the locations of any variance measurements in the measurement queue.
+    var_mask = [m.return_type is qml.operation.Variance for m in tape.measurements]
+    var_idx = np.where(var_mask)[0]
+
+    # Get <A>, the expectation value of the tape with unshifted parameters.
+    expval_tape = tape.copy(copy_operations=True)
+
+    # Convert all variance measurements on the tape into expectation values
+    for i in var_idx:
+        obs = expval_tape._measurements[i].obs
+        expval_tape._measurements[i] = qml.measure.MeasurementProcess(
+            qml.operation.Expectation, obs=obs
+        )
+
+    gradient_tapes = [expval_tape]
+
+    # evaluate the analytic derivative of <A>
+    pdA_tapes, pdA_fn = expval_param_shift(expval_tape, argnum, shift, gradient_recipes, f0)
+    gradient_tapes.extend(pdA_tapes)
+
+    # Store the number of first derivative tapes, so that we know
+    # the number of results to post-process later.
+    tape_boundary = len(pdA_tapes) + 1
+
+    # If there are non-involutory observables A present, we must compute d<A^2>/dp.
+    # Get the indices in the measurement queue of all non-involutory
+    # observables.
+    non_involutory = []
+
+    for i in var_idx:
+        obs_name = tape.observables[i].name
+
+        if isinstance(obs_name, list):
+            # Observable is a tensor product, we must investigate all constituent observables.
+            if any(name in NONINVOLUTORY_OBS for name in obs_name):
+                non_involutory.append(i)
+
+        elif obs_name in NONINVOLUTORY_OBS:
+            non_involutory.append(i)
+
+    # For involutory observables (A^2 = I) we have d<A^2>/dp = 0.
+    involutory = set(var_idx) - set(non_involutory)
+
+    if non_involutory:
+        expval_sq_tape = tape.copy(copy_operations=True)
+
+        for i in non_involutory:
+            # We need to calculate d<A^2>/dp; to do so, we replace the
+            # involutory observables A in the queue with A^2.
+            obs = _square_observable(expval_sq_tape._measurements[i].obs)
+            expval_sq_tape._measurements[i] = qml.measure.MeasurementProcess(
+                qml.operation.Expectation, obs=obs
+            )
+
+        # Non-involutory observables are present; the partial derivative of <A^2>
+        # may be non-zero. Here, we calculate the analytic derivatives of the <A^2>
+        # observables.
+        pdA2_tapes, pdA2_fn = expval_param_shift(
+            expval_sq_tape, argnum, shift, gradient_recipes, f0
+        )
+        gradient_tapes.extend(pdA2_tapes)
+
+    def processing_fn(results):
+        # We need to expand the dimensions of the variance mask,
+        # and convert it to be the same type as the results.
+        mask = qml.math.convert_like(qml.math.reshape(var_mask, [-1, 1]), results[0])
+        f0 = qml.math.expand_dims(results[0], -1)
+
+        pdA = pdA_fn(results[1:tape_boundary])
+        pdA2 = 0
+
+        if non_involutory:
+            # compute the second derivative of non-involutory observables
+            pdA2 = pdA2_fn(results[tape_boundary:])
+
+            if involutory:
+                # if involutory observables are present, ensure they have zero gradient.
+                #
+                # For the pdA2_tapes, we have replaced non-involutory
+                # observables with their square (A -> A^2). However,
+                # involutory observables have been left as-is (A), and have
+                # not been replaced by their square (A^2 = I). As a result,
+                # components of the gradient vector will not be correct. We
+                # need to replace the gradient value with 0 (the known,
+                # correct gradient for involutory variables).
+
+                m = [tape.observables[i].name not in NONINVOLUTORY_OBS for i in var_idx]
+                m = qml.math.convert_like(m, pdA2)
+                pdA2 = qml.math.where(qml.math.reshape(m, [-1, 1]), 0, pdA2)
+
+        # return d(var(A))/dp = d<A^2>/dp -2 * <A> * d<A>/dp for the variances (mask==True)
+        # d<A>/dp for plain expectations (mask==False)
+        return qml.math.where(mask, pdA2 - 2 * f0 * pdA, pdA)
+
+    return gradient_tapes, processing_fn
+
+
+def param_shift(
+    tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
+):
+    r"""Generate the parameter-shift tapes and postprocessing methods required
+    to compute the gradient of a gate parameter with respect to an
+    expectation value.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        argnum (int or list[int] or None): Trainable parameter indices to differentiate
+            with respect to. If not provided, the derivative with respect to all
+            trainable indices are returned.
+        shift (float): The shift value to use for the two-term parameter-shift formula.
+            Only valid if the operation in question supports the two-term parameter-shift
+            rule (that is, it has two distinct eigenvalues) and ``gradient_recipes``
+            is ``None``.
+        gradient_recipes (tuple(list[list[float]] or None)): List of gradient recipes
+            for the parameter-shift method. One gradient recipe must be provided
+            per trainable parameter.
+
+            This is a tuple with one nested list per parameter. For
+            parameter :math:`\phi_k`, the nested list contains elements of the form
+            :math:`[c_i, a_i, s_i]` where :math:`i` is the index of the
+            term, resulting in a gradient recipe of
+
+            .. math:: \frac{\partial}{\partial\phi_k}f = \sum_{i} c_i f(a_i \phi_k + s_i).
+
+            If ``None``, the default gradient recipe containing the two terms
+            :math:`[c_0, a_0, s_0]=[1/2, 1, \pi/2]` and :math:`[c_1, a_1,
+            s_1]=[-1/2, 1, -\pi/2]` is assumed for every parameter.
+        fallback_fn (None or Callable): a fallback gradient function to use for
+            any parameters that do not support the parameter-shift rule.
+        f0 (tensor_like[float] or None): Output of the evaluated input tape. If provided,
+            and the gradient recipe contains an unshifted term, this value is used,
+            saving a quantum evaluation.
+
+    Returns:
+        tuple[list[QuantumTape], function]: A tuple containing a
+        list of generated tapes, in addition to a post-processing
+        function to be applied to the results of the evaluated tapes.
+
+    For a variational evolution :math:`U(\mathbf{p}) \vert 0\rangle` with
+    :math:`N` parameters :math:`\mathbf{p}`,
+    consider the expectation value of an observable :math:`O`:
+
+    .. math::
+
+        f(\mathbf{p})  = \langle \hat{O} \rangle(\mathbf{p}) = \langle 0 \vert
+        U(\mathbf{p})^\dagger \hat{O} U(\mathbf{p}) \vert 0\rangle.
+
+
+    The gradient of this expectation value can be calculated using :math:`2N` expectation
+    values using the parameter-shift rule:
+
+    .. math::
+
+        \frac{\partial f}{\partial \mathbf{p}} = \frac{1}{2\sin s} \left[ f(\mathbf{p} + s) -
+        f(\mathbf{p} -s) \right].
+
+    **Gradients of variances**
+
+    For a variational evolution :math:`U(\mathbf{p}) \vert 0\rangle` with
+    :math:`N` parameters :math:`\mathbf{p}`,
+    consider the variance of an observable :math:`O`:
+
+    .. math::
+
+        g(\mathbf{p})=\langle \hat{O}^2 \rangle (\mathbf{p}) - [\langle \hat{O}
+        \rangle(\mathbf{p})]^2.
+
+    We can relate this directly to the parameter-shift rule by noting that
+
+    .. math::
+
+        \frac{\partial g}{\partial \mathbf{p}}= \frac{\partial}{\partial
+        \mathbf{p}} \langle \hat{O}^2 \rangle (\mathbf{p})
+        - 2 f(\mathbf{p}) \frac{\partial f}{\partial \mathbf{p}}.
+
+    This results in :math:`4N + 1` evaluations.
+
+    In the case where :math:`O` is involutory (:math:`\hat{O}^2 = I`), the first term in the above
+    expression vanishes, and we are simply left with
+
+    .. math::
+
+      \frac{\partial g}{\partial \mathbf{p}} = - 2 f(\mathbf{p})
+      \frac{\partial f}{\partial \mathbf{p}},
+
+    allowing us to compute the gradient using :math:`2N + 1` evaluations.
+
+    **Example**
+
+    >>> params = np.array([0.1, 0.2, 0.3])
+    >>> with qml.tape.JacobianTape() as tape:
+    ...     qml.RX(params[0], wires=0)
+    ...     qml.RY(params[1], wires=0)
+    ...     qml.RX(params[2], wires=0)
+    ...     qml.expval(qml.PauliZ(0))
+    ...     qml.var(qml.PauliZ(0))
+    >>> tape.trainable_params = {0, 1, 2}
+    >>> gradient_tapes, fn = qml.gradients.param_shift(tape)
+    >>> res = dev.batch_execute(gradient_tapes)
+    >>> fn(res)
+    [[-0.38751721 -0.18884787 -0.38355704]
+     [ 0.69916862  0.34072424  0.69202359]]
+    """
+
+    # perform gradient method validation
+    if any(m.return_type is qml.operation.State for m in tape.measurements):
+        raise ValueError(
+            "Computing the gradient of circuits that return the state is not supported."
+        )
+
+    _gradient_analysis(tape)
+    gradient_tapes = []
+
+    # TODO: replace the JacobianTape._grad_method_validation
+    # functionality before deprecation.
+    method = "analytic" if fallback_fn is None else "best"
+    diff_methods = tape._grad_method_validation(method)
+    all_params_grad_method_zero = all(g == "0" for g in diff_methods)
+    if not tape.trainable_params or all_params_grad_method_zero:
+        return gradient_tapes, lambda _: np.zeros([tape.output_dim, len(tape.trainable_params)])
+
+    # TODO: replace the JacobianTape._choose_params_with_methods
+    # functionality before deprecation.
+    method_map = dict(tape._choose_params_with_methods(diff_methods, argnum))
+
+    # If there are unsupported operations, call the callback gradient function
+    unsupported_params = {idx for idx, g in method_map.items() if g == "F"}
+
+    if unsupported_params:
+        g_tapes, fallback_proc_fn = fallback_fn(tape, argnum=unsupported_params)
+        gradient_tapes.extend(g_tapes)
+        fallback_len = len(g_tapes)
+
+        # remove finite difference parameters from the method map
+        method_map = {t_idx: dm for t_idx, dm in method_map.items() if dm != "F"}
+
+    # Generate parameter-shift gradient tapes
+    argnum = [i for i, dm in method_map.items() if dm == "A"]
+
+    if gradient_recipes is None:
+        gradient_recipes = [None] * len(argnum)
+
+    if any(m.return_type is qml.operation.Variance for m in tape.measurements):
+        g_tapes, fn = var_param_shift(tape, argnum, shift, gradient_recipes, f0)
+    else:
+        g_tapes, fn = expval_param_shift(tape, argnum, shift, gradient_recipes, f0)
+
+    gradient_tapes.extend(g_tapes)
+
+    if unsupported_params:
+        # If there are unsupported parameters, we must process
+        # the quantum results separately, once for the fallback
+        # function and once for the parameter-shift rule, and recombine.
+
+        def processing_fn(results):
+            unsupported_grads = fallback_proc_fn(results[:fallback_len])
+            supported_grads = fn(results[fallback_len:])
+            return unsupported_grads + supported_grads
+
+        return gradient_tapes, processing_fn
+
+    return gradient_tapes, fn
diff --git a/tests/gradients/test_parameter_shift.py b/tests/gradients/test_parameter_shift.py
new file mode 100644
index 00000000000..ad2cc2bd009
--- /dev/null
+++ b/tests/gradients/test_parameter_shift.py
@@ -0,0 +1,916 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the gradients.parameter_shift module."""
+import pytest
+
+import pennylane as qml
+from pennylane import numpy as np
+from pennylane.gradients import param_shift
+from pennylane.gradients.parameter_shift import _gradient_analysis
+
+
+class TestGradAnalysis:
+    """Tests for parameter gradient methods"""
+
+    def test_non_differentiable(self):
+        """Test that a non-differentiable parameter is
+        correctly marked"""
+        psi = np.array([1, 0, 1, 0]) / np.sqrt(2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(psi, wires=[0, 1])
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.probs(wires=[0, 1])
+
+        _gradient_analysis(tape)
+
+        assert tape._par_info[0]["grad_method"] is None
+        assert tape._par_info[1]["grad_method"] == "A"
+        assert tape._par_info[2]["grad_method"] == "A"
+
+    def test_analysis_caching(self, mocker):
+        """Test that the gradient analysis is only executed once per tape"""
+        psi = np.array([1, 0, 1, 0]) / np.sqrt(2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(psi, wires=[0, 1])
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.probs(wires=[0, 1])
+
+        spy = mocker.spy(tape, "_grad_method")
+        _gradient_analysis(tape)
+        spy.assert_called()
+
+        assert tape._par_info[0]["grad_method"] is None
+        assert tape._par_info[1]["grad_method"] == "A"
+        assert tape._par_info[2]["grad_method"] == "A"
+
+        spy = mocker.spy(tape, "_grad_method")
+        _gradient_analysis(tape)
+        spy.assert_not_called()
+
+    def test_independent(self):
+        """Test that an independent variable is properly marked
+        as having a zero gradient"""
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.expval(qml.PauliY(0))
+
+        _gradient_analysis(tape)
+
+        assert tape._par_info[0]["grad_method"] == "A"
+        assert tape._par_info[1]["grad_method"] == "0"
+
+    def test_independent_no_graph_mode(self):
+        """In non-graph mode, it is impossible to determine
+        if a parameter is independent or not"""
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.expval(qml.PauliY(0))
+
+        _gradient_analysis(tape, use_graph=False)
+
+        assert tape._par_info[0]["grad_method"] == "A"
+        assert tape._par_info[1]["grad_method"] == "A"
+
+    def test_finite_diff(self, monkeypatch):
+        """If an op has grad_method=F, this should be respected"""
+        monkeypatch.setattr(qml.RX, "grad_method", "F")
+
+        psi = np.array([1, 0, 1, 0]) / np.sqrt(2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(psi, wires=[0, 1])
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.probs(wires=[0, 1])
+
+        _gradient_analysis(tape)
+
+        assert tape._par_info[0]["grad_method"] is None
+        assert tape._par_info[1]["grad_method"] == "F"
+        assert tape._par_info[2]["grad_method"] == "A"
+
+
+def grad_fn(tape, dev, fn=qml.gradients.param_shift, **kwargs):
+    """Utility function to automate execution and processing of gradient tapes"""
+    tapes, fn = fn(tape, **kwargs)
+    return fn(dev.batch_execute(tapes))
+
+
+class TestShiftedTapes:
+    """Tests for the generation of shifted tapes"""
+
+    def test_behaviour(self):
+        """Test that the function behaves as expected"""
+
+        with qml.tape.JacobianTape() as tape:
+            qml.PauliZ(0)
+            qml.RX(1.0, wires=0)
+            qml.CNOT(wires=[0, 2])
+            qml.Rot(2.0, 3.0, 4.0, wires=0)
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {0, 2}
+        gradient_recipes = [[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], [[1, 1, 1], [2, 2, 2], [3, 3, 3]]]
+        tapes, _ = qml.gradients.param_shift(tape, gradient_recipes=gradient_recipes)
+
+        assert len(tapes) == 5
+        assert tapes[0].get_parameters(trainable_only=False) == [0.2 * 1.0 + 0.3, 2.0, 3.0, 4.0]
+        assert tapes[1].get_parameters(trainable_only=False) == [0.5 * 1.0 + 0.6, 2.0, 3.0, 4.0]
+        assert tapes[2].get_parameters(trainable_only=False) == [1.0, 2.0, 1 * 3.0 + 1, 4.0]
+        assert tapes[3].get_parameters(trainable_only=False) == [1.0, 2.0, 2 * 3.0 + 2, 4.0]
+        assert tapes[4].get_parameters(trainable_only=False) == [1.0, 2.0, 3 * 3.0 + 3, 4.0]
+
+
+class TestParamShift:
+    """Unit tests for the param_shift function"""
+
+    def test_state_non_differentiable_error(self):
+        """Test error raised if attempting to differentiate with
+        respect to a state"""
+        psi = np.array([1, 0, 1, 0]) / np.sqrt(2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.state()
+
+        with pytest.raises(ValueError, match=r"return the state is not supported"):
+            qml.gradients.param_shift(tape)
+
+    def test_independent_parameter(self, mocker):
+        """Test that an independent parameter is skipped
+        during the Jacobian computation."""
+        spy = mocker.spy(qml.gradients.parameter_shift, "expval_param_shift")
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.expval(qml.PauliZ(0))
+
+        dev = qml.device("default.qubit", wires=2)
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 2
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (1, 2)
+
+        # only called for parameter 0
+        assert spy.call_args[0][0:2] == (tape, [0])
+
+    def test_no_trainable_parameters(self):
+        """Test that if the tape has no trainable parameters, no
+        subroutines are called and the returned Jacobian is empty"""
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[1])
+            qml.expval(qml.PauliZ(0))
+
+        dev = qml.device("default.qubit", wires=2)
+        tape.trainable_params = {}
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 0
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.size == 0
+        assert np.all(res == np.array([[]]))
+
+    def test_y0(self, mocker):
+        """Test that if the gradient recipe has a zero-shift component, then
+        the tape is executed only once using the current parameter
+        values."""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        gradient_recipes = [[[-1e7, 1, 0], [1e7, 1, 1e7]], [[-1e7, 1, 0], [1e7, 1, 1e7]]]
+        tapes, fn = qml.gradients.param_shift(tape, gradient_recipes=gradient_recipes)
+
+        # one tape per parameter, plus one global call
+        assert len(tapes) == tape.num_params + 1
+
+    def test_y0_provided(self, mocker):
+        """Test that if the original tape output is provided, then
+        the tape is executed only once using the current parameter
+        values."""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.543, wires=[0])
+            qml.RY(-0.654, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        gradient_recipes = [[[-1e7, 1, 0], [1e7, 1, 1e7]], [[-1e7, 1, 0], [1e7, 1, 1e7]]]
+        f0 = dev.execute(tape)
+        tapes, fn = qml.gradients.param_shift(tape, gradient_recipes=gradient_recipes, f0=f0)
+
+        # one tape per parameter, plus one global call
+        assert len(tapes) == tape.num_params
+
+        fn(dev.batch_execute(tapes))
+
+    def test_independent_parameters_analytic(self):
+        """Test the case where expectation values are independent of some parameters. For those
+        parameters, the gradient should be evaluated to zero without executing the device."""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.JacobianTape() as tape1:
+            qml.RX(1, wires=[0])
+            qml.RX(1, wires=[1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.JacobianTape() as tape2:
+            qml.RX(1, wires=[0])
+            qml.RX(1, wires=[1])
+            qml.expval(qml.PauliZ(1))
+
+        tapes, fn = qml.gradients.param_shift(tape1)
+        j1 = fn(dev.batch_execute(tapes))
+
+        # We should only be executing the device to differentiate 1 parameter (2 executions)
+        assert dev.num_executions == 2
+
+        tapes, fn = qml.gradients.param_shift(tape2)
+        j2 = fn(dev.batch_execute(tapes))
+
+        exp = -np.sin(1)
+
+        assert np.allclose(j1, [exp, 0])
+        assert np.allclose(j2, [0, exp])
+
+
+class TestParameterShiftRule:
+    """Tests for the parameter shift implementation"""
+
+    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
+    @pytest.mark.parametrize("shift", [np.pi / 2, 0.3, np.sqrt(2)])
+    @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ, qml.PhaseShift])
+    def test_pauli_rotation_gradient(self, mocker, G, theta, shift, tol):
+        """Tests that the automatic gradients of Pauli rotations are correct."""
+        spy = mocker.spy(qml.gradients.parameter_shift, "_get_operation_recipe")
+        dev = qml.device("default.qubit", wires=1)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
+            G(theta, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1}
+
+        tapes, fn = qml.gradients.param_shift(tape, shift=shift)
+        assert len(tapes) == 2
+
+        autograd_val = fn(dev.batch_execute(tapes))
+        manualgrad_val = (
+            tape.execute(dev, params=[theta + np.pi / 2])
+            - tape.execute(dev, params=[theta - np.pi / 2])
+        ) / 2
+        assert np.allclose(autograd_val, manualgrad_val, atol=tol, rtol=0)
+
+        assert spy.call_args[1]["shift"] == shift
+
+        # compare to finite differences
+        tapes, fn = qml.gradients.finite_diff(tape)
+        numeric_val = fn(dev.batch_execute(tapes))
+        assert np.allclose(autograd_val, numeric_val, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
+    @pytest.mark.parametrize("shift", [np.pi / 2, 0.3, np.sqrt(2)])
+    def test_Rot_gradient(self, mocker, theta, shift, tol):
+        """Tests that the automatic gradient of an arbitrary Euler-angle-parameterized gate is correct."""
+        spy = mocker.spy(qml.gradients.parameter_shift, "_get_operation_recipe")
+        dev = qml.device("default.qubit", wires=1)
+        params = np.array([theta, theta ** 3, np.sqrt(2) * theta])
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
+            qml.Rot(*params, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        tapes, fn = qml.gradients.param_shift(tape, shift=shift)
+        assert len(tapes) == 2 * len(tape.trainable_params)
+
+        autograd_val = fn(dev.batch_execute(tapes))
+        manualgrad_val = np.zeros_like(autograd_val)
+
+        for idx in list(np.ndindex(*params.shape)):
+            s = np.zeros_like(params)
+            s[idx] += np.pi / 2
+
+            forward = tape.execute(dev, params=params + s)
+            backward = tape.execute(dev, params=params - s)
+
+            manualgrad_val[0, idx] = (forward - backward) / 2
+
+        assert np.allclose(autograd_val, manualgrad_val, atol=tol, rtol=0)
+        assert spy.call_args[1]["shift"] == shift
+
+        # compare to finite differences
+        tapes, fn = qml.gradients.finite_diff(tape)
+        numeric_val = fn(dev.batch_execute(tapes))
+        assert np.allclose(autograd_val, numeric_val, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("G", [qml.CRX, qml.CRY, qml.CRZ])
+    def test_controlled_rotation_gradient(self, G, tol):
+        """Test gradient of controlled rotation gates"""
+        dev = qml.device("default.qubit", wires=2)
+        b = 0.123
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
+            G(b, wires=[0, 1])
+            qml.expval(qml.PauliX(0))
+
+        tape.trainable_params = {1}
+
+        res = tape.execute(dev)
+        assert np.allclose(res, -np.cos(b / 2), atol=tol, rtol=0)
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        grad = fn(dev.batch_execute(tapes))
+        expected = np.sin(b / 2) / 2
+        assert np.allclose(grad, expected, atol=tol, rtol=0)
+
+        # compare to finite differences
+        tapes, fn = qml.gradients.finite_diff(tape)
+        numeric_val = fn(dev.batch_execute(tapes))
+        assert np.allclose(grad, numeric_val, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, np.pi, 7))
+    def test_CRot_gradient(self, theta, tol):
+        """Tests that the automatic gradient of an arbitrary controlled Euler-angle-parameterized
+        gate is correct."""
+        dev = qml.device("default.qubit", wires=2)
+        a, b, c = np.array([theta, theta ** 3, np.sqrt(2) * theta])
+
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
+            qml.CRot(a, b, c, wires=[0, 1])
+            qml.expval(qml.PauliX(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        res = tape.execute(dev)
+        expected = -np.cos(b / 2) * np.cos(0.5 * (a + c))
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 4 * len(tape.trainable_params)
+
+        grad = fn(dev.batch_execute(tapes))
+        expected = np.array(
+            [
+                [
+                    0.5 * np.cos(b / 2) * np.sin(0.5 * (a + c)),
+                    0.5 * np.sin(b / 2) * np.cos(0.5 * (a + c)),
+                    0.5 * np.cos(b / 2) * np.sin(0.5 * (a + c)),
+                ]
+            ]
+        )
+        assert np.allclose(grad, expected, atol=tol, rtol=0)
+
+        # compare to finite differences
+        tapes, fn = qml.gradients.finite_diff(tape)
+        numeric_val = fn(dev.batch_execute(tapes))
+        assert np.allclose(grad, numeric_val, atol=tol, rtol=0)
+
+    def test_gradients_agree_finite_differences(self, tol):
+        """Tests that the parameter-shift rule agrees with the first and second
+        order finite differences"""
+        params = np.array([0.1, -1.6, np.pi / 5])
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(params[0], wires=[0])
+            qml.CNOT(wires=[0, 1])
+            qml.RY(-1.6, wires=[0])
+            qml.RY(params[1], wires=[1])
+            qml.CNOT(wires=[1, 0])
+            qml.RX(params[2], wires=[0])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {0, 2, 3}
+        dev = qml.device("default.qubit", wires=2)
+
+        grad_F1 = grad_fn(tape, dev, fn=qml.gradients.finite_diff, approx_order=1)
+        grad_F2 = grad_fn(
+            tape, dev, fn=qml.gradients.finite_diff, approx_order=2, strategy="center"
+        )
+        grad_A = grad_fn(tape, dev)
+
+        # gradients computed with different methods must agree
+        assert np.allclose(grad_A, grad_F1, atol=tol, rtol=0)
+        assert np.allclose(grad_A, grad_F2, atol=tol, rtol=0)
+
+    def test_variance_gradients_agree_finite_differences(self, tol):
+        """Tests that the variance parameter-shift rule agrees with the first and second
+        order finite differences"""
+        params = np.array([0.1, -1.6, np.pi / 5])
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(params[0], wires=[0])
+            qml.CNOT(wires=[0, 1])
+            qml.RY(-1.6, wires=[0])
+            qml.RY(params[1], wires=[1])
+            qml.CNOT(wires=[1, 0])
+            qml.RX(params[2], wires=[0])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0)), qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+        tape.trainable_params = {0, 2, 3}
+        dev = qml.device("default.qubit", wires=2)
+
+        grad_F1 = grad_fn(tape, dev, fn=qml.gradients.finite_diff, approx_order=1)
+        grad_F2 = grad_fn(
+            tape, dev, fn=qml.gradients.finite_diff, approx_order=2, strategy="center"
+        )
+        grad_A = grad_fn(tape, dev)
+
+        # gradients computed with different methods must agree
+        assert np.allclose(grad_A, grad_F1, atol=tol, rtol=0)
+        assert np.allclose(grad_A, grad_F2, atol=tol, rtol=0)
+
+    def test_fallback(self, mocker, tol):
+        """Test that fallback gradient functions are correctly used"""
+        spy = mocker.spy(qml.gradients, "finite_diff")
+        dev = qml.device("default.qubit.autograd", wires=2)
+        x = 0.543
+        y = -0.654
+
+        params = np.array([x, y], requires_grad=True)
+
+        class RY(qml.RY):
+            grad_method = "F"
+
+        def cost_fn(params):
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(params[0], wires=[0])
+                RY(params[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
+                qml.var(qml.PauliX(1))
+
+            tapes, fn = param_shift(tape, fallback_fn=qml.gradients.finite_diff)
+            assert len(tapes) == 5
+
+            # check that the fallback method was called for the specified argnums
+            spy.assert_called()
+            assert spy.call_args[1]["argnum"] == {1}
+
+            return fn(dev.batch_execute(tapes))
+
+        res = cost_fn(params)
+        assert res.shape == (2, 2)
+
+        expected = np.array([[-np.sin(x), 0], [0, -2 * np.cos(y) * np.sin(y)]])
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        # double check the derivative
+        jac = qml.jacobian(cost_fn)(params)
+        assert np.allclose(jac[0, 0, 0], -np.cos(x), atol=tol, rtol=0)
+        assert np.allclose(jac[1, 1, 1], -2 * np.cos(2 * y), atol=tol, rtol=0)
+
+    def test_single_expectation_value(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with a single expval output"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 4
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (1, 2)
+
+        expected = np.array([[-np.sin(y) * np.sin(x), np.cos(y) * np.cos(x)]])
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_multiple_expectation_values(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with multiple expval outputs"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+            qml.expval(qml.PauliX(1))
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 4
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (2, 2)
+
+        expected = np.array([[-np.sin(x), 0], [0, np.cos(y)]])
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_var_expectation_values(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with expval and var outputs"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+            qml.var(qml.PauliX(1))
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 5
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (2, 2)
+
+        expected = np.array([[-np.sin(x), 0], [0, -2 * np.cos(y) * np.sin(y)]])
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_prob_expectation_values(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with prob and expval outputs"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+            qml.probs(wires=[0, 1])
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        assert len(tapes) == 4
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (5, 2)
+
+        expected = (
+            np.array(
+                [
+                    [-2 * np.sin(x), 0],
+                    [
+                        -(np.cos(y / 2) ** 2 * np.sin(x)),
+                        -(np.cos(x / 2) ** 2 * np.sin(y)),
+                    ],
+                    [
+                        -(np.sin(x) * np.sin(y / 2) ** 2),
+                        (np.cos(x / 2) ** 2 * np.sin(y)),
+                    ],
+                    [
+                        (np.sin(x) * np.sin(y / 2) ** 2),
+                        (np.sin(x / 2) ** 2 * np.sin(y)),
+                    ],
+                    [
+                        (np.cos(y / 2) ** 2 * np.sin(x)),
+                        -(np.sin(x / 2) ** 2 * np.sin(y)),
+                    ],
+                ]
+            )
+            / 2
+        )
+
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_involutory_variance(self, tol):
+        """Tests qubit observables that are involutory"""
+        dev = qml.device("default.qubit", wires=1)
+        a = 0.54
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(a, wires=0)
+            qml.var(qml.PauliZ(0))
+
+        res = tape.execute(dev)
+        expected = 1 - np.cos(a) ** 2
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        # circuit jacobians
+        tapes, fn = qml.gradients.param_shift(tape)
+        gradA = fn(dev.batch_execute(tapes))
+        assert len(tapes) == 1 + 2 * 1
+
+        tapes, fn = qml.gradients.finite_diff(tape)
+        gradF = fn(dev.batch_execute(tapes))
+        assert len(tapes) == 2
+
+        expected = 2 * np.sin(a) * np.cos(a)
+
+        assert gradF == pytest.approx(expected, abs=tol)
+        assert gradA == pytest.approx(expected, abs=tol)
+
+    def test_non_involutory_variance(self, tol):
+        """Tests a qubit Hermitian observable that is not involutory"""
+        dev = qml.device("default.qubit", wires=1)
+        A = np.array([[4, -1 + 6j], [-1 - 6j, 2]])
+        a = 0.54
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(a, wires=0)
+            qml.var(qml.Hermitian(A, 0))
+
+        tape.trainable_params = {0}
+
+        res = tape.execute(dev)
+        expected = (39 / 2) - 6 * np.sin(2 * a) + (35 / 2) * np.cos(2 * a)
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        # circuit jacobians
+        tapes, fn = qml.gradients.param_shift(tape)
+        gradA = fn(dev.batch_execute(tapes))
+        assert len(tapes) == 1 + 4 * 1
+
+        tapes, fn = qml.gradients.finite_diff(tape)
+        gradF = fn(dev.batch_execute(tapes))
+        assert len(tapes) == 2
+
+        expected = -35 * np.sin(2 * a) - 12 * np.cos(2 * a)
+        assert gradA == pytest.approx(expected, abs=tol)
+        assert gradF == pytest.approx(expected, abs=tol)
+
+    def test_involutory_and_noninvolutory_variance(self, tol):
+        """Tests a qubit Hermitian observable that is not involutory alongside
+        an involutory observable."""
+        dev = qml.device("default.qubit", wires=2)
+        A = np.array([[4, -1 + 6j], [-1 - 6j, 2]])
+        a = 0.54
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(a, wires=0)
+            qml.RX(a, wires=1)
+            qml.var(qml.PauliZ(0))
+            qml.var(qml.Hermitian(A, 1))
+
+        tape.trainable_params = {0, 1}
+
+        res = tape.execute(dev)
+        expected = [1 - np.cos(a) ** 2, (39 / 2) - 6 * np.sin(2 * a) + (35 / 2) * np.cos(2 * a)]
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        # circuit jacobians
+        tapes, fn = qml.gradients.param_shift(tape)
+        gradA = fn(dev.batch_execute(tapes))
+        assert len(tapes) == 1 + 2 * 4
+
+        tapes, fn = qml.gradients.finite_diff(tape)
+        gradF = fn(dev.batch_execute(tapes))
+        assert len(tapes) == 1 + 2
+
+        expected = [2 * np.sin(a) * np.cos(a), -35 * np.sin(2 * a) - 12 * np.cos(2 * a)]
+        assert np.diag(gradA) == pytest.approx(expected, abs=tol)
+        assert np.diag(gradF) == pytest.approx(expected, abs=tol)
+
+    def test_expval_and_variance(self, tol):
+        """Test that the qnode works for a combination of expectation
+        values and variances"""
+        dev = qml.device("default.qubit", wires=3)
+
+        a = 0.54
+        b = -0.423
+        c = 0.123
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(a, wires=0)
+            qml.RY(b, wires=1)
+            qml.CNOT(wires=[1, 2])
+            qml.RX(c, wires=2)
+            qml.CNOT(wires=[0, 1])
+            qml.var(qml.PauliZ(0))
+            qml.expval(qml.PauliZ(1))
+            qml.var(qml.PauliZ(2))
+
+        res = tape.execute(dev)
+        expected = np.array(
+            [
+                np.sin(a) ** 2,
+                np.cos(a) * np.cos(b),
+                0.25 * (3 - 2 * np.cos(b) ** 2 * np.cos(2 * c) - np.cos(2 * b)),
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        # # circuit jacobians
+        tapes, fn = qml.gradients.param_shift(tape)
+        gradA = fn(dev.batch_execute(tapes))
+
+        tapes, fn = qml.gradients.finite_diff(tape)
+        gradF = fn(dev.batch_execute(tapes))
+
+        expected = np.array(
+            [
+                [2 * np.cos(a) * np.sin(a), -np.cos(b) * np.sin(a), 0],
+                [
+                    0,
+                    -np.cos(a) * np.sin(b),
+                    0.5 * (2 * np.cos(b) * np.cos(2 * c) * np.sin(b) + np.sin(2 * b)),
+                ],
+                [0, 0, np.cos(b) ** 2 * np.sin(2 * c)],
+            ]
+        ).T
+        assert gradA == pytest.approx(expected, abs=tol)
+        assert gradF == pytest.approx(expected, abs=tol)
+
+    def test_projector_variance(self, tol):
+        """Test that the variance of a projector is correctly returned"""
+        dev = qml.device("default.qubit", wires=2)
+        P = np.array([1])
+        x, y = 0.765, -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=0)
+            qml.RY(y, wires=1)
+            qml.CNOT(wires=[0, 1])
+            qml.var(qml.Projector(P, wires=0) @ qml.PauliX(1))
+
+        tape.trainable_params = {0, 1}
+
+        res = tape.execute(dev)
+        expected = 0.25 * np.sin(x / 2) ** 2 * (3 + np.cos(2 * y) + 2 * np.cos(x) * np.sin(y) ** 2)
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        # # circuit jacobians
+        tapes, fn = qml.gradients.param_shift(tape)
+        gradA = fn(dev.batch_execute(tapes))
+
+        tapes, fn = qml.gradients.finite_diff(tape)
+        gradF = fn(dev.batch_execute(tapes))
+
+        expected = np.array(
+            [
+                [
+                    0.5 * np.sin(x) * (np.cos(x / 2) ** 2 + np.cos(2 * y) * np.sin(x / 2) ** 2),
+                    -2 * np.cos(y) * np.sin(x / 2) ** 4 * np.sin(y),
+                ]
+            ]
+        )
+        assert gradA == pytest.approx(expected, abs=tol)
+        assert gradF == pytest.approx(expected, abs=tol)
+
+
+class TestParamShiftGradients:
+    """Test that the transform is differentiable"""
+
+    def test_autograd(self, tol):
+        """Tests that the output of the parameter-shift transform
+        can be differentiated using autograd, yielding second derivatives."""
+        dev = qml.device("default.qubit.autograd", wires=2)
+        params = np.array([0.543, -0.654], requires_grad=True)
+
+        def cost_fn(x):
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(x[0], wires=[0])
+                qml.RY(x[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+            tape.trainable_params = {0, 1}
+            tapes, fn = qml.gradients.param_shift(tape)
+            jac = fn(dev.batch_execute(tapes))
+            return jac
+
+        res = qml.jacobian(cost_fn)(params)
+        x, y = params
+        expected = np.array(
+            [
+                [2 * np.cos(2 * x) * np.sin(y) ** 2, np.sin(2 * x) * np.sin(2 * y)],
+                [np.sin(2 * x) * np.sin(2 * y), -2 * np.cos(x) ** 2 * np.cos(2 * y)],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_tf(self, tol):
+        """Tests that the output of the finite-difference transform
+        can be differentiated using TF, yielding second derivatives."""
+        tf = pytest.importorskip("tensorflow")
+
+        dev = qml.device("default.qubit.tf", wires=2)
+        params = tf.Variable([0.543, -0.654], dtype=tf.float64)
+
+        with tf.GradientTape() as t:
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(params[0], wires=[0])
+                qml.RY(params[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+            tape.trainable_params = {0, 1}
+            tapes, fn = qml.gradients.param_shift(tape)
+            jac = fn(dev.batch_execute(tapes))
+
+        x, y = 1.0 * params
+
+        expected = np.array([np.sin(2 * x) * np.sin(y) ** 2, -np.cos(x) ** 2 * np.sin(2 * y)])
+        assert np.allclose(jac, expected, atol=tol, rtol=0)
+
+        res = t.jacobian(jac, params)
+        expected = np.array(
+            [
+                [2 * np.cos(2 * x) * np.sin(y) ** 2, np.sin(2 * x) * np.sin(2 * y)],
+                [np.sin(2 * x) * np.sin(2 * y), -2 * np.cos(x) ** 2 * np.cos(2 * y)],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_torch(self, tol):
+        """Tests that the output of the finite-difference transform
+        can be differentiated using Torch, yielding second derivatives."""
+        torch = pytest.importorskip("torch")
+        from pennylane.interfaces.torch import TorchInterface
+
+        dev = qml.device("default.qubit", wires=2)
+        params = torch.tensor([0.543, -0.654], dtype=torch.float64, requires_grad=True)
+
+        with TorchInterface.apply(qml.tape.QubitParamShiftTape()) as tape:
+            qml.RX(params[0], wires=[0])
+            qml.RY(params[1], wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+        tapes, fn = qml.gradients.param_shift(tape)
+        jac = fn([t.execute(dev) for t in tapes])
+        cost = jac[0, 0]
+        cost.backward()
+        hess = params.grad
+
+        x, y = params.detach().numpy()
+
+        expected = np.array([np.sin(2 * x) * np.sin(y) ** 2, -np.cos(x) ** 2 * np.sin(2 * y)])
+        assert np.allclose(jac.detach().numpy(), expected, atol=tol, rtol=0)
+
+        expected = np.array([2 * np.cos(2 * x) * np.sin(y) ** 2, np.sin(2 * x) * np.sin(2 * y)])
+        assert np.allclose(hess.detach().numpy(), expected, atol=0.1, rtol=0)
+
+    def test_jax(self, tol):
+        """Tests that the output of the finite-difference transform
+        can be differentiated using JAX, yielding second derivatives."""
+        jax = pytest.importorskip("jax")
+        from jax import numpy as jnp
+        from pennylane.interfaces.jax import JAXInterface
+        from jax.config import config
+
+        config.update("jax_enable_x64", True)
+
+        dev = qml.device("default.qubit", wires=2)
+        params = jnp.array([0.543, -0.654])
+
+        def cost_fn(x):
+            with JAXInterface.apply(qml.tape.QubitParamShiftTape()) as tape:
+                qml.RX(x[0], wires=[0])
+                qml.RY(x[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+            tape.trainable_params = {0, 1}
+            tapes, fn = qml.gradients.param_shift(tape)
+            jac = fn([t.execute(dev) for t in tapes])
+            return jac
+
+        res = jax.jacobian(cost_fn)(params)
+        x, y = params
+        expected = np.array(
+            [
+                [2 * np.cos(2 * x) * np.sin(y) ** 2, np.sin(2 * x) * np.sin(2 * y)],
+                [np.sin(2 * x) * np.sin(2 * y), -2 * np.cos(x) ** 2 * np.cos(2 * y)],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)