From 0c579193e9be915844b5485c738236a44b83131b Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 4 Aug 2021 22:38:44 +0800
Subject: [PATCH 01/45] Added differentiable VJP transform

---
 pennylane/gradients/__init__.py |   1 +
 pennylane/gradients/vjp.py      | 234 ++++++++++++++++++++++++++++++++
 2 files changed, 235 insertions(+)
 create mode 100644 pennylane/gradients/vjp.py

diff --git a/pennylane/gradients/__init__.py b/pennylane/gradients/__init__.py
index 105201a1711..1863ff8d784 100644
--- a/pennylane/gradients/__init__.py
+++ b/pennylane/gradients/__init__.py
@@ -19,3 +19,4 @@
 
 from .finite_difference import finite_diff, finite_diff_coeffs, generate_shifted_tapes
 from .parameter_shift import param_shift
+from .vjp import batch_vjp, vjp
diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
new file mode 100644
index 00000000000..f5157fce6a8
--- /dev/null
+++ b/pennylane/gradients/vjp.py
@@ -0,0 +1,234 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains functions for computing the vector-Jacobian product
+of a batch of tapes.
+"""
+from pennylane import math
+
+
+def _vector_jacobian_product(dy, jac):
+    """Compute the vector-Jacobian product for a given
+    vector of gradient outputs dy and a Jacobian Jac"""
+    dy_row = math.reshape(dy, [-1])
+    jac = math.transpose(math.stack(jac))
+    num_params = len(jac)
+    jac = math.reshape(jac, [-1, num_params])
+    return math.tensordot(jac, dy_row, [[0], [0]])
+
+
+def vjp(tape, dy, gradient_fn):
+    """Generate the gradient tapes and processing function required to compute
+    the vector-Jacobian products of a tape.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        dy (tensor_like): Gradient-output vector`. Must have shape
+            matching the output shape of the corresponding tape.
+        gradient_fn (callable): the gradient transform to use to differentiate
+            the tape
+
+    Returns:
+        tensor_like or None: Vector-Jacobian product. Returns None if the tape
+        has no trainable parameters.
+
+    **Example**
+
+    Consider the following Torch-compatible quantum tape:
+
+    .. code-block:: python
+
+        import torch
+        from pennylane.interfaces.torch import TorchInterface
+
+        x = torch.tensor([[0.1, 0.2, 0.3],
+                          [0.4, 0.5, 0.6]], requires_grad=True, dtype=torch.float64)
+
+        with TorchInterface.apply(qml.tape.JacobianTape()) as tape:
+            qml.RX(x[0, 0], wires=0)
+            qml.RY(x[0, 1], wires=1)
+            qml.RZ(x[0, 2], wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.RX(x[1, 0], wires=1)
+            qml.RY(x[1, 1], wires=0)
+            qml.RZ(x[1, 2], wires=1)
+            qml.expval(qml.PauliZ(0))
+            qml.probs(wires=1)
+
+    We can use the ``vjp`` function to compute the vector-Jacobian product,
+    given a gradient-output vector ``dy``:
+
+    >>> dy = torch.tensor([1., 1., 1.], dtype=torch.float64)
+    >>> vjp_tapes, fn = qml.gradients.vjp(tape, dy, qml.gradients.param_shift)
+
+    Note that ``dy`` has shape ``(3,)``, matching the output dimension of the tape
+    (1 expectation and 2 probability values).
+
+    Executing the VJP tapes, and applying the processing function:
+
+    >>> dev = qml.device("default.qubit", wires=2)
+    >>> vjp = fn([t.execute(dev) for t in vjp_tapes])
+    >>> vjp
+    tensor([-0.6069, -0.0451,  0.0451, -0.0139, -0.2809,  0.2809],
+           dtype=torch.float64, grad_fn=<ViewBackward>)
+
+    The output VJP is also differentiable with respect to the tape parameters:
+
+    >>> cost = torch.sum(vjp)
+    >>> cost.backward()
+    >>> x.grad
+    tensor([[-1.1025e+00, -2.0554e-01, -1.4917e-01],
+            [-1.9429e-09, -9.1580e-01,  1.3878e-09]], dtype=torch.float64)
+    """
+    # t._par_info = {}
+    # t._update()
+    num_params = len(tape.trainable_params)
+
+    if num_params == 0:
+        # The tape has no trainable parameters; the VJP
+        # is simply none.
+        return [], lambda _: None
+
+    if math.allclose(dy, 0):
+        # If the dy vector is zero, then the
+        # corresponding element of the VJP will be zero,
+        # and we can avoid a quantum computation.
+        return [], lambda _: math.convert_like(np.zeros([num_params]), dy)
+
+    gradient_tapes, fn = gradient_fn(tape)
+
+    def processing_fn(results):
+        # postprocess results to compute the Jacobian
+        jac = fn(results)
+        return _vector_jacobian_product(dy, jac)
+
+    return gradient_tapes, processing_fn
+
+
+def batch_vjp(tapes, dys, gradient_fn, reduction="append"):
+    """Generate the gradient tapes and processing function required to compute
+    the vector-Jacobian products of a batch of tapes.
+
+    Args:
+        tapes (Sequence[.QuantumTape]): sequence of quantum tapes to differentiate
+        dys (Sequence[tensor_like]): Sequence of gradient-output vectors ``dy``. Must be the
+            same length as ``tapes``. Each ``dy`` tensor should have shape
+            matching the output shape of the corresponding tape.
+        gradient_fn (callable): the gradient transform to use to differentiate
+            the tapes
+        reduction (str): Determines how the vector-Jacobian products are returned.
+            If ``append``, then the output of the function will be of the form
+            ``List[tensor_like]``, with each element corresponding to the VJP of each
+            input tape. If ``extend``, then the output VJPs will be concatenated.
+
+    Returns:
+        List[tensor_like or None]: list of vector-Jacobian products. ``None`` elements corresponds
+        to tapes with no trainable parameters.
+
+    **Example**
+
+    Consider the following Torch-compatible quantum tapes:
+
+    .. code-block:: python
+
+        x = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True, dtype=torch.float64)
+
+        def ansatz(x):
+            qml.RX(x[0, 0], wires=0)
+            qml.RY(x[0, 1], wires=1)
+            qml.RZ(x[0, 2], wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.RX(x[1, 0], wires=1)
+            qml.RY(x[1, 1], wires=0)
+            qml.RZ(x[1, 2], wires=1)
+
+        with TorchInterface.apply(qml.tape.JacobianTape()) as tape1:
+            ansatz(x)
+            qml.expval(qml.PauliZ(0))
+            qml.probs(wires=1)
+
+        with TorchInterface.apply(qml.tape.JacobianTape()) as tape2:
+            ansatz(x)
+            qml.expval(qml.PauliZ(0) @ qml.PauliZ(1))
+
+        tapes = [tape1, tape2]
+
+    Both tapes share the same circuit ansatz, but have different measurement outputs.
+
+    We can use the ``batch_vjp`` function to compute the vector-Jacobian product,
+    given a list of gradient-output vectors ``dys`` per tape:
+
+    >>> dys = [torch.tensor([1., 1., 1.], dtype=torch.float64),
+    ...  torch.tensor([1.], dtype=torch.float64)]
+    >>> vjp_tapes, fn = qml.gradients.batch_vjp(tapes, dys, qml.gradients.param_shift)
+
+    Note that each ``dy`` has shape matching the output dimension of the tape
+    (``tape1`` has 1 expectation and 2 probability values --- 3 outputs --- and ``tape2``
+    has 1 expectation value).
+
+    Executing the VJP tapes, and applying the processing function:
+
+    >>> dev = qml.device("default.qubit", wires=2)
+    >>> vjps = fn([t.execute(dev) for t in vjp_tapes])
+    >>> vjps
+    [tensor([-0.6069, -0.0451,  0.0451, -0.0139, -0.2809,  0.2809],
+       dtype=torch.float64, grad_fn=<ViewBackward>),
+       tensor([ 0.1739, -0.1641, -0.0054, -0.2937, -0.4008,  0.0000],
+       dtype=torch.float64, grad_fn=<ViewBackward>)]
+
+    We have two VJPs; one per tape. Each one corresponds to the number of parameters
+    on the tapes (6).
+
+    The output VJPs are also differentiable with respect to the tape parameters:
+
+    >>> cost = torch.sum(vjps[0] + vjps[1])
+    >>> cost.backward()
+    >>> x.grad
+    tensor([[-4.7924e-01, -9.0857e-01, -2.4198e-01],
+            [-9.2973e-02, -1.0772e+00,  4.7184e-09]], dtype=torch.float64)
+    """
+    reshape_info = []
+    gradient_tapes = []
+    processing_fns = []
+
+    # Loop through the tapes and dys vector
+    for tape, dy in zip(tapes, dys):
+        g_tapes, fn = vjp(tape, dy, gradient_fn)
+
+        reshape_info.append(len(g_tapes))
+        processing_fns.append(fn)
+        gradient_tapes.extend(g_tapes)
+
+    def processing_fn(results):
+        vjps = []
+        start = 0
+
+        for t_idx, dy in zip(range(len(tapes)), dys):
+            # extract the correct results from the flat list
+            res_len = reshape_info[t_idx]
+            res_t = results[start : start + res_len]
+            start += res_len
+
+            # postprocess results to compute the VJP
+            vjp = processing_fns[t_idx](res_t)
+
+            if vjp is None:
+                vjps.append(None)
+                continue
+
+            getattr(vjps, reduction)(vjp)
+
+        return vjps
+
+    return gradient_tapes, processing_fn

From 674604b32212f9668b38cbaf715e7cd2d4de521c Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 4 Aug 2021 22:53:31 +0800
Subject: [PATCH 02/45] linting

---
 pennylane/gradients/vjp.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index f5157fce6a8..e0b14b3c93a 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -15,6 +15,8 @@
 This module contains functions for computing the vector-Jacobian product
 of a batch of tapes.
 """
+import numpy as np
+
 from pennylane import math
 
 
@@ -214,20 +216,20 @@ def processing_fn(results):
         vjps = []
         start = 0
 
-        for t_idx, dy in zip(range(len(tapes)), dys):
+        for t_idx in range(len(tapes)):
             # extract the correct results from the flat list
             res_len = reshape_info[t_idx]
             res_t = results[start : start + res_len]
             start += res_len
 
             # postprocess results to compute the VJP
-            vjp = processing_fns[t_idx](res_t)
+            vjp_ = processing_fns[t_idx](res_t)
 
-            if vjp is None:
+            if vjp_ is None:
                 vjps.append(None)
                 continue
 
-            getattr(vjps, reduction)(vjp)
+            getattr(vjps, reduction)(vjp_)
 
         return vjps
 

From 688f4a227e2c567f333ee1092b7e520902a78853 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 5 Aug 2021 02:42:18 +0800
Subject: [PATCH 03/45] more tests

---
 pennylane/gradients/vjp.py |  5 +----
 pennylane/math/utils.py    | 13 ++++++++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index e0b14b3c93a..d0fa4d5058b 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """
 This module contains functions for computing the vector-Jacobian product
-of a batch of tapes.
+of tapes.
 """
 import numpy as np
 
@@ -24,9 +24,6 @@ def _vector_jacobian_product(dy, jac):
     """Compute the vector-Jacobian product for a given
     vector of gradient outputs dy and a Jacobian Jac"""
     dy_row = math.reshape(dy, [-1])
-    jac = math.transpose(math.stack(jac))
-    num_params = len(jac)
-    jac = math.reshape(jac, [-1, num_params])
     return math.tensordot(jac, dy_row, [[0], [0]])
 
 
diff --git a/pennylane/math/utils.py b/pennylane/math/utils.py
index 10d23cde51d..2ebb80d1df4 100644
--- a/pennylane/math/utils.py
+++ b/pennylane/math/utils.py
@@ -18,6 +18,8 @@
 from autoray import numpy as np
 import numpy as _np
 
+import pennylane as qml
+
 from . import single_dispatch  # pylint:disable=unused-import
 
 
@@ -55,9 +57,14 @@ def allequal(tensor1, tensor2, **kwargs):
 def allclose(a, b, rtol=1e-05, atol=1e-08, **kwargs):
     """Wrapper around np.allclose, allowing tensors ``a`` and ``b``
     to differ in type"""
-    t1 = ar.to_numpy(a)
-    t2 = ar.to_numpy(b)
-    return np.allclose(t1, t2, rtol=rtol, atol=atol, **kwargs)
+    try:
+        res = np.allclose(a, b, rtol=rtol, atol=atol, **kwargs)
+    except (TypeError, AttributeError):
+        t1 = ar.to_numpy(a)
+        t2 = ar.to_numpy(b)
+        res = np.allclose(t1, t2, rtol=rtol, atol=atol, **kwargs)
+
+    return res
 
 
 allclose.__doc__ = _np.allclose.__doc__

From 9a8476bc4a8b0802403b38044e3efcb83eb64ebc Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 5 Aug 2021 13:37:17 +0800
Subject: [PATCH 04/45] linting

---
 pennylane/math/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pennylane/math/utils.py b/pennylane/math/utils.py
index 2ebb80d1df4..d42175d956e 100644
--- a/pennylane/math/utils.py
+++ b/pennylane/math/utils.py
@@ -18,8 +18,6 @@
 from autoray import numpy as np
 import numpy as _np
 
-import pennylane as qml
-
 from . import single_dispatch  # pylint:disable=unused-import
 
 

From 04133075cb25c7168995d563470b264f5097e12e Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 5 Aug 2021 14:20:22 +0800
Subject: [PATCH 05/45] add tests

---
 tests/gradients/test_vjp.py | 426 ++++++++++++++++++++++++++++++++++++
 1 file changed, 426 insertions(+)
 create mode 100644 tests/gradients/test_vjp.py

diff --git a/tests/gradients/test_vjp.py b/tests/gradients/test_vjp.py
new file mode 100644
index 00000000000..107485899df
--- /dev/null
+++ b/tests/gradients/test_vjp.py
@@ -0,0 +1,426 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the gradients.vjp module."""
+from functools import partial
+
+import pytest
+
+import pennylane as qml
+from pennylane import numpy as np
+from pennylane.gradients import param_shift
+
+
+class TestVJP:
+    """Tests for the vjp function"""
+
+    def test_no_trainable_parameters(self):
+        """A tape with no trainable parameters will simply return None"""
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {}
+        dy = np.array([1.0])
+        tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+
+        assert not tapes
+        assert fn(tapes) is None
+
+    def test_zero_dy(self):
+        """A zero dy vector will return no tapes and a zero matrix"""
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {0, 1}
+        dy = np.array([0.0])
+        tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+
+        assert not tapes
+        assert np.all(fn(tapes) == np.zeros([len(tape.trainable_params)]))
+
+    def test_single_expectation_value(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with a single expval output"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        tape.trainable_params = {0, 1}
+        dy = np.array([1.0])
+
+        tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+        assert len(tapes) == 4
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (2,)
+
+        expected = np.array([-np.sin(y) * np.sin(x), np.cos(y) * np.cos(x)])
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_multiple_expectation_values(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with multiple expval outputs"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+            qml.expval(qml.PauliX(1))
+
+        tape.trainable_params = {0, 1}
+        dy = np.array([1.0, 2.0])
+
+        tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+        assert len(tapes) == 4
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (2,)
+
+        expected = np.array([-np.sin(x), 2 * np.cos(y)])
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_prob_expectation_values(self, tol):
+        """Tests correct output shape and evaluation for a tape
+        with prob and expval outputs"""
+        dev = qml.device("default.qubit", wires=2)
+        x = 0.543
+        y = -0.654
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(x, wires=[0])
+            qml.RY(y, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+            qml.probs(wires=[0, 1])
+
+        tape.trainable_params = {0, 1}
+        dy = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+
+        tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+        assert len(tapes) == 4
+
+        res = fn(dev.batch_execute(tapes))
+        assert res.shape == (2,)
+
+        expected = (
+            np.array(
+                [
+                    [-2 * np.sin(x), 0],
+                    [
+                        -(np.cos(y / 2) ** 2 * np.sin(x)),
+                        -(np.cos(x / 2) ** 2 * np.sin(y)),
+                    ],
+                    [
+                        -(np.sin(x) * np.sin(y / 2) ** 2),
+                        (np.cos(x / 2) ** 2 * np.sin(y)),
+                    ],
+                    [
+                        (np.sin(x) * np.sin(y / 2) ** 2),
+                        (np.sin(x / 2) ** 2 * np.sin(y)),
+                    ],
+                    [
+                        (np.cos(y / 2) ** 2 * np.sin(x)),
+                        -(np.sin(x / 2) ** 2 * np.sin(y)),
+                    ],
+                ]
+            )
+            / 2
+        )
+
+        assert np.allclose(res, dy @ expected, atol=tol, rtol=0)
+
+
+def expected(params):
+    x, y = 1.0 * params
+    return (
+        np.array(
+            [
+                (np.cos(y / 2) ** 2 * np.sin(x)) + (np.cos(y / 2) ** 2 * np.sin(x)),
+                (np.cos(x / 2) ** 2 * np.sin(y)) - (np.sin(x / 2) ** 2 * np.sin(y)),
+            ]
+        )
+        / 2
+    )
+
+
+def ansatz(x, y):
+    qml.RX(x, wires=[0])
+    qml.RY(y, wires=[1])
+    qml.CNOT(wires=[0, 1])
+    qml.probs(wires=[0, 1])
+
+
+class TestVJPGradients:
+    """Gradient tests for the vjp function"""
+
+    def test_autograd(self, tol):
+        """Tests that the output of the VJP transform
+        can be differentiated using autograd."""
+        dev = qml.device("default.qubit.autograd", wires=2)
+        params = np.array([0.543, -0.654], requires_grad=True)
+
+        def cost_fn(x, dy):
+            with qml.tape.JacobianTape() as tape:
+                ansatz(x[0], x[1])
+
+            tape.trainable_params = {0, 1}
+            tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+            vjp = fn(dev.batch_execute(tapes))
+            return vjp
+
+        dy = np.array([-1.0, 0.0, 0.0, 1.0], requires_grad=False)
+        res = cost_fn(params, dy)
+        assert np.allclose(res, expected(params), atol=tol, rtol=0)
+
+        res = qml.jacobian(cost_fn)(params, dy)
+        assert np.allclose(res, qml.jacobian(expected)(params), atol=tol, rtol=0)
+
+    def test_torch(self, tol):
+        """Tests that the output of the VJP transform
+        can be differentiated using Torch."""
+        torch = pytest.importorskip("torch")
+        from pennylane.interfaces.torch import TorchInterface
+
+        dev = qml.device("default.qubit.tf", wires=2)
+
+        params = torch.tensor([0.543, -0.654], requires_grad=True, dtype=torch.float64)
+        dy = torch.tensor([-1.0, 0.0, 0.0, 1.0], dtype=torch.float64)
+
+        with TorchInterface.apply(qml.tape.QubitParamShiftTape()) as tape:
+            ansatz(params[0], params[1])
+
+        tape.trainable_params = {0, 1}
+        tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+        vjp = fn([t.execute(dev) for t in tapes])
+
+        assert np.allclose(vjp.detach(), expected(params.detach()), atol=tol, rtol=0)
+
+        cost = vjp[0, 0]
+        cost.backward()
+
+        exp = qml.jacobian(lambda x: expected(x)[0])(params.detach().numpy())
+        assert np.allclose(params.grad, exp, atol=tol, rtol=0)
+
+    def test_tf(self, tol):
+        """Tests that the output of the VJP transform
+        can be differentiated using TF."""
+        tf = pytest.importorskip("tensorflow")
+
+        dev = qml.device("default.qubit.tf", wires=2)
+
+        params = tf.Variable([0.543, -0.654], dtype=tf.float64)
+        dy = tf.constant([-1.0, 0.0, 0.0, 1.0], dtype=tf.float64)
+
+        with tf.GradientTape() as t:
+            with qml.tape.JacobianTape() as tape:
+                ansatz(params[0], params[1])
+
+            tape.trainable_params = {0, 1}
+            tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+            vjp = fn(dev.batch_execute(tapes))
+
+        assert np.allclose(vjp, expected(params), atol=tol, rtol=0)
+
+        res = t.jacobian(vjp, params)
+        assert np.allclose(res, qml.jacobian(expected)(params.numpy()), atol=tol, rtol=0)
+
+    def test_jax(self, tol):
+        """Tests that the output of the VJP transform
+        can be differentiated using JAX."""
+        jax = pytest.importorskip("jax")
+        from jax import numpy as jnp
+
+        dev = qml.device("default.qubit.jax", wires=2)
+        params = jnp.array([0.543, -0.654])
+
+        @partial(jax.jit, static_argnums=1)
+        def cost_fn(x, dy):
+            with qml.tape.JacobianTape() as tape:
+                ansatz(x[0], x[1])
+
+            tape.trainable_params = {0, 1}
+            tapes, fn = qml.gradients.vjp(tape, dy, param_shift)
+            vjp = fn(dev.batch_execute(tapes))
+            return vjp
+
+        dy = (-1.0, 0.0, 0.0, 1.0)
+        res = cost_fn(params, dy)
+        assert np.allclose(res, expected(params), atol=tol, rtol=0)
+
+        res = jax.jacobian(cost_fn, argnums=0)(params, dy)
+        exp = qml.jacobian(expected)(np.array(params))
+        assert np.allclose(res, exp, atol=tol, rtol=0)
+
+
+class TestBatchVJP:
+    """Tests for the batch VJP function"""
+
+    def test_one_tape_no_trainable_parameters(self):
+        """A tape with no trainable parameters will simply return None"""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.JacobianTape() as tape2:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape1.trainable_params = {}
+        tape2.trainable_params = {0, 1}
+
+        tapes = [tape1, tape2]
+        dys = [np.array([1.0]), np.array([1.0])]
+
+        v_tapes, fn = qml.gradients.batch_vjp(tapes, dys, param_shift)
+        assert len(v_tapes) == 4
+
+        # Even though there are 3 parameters, only two contribute
+        # to the VJP, so only 2*2=4 quantum evals
+        res = fn(dev.batch_execute(v_tapes))
+        assert res[0] is None
+        assert res[1] is not None
+
+    def test_all_tapes_no_trainable_parameters(self):
+        """A tape with no trainable parameters will simply return None"""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape1.trainable_params = set()
+        tape2.trainable_params = set()
+
+        tapes = [tape1, tape2]
+        dys = [np.array([1.0]), np.array([1.0])]
+
+        v_tapes, fn = qml.gradients.batch_vjp(tapes, dys, param_shift)
+
+        assert v_tapes == []
+        assert fn([]) == [None, None]
+
+    def test_zero_dy(self):
+        """ "A zero dy vector will return no tapes and a zero matrix"""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.JacobianTape() as tape2:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape1.trainable_params = {0}
+        tape2.trainable_params = {0, 1}
+
+        tapes = [tape1, tape2]
+        dys = [np.array([0.0]), np.array([1.0])]
+
+        v_tapes, fn = qml.gradients.batch_vjp(tapes, dys, param_shift)
+        res = fn(dev.batch_execute(v_tapes))
+
+        # Even though there are 3 parameters, only two contribute
+        # to the VJP, so only 2*2=4 quantum evals
+        assert len(v_tapes) == 4
+        assert np.allclose(res[0], 0)
+
+    def test_reduction_append(self):
+        """ "Test the 'append' reduction strategy"""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.JacobianTape() as tape1:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.JacobianTape() as tape2:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape1.trainable_params = {0}
+        tape2.trainable_params = {0, 1}
+
+        tapes = [tape1, tape2]
+        dys = [np.array([1.0]), np.array([1.0])]
+
+        v_tapes, fn = qml.gradients.batch_vjp(tapes, dys, param_shift, reduction="append")
+        res = fn(dev.batch_execute(v_tapes))
+
+        # Returned VJPs will be appended to a list, one vjp per tape
+        assert len(res) == 2
+        assert all(isinstance(r, np.ndarray) for r in res)
+        assert all(len(r) == len(t.trainable_params) for t, r in zip(tapes, res))
+
+    def test_reduction_extend(self):
+        """ "Test the 'extend' reduction strategy"""
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.JacobianTape() as tape1:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.JacobianTape() as tape2:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        tape1.trainable_params = {0}
+        tape2.trainable_params = {0, 1}
+
+        tapes = [tape1, tape2]
+        dys = [np.array([1.0]), np.array([1.0])]
+
+        v_tapes, fn = qml.gradients.batch_vjp(tapes, dys, param_shift, reduction="extend")
+        res = fn(dev.batch_execute(v_tapes))
+
+        # Returned VJPs will be extended into a list. Each element of the returned
+        # list will correspond to a single input parameter of the combined
+        # tapes.
+        assert len(res) == sum(len(t.trainable_params) for t in tapes)

From 6b44284ba5c0efe7d00f9fdb40a5a016266a0ce1 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 5 Aug 2021 14:26:05 +0800
Subject: [PATCH 06/45] add comment

---
 pennylane/math/utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pennylane/math/utils.py b/pennylane/math/utils.py
index d42175d956e..985c92d781a 100644
--- a/pennylane/math/utils.py
+++ b/pennylane/math/utils.py
@@ -56,8 +56,17 @@ def allclose(a, b, rtol=1e-05, atol=1e-08, **kwargs):
     """Wrapper around np.allclose, allowing tensors ``a`` and ``b``
     to differ in type"""
     try:
+        # Some frameworks may provide their own allclose implementation.
+        # Try and use it if available.
         res = np.allclose(a, b, rtol=rtol, atol=atol, **kwargs)
     except (TypeError, AttributeError):
+        # Otherwise, convert the input to NumPy arrays.
+        #
+        # TODO: replace this with a bespoke, framework agnostic
+        # low-level implementation to avoid the NumPy conversion:
+        #
+        #    np.abs(a - b) <= atol + rtol * np.abs(b)
+        #
         t1 = ar.to_numpy(a)
         t2 = ar.to_numpy(b)
         res = np.allclose(t1, t2, rtol=rtol, atol=atol, **kwargs)

From 35e1848ba0731c5d2ccc68c6aa604b580964aa06 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 5 Aug 2021 14:38:30 +0800
Subject: [PATCH 07/45] fix

---
 .github/CHANGELOG.md        | 5 ++++-
 tests/gradients/test_vjp.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 27bdd37fcd1..a864d123053 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -75,15 +75,18 @@
 * A new gradients module `qml.gradients` has been added, which provides
   differentiable quantum gradient transforms.
   [(#1476)](https://github.com/PennyLaneAI/pennylane/pull/1476)
+  [(#1494)](https://github.com/PennyLaneAI/pennylane/pull/1494)
 
   Available quantum gradient transforms include:
 
   - `qml.gradients.finite_diff`
+  - `qml.gradients.vjp`
+  - `qml.gradients.batch_vjp`
 
   For example,
 
   ```pycon
-  >>> with qml.tape.QuantumTape() as tape:
+  >>> with qml.tape.JacobianTape() as tape:
   ...     qml.RX(params[0], wires=0)
   ...     qml.RY(params[1], wires=0)
   ...     qml.RX(params[2], wires=0)
diff --git a/tests/gradients/test_vjp.py b/tests/gradients/test_vjp.py
index 107485899df..fc59f8870b2 100644
--- a/tests/gradients/test_vjp.py
+++ b/tests/gradients/test_vjp.py
@@ -208,7 +208,7 @@ def test_torch(self, tol):
         torch = pytest.importorskip("torch")
         from pennylane.interfaces.torch import TorchInterface
 
-        dev = qml.device("default.qubit.tf", wires=2)
+        dev = qml.device("default.qubit", wires=2)
 
         params = torch.tensor([0.543, -0.654], requires_grad=True, dtype=torch.float64)
         dy = torch.tensor([-1.0, 0.0, 0.0, 1.0], dtype=torch.float64)

From 67e216a0229e3cecf7cb18351c50b5965e082099 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Fri, 6 Aug 2021 01:55:09 +0800
Subject: [PATCH 08/45] more

---
 pennylane/_qubit_device.py             |  15 +++
 pennylane/gradients/__init__.py        |   2 +-
 pennylane/gradients/vjp.py             |  28 ++++
 pennylane/interfaces/batch/__init__.py | 123 ++++++++++++++++++
 pennylane/interfaces/batch/autograd.py | 170 +++++++++++++++++++++++++
 5 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 pennylane/interfaces/batch/__init__.py
 create mode 100644 pennylane/interfaces/batch/autograd.py

diff --git a/pennylane/_qubit_device.py b/pennylane/_qubit_device.py
index 5561d9cca49..6a799409f9c 100644
--- a/pennylane/_qubit_device.py
+++ b/pennylane/_qubit_device.py
@@ -826,6 +826,21 @@ def sample(self, observable, shot_range=None, bin_size=None):
 
         return samples.reshape((bin_size, -1))
 
+    def execute_and_gradients(self, circuits):
+        res = []
+        jacs = []
+
+        for circuit in circuits:
+            # Evaluations and gradients are paired, so that
+            # we can re-use the device state for the adjoint method
+            res.append(circuit.execute(self))
+            jacs.append(self.adjoint_jacobian(circuit, use_device_state=True))
+
+        return res, jacs
+
+    def gradients(self, circuits):
+        return [self.adjoint_jacobian(circuit) for circuit in circuits]
+
     def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
         """Implements the adjoint method outlined in
         `Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
diff --git a/pennylane/gradients/__init__.py b/pennylane/gradients/__init__.py
index 1863ff8d784..50f153552d7 100644
--- a/pennylane/gradients/__init__.py
+++ b/pennylane/gradients/__init__.py
@@ -19,4 +19,4 @@
 
 from .finite_difference import finite_diff, finite_diff_coeffs, generate_shifted_tapes
 from .parameter_shift import param_shift
-from .vjp import batch_vjp, vjp
+from .vjp import batch_vjp, vjp, _vector_jacobian_product, _vector_jacobian_products
diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index d0fa4d5058b..680531d84f3 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -24,9 +24,37 @@ def _vector_jacobian_product(dy, jac):
     """Compute the vector-Jacobian product for a given
     vector of gradient outputs dy and a Jacobian Jac"""
     dy_row = math.reshape(dy, [-1])
+    jac = math.reshape(jac, [dy_row.shape[0], -1])
     return math.tensordot(jac, dy_row, [[0], [0]])
 
 
+def _vector_jacobian_products(dys, jacs, reduction="append"):
+    """Compute the vector-Jacobian product for a given
+    vector of gradient outputs dys and Jacobians jacs"""
+    vjps = []
+
+    for dy, jac in zip(dys, jacs):
+
+        if jacs is None:
+            # The tape has no trainable parameters; the VJP
+            # is simply none.
+            vjps.append(None)
+            continue
+
+        if math.allclose(dy, 0):
+            # If the dy vector is zero, then the
+            # corresponding element of the VJP will be zero,
+            # and we can avoid a quantum computation.
+            num_params = math.reshape(jac, [-1, dy_row.shape[0]]).shape[0]
+            vjp = math.convert_like(np.zeros([num_params]), dy)
+        else:
+            vjp = _vector_jacobian_product(dy, jac)
+
+        getattr(vjps, reduction)(vjp)
+
+    return vjps
+
+
 def vjp(tape, dy, gradient_fn):
     """Generate the gradient tapes and processing function required to compute
     the vector-Jacobian products of a tape.
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
new file mode 100644
index 00000000000..c326cce43aa
--- /dev/null
+++ b/pennylane/interfaces/batch/__init__.py
@@ -0,0 +1,123 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This subpackage defines functions for interfacing devices with batch execution
+capabilities with different machine learning libraries.
+"""
+# pylint: disable=import-outside-toplevel)
+from functools import partial
+
+import pennylane as qml
+
+from .autograd import execute as execute_autograd
+
+
+def execute(tapes, device, gradient_fn, interface="autograd", accumulation="forward"):
+    """Execute a batch of tapes with Autograd parameters on a device.
+
+    Args:
+        tapes (Sequence[.QuantumTape]): batch of tapes to execute
+        device (.Device): Device to use to execute the batch of tapes.
+            If the device does not provide a ``batch_execute`` method,
+            by default the tapes will be executed in serial.
+        gradient_fn (None or callable): The gradient transform function to use
+            for backward passes. If "device", the device will be queried directly
+            for the gradient (if supported).
+        interface (str): The interface that will be used for classical backpropagation.
+            This affects the types of parameters that can exist on the input tapes.
+            Available options include ``autograd``, ``torch``, ``tf``, and ``jax``.
+        accumulation (str): Whether the gradients should be computed on the forward
+            pass (``forward``) or the backward pass (``backward``). Only applies
+            if the device is queried for the gradient; gradient transform
+            functions available in ``qml.gradients`` are only supported on the backward
+            pass.
+
+
+    Returns:
+        list[list[float]]: A nested list of tape results. Each element in
+        the returned list corresponds in order to the provided tapes.
+
+    **Example**
+
+    Consider the following cost function:
+
+    .. code-block:: python
+
+        def cost_fn(params, x, dev):
+            with qml.tape.QuantumTape() as tape1:
+                qml.RX(params[0], wires=0)
+                qml.RY(params[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            with qml.tape.QuantumTape() as tape2:
+                qml.RX(params[2], wires=0)
+                qml.RY(x[0], wires=1)
+                qml.CNOT(wires=[0, 1])
+                qml.probs(wires=0)
+
+            tapes = [tape1, tape2]
+
+            # execute both tapes in a batch on the given device
+            res = execute(tapes, dev)
+
+            return res[0][0] + res[1][0, 0] - res[1][0, 1]
+
+    In this cost function, two **independent** quantum tapes are being
+    constructed; one returning an expectation value, the other probabilities.
+    We then batch execute the two tapes, and reduce the results to obtain
+    a scalar.
+
+    Let's execute this cost function while tracking the gradient:
+
+    >>> dev = qml.device("lightning.qubit", wires=2)
+    >>> params = np.array([0.1, 0.2, 0.3], requires_grad=True)
+    >>> x = np.array([0.5], requires_grad=True)
+    >>> cost_fn(params, x)
+    1.9305068163274222
+
+    Since the ``execute`` function is differentiable, we can
+    also compute the gradient:
+
+    >>> qml.grad(cost_fn)(params, x)
+    (array([-0.0978434 , -0.19767681, -0.29552021]), array([5.37764278e-17]))
+
+    Finally, we can also compute any nth-order derivative. Let's compute the Jacobian
+    of the gradient (that is, the Hessian):
+
+    >>> x.requires_grad = False
+    >>> qml.jacobian(qml.grad(cost_fn))(params, x)
+    array([[-0.97517033,  0.01983384,  0.        ],
+           [ 0.01983384, -0.97517033,  0.        ],
+           [ 0.        ,  0.        , -0.95533649]])
+    """
+    # Default execution function; simply call device.batch execute
+    # and return no Jacobians.
+    execute_fn = lambda tapes: (device.batch_execute(tapes), [])
+
+    if gradient_fn == "device":
+        # gradient function is a device method
+
+        if accumulation == "forward":
+            # replace the forward execution function to return
+            # both results and gradients
+            execute_fn = device.execute_and_gradients
+
+        elif accumulation == "backward":
+            # replace the backward gradient computation
+            gradient_fn = device.gradients
+
+    if interface == "autograd":
+        return execute_autograd(tapes, device, execute_fn, gradient_fn)
+
+    raise ValueError(f"Unknown interface {interface}")
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
new file mode 100644
index 00000000000..00657e8fb20
--- /dev/null
+++ b/pennylane/interfaces/batch/autograd.py
@@ -0,0 +1,170 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains functions for adding the Autograd interface
+to a PennyLane Device class.
+"""
+import inspect
+
+import autograd
+from autograd.numpy.numpy_boxes import ArrayBox
+
+import pennylane as qml
+from pennylane import numpy as np
+
+
+def execute(tapes, device, execute_fn, gradient_fn, _n=1):
+    """Execute a batch of tapes with Autograd parameters on a device.
+
+    Args:
+        tapes (Sequence[.QuantumTape]): batch of tapes to execute
+        device (.Device): Device to use to execute the batch of tapes.
+            If the device does not provide a ``batch_execute`` method,
+            by default the tapes will be executed in serial.
+        execute_fn (callable): The execution function used to execute the tapes
+            during the forward pass. This function must return a tuple ``(results, jacobians)``.
+            If ``jacobians`` is an empty list, then ``gradient_fn`` is used to
+            compute the gradients during the backwards pass.
+        gradient_fn (callable): the gradient function to use to compute quantum gradients
+        _n (int): a positive integer used to track nesting of derivatives, for example
+            if the nth-order derivative is requested.
+
+    Returns:
+        list[list[float]]: A nested list of tape results. Each element in
+        the returned list corresponds in order to the provided tapes.
+    """
+
+    parameters = autograd.builtins.tuple(
+        [autograd.builtins.list(t.get_parameters()) for t in tapes]
+    )
+
+    return _execute(
+        parameters,
+        tapes=tapes,
+        device=device,
+        execute_fn=execute_fn,
+        gradient_fn=gradient_fn,
+        _n=_n,
+    )[0]
+
+
+@autograd.extend.primitive
+def _execute(
+    parameters, tapes=None, device=None, execute_fn=None, gradient_fn=None, _n=1
+):  # pylint: disable=dangerous-default-value,unused-argument
+    """Autodifferentiable wrapper around ``Device.batch_execute``.
+
+    The signature of this function is designed to workaround Autograd restrictions.
+    Note that the ``parameters`` argument is dependent on the ``tapes`` argument;
+    this function should always be called as follows:
+
+    >>> parameters = [autograd.builtins.list(t.get_parameters()) for t in tapes])
+    >>> parameters = autograd.builtins.tuple(parameters)
+    >>> _batch_execute(parameters, tapes=tapes, device=device)
+
+    In particular:
+
+    - ``parameters`` is dependent on the provided tapes: always extract them as above
+    - ``tapes`` is a *required* argument
+    - ``device`` is a *required* argument
+
+    The private argument ``_n`` is used to track nesting of derivatives, for example
+    if the nth-order derivative is requested. Do not set this argument unless you
+    understand the consequences!
+    """
+    with qml.tape.Unwrap(*tapes):
+        res, jacs = execute_fn(tapes)
+
+    return [np.tensor(r) for r in res], jacs
+
+
+def vjp(
+    ans, parameters, tapes=None, device=None, execute_fn=None, gradient_fn=None, _n=1
+):  # pylint: disable=dangerous-default-value,unused-argument
+    """Returns the vector-Jacobian product operator for a batch of quantum tapes.
+
+    Args:
+        ans (array): the result of the batch tape execution
+        parameters (list[list[Any]]): Nested list of the quantum tape parameters.
+            This argument should be generated from the provided list of tapes.
+        tapes (Sequence[.QuantumTape]): batch of tapes to execute
+        device (.Device): Device to use to execute the batch of tapes.
+            If the device does not provide a ``batch_execute`` method,
+            by default the tapes will be executed in serial.
+        execute_fn (callable): The execution function used to execute the tapes
+            during the forward pass. This function must return a tuple ``(results, jacobians)``.
+            If ``jacobians`` is an empty list, then ``gradient_fn`` is used to
+            compute the gradients during the backwards pass.
+        gradient_fn (callable): the gradient function to use to compute quantum gradients
+        _n (int): a positive integer used to track nesting of derivatives, for example
+            if the nth-order derivative is requested.
+
+    Returns:
+        function: this function accepts the backpropagation
+        gradient output vector, and computes the vector-Jacobian product
+    """
+
+    def grad_fn(dy):
+        """Returns the vector-Jacobian product with given
+        parameter values p and output gradient dy"""
+
+        dy = dy[0]
+        jacs = ans[1]
+
+        if jacs:
+            # Jacobians were computed on the forward pass (accumulation="forward")
+            # Simply compute the vjps classically here.
+            vjps = qml.gradients._vector_jacobian_products(dy, jacs, reduction="append")
+
+        else:
+            # Need to compute the Jacobians on the backward pass (accumulation="backward")
+
+            # Temporary: check if the gradient function is a differentiable transform.
+            # For the moment, simply check if it is part of the `qml.gradients` package.
+            # Longer term, we should have a way of checking this directly
+            # (e.g., isinstance(gradient_fn, GradientTransform))
+
+            if "pennylane.gradients" in inspect.getmodule(gradient_fn).__name__:
+
+                # Generate and execute the required gradient tapes
+                vjp_tapes, fn = qml.gradients.batch_vjp(tapes, dy, gradient_fn, reduction="append")
+
+                # This is where the magic happens. Note that we call ``execute``.
+                # This recursion, coupled with the fact that the gradient transforms
+                # are differentiable, allows for arbitrary order differentiation.
+                vjps = fn(execute(vjp_tapes, device, execute_fn, gradient_fn, _n=_n + 1))
+
+            elif inspect.ismethod(gradient_fn) and gradient_fn.__self__ is device:
+                # Gradient function is a device method.
+                # Note that unlike the previous branch:
+                #
+                # - there is no recursion here
+                # - gradient_fn is not differentiable
+                #
+                # so we cannot support higher-order derivatives.
+
+                with qml.tape.Unwrap(*tapes):
+                    jacs = gradient_fn(tapes)
+
+                vjps = qml.gradients._vector_jacobian_products(dy, jacs, reduction="append")
+
+            else:
+                raise ValueError("Unknown gradient function!!!")
+
+        return [qml.math.to_numpy(v, max_depth=_n) if isinstance(v, ArrayBox) else v for v in vjps]
+
+    return grad_fn
+
+
+autograd.extend.defvjp(_execute, vjp, argnums=[0])

From d0e40f8c160d32531d6c8ae2c27f07cd5ec1b7ba Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Fri, 6 Aug 2021 02:17:13 +0800
Subject: [PATCH 09/45] typos

---
 pennylane/gradients/vjp.py             |  3 +--
 pennylane/interfaces/batch/__init__.py | 11 ++++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index 680531d84f3..f306b3d2c7f 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -43,8 +43,7 @@ def _vector_jacobian_products(dys, jacs, reduction="append"):
 
         if math.allclose(dy, 0):
             # If the dy vector is zero, then the
-            # corresponding element of the VJP will be zero,
-            # and we can avoid a quantum computation.
+            # corresponding element of the VJP will be zero.
             num_params = math.reshape(jac, [-1, dy_row.shape[0]]).shape[0]
             vjp = math.convert_like(np.zeros([num_params]), dy)
         else:
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index c326cce43aa..17c2f417dd8 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -54,13 +54,15 @@ def execute(tapes, device, gradient_fn, interface="autograd", accumulation="forw
 
     .. code-block:: python
 
-        def cost_fn(params, x, dev):
-            with qml.tape.QuantumTape() as tape1:
+        dev = qml.device("lightning.qubit", wires=2)
+
+        def cost_fn(params, x):
+            with qml.tape.JacobianTape() as tape1:
                 qml.RX(params[0], wires=0)
                 qml.RY(params[1], wires=0)
                 qml.expval(qml.PauliZ(0))
 
-            with qml.tape.QuantumTape() as tape2:
+            with qml.tape.JacobianTape() as tape2:
                 qml.RX(params[2], wires=0)
                 qml.RY(x[0], wires=1)
                 qml.CNOT(wires=[0, 1])
@@ -69,7 +71,7 @@ def cost_fn(params, x, dev):
             tapes = [tape1, tape2]
 
             # execute both tapes in a batch on the given device
-            res = execute(tapes, dev)
+            res = execute(tapes, dev, qml.gradients.param_shift)
 
             return res[0][0] + res[1][0, 0] - res[1][0, 1]
 
@@ -80,7 +82,6 @@ def cost_fn(params, x, dev):
 
     Let's execute this cost function while tracking the gradient:
 
-    >>> dev = qml.device("lightning.qubit", wires=2)
     >>> params = np.array([0.1, 0.2, 0.3], requires_grad=True)
     >>> x = np.array([0.5], requires_grad=True)
     >>> cost_fn(params, x)

From 89bdd8d0d022f6edc5729acb461c60532d1eaf7f Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Fri, 6 Aug 2021 22:04:12 +0800
Subject: [PATCH 10/45] Apply suggestions from code review

Co-authored-by: Nathan Killoran <co9olguy@users.noreply.github.com>
---
 pennylane/gradients/vjp.py             | 2 +-
 pennylane/interfaces/batch/__init__.py | 8 ++++----
 pennylane/interfaces/batch/autograd.py | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index f306b3d2c7f..562e57523da 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -22,7 +22,7 @@
 
 def _vector_jacobian_product(dy, jac):
     """Compute the vector-Jacobian product for a given
-    vector of gradient outputs dy and a Jacobian Jac"""
+    vector of gradient outputs dy and a Jacobian jac"""
     dy_row = math.reshape(dy, [-1])
     jac = math.reshape(jac, [dy_row.shape[0], -1])
     return math.tensordot(jac, dy_row, [[0], [0]])
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 17c2f417dd8..62a461792e2 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This subpackage defines functions for interfacing devices with batch execution
+This subpackage defines functions for interfacing devices' batch execution
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel)
@@ -24,7 +24,7 @@
 
 
 def execute(tapes, device, gradient_fn, interface="autograd", accumulation="forward"):
-    """Execute a batch of tapes with Autograd parameters on a device.
+    """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
     Args:
         tapes (Sequence[.QuantumTape]): batch of tapes to execute
@@ -34,7 +34,7 @@ def execute(tapes, device, gradient_fn, interface="autograd", accumulation="forw
         gradient_fn (None or callable): The gradient transform function to use
             for backward passes. If "device", the device will be queried directly
             for the gradient (if supported).
-        interface (str): The interface that will be used for classical backpropagation.
+        interface (str): The interface that will be used for classical autodifferentiation.
             This affects the types of parameters that can exist on the input tapes.
             Available options include ``autograd``, ``torch``, ``tf``, and ``jax``.
         accumulation (str): Whether the gradients should be computed on the forward
@@ -102,7 +102,7 @@ def cost_fn(params, x):
            [ 0.01983384, -0.97517033,  0.        ],
            [ 0.        ,  0.        , -0.95533649]])
     """
-    # Default execution function; simply call device.batch execute
+    # Default execution function; simply call device.batch_execute
     # and return no Jacobians.
     execute_fn = lambda tapes: (device.batch_execute(tapes), [])
 
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 00657e8fb20..3ea6df1c6bb 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -65,7 +65,7 @@ def _execute(
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Autodifferentiable wrapper around ``Device.batch_execute``.
 
-    The signature of this function is designed to workaround Autograd restrictions.
+    The signature of this function is designed to work around Autograd restrictions.
     Note that the ``parameters`` argument is dependent on the ``tapes`` argument;
     this function should always be called as follows:
 
@@ -117,7 +117,7 @@ def vjp(
 
     def grad_fn(dy):
         """Returns the vector-Jacobian product with given
-        parameter values p and output gradient dy"""
+        parameter values and output gradient dy"""
 
         dy = dy[0]
         jacs = ans[1]
@@ -160,7 +160,7 @@ def grad_fn(dy):
                 vjps = qml.gradients._vector_jacobian_products(dy, jacs, reduction="append")
 
             else:
-                raise ValueError("Unknown gradient function!!!")
+                raise ValueError("Unknown gradient function.")
 
         return [qml.math.to_numpy(v, max_depth=_n) if isinstance(v, ArrayBox) else v for v in vjps]
 

From f415d9f10626765f77670d6124ed71319001a64c Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 9 Aug 2021 15:51:53 +0800
Subject: [PATCH 11/45] fixes

---
 pennylane/gradients/__init__.py |  2 +-
 pennylane/gradients/vjp.py      | 29 +++++++++++++++++++++++++----
 tests/gradients/test_vjp.py     | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/pennylane/gradients/__init__.py b/pennylane/gradients/__init__.py
index 1863ff8d784..444e9fddb12 100644
--- a/pennylane/gradients/__init__.py
+++ b/pennylane/gradients/__init__.py
@@ -19,4 +19,4 @@
 
 from .finite_difference import finite_diff, finite_diff_coeffs, generate_shifted_tapes
 from .parameter_shift import param_shift
-from .vjp import batch_vjp, vjp
+from .vjp import compute_vjp, batch_vjp, vjp
diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index d0fa4d5058b..c7c1021d854 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -20,10 +20,31 @@
 from pennylane import math
 
 
-def _vector_jacobian_product(dy, jac):
-    """Compute the vector-Jacobian product for a given
-    vector of gradient outputs dy and a Jacobian Jac"""
+def compute_vjp(dy, jac):
+    """Convenience function to compute the vector-Jacobian product for a given
+    vector of gradient outputs and a Jacobian.
+
+    Args:
+        dy (tensor_like): vector of gradient outputs
+        jac (tensor_like): Jacobian matrix. For an n-dimensional ``dy``
+            vector, the first n-dimensions of ``jac`` should match
+            the shape of ``dy``.
+
+    Returns:
+        tensor_like: the vector-Jacobian product
+    """
+    if jac is None:
+        return None
+
     dy_row = math.reshape(dy, [-1])
+    jac = math.reshape(jac, [dy_row.shape[0], -1])
+
+    if math.allclose(dy, 0):
+        # If the dy vector is zero, then the
+        # corresponding element of the VJP will be zero.
+        num_params = jac.shape[1]
+        return math.convert_like(np.zeros([num_params]), dy)
+
     return math.tensordot(jac, dy_row, [[0], [0]])
 
 
@@ -110,7 +131,7 @@ def vjp(tape, dy, gradient_fn):
     def processing_fn(results):
         # postprocess results to compute the Jacobian
         jac = fn(results)
-        return _vector_jacobian_product(dy, jac)
+        return compute_vjp(dy, jac)
 
     return gradient_tapes, processing_fn
 
diff --git a/tests/gradients/test_vjp.py b/tests/gradients/test_vjp.py
index fc59f8870b2..3be53760e59 100644
--- a/tests/gradients/test_vjp.py
+++ b/tests/gradients/test_vjp.py
@@ -21,6 +21,37 @@
 from pennylane.gradients import param_shift
 
 
+class TestComputeVJP:
+    """Tests for the numeric computation of VJPs"""
+
+    def test_computation(self):
+        """Test that the correct VJP is returned"""
+        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
+        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
+
+        vjp = qml.gradients.compute_vjp(dy, jac)
+
+        assert vjp.shape == (3,)
+        assert np.all(vjp == np.tensordot(dy, jac, axes=[[0, 1], [0, 1]]))
+
+    def test_jacobian_is_none(self):
+        """A None Jacobian returns a None VJP"""
+
+        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
+        jac = None
+
+        vjp = qml.gradients.compute_vjp(dy, jac)
+        assert vjp is None
+
+    def test_zero_dy(self):
+        """A zero dy vector will return a zero matrix"""
+        dy = np.zeros([2, 2])
+        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
+
+        vjp = qml.gradients.compute_vjp(dy, jac)
+        assert np.all(vjp == np.zeros([3]))
+
+
 class TestVJP:
     """Tests for the vjp function"""
 
@@ -222,7 +253,7 @@ def test_torch(self, tol):
 
         assert np.allclose(vjp.detach(), expected(params.detach()), atol=tol, rtol=0)
 
-        cost = vjp[0, 0]
+        cost = vjp[0]
         cost.backward()
 
         exp = qml.jacobian(lambda x: expected(x)[0])(params.detach().numpy())

From 6c5dc72e1e3aa0423b3f7b805aa156476aee7163 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 9 Aug 2021 19:57:02 +0800
Subject: [PATCH 12/45] add tests

---
 pennylane/_device.py                         |  59 +++
 pennylane/_qubit_device.py                   |  15 -
 pennylane/gradients/vjp.py                   |  15 +-
 pennylane/interfaces/batch/__init__.py       |  20 +-
 pennylane/interfaces/batch/autograd.py       |  55 +-
 tests/interfaces/batch/test_tape_autograd.py | 529 +++++++++++++++++++
 6 files changed, 656 insertions(+), 37 deletions(-)
 create mode 100644 tests/interfaces/batch/test_tape_autograd.py

diff --git a/pennylane/_device.py b/pennylane/_device.py
index 637c79cc637..b68bb844769 100644
--- a/pennylane/_device.py
+++ b/pennylane/_device.py
@@ -496,6 +496,65 @@ def batch_execute(self, circuits):
 
         return results
 
+    def execute_and_gradients(self, circuits, method="jacobian", **kwargs):
+        """Execute a batch of quantum circuits on the device, and return both the
+        results and the gradients.
+
+        The circuits are represented by tapes, and they are executed
+        one-by-one using the device's ``execute`` method. The results and the
+        corresponding Jacobians are collected in a list.
+
+        For plugin developers: This method should be overwritten if the device
+        can efficiently run multiple circuits on a backend, for example using
+        parallel and/or asynchronous executions, and return both the results and the
+        Jacobians.
+
+        Args:
+            circuits (list[.tape.QuantumTape]): circuits to execute on the device
+            method (str): the device method to call to compute the Jacobian of a single circuit
+            gradient_kwargs (dict): dictionary of keyword argument to pass
+                when calling ``method``.
+
+        Returns:
+            tuple[list[array[float]], list[array[float]]]: Tuple containing list of measured value(s)
+            and list of Jacobians. Returned Jacobians should be of shape ``(output_shape, num_params)``.
+        """
+        gradient_method = getattr(self, method)
+
+        res = []
+        jacs = []
+
+        for circuit in circuits:
+            # Evaluations and gradients are paired, so that
+            # we can re-use the device state for the adjoint method
+            res.append(circuit.execute(self))
+            jacs.append(gradient_method(circuit, **kwargs))
+
+        return res, jacs
+
+    def gradients(self, circuits, method="jacobian", **kwargs):
+        """Return the gradients of a batch of quantum circuits on the device.
+
+        The gradient method ``method`` is called sequentially for each
+        circuit, and the corresponding Jacobians are collected in a list.
+
+        For plugin developers: This method should be overwritten if the device
+        can efficiently compute the gradient of multiple circuits on a
+        backend, for example using parallel and/or asynchronous executions.
+
+        Args:
+            circuits (list[.tape.QuantumTape]): circuits to execute on the device
+            method (str): the device method to call to compute the Jacobian of a single circuit
+            gradient_kwargs (dict): dictionary of keyword argument to pass
+                when calling ``method``.
+
+        Returns:
+            list[array[float]]: List of Jacobians. Returned Jacobians should be of
+            shape ``(output_shape, num_params)``.
+        """
+        gradient_method = getattr(self, method)
+        return [gradient_method(circuit, **kwargs) for circuit in circuits]
+
     @property
     def op_queue(self):
         """The operation queue to be applied.
diff --git a/pennylane/_qubit_device.py b/pennylane/_qubit_device.py
index 6a799409f9c..5561d9cca49 100644
--- a/pennylane/_qubit_device.py
+++ b/pennylane/_qubit_device.py
@@ -826,21 +826,6 @@ def sample(self, observable, shot_range=None, bin_size=None):
 
         return samples.reshape((bin_size, -1))
 
-    def execute_and_gradients(self, circuits):
-        res = []
-        jacs = []
-
-        for circuit in circuits:
-            # Evaluations and gradients are paired, so that
-            # we can re-use the device state for the adjoint method
-            res.append(circuit.execute(self))
-            jacs.append(self.adjoint_jacobian(circuit, use_device_state=True))
-
-        return res, jacs
-
-    def gradients(self, circuits):
-        return [self.adjoint_jacobian(circuit) for circuit in circuits]
-
     def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
         """Implements the adjoint method outlined in
         `Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
diff --git a/pennylane/gradients/vjp.py b/pennylane/gradients/vjp.py
index c7c1021d854..57f21ae64fd 100644
--- a/pennylane/gradients/vjp.py
+++ b/pennylane/gradients/vjp.py
@@ -48,7 +48,7 @@ def compute_vjp(dy, jac):
     return math.tensordot(jac, dy_row, [[0], [0]])
 
 
-def vjp(tape, dy, gradient_fn):
+def vjp(tape, dy, gradient_fn, gradient_kwargs=None):
     """Generate the gradient tapes and processing function required to compute
     the vector-Jacobian products of a tape.
 
@@ -58,6 +58,8 @@ def vjp(tape, dy, gradient_fn):
             matching the output shape of the corresponding tape.
         gradient_fn (callable): the gradient transform to use to differentiate
             the tape
+        gradient_kwargs (dict): dictionary of keyword arguments to pass when
+            determining the gradients of tapes
 
     Returns:
         tensor_like or None: Vector-Jacobian product. Returns None if the tape
@@ -113,6 +115,7 @@ def vjp(tape, dy, gradient_fn):
     """
     # t._par_info = {}
     # t._update()
+    gradient_kwargs = gradient_kwargs or {}
     num_params = len(tape.trainable_params)
 
     if num_params == 0:
@@ -126,7 +129,7 @@ def vjp(tape, dy, gradient_fn):
         # and we can avoid a quantum computation.
         return [], lambda _: math.convert_like(np.zeros([num_params]), dy)
 
-    gradient_tapes, fn = gradient_fn(tape)
+    gradient_tapes, fn = gradient_fn(tape, **gradient_kwargs)
 
     def processing_fn(results):
         # postprocess results to compute the Jacobian
@@ -136,7 +139,7 @@ def processing_fn(results):
     return gradient_tapes, processing_fn
 
 
-def batch_vjp(tapes, dys, gradient_fn, reduction="append"):
+def batch_vjp(tapes, dys, gradient_fn, reduction="append", gradient_kwargs=None):
     """Generate the gradient tapes and processing function required to compute
     the vector-Jacobian products of a batch of tapes.
 
@@ -151,6 +154,8 @@ def batch_vjp(tapes, dys, gradient_fn, reduction="append"):
             If ``append``, then the output of the function will be of the form
             ``List[tensor_like]``, with each element corresponding to the VJP of each
             input tape. If ``extend``, then the output VJPs will be concatenated.
+        gradient_kwargs (dict): dictionary of keyword arguments to pass when
+            determining the gradients of tapes
 
     Returns:
         List[tensor_like or None]: list of vector-Jacobian products. ``None`` elements corresponds
@@ -218,13 +223,15 @@ def ansatz(x):
     tensor([[-4.7924e-01, -9.0857e-01, -2.4198e-01],
             [-9.2973e-02, -1.0772e+00,  4.7184e-09]], dtype=torch.float64)
     """
+    gradient_kwargs = gradient_kwargs or {}
+
     reshape_info = []
     gradient_tapes = []
     processing_fns = []
 
     # Loop through the tapes and dys vector
     for tape, dy in zip(tapes, dys):
-        g_tapes, fn = vjp(tape, dy, gradient_fn)
+        g_tapes, fn = vjp(tape, dy, gradient_fn, gradient_kwargs)
 
         reshape_info.append(len(g_tapes))
         processing_fns.append(fn)
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 62a461792e2..71abc8189a0 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -23,7 +23,7 @@
 from .autograd import execute as execute_autograd
 
 
-def execute(tapes, device, gradient_fn, interface="autograd", accumulation="forward"):
+def execute(tapes, device, gradient_fn, interface="autograd", mode="best", gradient_kwargs=None):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
     Args:
@@ -37,12 +37,13 @@ def execute(tapes, device, gradient_fn, interface="autograd", accumulation="forw
         interface (str): The interface that will be used for classical autodifferentiation.
             This affects the types of parameters that can exist on the input tapes.
             Available options include ``autograd``, ``torch``, ``tf``, and ``jax``.
-        accumulation (str): Whether the gradients should be computed on the forward
+        mode (str): Whether the gradients should be computed on the forward
             pass (``forward``) or the backward pass (``backward``). Only applies
             if the device is queried for the gradient; gradient transform
             functions available in ``qml.gradients`` are only supported on the backward
             pass.
-
+        gradient_kwargs (dict): dictionary of keyword arguments to pass when
+            determining the gradients of tapes
 
     Returns:
         list[list[float]]: A nested list of tape results. Each element in
@@ -104,21 +105,26 @@ def cost_fn(params, x):
     """
     # Default execution function; simply call device.batch_execute
     # and return no Jacobians.
-    execute_fn = lambda tapes: (device.batch_execute(tapes), [])
+    execute_fn = lambda tapes, **kwargs: (device.batch_execute(tapes), [])
+    gradient_kwargs = gradient_kwargs or {}
 
     if gradient_fn == "device":
         # gradient function is a device method
 
-        if accumulation == "forward":
+        if mode in ("forward", "best"):
             # replace the forward execution function to return
             # both results and gradients
             execute_fn = device.execute_and_gradients
+            gradient_fn = None
 
-        elif accumulation == "backward":
+        elif mode == "backward":
             # replace the backward gradient computation
             gradient_fn = device.gradients
 
+    elif mode == "forward":
+        raise ValueError("Gradient transforms cannot be used with mode='forward'")
+
     if interface == "autograd":
-        return execute_autograd(tapes, device, execute_fn, gradient_fn)
+        return execute_autograd(tapes, device, execute_fn, gradient_fn, gradient_kwargs)
 
     raise ValueError(f"Unknown interface {interface}")
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 3ea6df1c6bb..6f7270c1bbc 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -24,7 +24,7 @@
 from pennylane import numpy as np
 
 
-def execute(tapes, device, execute_fn, gradient_fn, _n=1):
+def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
     """Execute a batch of tapes with Autograd parameters on a device.
 
     Args:
@@ -36,6 +36,8 @@ def execute(tapes, device, execute_fn, gradient_fn, _n=1):
             during the forward pass. This function must return a tuple ``(results, jacobians)``.
             If ``jacobians`` is an empty list, then ``gradient_fn`` is used to
             compute the gradients during the backwards pass.
+        gradient_kwargs (dict): dictionary of keyword arguments to pass when
+            determining the gradients of tapes
         gradient_fn (callable): the gradient function to use to compute quantum gradients
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
@@ -44,6 +46,10 @@ def execute(tapes, device, execute_fn, gradient_fn, _n=1):
         list[list[float]]: A nested list of tape results. Each element in
         the returned list corresponds in order to the provided tapes.
     """
+    for tape in tapes:
+        # set the trainable parameters
+        params = tape.get_parameters(trainable_only=False)
+        tape.trainable_params = qml.math.get_trainable_indices(params)
 
     parameters = autograd.builtins.tuple(
         [autograd.builtins.list(t.get_parameters()) for t in tapes]
@@ -55,13 +61,20 @@ def execute(tapes, device, execute_fn, gradient_fn, _n=1):
         device=device,
         execute_fn=execute_fn,
         gradient_fn=gradient_fn,
+        gradient_kwargs=gradient_kwargs,
         _n=_n,
     )[0]
 
 
 @autograd.extend.primitive
 def _execute(
-    parameters, tapes=None, device=None, execute_fn=None, gradient_fn=None, _n=1
+    parameters,
+    tapes=None,
+    device=None,
+    execute_fn=None,
+    gradient_fn=None,
+    gradient_kwargs=None,
+    _n=1,
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Autodifferentiable wrapper around ``Device.batch_execute``.
 
@@ -84,13 +97,27 @@ def _execute(
     understand the consequences!
     """
     with qml.tape.Unwrap(*tapes):
-        res, jacs = execute_fn(tapes)
+        res, jacs = execute_fn(tapes, **gradient_kwargs)
 
-    return [np.tensor(r) for r in res], jacs
+    for i, r in enumerate(res):
+        res[i] = np.tensor(r)
+
+        if r.dtype == np.dtype("object"):
+            # For backwards compatibility, we flatten ragged tape outputs
+            res[i] = np.hstack(r)
+
+    return res, jacs
 
 
 def vjp(
-    ans, parameters, tapes=None, device=None, execute_fn=None, gradient_fn=None, _n=1
+    ans,
+    parameters,
+    tapes=None,
+    device=None,
+    execute_fn=None,
+    gradient_fn=None,
+    gradient_kwargs=None,
+    _n=1,
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Returns the vector-Jacobian product operator for a batch of quantum tapes.
 
@@ -107,6 +134,8 @@ def vjp(
             If ``jacobians`` is an empty list, then ``gradient_fn`` is used to
             compute the gradients during the backwards pass.
         gradient_fn (callable): the gradient function to use to compute quantum gradients
+        gradient_kwargs (dict): dictionary of keyword arguments to pass when
+            determining the gradients of tapes
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
 
@@ -124,8 +153,8 @@ def grad_fn(dy):
 
         if jacs:
             # Jacobians were computed on the forward pass (accumulation="forward")
-            # Simply compute the vjps classically here.
-            vjps = qml.gradients._vector_jacobian_products(dy, jacs, reduction="append")
+            # No additional quantum evaluations needed; simply compute the VJPs directly.
+            vjps = [qml.gradients.compute_vjp(d, jac) for d, jac in zip(dy, jacs)]
 
         else:
             # Need to compute the Jacobians on the backward pass (accumulation="backward")
@@ -138,12 +167,16 @@ def grad_fn(dy):
             if "pennylane.gradients" in inspect.getmodule(gradient_fn).__name__:
 
                 # Generate and execute the required gradient tapes
-                vjp_tapes, fn = qml.gradients.batch_vjp(tapes, dy, gradient_fn, reduction="append")
+                vjp_tapes, processing_fn = qml.gradients.batch_vjp(
+                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
+                )
 
                 # This is where the magic happens. Note that we call ``execute``.
                 # This recursion, coupled with the fact that the gradient transforms
                 # are differentiable, allows for arbitrary order differentiation.
-                vjps = fn(execute(vjp_tapes, device, execute_fn, gradient_fn, _n=_n + 1))
+                vjps = processing_fn(
+                    execute(vjp_tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=_n + 1)
+                )
 
             elif inspect.ismethod(gradient_fn) and gradient_fn.__self__ is device:
                 # Gradient function is a device method.
@@ -155,9 +188,9 @@ def grad_fn(dy):
                 # so we cannot support higher-order derivatives.
 
                 with qml.tape.Unwrap(*tapes):
-                    jacs = gradient_fn(tapes)
+                    jacs = gradient_fn(tapes, **gradient_kwargs)
 
-                vjps = qml.gradients._vector_jacobian_products(dy, jacs, reduction="append")
+                vjps = [qml.gradients.compute_vjp(d, jac) for d, jac in zip(dy, jacs)]
 
             else:
                 raise ValueError("Unknown gradient function.")
diff --git a/tests/interfaces/batch/test_tape_autograd.py b/tests/interfaces/batch/test_tape_autograd.py
new file mode 100644
index 00000000000..8ddf83552be
--- /dev/null
+++ b/tests/interfaces/batch/test_tape_autograd.py
@@ -0,0 +1,529 @@
+# Copyright 2018-2020 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for the autograd interface"""
+import functools
+
+import pytest
+from pennylane import numpy as np
+
+import pennylane as qml
+from pennylane.gradients import param_shift
+from pennylane.interfaces.batch import execute
+
+
+class TestAutogradExecuteUnitTests:
+    """Unit tests for the autograd execution"""
+
+    def test_jacobian_options(self, mocker, tol):
+        """Test setting jacobian options"""
+        spy = mocker.spy(qml.gradients, "param_shift")
+
+        a = np.array([0.1, 0.2], requires_grad=True)
+
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute(
+                [tape], device, gradient_fn=param_shift, gradient_kwargs={"shift": np.pi / 4}
+            )[0]
+
+        res = qml.jacobian(cost)(a, device=dev)
+
+        for args in spy.call_args_list:
+            assert args[1]["shift"] == np.pi / 4
+
+    def test_unknown_gradient_fn_error(self):
+        """Test that an error is raised if an unknown gradient function
+        is passed"""
+        a = np.array([0.1, 0.2], requires_grad=True)
+
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute([tape], device, gradient_fn=lambda x: x)[0]
+
+        with pytest.raises(ValueError, match="Unknown gradient function"):
+            res = qml.jacobian(cost)(a, device=dev)
+
+    def test_incorrect_mode(self):
+        """Test that an error is raised if an gradient transform
+        is used with mode=forward"""
+        a = np.array([0.1, 0.2], requires_grad=True)
+
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute([tape], device, gradient_fn=param_shift, mode="forward")[0]
+
+        with pytest.raises(ValueError, match="Gradient transforms cannot be used with mode"):
+            res = qml.jacobian(cost)(a, device=dev)
+
+    def test_unknown_interface(self):
+        """Test that an error is raised if the interface is unknown"""
+        a = np.array([0.1, 0.2], requires_grad=True)
+
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute([tape], device, gradient_fn=param_shift, interface=None)[0]
+
+        with pytest.raises(ValueError, match="Unknown interface"):
+            cost(a, device=dev)
+
+
+execute_kwargs = [
+    {"gradient_fn": param_shift},
+    {
+        "gradient_fn": "device",
+        "mode": "forward",
+        "gradient_kwargs": {"method": "adjoint_jacobian", "use_device_state": True},
+    },
+    {
+        "gradient_fn": "device",
+        "mode": "backward",
+        "gradient_kwargs": {"method": "adjoint_jacobian"},
+    },
+]
+
+
+@pytest.mark.parametrize("execute_kwargs", execute_kwargs)
+class TestAutogradExecuteIntegration:
+    """Test the autograd interface execute function
+    integrates well for both forward and backward execution"""
+
+    def test_execution(self, execute_kwargs):
+        """Test execution"""
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a, b):
+            with qml.tape.JacobianTape() as tape1:
+                qml.RY(a, wires=0)
+                qml.RX(b, wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            with qml.tape.JacobianTape() as tape2:
+                qml.RY(a, wires=0)
+                qml.RX(b, wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute([tape1, tape2], dev, **execute_kwargs)
+
+        a = np.array(0.1, requires_grad=True)
+        b = np.array(0.2, requires_grad=False)
+        res = cost(a, b)
+
+        assert len(res) == 2
+        assert res[0].shape == (1,)
+        assert res[1].shape == (1,)
+
+    def test_scalar_jacobian(self, execute_kwargs, tol):
+        """Test scalar jacobian calculation"""
+        a = np.array(0.1, requires_grad=True)
+        dev = qml.device("default.qubit", wires=2)
+
+        def cost(a):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a, wires=0)
+                qml.expval(qml.PauliZ(0))
+            return execute([tape], dev, **execute_kwargs)[0]
+
+        res = qml.jacobian(cost)(a)
+        assert res.shape == (1,)
+
+        # compare to standard tape jacobian
+        with qml.tape.JacobianTape() as tape:
+            qml.RY(a, wires=0)
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {0}
+        tapes, fn = param_shift(tape)
+        expected = fn(dev.batch_execute(tapes))
+
+        assert expected.shape == (1, 1)
+        assert np.allclose(res, np.squeeze(expected), atol=tol, rtol=0)
+
+    def test_jacobian(self, execute_kwargs, tol):
+        """Test jacobian calculation"""
+        a = np.array(0.1, requires_grad=True)
+        b = np.array(0.2, requires_grad=True)
+
+        def cost(a, b, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a, wires=0)
+                qml.RX(b, wires=1)
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
+                qml.expval(qml.PauliY(1))
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2)
+
+        res = cost(a, b, device=dev)
+        expected = [np.cos(a), -np.cos(a) * np.sin(b)]
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        res = qml.jacobian(cost)(a, b, device=dev)
+        assert res.shape == (2, 2)
+
+        expected = [[-np.sin(a), 0], [np.sin(a) * np.sin(b), -np.cos(a) * np.cos(b)]]
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_reusing_quantum_tape(self, execute_kwargs, tol):
+        """Test re-using a quantum tape by passing new parameters"""
+        a = np.array(0.1, requires_grad=True)
+        b = np.array(0.2, requires_grad=True)
+
+        dev = qml.device("default.qubit", wires=2)
+
+        with qml.tape.JacobianTape() as tape:
+            qml.RY(a, wires=0)
+            qml.RX(b, wires=1)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+            qml.expval(qml.PauliY(1))
+
+        assert tape.trainable_params == {0, 1}
+
+        def cost(a, b):
+            tape.set_parameters([a, b])
+            return execute([tape], dev, **execute_kwargs)[0]
+
+        jac_fn = qml.jacobian(cost)
+        jac = jac_fn(a, b)
+
+        a = np.array(0.54, requires_grad=True)
+        b = np.array(0.8, requires_grad=True)
+
+        res2 = cost(2 * a, b)
+        expected = [np.cos(2 * a), -np.cos(2 * a) * np.sin(b)]
+        assert np.allclose(res2, expected, atol=tol, rtol=0)
+
+        jac_fn = qml.jacobian(lambda a, b: cost(2 * a, b))
+        jac = jac_fn(a, b)
+        expected = [
+            [-2 * np.sin(2 * a), 0],
+            [2 * np.sin(2 * a) * np.sin(b), -np.cos(2 * a) * np.cos(b)],
+        ]
+        assert np.allclose(jac, expected, atol=tol, rtol=0)
+
+    def test_classical_processing(self, execute_kwargs, tol):
+        """Test classical processing within the quantum tape"""
+        a = np.array(0.1, requires_grad=True)
+        b = np.array(0.2, requires_grad=False)
+        c = np.array(0.3, requires_grad=True)
+
+        def cost(a, b, c, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a * c, wires=0)
+                qml.RZ(b, wires=0)
+                qml.RX(c + c ** 2 + np.sin(a), wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2)
+        res = qml.jacobian(cost)(a, b, c, device=dev)
+        assert res.shape == (1, 2)
+
+    def test_no_trainable_parameters(self, execute_kwargs, tol):
+        """Test evaluation and Jacobian if there are no trainable parameters"""
+        a = np.array(0.1, requires_grad=False)
+        b = np.array(0.2, requires_grad=False)
+
+        def cost(a, b, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a, wires=0)
+                qml.RX(b, wires=0)
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
+                qml.expval(qml.PauliZ(1))
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2)
+        res = cost(a, b, device=dev)
+        assert res.shape == (2,)
+
+        res = qml.jacobian(cost)(a, b, device=dev)
+        assert not res
+
+        def loss(a, b):
+            return np.sum(cost(a, b, device=dev))
+
+        with pytest.warns(UserWarning, match="Output seems independent"):
+            res = qml.grad(loss)(a, b)
+
+        assert not res
+
+    def test_matrix_parameter(self, execute_kwargs, tol):
+        """Test that the autograd interface works correctly
+        with a matrix parameter"""
+        U = np.array([[0, 1], [1, 0]], requires_grad=False)
+        a = np.array(0.1, requires_grad=True)
+
+        def cost(a, U, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.QubitUnitary(U, wires=0)
+                qml.RY(a, wires=0)
+                qml.expval(qml.PauliZ(0))
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2)
+        res = cost(a, U, device=dev)
+        assert np.allclose(res, -np.cos(a), atol=tol, rtol=0)
+
+        jac_fn = qml.jacobian(cost)
+        res = jac_fn(a, U, device=dev)
+        assert np.allclose(res, np.sin(a), atol=tol, rtol=0)
+
+    def test_differentiable_expand(self, execute_kwargs, tol):
+        """Test that operation and nested tapes expansion
+        is differentiable"""
+
+        class U3(qml.U3):
+            def expand(self):
+                tape = qml.tape.JacobianTape()
+                theta, phi, lam = self.data
+                wires = self.wires
+                tape._ops += [
+                    qml.Rot(lam, theta, -lam, wires=wires),
+                    qml.PhaseShift(phi + lam, wires=wires),
+                ]
+                return tape
+
+        def cost_fn(a, p, device):
+            tape = qml.tape.JacobianTape()
+
+            with tape:
+                qml.RX(a, wires=0)
+                U3(*p, wires=0)
+                qml.expval(qml.PauliX(0))
+
+            tape = tape.expand(stop_at=lambda obj: device.supports_operation(obj.name))
+            return execute([tape], device, **execute_kwargs)[0]
+
+        a = np.array(0.1, requires_grad=False)
+        p = np.array([0.1, 0.2, 0.3], requires_grad=True)
+
+        dev = qml.device("default.qubit", wires=1)
+        res = cost_fn(a, p, device=dev)
+        expected = np.cos(a) * np.cos(p[1]) * np.sin(p[0]) + np.sin(a) * (
+            np.cos(p[2]) * np.sin(p[1]) + np.cos(p[0]) * np.cos(p[1]) * np.sin(p[2])
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        jac_fn = qml.jacobian(cost_fn)
+        res = jac_fn(a, p, device=dev)
+        expected = np.array(
+            [
+                np.cos(p[1]) * (np.cos(a) * np.cos(p[0]) - np.sin(a) * np.sin(p[0]) * np.sin(p[2])),
+                np.cos(p[1]) * np.cos(p[2]) * np.sin(a)
+                - np.sin(p[1])
+                * (np.cos(a) * np.sin(p[0]) + np.cos(p[0]) * np.sin(a) * np.sin(p[2])),
+                np.sin(a)
+                * (np.cos(p[0]) * np.cos(p[1]) * np.cos(p[2]) - np.sin(p[1]) * np.sin(p[2])),
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_probability_differentiation(self, execute_kwargs, tol):
+        """Tests correct output shape and evaluation for a tape
+        with prob outputs"""
+
+        if execute_kwargs["gradient_fn"] == "device":
+            pytest.skip("Adjoint differentiation does not yet support probabilities")
+
+        def cost(x, y, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(x, wires=[0])
+                qml.RY(y, wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.probs(wires=[0])
+                qml.probs(wires=[1])
+
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2)
+        x = np.array(0.543, requires_grad=True)
+        y = np.array(-0.654, requires_grad=True)
+
+        res = cost(x, y, device=dev)
+        expected = np.array(
+            [
+                [np.cos(x / 2) ** 2, np.sin(x / 2) ** 2],
+                [(1 + np.cos(x) * np.cos(y)) / 2, (1 - np.cos(x) * np.cos(y)) / 2],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        jac_fn = qml.jacobian(cost)
+        res = jac_fn(x, y, device=dev)
+        assert res.shape == (2, 2, 2)
+
+        expected = np.array(
+            [
+                [[-np.sin(x) / 2, 0], [np.sin(x) / 2, 0]],
+                [
+                    [-np.sin(x) * np.cos(y) / 2, -np.cos(x) * np.sin(y) / 2],
+                    [np.cos(y) * np.sin(x) / 2, np.cos(x) * np.sin(y) / 2],
+                ],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_ragged_differentiation(self, execute_kwargs, tol):
+        """Tests correct output shape and evaluation for a tape
+        with prob and expval outputs"""
+        if execute_kwargs["gradient_fn"] == "device":
+            pytest.skip("Adjoint differentiation does not yet support probabilities")
+
+        def cost(x, y, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(x, wires=[0])
+                qml.RY(y, wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
+                qml.probs(wires=[1])
+
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2)
+        x = np.array(0.543, requires_grad=True)
+        y = np.array(-0.654, requires_grad=True)
+
+        res = cost(x, y, device=dev)
+        expected = np.array(
+            [np.cos(x), (1 + np.cos(x) * np.cos(y)) / 2, (1 - np.cos(x) * np.cos(y)) / 2]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        jac_fn = qml.jacobian(cost)
+        res = jac_fn(x, y, device=dev)
+        expected = np.array(
+            [
+                [-np.sin(x), 0],
+                [-np.sin(x) * np.cos(y) / 2, -np.cos(x) * np.sin(y) / 2],
+                [np.cos(y) * np.sin(x) / 2, np.cos(x) * np.sin(y) / 2],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_sampling(self, execute_kwargs):
+        """Test sampling works as expected"""
+        if execute_kwargs["gradient_fn"] == "device" and execute_kwargs["mode"] == "forward":
+            pytest.skip("Adjoint differentiation does not support samples")
+
+        def cost(x, device):
+            with qml.tape.JacobianTape() as tape:
+                qml.Hadamard(wires=[0])
+                qml.CNOT(wires=[0, 1])
+                qml.sample(qml.PauliZ(0))
+                qml.sample(qml.PauliX(1))
+
+            return execute([tape], device, **execute_kwargs)[0]
+
+        dev = qml.device("default.qubit", wires=2, shots=10)
+        x = np.array(0.543, requires_grad=True)
+        res = cost(x, device=dev)
+        assert res.shape == (2, 10)
+
+
+class TestHigherOrderDerivatives:
+    """Test that the autograd execute function can be differentiated"""
+
+    def test_parameter_shift_hessian(self, tol):
+        """Tests that the output of the parameter-shift transform
+        can be differentiated using autograd, yielding second derivatives."""
+        dev = qml.device("default.qubit.autograd", wires=2)
+        params = np.array([0.543, -0.654], requires_grad=True)
+
+        def cost_fn(x):
+            with qml.tape.JacobianTape() as tape1:
+                qml.RX(x[0], wires=[0])
+                qml.RY(x[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+            with qml.tape.JacobianTape() as tape2:
+                qml.RX(x[0], wires=0)
+                qml.RY(x[0], wires=1)
+                qml.CNOT(wires=[0, 1])
+                qml.probs(wires=1)
+
+            result = execute([tape1, tape2], dev, gradient_fn=param_shift)
+            return result[0] + result[1][0, 0]
+
+        res = cost_fn(params)
+        x, y = params
+        expected = 0.5 * (3 + np.cos(x) ** 2 * np.cos(2 * y))
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        res = qml.grad(cost_fn)(params)
+        expected = np.array(
+            [-np.cos(x) * np.cos(2 * y) * np.sin(x), -np.cos(x) ** 2 * np.sin(2 * y)]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        res = qml.jacobian(qml.grad(cost_fn))(params)
+        expected = np.array(
+            [
+                [-np.cos(2 * x) * np.cos(2 * y), np.sin(2 * x) * np.sin(2 * y)],
+                [np.sin(2 * x) * np.sin(2 * y), -2 * np.cos(x) ** 2 * np.cos(2 * y)],
+            ]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+    def test_adjoint_hessian(self, tol):
+        """Since the adjoint hessian is not a differentiable transform,
+        higher-order derivatives are not supported."""
+        dev = qml.device("default.qubit.autograd", wires=2)
+        params = np.array([0.543, -0.654], requires_grad=True)
+
+        def cost_fn(x):
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(x[0], wires=[0])
+                qml.RY(x[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
+
+            return execute(
+                [tape],
+                dev,
+                gradient_fn="device",
+                gradient_kwargs={"method": "adjoint_jacobian", "use_device_state": True},
+            )[0]
+
+        with pytest.warns(UserWarning, match="Output seems independent"):
+            res = qml.jacobian(qml.grad(cost_fn))(params)
+
+        assert np.allclose(res, np.zeros([2, 2]), atol=tol, rtol=0)

From 11f20b3b9d0b60c7dbae3733081e7e05ce41fb56 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 9 Aug 2021 20:11:21 +0800
Subject: [PATCH 13/45] more tests

---
 pennylane/interfaces/batch/autograd.py       |  4 +-
 tests/interfaces/batch/test_tape_autograd.py | 55 ++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 6f7270c1bbc..3aab562bf6f 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -163,8 +163,10 @@ def grad_fn(dy):
             # For the moment, simply check if it is part of the `qml.gradients` package.
             # Longer term, we should have a way of checking this directly
             # (e.g., isinstance(gradient_fn, GradientTransform))
+            module_name = getattr(inspect.getmodule(gradient_fn), "__name__", "")
+            print(gradient_fn, gradient_fn.__module__, inspect.ismethod(gradient_fn))
 
-            if "pennylane.gradients" in inspect.getmodule(gradient_fn).__name__:
+            if "pennylane.gradients" in module_name:
 
                 # Generate and execute the required gradient tapes
                 vjp_tapes, processing_fn = qml.gradients.batch_vjp(
diff --git a/tests/interfaces/batch/test_tape_autograd.py b/tests/interfaces/batch/test_tape_autograd.py
index 8ddf83552be..dbf5e297d88 100644
--- a/tests/interfaces/batch/test_tape_autograd.py
+++ b/tests/interfaces/batch/test_tape_autograd.py
@@ -101,6 +101,61 @@ def cost(a, device):
         with pytest.raises(ValueError, match="Unknown interface"):
             cost(a, device=dev)
 
+    def test_forward_mode(self, mocker):
+        """Test that forward mode uses the `device.execute_and_gradients` pathway"""
+        dev = qml.device("default.qubit", wires=1)
+        spy = mocker.spy(dev, "execute_and_gradients")
+
+        def cost(a):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute(
+                [tape],
+                dev,
+                gradient_fn="device",
+                gradient_kwargs={"method": "adjoint_jacobian", "use_device_state": True},
+            )[0]
+
+        a = np.array([0.1, 0.2], requires_grad=True)
+        cost(a)
+
+        # adjoint method only performs a single device execution, but gets both result and gradient
+        assert dev.num_executions == 1
+        spy.assert_called()
+
+    def test_backward_mode(self, mocker):
+        """Test that backward mode uses the `device.batch_execute` and `device.gradients` pathway"""
+        dev = qml.device("default.qubit", wires=1)
+        spy_execute = mocker.spy(qml.devices.DefaultQubit, "batch_execute")
+        spy_gradients = mocker.spy(qml.devices.DefaultQubit, "gradients")
+
+        def cost(a):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute(
+                [tape],
+                dev,
+                gradient_fn="device",
+                mode="backward",
+                gradient_kwargs={"method": "adjoint_jacobian"},
+            )[0]
+
+        a = np.array([0.1, 0.2], requires_grad=True)
+        cost(a)
+
+        assert dev.num_executions == 1
+        spy_execute.assert_called()
+        spy_gradients.assert_not_called()
+
+        qml.jacobian(cost)(a)
+        spy_gradients.assert_called()
+
 
 execute_kwargs = [
     {"gradient_fn": param_shift},

From 122194c5c55bae766cfd18c6462bc67cf61e3ea2 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 9 Aug 2021 20:14:05 +0800
Subject: [PATCH 14/45] renamed

---
 .../{batch/test_tape_autograd.py => test_batch_autograd.py}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/interfaces/{batch/test_tape_autograd.py => test_batch_autograd.py} (100%)

diff --git a/tests/interfaces/batch/test_tape_autograd.py b/tests/interfaces/test_batch_autograd.py
similarity index 100%
rename from tests/interfaces/batch/test_tape_autograd.py
rename to tests/interfaces/test_batch_autograd.py

From e98c835a70e65c91e22c6843425bd484d3d3dec0 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 9 Aug 2021 20:57:24 +0800
Subject: [PATCH 15/45] typo

---
 pennylane/_device.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pennylane/_device.py b/pennylane/_device.py
index b68bb844769..1fbf03745c5 100644
--- a/pennylane/_device.py
+++ b/pennylane/_device.py
@@ -512,8 +512,7 @@ def execute_and_gradients(self, circuits, method="jacobian", **kwargs):
         Args:
             circuits (list[.tape.QuantumTape]): circuits to execute on the device
             method (str): the device method to call to compute the Jacobian of a single circuit
-            gradient_kwargs (dict): dictionary of keyword argument to pass
-                when calling ``method``.
+            **kwargs: keyword argument to pass when calling ``method``.
 
         Returns:
             tuple[list[array[float]], list[array[float]]]: Tuple containing list of measured value(s)
@@ -545,8 +544,7 @@ def gradients(self, circuits, method="jacobian", **kwargs):
         Args:
             circuits (list[.tape.QuantumTape]): circuits to execute on the device
             method (str): the device method to call to compute the Jacobian of a single circuit
-            gradient_kwargs (dict): dictionary of keyword argument to pass
-                when calling ``method``.
+            **kwargs: keyword argument to pass when calling ``method``.
 
         Returns:
             list[array[float]]: List of Jacobians. Returned Jacobians should be of

From 59569673432effca8993da52c9d70356ce719543 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Tue, 10 Aug 2021 01:07:13 +0800
Subject: [PATCH 16/45] Add caching to the autograd backend

---
 pennylane/gradients/parameter_shift.py  | 28 +++++++++++++++++++++++++
 pennylane/interfaces/batch/__init__.py  |  6 +++---
 pennylane/interfaces/batch/autograd.py  | 10 +++++++--
 pennylane/tape/tape.py                  | 19 +++++++++++++++++
 tests/interfaces/test_batch_autograd.py | 20 ++++++++++++++++++
 5 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 38005ddc334..35368ae54d5 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -16,6 +16,7 @@
 of a qubit-based quantum tape.
 """
 # pylint: disable=protected-access,too-many-arguments
+import functools
 import numpy as np
 
 import pennylane as qml
@@ -34,6 +35,7 @@
 """
 
 
+@functools.lru_cache
 def _square_observable(obs):
     """Returns the square of an observable."""
 
@@ -57,6 +59,7 @@ def _square_observable(obs):
     return NONINVOLUTORY_OBS[obs.name](obs)
 
 
+@functools.lru_cache
 def _get_operation_recipe(tape, t_idx, shift=np.pi / 2):
     """Utility function to return the parameter-shift rule
     of the operation corresponding to trainable parameter
@@ -91,6 +94,7 @@ def _process_gradient_recipe(gradient_recipe, tol=1e-10):
     return gradient_recipe[:, np.argsort(np.abs(gradient_recipe)[-1])]
 
 
+@functools.lru_cache
 def _gradient_analysis(tape, use_graph=True):
     """Update the parameter information dictionary of the tape with
     gradient information of each parameter."""
@@ -349,6 +353,29 @@ def processing_fn(results):
     return gradient_tapes, processing_fn
 
 
+def freezeargs(func):
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args])
+
+        new_kwargs = {}
+        for k, v in kwargs.items():
+            if k == "gradient_recipes":
+                new_kwargs[k] = tuple([tuple([tuple(z) for z in y]) for y in v])
+            elif isinstance(v, list):
+                new_kwargs[k] = tuple(v)
+            elif isinstance(v, np.ndarray):
+                new_kwargs[k] = tuple(qml.math.array(v).tolist())
+            else:
+                new_kwargs[k] = v
+
+        return func(*args, **new_kwargs)
+
+    return wrapped
+
+
+@freezeargs
+@functools.lru_cache
 def param_shift(
     tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
 ):
@@ -455,6 +482,7 @@ def param_shift(
     [[-0.38751721 -0.18884787 -0.38355704]
      [ 0.69916862  0.34072424  0.69202359]]
     """
+    f0 = np.array(f0) if f0 is not None else None
 
     # perform gradient method validation
     if any(m.return_type is qml.operation.State for m in tape.measurements):
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 71abc8189a0..35c62611efe 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -16,7 +16,7 @@
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel)
-from functools import partial
+from functools import partial, lru_cache
 
 import pennylane as qml
 
@@ -105,7 +105,7 @@ def cost_fn(params, x):
     """
     # Default execution function; simply call device.batch_execute
     # and return no Jacobians.
-    execute_fn = lambda tapes, **kwargs: (device.batch_execute(tapes), [])
+    execute_fn = lru_cache(lambda tapes, **kwargs: (device.batch_execute(tapes), []))
     gradient_kwargs = gradient_kwargs or {}
 
     if gradient_fn == "device":
@@ -125,6 +125,6 @@ def cost_fn(params, x):
         raise ValueError("Gradient transforms cannot be used with mode='forward'")
 
     if interface == "autograd":
-        return execute_autograd(tapes, device, execute_fn, gradient_fn, gradient_kwargs)
+        return execute_autograd(tuple(tapes), device, execute_fn, gradient_fn, gradient_kwargs)
 
     raise ValueError(f"Unknown interface {interface}")
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 3aab562bf6f..378f0481dc5 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -164,7 +164,6 @@ def grad_fn(dy):
             # Longer term, we should have a way of checking this directly
             # (e.g., isinstance(gradient_fn, GradientTransform))
             module_name = getattr(inspect.getmodule(gradient_fn), "__name__", "")
-            print(gradient_fn, gradient_fn.__module__, inspect.ismethod(gradient_fn))
 
             if "pennylane.gradients" in module_name:
 
@@ -177,7 +176,14 @@ def grad_fn(dy):
                 # This recursion, coupled with the fact that the gradient transforms
                 # are differentiable, allows for arbitrary order differentiation.
                 vjps = processing_fn(
-                    execute(vjp_tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=_n + 1)
+                    execute(
+                        tuple(vjp_tapes),
+                        device,
+                        execute_fn,
+                        gradient_fn,
+                        gradient_kwargs,
+                        _n=_n + 1,
+                    )
                 )
 
             elif inspect.ismethod(gradient_fn) and gradient_fn.__self__ is device:
diff --git a/pennylane/tape/tape.py b/pennylane/tape/tape.py
index 4ea8c2f01ea..425da98c399 100644
--- a/pennylane/tape/tape.py
+++ b/pennylane/tape/tape.py
@@ -985,6 +985,25 @@ def diagonalizing_gates(self):
 
         return rotation_gates
 
+    def __hash__(self):
+        fingerprint = []
+
+        for p in self.get_parameters(trainable_only=False):
+            if isinstance(p, (int, float)):
+                fingerprint.append(p)
+            else:
+                q = qml.math.toarray(p).tolist()
+
+                if isinstance(q, (int, float)):
+                    fingerprint.append(q)
+                else:
+                    fingerprint.append(tuple(qml.math.flatten(qml.math.toarray(p)).tolist()))
+
+        fingerprint = tuple(fingerprint)
+        fingerprint += tuple(self.operations)
+        fingerprint += tuple(self.measurements)
+        return hash(fingerprint)
+
     @property
     def graph(self):
         """Returns a directed acyclic graph representation of the recorded
diff --git a/tests/interfaces/test_batch_autograd.py b/tests/interfaces/test_batch_autograd.py
index dbf5e297d88..a0c108ac01d 100644
--- a/tests/interfaces/test_batch_autograd.py
+++ b/tests/interfaces/test_batch_autograd.py
@@ -156,6 +156,26 @@ def cost(a):
         qml.jacobian(cost)(a)
         spy_gradients.assert_called()
 
+    def test_caching(self, tol):
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.expval(qml.PauliZ(0))
+
+            return execute([tape], dev, gradient_fn=param_shift)[0]
+
+        params = np.array([0.1, 0.2])
+        grad1 = qml.jacobian(cost)(params)
+        assert dev.num_executions == 5
+
+        grad2 = qml.jacobian(cost)(2 * params)
+        assert dev.num_executions == 10
+
+        assert not np.allclose(grad1, grad2, atol=tol, rtol=0)
+
 
 execute_kwargs = [
     {"gradient_fn": param_shift},

From 8e3159f22557cd9c3d4a31ca9e8919750f3975ba Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Tue, 10 Aug 2021 01:13:58 +0800
Subject: [PATCH 17/45] more

---
 pennylane/gradients/parameter_shift.py | 8 ++++----
 pennylane/interfaces/batch/__init__.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 35368ae54d5..87f64efca13 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -35,7 +35,7 @@
 """
 
 
-@functools.lru_cache
+@functools.lru_cache()
 def _square_observable(obs):
     """Returns the square of an observable."""
 
@@ -59,7 +59,7 @@ def _square_observable(obs):
     return NONINVOLUTORY_OBS[obs.name](obs)
 
 
-@functools.lru_cache
+@functools.lru_cache()
 def _get_operation_recipe(tape, t_idx, shift=np.pi / 2):
     """Utility function to return the parameter-shift rule
     of the operation corresponding to trainable parameter
@@ -94,7 +94,7 @@ def _process_gradient_recipe(gradient_recipe, tol=1e-10):
     return gradient_recipe[:, np.argsort(np.abs(gradient_recipe)[-1])]
 
 
-@functools.lru_cache
+@functools.lru_cache()
 def _gradient_analysis(tape, use_graph=True):
     """Update the parameter information dictionary of the tape with
     gradient information of each parameter."""
@@ -375,7 +375,7 @@ def wrapped(*args, **kwargs):
 
 
 @freezeargs
-@functools.lru_cache
+@functools.lru_cache()
 def param_shift(
     tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
 ):
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 35c62611efe..e8a7ad50e65 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -105,7 +105,7 @@ def cost_fn(params, x):
     """
     # Default execution function; simply call device.batch_execute
     # and return no Jacobians.
-    execute_fn = lru_cache(lambda tapes, **kwargs: (device.batch_execute(tapes), []))
+    execute_fn = lru_cache()(lambda tapes, **kwargs: (device.batch_execute(tapes), []))
     gradient_kwargs = gradient_kwargs or {}
 
     if gradient_fn == "device":

From b36ec30a1c077858ab26e31509274d4e498783c4 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Tue, 10 Aug 2021 01:53:55 +0800
Subject: [PATCH 18/45] more

---
 pennylane/gradients/parameter_shift.py | 24 ++++++++++++++++++++++--
 pennylane/tape/tape.py                 | 19 -------------------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 87f64efca13..e33fe5f7a63 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -358,6 +358,19 @@ def freezeargs(func):
     def wrapped(*args, **kwargs):
         args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args])
 
+        fingerprint = []
+
+        for p in args[0].get_parameters(trainable_only=False):
+            if isinstance(p, (int, float)):
+                fingerprint.append(p)
+            else:
+                q = qml.math.toarray(p).tolist()
+
+                if isinstance(q, (int, float)):
+                    fingerprint.append(q)
+                else:
+                    fingerprint.append(tuple(qml.math.flatten(qml.math.toarray(p)).tolist()))
+
         new_kwargs = {}
         for k, v in kwargs.items():
             if k == "gradient_recipes":
@@ -369,7 +382,7 @@ def wrapped(*args, **kwargs):
             else:
                 new_kwargs[k] = v
 
-        return func(*args, **new_kwargs)
+        return func(*args, hash=tuple(fingerprint), **new_kwargs)
 
     return wrapped
 
@@ -377,7 +390,13 @@ def wrapped(*args, **kwargs):
 @freezeargs
 @functools.lru_cache()
 def param_shift(
-    tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
+    tape,
+    hash=None,
+    argnum=None,
+    shift=np.pi / 2,
+    gradient_recipes=None,
+    fallback_fn=finite_diff,
+    f0=None,
 ):
     r"""Generate the parameter-shift tapes and postprocessing methods required
     to compute the gradient of a gate parameter with respect to an
@@ -482,6 +501,7 @@ def param_shift(
     [[-0.38751721 -0.18884787 -0.38355704]
      [ 0.69916862  0.34072424  0.69202359]]
     """
+    print("hello")
     f0 = np.array(f0) if f0 is not None else None
 
     # perform gradient method validation
diff --git a/pennylane/tape/tape.py b/pennylane/tape/tape.py
index 425da98c399..4ea8c2f01ea 100644
--- a/pennylane/tape/tape.py
+++ b/pennylane/tape/tape.py
@@ -985,25 +985,6 @@ def diagonalizing_gates(self):
 
         return rotation_gates
 
-    def __hash__(self):
-        fingerprint = []
-
-        for p in self.get_parameters(trainable_only=False):
-            if isinstance(p, (int, float)):
-                fingerprint.append(p)
-            else:
-                q = qml.math.toarray(p).tolist()
-
-                if isinstance(q, (int, float)):
-                    fingerprint.append(q)
-                else:
-                    fingerprint.append(tuple(qml.math.flatten(qml.math.toarray(p)).tolist()))
-
-        fingerprint = tuple(fingerprint)
-        fingerprint += tuple(self.operations)
-        fingerprint += tuple(self.measurements)
-        return hash(fingerprint)
-
     @property
     def graph(self):
         """Returns a directed acyclic graph representation of the recorded

From 3bd36bf97c1e8d0a6741cc4b2c172b32e12999d2 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 01:03:40 +0800
Subject: [PATCH 19/45] more

---
 pennylane/_qubit_device.py             |  1 -
 pennylane/gradients/parameter_shift.py | 61 ++++++++++-----------
 pennylane/interfaces/batch/__init__.py | 74 +++++++++++++++++++++++---
 pennylane/interfaces/batch/autograd.py |  5 +-
 4 files changed, 100 insertions(+), 41 deletions(-)

diff --git a/pennylane/_qubit_device.py b/pennylane/_qubit_device.py
index 5561d9cca49..30f60651ba4 100644
--- a/pennylane/_qubit_device.py
+++ b/pennylane/_qubit_device.py
@@ -268,7 +268,6 @@ def batch_execute(self, circuits):
         """
         # TODO: This method and the tests can be globally implemented by Device
         # once it has the same signature in the execute() method
-
         results = []
         for circuit in circuits:
             # we need to reset the device here, else it will
diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index e33fe5f7a63..632d36fbc02 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -353,45 +353,43 @@ def processing_fn(results):
     return gradient_tapes, processing_fn
 
 
-def freezeargs(func):
-    @functools.wraps(func)
-    def wrapped(*args, **kwargs):
-        args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args])
+from cachetools import cached
+from cachetools.keys import hashkey
 
-        fingerprint = []
 
-        for p in args[0].get_parameters(trainable_only=False):
-            if isinstance(p, (int, float)):
-                fingerprint.append(p)
-            else:
-                q = qml.math.toarray(p).tolist()
-
-                if isinstance(q, (int, float)):
-                    fingerprint.append(q)
-                else:
-                    fingerprint.append(tuple(qml.math.flatten(qml.math.toarray(p)).tolist()))
-
-        new_kwargs = {}
-        for k, v in kwargs.items():
-            if k == "gradient_recipes":
-                new_kwargs[k] = tuple([tuple([tuple(z) for z in y]) for y in v])
-            elif isinstance(v, list):
-                new_kwargs[k] = tuple(v)
-            elif isinstance(v, np.ndarray):
-                new_kwargs[k] = tuple(qml.math.array(v).tolist())
-            else:
-                new_kwargs[k] = v
+def tape_hash(tape):
+    fingerprint = []
+    fingerprint.extend(
+        (
+            str(op.name),
+            tuple(op.wires.tolist()),
+            str(op.data),
+        )
+        for op in tape.operations
+    )
+    fingerprint.extend(
+        (str(op.name), tuple(op.wires.tolist()), str(op.data), op.return_type)
+        for op in tape.measurements
+    )
+    fingerprint = tuple(item for sublist in fingerprint for item in sublist)
+    return hash(fingerprint)
+
+
+def key(
+    tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
+):
+    f0 = qml.math.toarray(f0).tolist() if f0 is not None else None
+    argnum = tuple(argnum) if argnum is not None else None
 
-        return func(*args, hash=tuple(fingerprint), **new_kwargs)
+    if gradient_recipes is not None:
+        gradient_recipes = tuple(tuple(y) for y in gradient_recipes)
 
-    return wrapped
+    return hashkey((tape_hash(tape), argnum, shift, gradient_recipes, fallback_fn, f0))
 
 
-@freezeargs
-@functools.lru_cache()
+@cached(cache={}, key=key)
 def param_shift(
     tape,
-    hash=None,
     argnum=None,
     shift=np.pi / 2,
     gradient_recipes=None,
@@ -501,7 +499,6 @@ def param_shift(
     [[-0.38751721 -0.18884787 -0.38355704]
      [ 0.69916862  0.34072424  0.69202359]]
     """
-    print("hello")
     f0 = np.array(f0) if f0 is not None else None
 
     # perform gradient method validation
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index e8a7ad50e65..7a72f7c4338 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -16,13 +16,66 @@
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel)
-from functools import partial, lru_cache
+from functools import partial
 
 import pennylane as qml
 
 from .autograd import execute as execute_autograd
 
 
+from collections import OrderedDict
+
+
+def tape_hash(tape):
+    fingerprint = []
+    fingerprint.extend(
+        (
+            str(op.name),
+            tuple(op.wires.tolist()),
+            str(op.data),
+        )
+        for op in tape.operations
+    )
+    fingerprint.extend(
+        (str(op.name), tuple(op.wires.tolist()), str(op.data), op.return_type)
+        for op in tape.measurements
+    )
+    fingerprint = tuple(item for sublist in fingerprint for item in sublist)
+    return hash(fingerprint)
+
+
+def execute_fn_wrapper(tapes, device, **kwargs):
+    cache = kwargs.pop("cache", None)
+
+    if cache is None:
+        return device.batch_execute(tapes), []
+
+    execution_tapes = OrderedDict()
+    cached_results = {}
+    hashes = {}
+
+    for i, tape in enumerate(tapes):
+        hashes[i] = tape_hash(tape)
+
+        if hashes[i] in cache:
+            cached_results[i] = cache[hashes[i]]
+        else:
+            execution_tapes[i] = tape
+
+    res = device.batch_execute(execution_tapes.values())
+    final_res = []
+
+    for i, tape in enumerate(tapes):
+        if i in cached_results:
+            final_res.append(cached_results[i])
+        else:
+            r = res.pop(0)
+            final_res.append(r)
+            cache[hashes[i]] = r
+
+    return final_res, []
+
+
 def execute(tapes, device, gradient_fn, interface="autograd", mode="best", gradient_kwargs=None):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
@@ -103,9 +156,6 @@ def cost_fn(params, x):
            [ 0.01983384, -0.97517033,  0.        ],
            [ 0.        ,  0.        , -0.95533649]])
     """
-    # Default execution function; simply call device.batch_execute
-    # and return no Jacobians.
-    execute_fn = lru_cache()(lambda tapes, **kwargs: (device.batch_execute(tapes), []))
     gradient_kwargs = gradient_kwargs or {}
 
     if gradient_fn == "device":
@@ -119,12 +169,24 @@ def cost_fn(params, x):
 
         elif mode == "backward":
             # replace the backward gradient computation
+            execute_fn = lambda tapes, **kwargs: (device.batch_execute(tapes), [])
             gradient_fn = device.gradients
 
     elif mode == "forward":
         raise ValueError("Gradient transforms cannot be used with mode='forward'")
 
+    else:
+        # gradient function is a transform
+        gradient_kwargs["cache"] = {}
+        execute_fn = lambda tapes, **kwargs: execute_fn_wrapper(tapes, device, **kwargs)
+
     if interface == "autograd":
-        return execute_autograd(tuple(tapes), device, execute_fn, gradient_fn, gradient_kwargs)
+        res = execute_autograd(tuple(tapes), device, execute_fn, gradient_fn, gradient_kwargs)
+    else:
+        raise ValueError(f"Unknown interface {interface}")
+
+    if "cache" in gradient_kwargs:
+        # clear the cache
+        gradient_kwargs["cache"] = {}
 
-    raise ValueError(f"Unknown interface {interface}")
+    return res
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 378f0481dc5..095b4c8bf41 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -143,6 +143,7 @@ def vjp(
         function: this function accepts the backpropagation
         gradient output vector, and computes the vector-Jacobian product
     """
+    g_kwargs = {x: gradient_kwargs[x] for x in gradient_kwargs if "cache" not in x}
 
     def grad_fn(dy):
         """Returns the vector-Jacobian product with given
@@ -169,7 +170,7 @@ def grad_fn(dy):
 
                 # Generate and execute the required gradient tapes
                 vjp_tapes, processing_fn = qml.gradients.batch_vjp(
-                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
+                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=g_kwargs
                 )
 
                 # This is where the magic happens. Note that we call ``execute``.
@@ -177,7 +178,7 @@ def grad_fn(dy):
                 # are differentiable, allows for arbitrary order differentiation.
                 vjps = processing_fn(
                     execute(
-                        tuple(vjp_tapes),
+                        vjp_tapes,
                         device,
                         execute_fn,
                         gradient_fn,

From d6442289216196e0a07bb99f86267ae30ac44b7c Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 01:32:36 +0800
Subject: [PATCH 20/45] more

---
 pennylane/interfaces/batch/__init__.py | 15 +++++++++++++--
 pennylane/interfaces/batch/autograd.py |  5 ++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 7a72f7c4338..63696d74571 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -19,6 +19,7 @@
 from functools import partial
 
 import pennylane as qml
+import numpy as np
 
 from .autograd import execute as execute_autograd
 
@@ -32,7 +33,7 @@ def tape_hash(tape):
         (
             str(op.name),
             tuple(op.wires.tolist()),
-            str(op.data),
+            str([np.mod(d, 2*np.pi) for d in op.data]),
         )
         for op in tape.operations
     )
@@ -53,9 +54,17 @@ def execute_fn_wrapper(tapes, device, **kwargs):
     execution_tapes = OrderedDict()
     cached_results = {}
     hashes = {}
+    repeated = {}
 
     for i, tape in enumerate(tapes):
-        hashes[i] = tape_hash(tape)
+        h = tape_hash(tape)
+
+        if h in hashes.values():
+            idx = list(hashes.keys())[list(hashes.values()).index(h)]
+            repeated[i] = idx
+            continue
+
+        hashes[i] = h
 
         if hashes[i] in cache:
             cached_results[i] = cache[hashes[i]]
@@ -68,6 +77,8 @@ def execute_fn_wrapper(tapes, device, **kwargs):
     for i, tape in enumerate(tapes):
         if i in cached_results:
             final_res.append(cached_results[i])
+        elif i in repeated:
+            final_res.append(final_res[repeated[i]])
         else:
             r = res.pop(0)
             final_res.append(r)
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 095b4c8bf41..588c5652bc7 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -144,8 +144,9 @@ def vjp(
         gradient output vector, and computes the vector-Jacobian product
     """
     g_kwargs = {x: gradient_kwargs[x] for x in gradient_kwargs if "cache" not in x}
+    cache = gradient_kwargs.get("cache", {})
 
-    def grad_fn(dy):
+    def grad_fn(dy, cache=cache):
         """Returns the vector-Jacobian product with given
         parameter values and output gradient dy"""
 
@@ -173,6 +174,8 @@ def grad_fn(dy):
                     tapes, dy, gradient_fn, reduction="append", gradient_kwargs=g_kwargs
                 )
 
+                gradient_kwargs["cache"] = cache
+
                 # This is where the magic happens. Note that we call ``execute``.
                 # This recursion, coupled with the fact that the gradient transforms
                 # are differentiable, allows for arbitrary order differentiation.

From 81bd371fdd2fc45b1ecefb44186371d44b719456 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 01:54:39 +0800
Subject: [PATCH 21/45] caching

---
 pennylane/interfaces/batch/__init__.py | 19 ++++++++++++++++---
 pennylane/interfaces/batch/autograd.py |  5 +----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 63696d74571..2938d992505 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -27,13 +27,23 @@
 from collections import OrderedDict
 
 
+def _process_data(op):
+    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
+        return str([np.mod(d, 2 * np.pi) for d in op.data])
+
+    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
+        return str([np.mod(d, 4 * np.pi) for d in op.data])
+
+    return str(op.data)
+
+
 def tape_hash(tape):
     fingerprint = []
     fingerprint.extend(
         (
             str(op.name),
             tuple(op.wires.tolist()),
-            str([np.mod(d, 2*np.pi) for d in op.data]),
+            _process_data(op),
         )
         for op in tape.operations
     )
@@ -87,7 +97,9 @@ def execute_fn_wrapper(tapes, device, **kwargs):
     return final_res, []
 
 
-def execute(tapes, device, gradient_fn, interface="autograd", mode="best", gradient_kwargs=None):
+def execute(
+    tapes, device, gradient_fn, interface="autograd", mode="best", gradient_kwargs=None, cache=True
+):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
     Args:
@@ -188,7 +200,8 @@ def cost_fn(params, x):
 
     else:
         # gradient function is a transform
-        gradient_kwargs["cache"] = {}
+        if cache:
+            gradient_kwargs["cache"] = {}
         execute_fn = lambda tapes, **kwargs: execute_fn_wrapper(tapes, device, **kwargs)
 
     if interface == "autograd":
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 588c5652bc7..095b4c8bf41 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -144,9 +144,8 @@ def vjp(
         gradient output vector, and computes the vector-Jacobian product
     """
     g_kwargs = {x: gradient_kwargs[x] for x in gradient_kwargs if "cache" not in x}
-    cache = gradient_kwargs.get("cache", {})
 
-    def grad_fn(dy, cache=cache):
+    def grad_fn(dy):
         """Returns the vector-Jacobian product with given
         parameter values and output gradient dy"""
 
@@ -174,8 +173,6 @@ def grad_fn(dy, cache=cache):
                     tapes, dy, gradient_fn, reduction="append", gradient_kwargs=g_kwargs
                 )
 
-                gradient_kwargs["cache"] = cache
-
                 # This is where the magic happens. Note that we call ``execute``.
                 # This recursion, coupled with the fact that the gradient transforms
                 # are differentiable, allows for arbitrary order differentiation.

From 9a19ce25de8c74cfa1d2bc7aebc4e35b9902dd87 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 02:22:31 +0800
Subject: [PATCH 22/45] fix

---
 pennylane/interfaces/batch/__init__.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 2938d992505..39c95359b33 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -16,6 +16,7 @@
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel)
+from cachetools import LRUCache
 from functools import partial
 
 import pennylane as qml
@@ -98,7 +99,14 @@ def execute_fn_wrapper(tapes, device, **kwargs):
 
 
 def execute(
-    tapes, device, gradient_fn, interface="autograd", mode="best", gradient_kwargs=None, cache=True
+    tapes,
+    device,
+    gradient_fn,
+    interface="autograd",
+    mode="best",
+    gradient_kwargs=None,
+    cache=None,
+    cachesize=10000,
 ):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
@@ -200,8 +208,7 @@ def cost_fn(params, x):
 
     else:
         # gradient function is a transform
-        if cache:
-            gradient_kwargs["cache"] = {}
+        gradient_kwargs["cache"] = cache or LRUCache(maxsize=cachesize, getsizeof=lambda x: len(x))
         execute_fn = lambda tapes, **kwargs: execute_fn_wrapper(tapes, device, **kwargs)
 
     if interface == "autograd":
@@ -209,8 +216,4 @@ def cost_fn(params, x):
     else:
         raise ValueError(f"Unknown interface {interface}")
 
-    if "cache" in gradient_kwargs:
-        # clear the cache
-        gradient_kwargs["cache"] = {}
-
     return res

From 44ca01dfb70ec5532422fdcd69579b4d2afe9f5a Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 02:32:41 +0800
Subject: [PATCH 23/45] fix

---
 pennylane/gradients/parameter_shift.py | 12 +++++++++++-
 requirements.txt                       |  1 +
 setup.py                               |  3 ++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 632d36fbc02..652322cf4f8 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -357,13 +357,23 @@ def processing_fn(results):
 from cachetools.keys import hashkey
 
 
+def _process_data(op):
+    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
+        return str([np.mod(d, 2 * np.pi) for d in op.data])
+
+    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
+        return str([np.mod(d, 4 * np.pi) for d in op.data])
+
+    return str(op.data)
+
+
 def tape_hash(tape):
     fingerprint = []
     fingerprint.extend(
         (
             str(op.name),
             tuple(op.wires.tolist()),
-            str(op.data),
+            _process_data(op),
         )
         for op in tape.operations
     )
diff --git a/requirements.txt b/requirements.txt
index 27a55d2a73e..cf368ea8175 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ numpy
 scipy
 cvxpy
 cvxopt
+cachetools
 networkx
 tensornetwork==0.3
 autograd
diff --git a/setup.py b/setup.py
index 5531f525d82..6d99a623329 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,8 @@
     "toml",
     "appdirs",
     "semantic_version==2.6",
-    "autoray"
+    "autoray",
+    "cachetools"
 ]
 
 info = {

From b4bb9d2ec737d68f33eb5a0c0e77bdcde3a99981 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 03:03:53 +0800
Subject: [PATCH 24/45] fix tests

---
 pennylane/gradients/parameter_shift.py  | 16 +++++++++++-----
 pennylane/interfaces/batch/__init__.py  | 20 +++++++++++++++-----
 tests/gradients/test_parameter_shift.py |  2 +-
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 652322cf4f8..d102f7bea70 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -359,10 +359,10 @@ def processing_fn(results):
 
 def _process_data(op):
     if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
-        return str([np.mod(d, 2 * np.pi) for d in op.data])
+        return str([d % (2.0 * np.pi) for d in op.data])
 
     if op.name in ("CRX", "CRY", "CRZ", "CRot"):
-        return str([np.mod(d, 4 * np.pi) for d in op.data])
+        return str([d % (4.0 * np.pi) for d in op.data])
 
     return str(op.data)
 
@@ -378,9 +378,15 @@ def tape_hash(tape):
         for op in tape.operations
     )
     fingerprint.extend(
-        (str(op.name), tuple(op.wires.tolist()), str(op.data), op.return_type)
+        (
+            str(getattr(getattr(op, "obs", op), "name", op.name)),
+            tuple(op.wires.tolist()),
+            str(getattr(getattr(op, "obs", op), "data", op.data)),
+            op.return_type,
+        )
         for op in tape.measurements
     )
+    fingerprint.append(tape.trainable_params)
     fingerprint = tuple(item for sublist in fingerprint for item in sublist)
     return hash(fingerprint)
 
@@ -388,11 +394,11 @@ def tape_hash(tape):
 def key(
     tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
 ):
-    f0 = qml.math.toarray(f0).tolist() if f0 is not None else None
+    f0 = tuple(qml.math.toarray(f0).tolist()) if f0 is not None else None
     argnum = tuple(argnum) if argnum is not None else None
 
     if gradient_recipes is not None:
-        gradient_recipes = tuple(tuple(y) for y in gradient_recipes)
+        gradient_recipes = tuple(tuple(tuple(w) for w in y) for y in gradient_recipes)
 
     return hashkey((tape_hash(tape), argnum, shift, gradient_recipes, fallback_fn, f0))
 
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 39c95359b33..e863a298b78 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -30,10 +30,10 @@
 
 def _process_data(op):
     if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
-        return str([np.mod(d, 2 * np.pi) for d in op.data])
+        return str([d % (2 * np.pi) for d in op.data])
 
     if op.name in ("CRX", "CRY", "CRZ", "CRot"):
-        return str([np.mod(d, 4 * np.pi) for d in op.data])
+        return str([d % (4 * np.pi) for d in op.data])
 
     return str(op.data)
 
@@ -49,9 +49,15 @@ def tape_hash(tape):
         for op in tape.operations
     )
     fingerprint.extend(
-        (str(op.name), tuple(op.wires.tolist()), str(op.data), op.return_type)
+        (
+            str(getattr(getattr(op, "obs", op), "name", op.name)),
+            tuple(op.wires.tolist()),
+            str(getattr(getattr(op, "obs", op), "data", op.data)),
+            op.return_type,
+        )
         for op in tape.measurements
     )
+    fingerprint.append(tape.trainable_params)
     fingerprint = tuple(item for sublist in fingerprint for item in sublist)
     return hash(fingerprint)
 
@@ -105,7 +111,7 @@ def execute(
     interface="autograd",
     mode="best",
     gradient_kwargs=None,
-    cache=None,
+    cache=True,
     cachesize=10000,
 ):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
@@ -208,7 +214,11 @@ def cost_fn(params, x):
 
     else:
         # gradient function is a transform
-        gradient_kwargs["cache"] = cache or LRUCache(maxsize=cachesize, getsizeof=lambda x: len(x))
+        if isinstance(cache, bool) and cache:
+            gradient_kwargs["cache"] = LRUCache(maxsize=cachesize, getsizeof=lambda x: len(x))
+        elif not isinstance(cache, bool) and cache is not None:
+            gradient_kwargs["cache"] = cache
+
         execute_fn = lambda tapes, **kwargs: execute_fn_wrapper(tapes, device, **kwargs)
 
     if interface == "autograd":
diff --git a/tests/gradients/test_parameter_shift.py b/tests/gradients/test_parameter_shift.py
index ad2cc2bd009..d3ce29c0eb5 100644
--- a/tests/gradients/test_parameter_shift.py
+++ b/tests/gradients/test_parameter_shift.py
@@ -267,7 +267,7 @@ def test_independent_parameters_analytic(self):
 class TestParameterShiftRule:
     """Tests for the parameter shift implementation"""
 
-    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
+    @pytest.mark.parametrize("theta", np.linspace(0, 2 * np.pi - 0.01, 7))
     @pytest.mark.parametrize("shift", [np.pi / 2, 0.3, np.sqrt(2)])
     @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ, qml.PhaseShift])
     def test_pauli_rotation_gradient(self, mocker, G, theta, shift, tol):

From 102d5515e40f0ae8a44e20ba54643f3bf040a2ce Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 03:11:18 +0800
Subject: [PATCH 25/45] final

---
 pennylane/gradients/parameter_shift.py | 43 +++-----------------------
 pennylane/interfaces/batch/__init__.py | 36 +--------------------
 pennylane/tape/tape.py                 | 34 ++++++++++++++++++++
 3 files changed, 39 insertions(+), 74 deletions(-)

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index d102f7bea70..175b33d402a 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -17,6 +17,9 @@
 """
 # pylint: disable=protected-access,too-many-arguments
 import functools
+
+from cachetools import cached
+from cachetools.keys import hashkey
 import numpy as np
 
 import pennylane as qml
@@ -353,44 +356,6 @@ def processing_fn(results):
     return gradient_tapes, processing_fn
 
 
-from cachetools import cached
-from cachetools.keys import hashkey
-
-
-def _process_data(op):
-    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
-        return str([d % (2.0 * np.pi) for d in op.data])
-
-    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
-        return str([d % (4.0 * np.pi) for d in op.data])
-
-    return str(op.data)
-
-
-def tape_hash(tape):
-    fingerprint = []
-    fingerprint.extend(
-        (
-            str(op.name),
-            tuple(op.wires.tolist()),
-            _process_data(op),
-        )
-        for op in tape.operations
-    )
-    fingerprint.extend(
-        (
-            str(getattr(getattr(op, "obs", op), "name", op.name)),
-            tuple(op.wires.tolist()),
-            str(getattr(getattr(op, "obs", op), "data", op.data)),
-            op.return_type,
-        )
-        for op in tape.measurements
-    )
-    fingerprint.append(tape.trainable_params)
-    fingerprint = tuple(item for sublist in fingerprint for item in sublist)
-    return hash(fingerprint)
-
-
 def key(
     tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
 ):
@@ -400,7 +365,7 @@ def key(
     if gradient_recipes is not None:
         gradient_recipes = tuple(tuple(tuple(w) for w in y) for y in gradient_recipes)
 
-    return hashkey((tape_hash(tape), argnum, shift, gradient_recipes, fallback_fn, f0))
+    return hashkey((tape.hash, argnum, shift, gradient_recipes, fallback_fn, f0))
 
 
 @cached(cache={}, key=key)
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index e863a298b78..59ede76cced 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -28,40 +28,6 @@
 from collections import OrderedDict
 
 
-def _process_data(op):
-    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
-        return str([d % (2 * np.pi) for d in op.data])
-
-    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
-        return str([d % (4 * np.pi) for d in op.data])
-
-    return str(op.data)
-
-
-def tape_hash(tape):
-    fingerprint = []
-    fingerprint.extend(
-        (
-            str(op.name),
-            tuple(op.wires.tolist()),
-            _process_data(op),
-        )
-        for op in tape.operations
-    )
-    fingerprint.extend(
-        (
-            str(getattr(getattr(op, "obs", op), "name", op.name)),
-            tuple(op.wires.tolist()),
-            str(getattr(getattr(op, "obs", op), "data", op.data)),
-            op.return_type,
-        )
-        for op in tape.measurements
-    )
-    fingerprint.append(tape.trainable_params)
-    fingerprint = tuple(item for sublist in fingerprint for item in sublist)
-    return hash(fingerprint)
-
-
 def execute_fn_wrapper(tapes, device, **kwargs):
     cache = kwargs.pop("cache", None)
 
@@ -74,7 +40,7 @@ def execute_fn_wrapper(tapes, device, **kwargs):
     repeated = {}
 
     for i, tape in enumerate(tapes):
-        h = tape_hash(tape)
+        h = tape.hash
 
         if h in hashes.values():
             idx = list(hashes.keys())[list(hashes.values()).index(h)]
diff --git a/pennylane/tape/tape.py b/pennylane/tape/tape.py
index 4ea8c2f01ea..0ef9343544e 100644
--- a/pennylane/tape/tape.py
+++ b/pennylane/tape/tape.py
@@ -84,6 +84,16 @@
 """
 
 
+def _process_data(op):
+    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
+        return str([d % (2 * np.pi) for d in op.data])
+
+    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
+        return str([d % (4 * np.pi) for d in op.data])
+
+    return str(op.data)
+
+
 def get_active_tape():
     """Returns the currently recording tape.
     If no tape is currently recording, ``None`` is returned.
@@ -1302,6 +1312,30 @@ def copy(self, copy_operations=False, tape_cls=None):
     def __copy__(self):
         return self.copy(copy_operations=True)
 
+    @property
+    def hash(self):
+        fingerprint = []
+        fingerprint.extend(
+            (
+                str(op.name),
+                tuple(op.wires.tolist()),
+                _process_data(op),
+            )
+            for op in self.operations
+        )
+        fingerprint.extend(
+            (
+                str(getattr(getattr(op, "obs", op), "name", op.name)),
+                tuple(op.wires.tolist()),
+                str(getattr(getattr(op, "obs", op), "data", op.data)),
+                op.return_type,
+            )
+            for op in self.measurements
+        )
+        fingerprint.append(self.trainable_params)
+        fingerprint = tuple(item for sublist in fingerprint for item in sublist)
+        return hash(fingerprint)
+
     # ========================================================
     # execution methods
     # ========================================================

From 55be8f2ae659f40d1125bff7299f429ba0233ed8 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 15:32:02 +0800
Subject: [PATCH 26/45] update changelog

---
 pennylane/interfaces/batch/__init__.py | 101 ++++++++++++++-----------
 pennylane/interfaces/batch/autograd.py |   9 ++-
 2 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 59ede76cced..6f104c15160 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -16,8 +16,8 @@
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel)
+from functools import wraps
 from cachetools import LRUCache
-from functools import partial
 
 import pennylane as qml
 import numpy as np
@@ -28,46 +28,59 @@
 from collections import OrderedDict
 
 
-def execute_fn_wrapper(tapes, device, **kwargs):
-    cache = kwargs.pop("cache", None)
+def cache_execute(fn, cache, pass_kwargs=False, return_jacs=True):
+    @wraps(fn)
+    def wrapper(tapes, **kwargs):
 
-    if cache is None:
-        return device.batch_execute(tapes), []
+        if not pass_kwargs:
+            kwargs = {}
 
-    execution_tapes = OrderedDict()
-    cached_results = {}
-    hashes = {}
-    repeated = {}
+        if cache is None or (isinstance(cache, bool) and not cache):
+            if not return_jacs:
+                return fn(tapes, **kwargs)
 
-    for i, tape in enumerate(tapes):
-        h = tape.hash
+            return fn(tapes, **kwargs), []
 
-        if h in hashes.values():
-            idx = list(hashes.keys())[list(hashes.values()).index(h)]
-            repeated[i] = idx
-            continue
+        execution_tapes = OrderedDict()
+        cached_results = {}
+        hashes = {}
+        repeated = {}
 
-        hashes[i] = h
+        for i, tape in enumerate(tapes):
+            h = tape.hash
 
-        if hashes[i] in cache:
-            cached_results[i] = cache[hashes[i]]
-        else:
-            execution_tapes[i] = tape
+            if h in hashes.values():
+                idx = list(hashes.keys())[list(hashes.values()).index(h)]
+                repeated[i] = idx
+                continue
 
-    res = device.batch_execute(execution_tapes.values())
-    final_res = []
+            hashes[i] = h
 
-    for i, tape in enumerate(tapes):
-        if i in cached_results:
-            final_res.append(cached_results[i])
-        elif i in repeated:
-            final_res.append(final_res[repeated[i]])
-        else:
-            r = res.pop(0)
-            final_res.append(r)
-            cache[hashes[i]] = r
+            if hashes[i] in cache:
+                cached_results[i] = cache[hashes[i]]
+            else:
+                execution_tapes[i] = tape
 
-    return final_res, []
+        res = fn(execution_tapes.values(), **kwargs)
+        final_res = []
+
+        for i, tape in enumerate(tapes):
+            if i in cached_results:
+                final_res.append(cached_results[i])
+            elif i in repeated:
+                final_res.append(final_res[repeated[i]])
+            else:
+                r = res.pop(0)
+                final_res.append(r)
+                cache[hashes[i]] = r
+
+        if not return_jacs:
+            return final_res
+
+        return final_res, []
+
+    wrapper.fn = fn
+    return wrapper
 
 
 def execute(
@@ -161,6 +174,13 @@ def cost_fn(params, x):
     """
     gradient_kwargs = gradient_kwargs or {}
 
+    if isinstance(cache, bool) and cache:
+        # cache=True: create a LRUCache object
+        cache = LRUCache(maxsize=cachesize, getsizeof=lambda x: len(x))
+
+    # the default execution function is device.batch_execute
+    execute_fn = cache_execute(device.batch_execute, cache)
+
     if gradient_fn == "device":
         # gradient function is a device method
 
@@ -171,22 +191,17 @@ def cost_fn(params, x):
             gradient_fn = None
 
         elif mode == "backward":
+            # disable caching on the forward pass
+            execute_fn = cache_execute(device.batch_execute, cache=None)
+
             # replace the backward gradient computation
-            execute_fn = lambda tapes, **kwargs: (device.batch_execute(tapes), [])
-            gradient_fn = device.gradients
+            gradient_fn = cache_execute(
+                device.gradients, cache, pass_kwargs=True, return_jacs=False
+            )
 
     elif mode == "forward":
         raise ValueError("Gradient transforms cannot be used with mode='forward'")
 
-    else:
-        # gradient function is a transform
-        if isinstance(cache, bool) and cache:
-            gradient_kwargs["cache"] = LRUCache(maxsize=cachesize, getsizeof=lambda x: len(x))
-        elif not isinstance(cache, bool) and cache is not None:
-            gradient_kwargs["cache"] = cache
-
-        execute_fn = lambda tapes, **kwargs: execute_fn_wrapper(tapes, device, **kwargs)
-
     if interface == "autograd":
         res = execute_autograd(tuple(tapes), device, execute_fn, gradient_fn, gradient_kwargs)
     else:
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 095b4c8bf41..c442a5d78d3 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -143,7 +143,6 @@ def vjp(
         function: this function accepts the backpropagation
         gradient output vector, and computes the vector-Jacobian product
     """
-    g_kwargs = {x: gradient_kwargs[x] for x in gradient_kwargs if "cache" not in x}
 
     def grad_fn(dy):
         """Returns the vector-Jacobian product with given
@@ -170,7 +169,7 @@ def grad_fn(dy):
 
                 # Generate and execute the required gradient tapes
                 vjp_tapes, processing_fn = qml.gradients.batch_vjp(
-                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=g_kwargs
+                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
                 )
 
                 # This is where the magic happens. Note that we call ``execute``.
@@ -187,7 +186,11 @@ def grad_fn(dy):
                     )
                 )
 
-            elif inspect.ismethod(gradient_fn) and gradient_fn.__self__ is device:
+            elif (
+                hasattr(gradient_fn, "fn")
+                and inspect.ismethod(gradient_fn.fn)
+                and gradient_fn.fn.__self__ is device
+            ):
                 # Gradient function is a device method.
                 # Note that unlike the previous branch:
                 #

From 49412da707cbb1a0fc64e59dda16068c4c6d3ab7 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 15:37:31 +0800
Subject: [PATCH 27/45] update

---
 pennylane/gradients/parameter_shift.py    |  28 +-
 pennylane/gradients/parameter_shift_cv.py | 693 ++++++++++++++++++++++
 tests/gradients/test_parameter_shift.py   |   2 +-
 3 files changed, 695 insertions(+), 28 deletions(-)
 create mode 100644 pennylane/gradients/parameter_shift_cv.py

diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 175b33d402a..38005ddc334 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -16,10 +16,6 @@
 of a qubit-based quantum tape.
 """
 # pylint: disable=protected-access,too-many-arguments
-import functools
-
-from cachetools import cached
-from cachetools.keys import hashkey
 import numpy as np
 
 import pennylane as qml
@@ -38,7 +34,6 @@
 """
 
 
-@functools.lru_cache()
 def _square_observable(obs):
     """Returns the square of an observable."""
 
@@ -62,7 +57,6 @@ def _square_observable(obs):
     return NONINVOLUTORY_OBS[obs.name](obs)
 
 
-@functools.lru_cache()
 def _get_operation_recipe(tape, t_idx, shift=np.pi / 2):
     """Utility function to return the parameter-shift rule
     of the operation corresponding to trainable parameter
@@ -97,7 +91,6 @@ def _process_gradient_recipe(gradient_recipe, tol=1e-10):
     return gradient_recipe[:, np.argsort(np.abs(gradient_recipe)[-1])]
 
 
-@functools.lru_cache()
 def _gradient_analysis(tape, use_graph=True):
     """Update the parameter information dictionary of the tape with
     gradient information of each parameter."""
@@ -356,26 +349,8 @@ def processing_fn(results):
     return gradient_tapes, processing_fn
 
 
-def key(
-    tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
-):
-    f0 = tuple(qml.math.toarray(f0).tolist()) if f0 is not None else None
-    argnum = tuple(argnum) if argnum is not None else None
-
-    if gradient_recipes is not None:
-        gradient_recipes = tuple(tuple(tuple(w) for w in y) for y in gradient_recipes)
-
-    return hashkey((tape.hash, argnum, shift, gradient_recipes, fallback_fn, f0))
-
-
-@cached(cache={}, key=key)
 def param_shift(
-    tape,
-    argnum=None,
-    shift=np.pi / 2,
-    gradient_recipes=None,
-    fallback_fn=finite_diff,
-    f0=None,
+    tape, argnum=None, shift=np.pi / 2, gradient_recipes=None, fallback_fn=finite_diff, f0=None
 ):
     r"""Generate the parameter-shift tapes and postprocessing methods required
     to compute the gradient of a gate parameter with respect to an
@@ -480,7 +455,6 @@ def param_shift(
     [[-0.38751721 -0.18884787 -0.38355704]
      [ 0.69916862  0.34072424  0.69202359]]
     """
-    f0 = np.array(f0) if f0 is not None else None
 
     # perform gradient method validation
     if any(m.return_type is qml.operation.State for m in tape.measurements):
diff --git a/pennylane/gradients/parameter_shift_cv.py b/pennylane/gradients/parameter_shift_cv.py
new file mode 100644
index 00000000000..5db6b6b4526
--- /dev/null
+++ b/pennylane/gradients/parameter_shift_cv.py
@@ -0,0 +1,693 @@
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains functions for computing the parameter-shift gradient
+of a CV-based quantum tape.
+"""
+# pylint: disable=protected-access,too-many-arguments,too-many-statements,too-many-branches
+import itertools
+import warnings
+
+import numpy as np
+
+import pennylane as qml
+
+from .finite_difference import finite_diff, generate_shifted_tapes
+from .parameter_shift import expval_param_shift, _get_operation_recipe, _process_gradient_recipe
+
+
+def _grad_method(tape, idx):
+    """Determine the best CV parameter-shift gradient recipe for a given
+    parameter index of a tape.
+
+    Args:
+        tape (.QuantumTape): input tape
+        idx (int): positive integer corresponding to the parameter location
+            on the tape to inspect
+
+    Returns:
+        str: a string containing either ``"A"`` (for first-order analytic method),
+            ``"A2"`` (second-order analytic method), ``"F"`` (finite differences),
+            or ``"0"`` (constant parameter).
+    """
+
+    op = tape._par_info[idx]["op"]
+
+    if op.grad_method in (None, "F"):
+        return op.grad_method
+
+    if op.grad_method != "A":
+        raise ValueError(f"Operation {op} has unknown gradient method {op.grad_method}")
+
+    # Operation supports the CV parameter-shift rule.
+    # Create an empty list to store the 'best' partial derivative method
+    # for each observable
+    best = []
+
+    for m in tape.measurements:
+
+        if (m.return_type is qml.operation.Probability) or (m.obs.ev_order not in (1, 2)):
+            # Higher-order observables (including probability) only support finite differences.
+            best.append("F")
+            continue
+
+        # get the set of operations betweens the operation and the observable
+        ops_between = tape.graph.nodes_between(op, m.obs)
+
+        if not ops_between:
+            # if there is no path between the operation and the observable,
+            # the operator has a zero gradient.
+            best.append("0")
+            continue
+
+        # For parameter-shift compatible CV gates, we need to check both the
+        # intervening gates, and the type of the observable.
+        best_method = "A"
+
+        if any(not k.supports_heisenberg for k in ops_between):
+            # non-Gaussian operators present in-between the operation
+            # and the observable. Must fallback to numeric differentiation.
+            best_method = "F"
+
+        elif m.obs.ev_order == 2:
+
+            if m.return_type is qml.operation.Expectation:
+                # If the observable is second-order, we must use the second-order
+                # CV parameter shift rule
+                best_method = "A2"
+
+            elif m.return_type is qml.operation.Variance:
+                # we only support analytic variance gradients for
+                # first-order observables
+                best_method = "F"
+
+        best.append(best_method)
+
+    if all(k == "0" for k in best):
+        # if the operation is independent of *all* observables
+        # in the circuit, the gradient will be 0
+        return "0"
+
+    if "F" in best:
+        # one non-analytic observable path makes the whole operation
+        # gradient method fallback to finite-difference
+        return "F"
+
+    if "A2" in best:
+        # one second-order observable makes the whole operation gradient
+        # require the second-order parameter-shift rule
+        return "A2"
+
+    return "A"
+
+
+def _gradient_analysis(tape):
+    """Update the parameter information dictionary of the tape with
+    gradient information of each parameter."""
+
+    if getattr(tape, "_gradient_fn", None) is param_shift_cv:
+        # gradient analysis has already been performed on this tape
+        return
+
+    tape._gradient_fn = param_shift_cv
+
+    for idx, info in tape._par_info.items():
+        info["grad_method"] = _grad_method(tape, idx)
+
+
+def _transform_observable(obs, Z, device_wires):
+    """Apply a Gaussian linear transformation to an observable.
+
+    Args:
+        obs (.Observable): observable to transform
+        Z (array[float]): Heisenberg picture representation of the linear transformation
+        device_wires (.Wires): wires on the device the transformed observable is to be
+            measured on
+
+    Returns:
+        .Observable: the transformed observable
+    """
+    # Get the Heisenberg representation of the observable
+    # in the position/momentum basis. The returned matrix/vector
+    # will have been expanded to act on the entire device.
+    if obs.ev_order > 2:
+        raise NotImplementedError("Transforming observables of order > 2 not implemented.")
+
+    A = obs.heisenberg_obs(device_wires)
+
+    if A.ndim != obs.ev_order:
+        raise ValueError(
+            "Mismatch between the polynomial order of observable and its Heisenberg representation"
+        )
+
+    # transform the observable by the linear transformation Z
+    A = A @ Z
+
+    if A.ndim == 2:
+        A = A + A.T
+
+    # TODO: if the A matrix corresponds to a known observable in PennyLane,
+    # for example qml.X, qml.P, qml.NumberOperator, we should return that
+    # instead. This will allow for greater device compatibility.
+    return qml.PolyXP(A, wires=device_wires)
+
+
+def var_param_shift(tape, dev_wires, argnum=None, shift=np.pi / 2, gradient_recipes=None, f0=None):
+    r"""Partial derivative using the first-order or second-order parameter-shift rule of a tape
+    consisting of a mixture of expectation values and variances of observables.
+
+    Expectation values may be of first- or second-order observables,
+    but variances can only be taken of first-order variables.
+
+    .. warning::
+
+        This method can only be executed on devices that support the
+        :class:`~.PolyXP` observable.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        dev_wires (.Wires): wires on the device the parameter-shift method is computed on
+        argnum (int or list[int] or None): Trainable parameter indices to differentiate
+            with respect to. If not provided, the derivative with respect to all
+            trainable indices are returned.
+        shift (float): The shift value to use for the two-term parameter-shift formula.
+            Only valid if the operation in question supports the two-term parameter-shift
+            rule (that is, it has two distinct eigenvalues) and ``gradient_recipes``
+            is ``None``.
+        gradient_recipes (tuple(list[list[float]] or None)): List of gradient recipes
+            for the parameter-shift method. One gradient recipe must be provided
+            per trainable parameter.
+        f0 (tensor_like[float] or None): Output of the evaluated input tape. If provided,
+            and the gradient recipe contains an unshifted term, this value is used,
+            saving a quantum evaluation.
+
+    Returns:
+        tuple[list[QuantumTape], function]: A tuple containing a
+        list of generated tapes, in addition to a post-processing
+        function to be applied to the evaluated tapes.
+    """
+    argnum = argnum or tape.trainable_params
+
+    # Determine the locations of any variance measurements in the measurement queue.
+    var_mask = [m.return_type is qml.operation.Variance for m in tape.measurements]
+    var_idx = np.where(var_mask)[0]
+
+    # Get <A>, the expectation value of the tape with unshifted parameters.
+    expval_tape = tape.copy(copy_operations=True)
+
+    # Convert all variance measurements on the tape into expectation values
+    for i in var_idx:
+        obs = expval_tape._measurements[i].obs
+        expval_tape._measurements[i] = qml.measure.MeasurementProcess(
+            qml.operation.Expectation, obs=obs
+        )
+
+    gradient_tapes = [expval_tape]
+
+    # evaluate the analytic derivative of <A>
+    pdA_tapes, pdA_fn = expval_param_shift(expval_tape, argnum, shift, gradient_recipes, f0)
+    gradient_tapes.extend(pdA_tapes)
+
+    # Store the number of first derivative tapes, so that we know
+    # the number of results to post-process later.
+    tape_boundary = len(pdA_tapes) + 1
+    expval_sq_tape = tape.copy(copy_operations=True)
+
+    for i in var_idx:
+        # We need to calculate d<A^2>/dp; to do so, we replace the
+        # observables A in the queue with A^2.
+        obs = expval_sq_tape._measurements[i].obs
+
+        # CV first-order observable
+        # get the heisenberg representation
+        # This will be a real 1D vector representing the
+        # first-order observable in the basis [I, x, p]
+        A = obs._heisenberg_rep(obs.parameters)
+
+        # take the outer product of the heisenberg representation
+        # with itself, to get a square symmetric matrix representing
+        # the square of the observable
+        obs = qml.PolyXP(np.outer(A, A), wires=obs.wires)
+        expval_sq_tape._measurements[i] = qml.measure.MeasurementProcess(
+            qml.operation.Expectation, obs=obs
+        )
+
+    # Non-involutory observables are present; the partial derivative of <A^2>
+    # may be non-zero. Here, we calculate the analytic derivatives of the <A^2>
+    # observables.
+    pdA2_tapes, pdA2_fn = second_order_param_shift(
+        expval_sq_tape, dev_wires, argnum, shift, gradient_recipes
+    )
+    gradient_tapes.extend(pdA2_tapes)
+
+    def processing_fn(results):
+        mask = qml.math.convert_like(qml.math.reshape(var_mask, [-1, 1]), results[0])
+        f0 = qml.math.expand_dims(results[0], -1)
+
+        pdA = pdA_fn(results[1:tape_boundary])
+        pdA2 = pdA2_fn(results[tape_boundary:])
+
+        # return d(var(A))/dp = d<A^2>/dp -2 * <A> * d<A>/dp for the variances (mask==True)
+        # d<A>/dp for plain expectations (mask==False)
+        return qml.math.where(mask, pdA2 - 2 * f0 * pdA, pdA)
+
+    return gradient_tapes, processing_fn
+
+
+def second_order_param_shift(tape, dev_wires, argnum=None, shift=np.pi / 2, gradient_recipes=None):
+    r"""Generate the second-order CV parameter-shift tapes and postprocessing methods required
+    to compute the gradient of a gate parameter with respect to an
+    expectation value.
+
+    .. note::
+
+        The 2nd order method can handle also first-order observables, but
+        1st order method may be more efficient unless it's really easy to
+        experimentally measure arbitrary 2nd order observables.
+
+    .. warning::
+
+        The 2nd order method can only be executed on devices that support the
+        :class:`~.PolyXP` observable.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        dev_wires (.Wires): wires on the device the parameter-shift method is computed on
+        argnum (int or list[int] or None): Trainable parameter indices to differentiate
+            with respect to. If not provided, the derivative with respect to all
+            trainable indices are returned.
+        shift (float): The shift value to use for the two-term parameter-shift formula.
+            Only valid if the operation in question supports the two-term parameter-shift
+            rule (that is, it has two distinct eigenvalues) and ``gradient_recipes``
+            is ``None``.
+        gradient_recipes (tuple(list[list[float]] or None)): List of gradient recipes
+            for the parameter-shift method. One gradient recipe must be provided
+            per trainable parameter.
+
+    Returns:
+        tuple[list[QuantumTape], function]: A tuple containing a
+        list of generated tapes, in addition to a post-processing
+        function to be applied to the evaluated tapes.
+    """
+    argnum = argnum or list(tape.trainable_params)
+    gradient_recipes = gradient_recipes or [None] * len(argnum)
+
+    gradient_tapes = []
+    shapes = []
+    obs_indices = []
+    gradient_values = []
+
+    for idx, _ in enumerate(tape.trainable_params):
+        t_idx = list(tape.trainable_params)[idx]
+        op = tape._par_info[t_idx]["op"]
+
+        if idx not in argnum:
+            # parameter has zero gradient
+            shapes.append(0)
+            obs_indices.append([])
+            gradient_values.append([])
+            continue
+
+        shapes.append(1)
+
+        # get the gradient recipe for the trainable parameter
+        recipe = gradient_recipes[argnum.index(idx)]
+        recipe = recipe or _get_operation_recipe(tape, idx, shift=shift)
+        recipe = _process_gradient_recipe(recipe)
+        coeffs, multipliers, shifts = recipe
+
+        if len(shifts) != 2:
+            # The 2nd order CV parameter-shift rule only accepts two-term shifts
+            raise NotImplementedError(
+                "Taking the analytic gradient for order-2 operators is "
+                f"unsupported for operation {op} which has a "
+                "gradient recipe of more than two terms."
+            )
+
+        shifted_tapes = generate_shifted_tapes(tape, idx, shifts, multipliers)
+
+        # evaluate transformed observables at the original parameter point
+        # first build the Heisenberg picture transformation matrix Z
+        Z0 = op.heisenberg_tr(dev_wires, inverse=True)
+        Z2 = shifted_tapes[0]._par_info[t_idx]["op"].heisenberg_tr(dev_wires)
+        Z1 = shifted_tapes[1]._par_info[t_idx]["op"].heisenberg_tr(dev_wires)
+
+        # derivative of the operation
+        Z = Z2 * coeffs[0] + Z1 * coeffs[1]
+        Z = Z @ Z0
+
+        # conjugate Z with all the descendant operations
+        B = np.eye(1 + 2 * len(dev_wires))
+        B_inv = B.copy()
+
+        succ = tape.graph.descendants_in_order((op,))
+        operation_descendents = itertools.filterfalse(qml.circuit_graph._is_observable, succ)
+        observable_descendents = filter(qml.circuit_graph._is_observable, succ)
+
+        for BB in operation_descendents:
+            if not BB.supports_heisenberg:
+                # if the descendant gate is non-Gaussian in parameter-shift differentiation
+                # mode, then there must be no observable following it.
+                continue
+
+            B = BB.heisenberg_tr(dev_wires) @ B
+            B_inv = B_inv @ BB.heisenberg_tr(dev_wires, inverse=True)
+
+        Z = B @ Z @ B_inv  # conjugation
+
+        g_tape = tape.copy(copy_operations=True)
+        constants = []
+
+        # transform the descendant observables into their derivatives using Z
+        transformed_obs_idx = []
+
+        for obs in observable_descendents:
+            # get the index of the descendent observable
+            idx = tape.observables.index(obs)
+            transformed_obs_idx.append(idx)
+
+            transformed_obs = _transform_observable(obs, Z, dev_wires)
+
+            A = transformed_obs.parameters[0]
+            constant = None
+
+            # Check if the transformed observable corresponds to a constant term.
+            if len(A.nonzero()[0]) == 1:
+                if A.ndim == 2 and A[0, 0] != 0:
+                    constant = A[0, 0]
+
+                elif A.ndim == 1 and A[0] != 0:
+                    constant = A[0]
+
+            constants.append(constant)
+
+            g_tape._measurements[idx] = qml.measure.MeasurementProcess(
+                qml.operation.Expectation, _transform_observable(obs, Z, dev_wires)
+            )
+
+        if not any(i is None for i in constants):
+            # Check if *all* transformed observables corresponds to a constant term.
+            # term. If this is the case for all transformed observables on the tape,
+            # then <psi|A|psi> = A<psi|psi> = A,
+            # and we can avoid the device execution.
+            shapes[-1] = 0
+            obs_indices.append(transformed_obs_idx)
+            gradient_values.append(constants)
+            continue
+
+        gradient_tapes.append(g_tape)
+        obs_indices.append(transformed_obs_idx)
+        gradient_values.append(None)
+
+    def processing_fn(results):
+        grads = []
+        start = 0
+
+        if not results:
+            results = [np.zeros([tape.output_dim])]
+
+        interface = qml.math.get_interface(results[0])
+        iterator = enumerate(zip(shapes, gradient_values, obs_indices))
+
+        for i, (shape, grad_value, obs_ind) in iterator:
+
+            if shape == 0:
+                # parameter has zero gradient
+                g = qml.math.zeros_like(results[0], like=interface)
+
+                if grad_value:
+                    g = qml.math.scatter_element_add(g, obs_ind, grad_value, like=interface)
+
+                grads.append(g)
+                continue
+
+            obs_result = results[start : start + shape]
+            start = start + shape
+
+            # compute the linear combination of results and coefficients
+            obs_result = qml.math.stack(obs_result[0])
+            g = qml.math.zeros_like(obs_result, like=interface)
+
+            if qml.math.get_interface(g) not in ("tensorflow", "autograd"):
+                obs_ind = (obs_ind,)
+
+            g = qml.math.scatter_element_add(g, obs_ind, obs_result[obs_ind], like=interface)
+            grads.append(g)
+
+        # The following is for backwards compatibility; currently,
+        # the device stacks multiple measurement arrays, even if not the same
+        # size, resulting in a ragged array.
+        # In the future, we might want to change this so that only tuples
+        # of arrays are returned.
+        for i, g in enumerate(grads):
+            g = qml.math.convert_like(g, results[0])
+            if hasattr(g, "dtype") and g.dtype is np.dtype("object"):
+                grads[i] = qml.math.hstack(g)
+
+        return qml.math.T(qml.math.stack(grads))
+
+    return gradient_tapes, processing_fn
+
+
+def param_shift_cv(
+    tape,
+    dev,
+    argnum=None,
+    shift=np.pi / 2,
+    gradient_recipes=None,
+    fallback_fn=finite_diff,
+    f0=None,
+    force_order2=False,
+):
+    r"""Generate the CV parameter-shift tapes and postprocessing methods required
+    to compute the gradient of a gate parameter with respect to the CV output.
+
+    Args:
+        tape (.QuantumTape): quantum tape to differentiate
+        dev (.Device): device the parameter-shift method is to be computed on
+        argnum (int or list[int] or None): Trainable parameter indices to differentiate
+            with respect to. If not provided, the derivative with respect to all
+            trainable indices are returned.
+        shift (float): The shift value to use for the two-term parameter-shift formula.
+            Only valid if the operation in question supports the two-term parameter-shift
+            rule (that is, it has two distinct eigenvalues) and ``gradient_recipes``
+            is ``None``.
+        gradient_recipes (tuple(list[list[float]] or None)): List of gradient recipes
+            for the parameter-shift method. One gradient recipe must be provided
+            per trainable parameter.
+
+            This is a tuple with one nested list per parameter. For
+            parameter :math:`\phi_k`, the nested list contains elements of the form
+            :math:`[c_i, a_i, s_i]` where :math:`i` is the index of the
+            term, resulting in a gradient recipe of
+
+            .. math:: \frac{\partial}{\partial\phi_k}f = \sum_{i} c_i f(a_i \phi_k + s_i).
+
+            If ``None``, the default gradient recipe containing the two terms
+            :math:`[c_0, a_0, s_0]=[1/2, 1, \pi/2]` and :math:`[c_1, a_1,
+            s_1]=[-1/2, 1, -\pi/2]` is assumed for every parameter.
+        fallback_fn (None or Callable): a fallback grdient function to use for
+            any parameters that do not support the parameter-shift rule.
+        f0 (tensor_like[float] or None): Output of the evaluated input tape. If provided,
+            and the gradient recipe contains an unshifted term, this value is used,
+            saving a quantum evaluation.
+        force_order2 (bool): if True, use the order-2 method even if not necessary
+
+    Returns:
+        tuple[list[QuantumTape], function]: A tuple containing a
+        list of generated tapes, in addition to a post-processing
+        function to be applied to the evaluated tapes.
+
+    This transform supports analytic gradients of Gaussian CV operations using
+    the parameter-shift rule. This gradient method returns *exact* gradients,
+    and can be computed directly on quantum hardware.
+
+    Analytic gradients of photonic circuits that satisfy
+    the following constraints with regards to measurements are supported:
+
+    * Expectation values are restricted to observables that are first- and
+      second-order in :math:`\hat{x}` and :math:`\hat{p}` only.
+      This includes :class:`~.X`, :class:`~.P`, :class:`~.QuadOperator`,
+      :class:`~.PolyXP`, and :class:`~.NumberOperator`.
+
+      For second-order observables, the device **must support** :class:`~.PolyXP`.
+
+    * Variances are restricted to observables that are first-order
+      in :math:`\hat{x}` and :math:`\hat{p}` only. This includes :class:`~.X`, :class:`~.P`,
+      :class:`~.QuadOperator`, and *some* parameter values of :class:`~.PolyXP`.
+
+      The device **must support** :class:`~.PolyXP`.
+
+    .. warning::
+
+        Fock state probabilities (tapes that return :func:`~pennylane.probs` or
+        expectation values of :class:`~.FockStateProjector`) are not supported.
+
+    In addition, the tape operations must fulfill the following requirements:
+
+    * Only Gaussian operations are differentiable.
+
+    * Non-differentiable Fock states and Fock operations may *precede* all differentiable Gaussian,
+      operations. For example, the following is permissible:
+
+      .. code-block:: python
+
+          with qml.tape.JacobianTape() as tape:
+              # Non-differentiable Fock operations
+              qml.FockState(2, wires=0)
+              qml.Kerr(0.654, wires=1)
+
+              # differentiable Gaussian operations
+              qml.Displacement(0.6, 0.5, wires=0)
+              qml.Beamsplitter(0.5, 0.1, wires=[0, 1])
+              qml.expval(qml.NumberOperator(0))
+
+          tape.trainable_params = {2, 3, 4}
+
+    * If a Fock operation succeeds a Gaussian operation, the Fock operation must
+      not contribute to any measurements. For example, the following is allowed:
+
+      .. code-block:: python
+
+          with qml.tape.JacobianTape() as tape:
+              qml.Displacement(0.6, 0.5, wires=0)
+              qml.Beamsplitter(0.5, 0.1, wires=[0, 1])
+              qml.Kerr(0.654, wires=1)  # there is no measurement on wire 1
+              qml.expval(qml.NumberOperator(0))
+
+          tape.trainable_params = {0, 1, 2}
+
+    If any of the above constraints are not followed, the tape cannot be differentiated
+    via the CV parameter-shift rule. Please use numerical differentiation instead.
+
+    **Example**
+
+    >>> r0, phi0, r1, phi1 = [0.4, -0.3, -0.7, 0.2]
+    >>> dev = qml.device("default.gaussian", wires=1)
+    >>> with qml.tape.JacobianTape() as tape:
+    ...     qml.Squeezing(r0, phi0, wires=[0])
+    ...     qml.Squeezing(r1, phi1, wires=[0])
+    ...     qml.expval(qml.NumberOperator(0))  # second-order
+    >>> tape.trainable_params = {0, 2}
+    >>> gradient_tapes, fn = qml.gradients.param_shift_cv(tape, dev)
+    >>> res = dev.batch_execute(gradient_tapes)
+    >>> fn(res)
+    array([[-0.32487113, -0.87049853]])
+    """
+
+    # perform gradient method validation
+    if any(m.return_type is qml.operation.State for m in tape.measurements):
+        raise ValueError(
+            "Computing the gradient of circuits that return the state is not supported."
+        )
+
+    _gradient_analysis(tape)
+
+    gradient_tapes = []
+    shapes = []
+    fns = []
+
+    def _update(data):
+        """Utility function to update the list of gradient tapes,
+        the corresponding number of gradient tapes, and the processing functions"""
+        gradient_tapes.extend(data[0])
+        shapes.append(len(data[0]))
+        fns.append(data[1])
+
+    # TODO: replace the JacobianTape._grad_method_validation
+    # functionality before deprecation.
+    diff_methods = tape._grad_method_validation("analytic" if fallback_fn is None else "best")
+    all_params_grad_method_zero = all(g == "0" for g in diff_methods)
+
+    if not tape.trainable_params or all_params_grad_method_zero:
+        return gradient_tapes, lambda _: np.zeros([tape.output_dim, len(tape.trainable_params)])
+
+    # TODO: replace the JacobianTape._choose_params_with_methods
+    # functionality before deprecation.
+    method_map = dict(tape._choose_params_with_methods(diff_methods, argnum))
+    var_present = any(m.return_type is qml.operation.Variance for m in tape.measurements)
+
+    unsupported_params = []
+    first_order_params = []
+    second_order_params = []
+
+    for idx, g in method_map.items():
+        if g == "F":
+            unsupported_params.append(idx)
+
+        elif g == "A":
+            first_order_params.append(idx)
+
+        elif g == "A2":
+            second_order_params.append(idx)
+
+    if force_order2:
+        # all analytic parameters should be computed using the second-order method
+        second_order_params += first_order_params
+        first_order_params = []
+
+    if "PolyXP" not in dev.observables and (second_order_params or var_present):
+        warnings.warn(
+            f"The device {dev.short_name} does not support "
+            "the PolyXP observable. The analytic parameter-shift cannot be used for "
+            "second-order observables; falling back to finite-differences.",
+            UserWarning,
+        )
+
+        if var_present:
+            unsupported_params += first_order_params
+            first_order_params = []
+
+        unsupported_params += second_order_params
+        second_order_params = []
+
+    # If there are unsupported operations, call the fallback gradient function
+    if unsupported_params:
+        _update(fallback_fn(tape, argnum=unsupported_params))
+
+    # collect all the analytic parameters
+    argnum = first_order_params + second_order_params
+
+    if not argnum:
+        # No analytic parameters. Return the existing fallback tapes/fn
+        return gradient_tapes, fns[-1]
+
+    gradient_recipes = gradient_recipes or [None] * len(argnum)
+
+    if var_present:
+        _update(var_param_shift(tape, dev.wires, argnum, shift, gradient_recipes, f0))
+
+    else:
+        # Only expectation values were specified
+        if first_order_params:
+            _update(expval_param_shift(tape, first_order_params, shift, gradient_recipes, f0))
+
+        if second_order_params:
+            _update(
+                second_order_param_shift(
+                    tape, dev.wires, second_order_params, shift, gradient_recipes
+                )
+            )
+
+    def processing_fn(results):
+        start = 0
+        grads = []
+
+        for s, f in zip(shapes, fns):
+            grads.append(f(results[start : start + s]))
+            start += s
+
+        return sum(grads)
+
+    return gradient_tapes, processing_fn
diff --git a/tests/gradients/test_parameter_shift.py b/tests/gradients/test_parameter_shift.py
index d3ce29c0eb5..ad2cc2bd009 100644
--- a/tests/gradients/test_parameter_shift.py
+++ b/tests/gradients/test_parameter_shift.py
@@ -267,7 +267,7 @@ def test_independent_parameters_analytic(self):
 class TestParameterShiftRule:
     """Tests for the parameter shift implementation"""
 
-    @pytest.mark.parametrize("theta", np.linspace(0, 2 * np.pi - 0.01, 7))
+    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("shift", [np.pi / 2, 0.3, np.sqrt(2)])
     @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ, qml.PhaseShift])
     def test_pauli_rotation_gradient(self, mocker, G, theta, shift, tol):

From ff2ecb0f20b5b786c0dfd92604a87dc82ca50abe Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 15:40:25 +0800
Subject: [PATCH 28/45] more

---
 pennylane/_qubit_device.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pennylane/_qubit_device.py b/pennylane/_qubit_device.py
index 30f60651ba4..5561d9cca49 100644
--- a/pennylane/_qubit_device.py
+++ b/pennylane/_qubit_device.py
@@ -268,6 +268,7 @@ def batch_execute(self, circuits):
         """
         # TODO: This method and the tests can be globally implemented by Device
         # once it has the same signature in the execute() method
+
         results = []
         for circuit in circuits:
             # we need to reset the device here, else it will

From efa7c497c09d17ec73cf35a5f37d4102c938a7c8 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 15:41:18 +0800
Subject: [PATCH 29/45] revert formatting

---
 pennylane/interfaces/batch/autograd.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index c442a5d78d3..40ab022e30f 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -176,14 +176,7 @@ def grad_fn(dy):
                 # This recursion, coupled with the fact that the gradient transforms
                 # are differentiable, allows for arbitrary order differentiation.
                 vjps = processing_fn(
-                    execute(
-                        vjp_tapes,
-                        device,
-                        execute_fn,
-                        gradient_fn,
-                        gradient_kwargs,
-                        _n=_n + 1,
-                    )
+                    execute(vjp_tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=_n + 1)
                 )
 
             elif (

From 4f8342a5ce2783377000f0290e308fd20e7a8c65 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 20:00:08 +0800
Subject: [PATCH 30/45] more

---
 pennylane/interfaces/batch/__init__.py | 72 ++++++++++++++++++++++++--
 pennylane/interfaces/batch/autograd.py | 54 +++++++++++++++----
 pennylane/measure.py                   | 20 +++++++
 pennylane/operation.py                 | 15 ++++++
 pennylane/tape/tape.py                 | 35 ++-----------
 5 files changed, 150 insertions(+), 46 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 6f104c15160..2ee6b13132f 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -28,7 +28,44 @@
 from collections import OrderedDict
 
 
-def cache_execute(fn, cache, pass_kwargs=False, return_jacs=True):
+def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
+    """Decorator that adds caching to a function that executes
+    multiple tapes on a device.
+
+    This decorator makes use of :attr:`.QuantumTape.hash` to identify
+    unique tapes.
+
+    - If a tape does not match a hash in the cache, then the tape
+      has not been previously executed. It is executed, and the result
+      added to the cache.
+
+    - If a tape matches a hash in the cache, then the tape has been previously
+      executed. The corresponding cached result is
+      extracted, and the tape is not passed to the execution function.
+
+    - Finally, there might be the case where one or more tapes in the current
+      set of tapes to be executed share a hash. If this is the case, duplicated
+      are removed, to avoid redundant evaluations.
+
+    Args:
+        fn (callable): The execution function to add caching to.
+            This function should have the signature ``fn(tapes, **kwargs)``,
+            and it should return ``list[tensor_like]``, with the
+            same length as the input ``tapes``.
+        cache (None or dict or Cache): The cache to use. If ``None``,
+            caching will not occur.
+        pass_kwargs (bool): If ``False``, keyword arguments passed to the
+            wrapped function will be passed directly to ``fn``. If ``True``,
+            they will be ignored.
+        return_tuple (bool): If ``True``, the output of ``fn`` is returned
+            as a tuple ``(fn_ouput, [])``, to match the output of execution functions
+            that also return gradients.
+
+    Returns:
+        function: a wrapped version of the execution function ``fn`` with caching
+        support
+    """
+
     @wraps(fn)
     def wrapper(tapes, **kwargs):
 
@@ -36,7 +73,10 @@ def wrapper(tapes, **kwargs):
             kwargs = {}
 
         if cache is None or (isinstance(cache, bool) and not cache):
-            if not return_jacs:
+            # No caching. Simply execution the execution function
+            # and return the results.
+
+            if not return_tuple:
                 return fn(tapes, **kwargs)
 
             return fn(tapes, **kwargs), []
@@ -50,6 +90,9 @@ def wrapper(tapes, **kwargs):
             h = tape.hash
 
             if h in hashes.values():
+                # Tape already exists within ``tapes``. Determine the
+                # index of the first occurance of the tape, store this,
+                # and continue to the next iteration.
                 idx = list(hashes.keys())[list(hashes.values()).index(h)]
                 repeated[i] = idx
                 continue
@@ -57,24 +100,33 @@ def wrapper(tapes, **kwargs):
             hashes[i] = h
 
             if hashes[i] in cache:
+                # Tape exists within the cache, store the cached result
                 cached_results[i] = cache[hashes[i]]
             else:
+                # Tape does not exist within the cache, store the tape
+                # for execution via the execution function.
                 execution_tapes[i] = tape
 
+        # execute all unique tapes that do not exist in the cache
         res = fn(execution_tapes.values(), **kwargs)
         final_res = []
 
         for i, tape in enumerate(tapes):
             if i in cached_results:
+                # insert cached results into the results vector
                 final_res.append(cached_results[i])
+
             elif i in repeated:
+                # insert repeated results into the results vector
                 final_res.append(final_res[repeated[i]])
+
             else:
+                # insert evaluated results into the results vector
                 r = res.pop(0)
                 final_res.append(r)
                 cache[hashes[i]] = r
 
-        if not return_jacs:
+        if not return_tuple:
             return final_res
 
         return final_res, []
@@ -92,6 +144,7 @@ def execute(
     gradient_kwargs=None,
     cache=True,
     cachesize=10000,
+    max_diff=2,
 ):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
@@ -113,6 +166,13 @@ def execute(
             pass.
         gradient_kwargs (dict): dictionary of keyword arguments to pass when
             determining the gradients of tapes
+        cache (bool): Whether to cache evaluations. This can result in
+            a significant reduction in quantum evaluations during gradient computations.
+        cachesize (int): the size of the cache
+        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
+            the maximum number of derivatives to support. Increasing this value allows
+            for higher order derivatives to be extracted, at the cost of additional
+            (classical) computational overhead during the backwards pass.
 
     Returns:
         list[list[float]]: A nested list of tape results. Each element in
@@ -196,14 +256,16 @@ def cost_fn(params, x):
 
             # replace the backward gradient computation
             gradient_fn = cache_execute(
-                device.gradients, cache, pass_kwargs=True, return_jacs=False
+                device.gradients, cache, pass_kwargs=True, return_tuple=False
             )
 
     elif mode == "forward":
         raise ValueError("Gradient transforms cannot be used with mode='forward'")
 
     if interface == "autograd":
-        res = execute_autograd(tuple(tapes), device, execute_fn, gradient_fn, gradient_kwargs)
+        res = execute_autograd(
+            tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_diff=max_diff
+        )
     else:
         raise ValueError(f"Unknown interface {interface}")
 
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 40ab022e30f..9f64d77e804 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -24,7 +24,7 @@
 from pennylane import numpy as np
 
 
-def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
+def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_diff=2):
     """Execute a batch of tapes with Autograd parameters on a device.
 
     Args:
@@ -41,6 +41,10 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
         gradient_fn (callable): the gradient function to use to compute quantum gradients
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
+        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
+            the maximum number of derivatives to support. Increasing this value allows
+            for higher order derivatives to be extracted, at the cost of additional
+            (classical) computational overhead during the backwards pass.
 
     Returns:
         list[list[float]]: A nested list of tape results. Each element in
@@ -63,6 +67,7 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
         gradient_fn=gradient_fn,
         gradient_kwargs=gradient_kwargs,
         _n=_n,
+        max_diff=max_diff,
     )[0]
 
 
@@ -75,6 +80,7 @@ def _execute(
     gradient_fn=None,
     gradient_kwargs=None,
     _n=1,
+    max_diff=2,
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Autodifferentiable wrapper around ``Device.batch_execute``.
 
@@ -118,6 +124,7 @@ def vjp(
     gradient_fn=None,
     gradient_kwargs=None,
     _n=1,
+    max_diff=2,
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Returns the vector-Jacobian product operator for a batch of quantum tapes.
 
@@ -138,6 +145,10 @@ def vjp(
             determining the gradients of tapes
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
+        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
+            the maximum number of derivatives to support. Increasing this value allows
+            for higher order derivatives to be extracted, at the cost of additional
+            (classical) computational overhead during the backwards pass.
 
     Returns:
         function: this function accepts the backpropagation
@@ -168,16 +179,37 @@ def grad_fn(dy):
             if "pennylane.gradients" in module_name:
 
                 # Generate and execute the required gradient tapes
-                vjp_tapes, processing_fn = qml.gradients.batch_vjp(
-                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
-                )
-
-                # This is where the magic happens. Note that we call ``execute``.
-                # This recursion, coupled with the fact that the gradient transforms
-                # are differentiable, allows for arbitrary order differentiation.
-                vjps = processing_fn(
-                    execute(vjp_tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=_n + 1)
-                )
+                if _n == max_diff:
+                    with qml.tape.Unwrap(*tapes):
+                        vjp_tapes, processing_fn = qml.gradients.batch_vjp(
+                            tapes,
+                            dy,
+                            gradient_fn,
+                            reduction="append",
+                            gradient_kwargs=gradient_kwargs,
+                        )
+
+                    vjps = processing_fn(execute_fn(vjp_tapes)[0])
+
+                else:
+                    vjp_tapes, processing_fn = qml.gradients.batch_vjp(
+                        tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
+                    )
+
+                    # This is where the magic happens. Note that we call ``execute``.
+                    # This recursion, coupled with the fact that the gradient transforms
+                    # are differentiable, allows for arbitrary order differentiation.
+                    vjps = processing_fn(
+                        execute(
+                            vjp_tapes,
+                            device,
+                            execute_fn,
+                            gradient_fn,
+                            gradient_kwargs,
+                            _n=_n + 1,
+                            max_diff=max_diff,
+                        )
+                    )
 
             elif (
                 hasattr(gradient_fn, "fn")
diff --git a/pennylane/measure.py b/pennylane/measure.py
index fa30a2f7ae5..e48e7e9493f 100644
--- a/pennylane/measure.py
+++ b/pennylane/measure.py
@@ -202,6 +202,26 @@ def queue(self, context=qml.QueuingContext):
 
         return self
 
+    @property
+    def hash(self):
+        """int: returns an integer hash uniquely representing the measurement process"""
+        if self.obs is None:
+            fingerprint = (
+                str(self.name),
+                tuple(self.wires.tolist()),
+                str(self.data),
+                self.return_type,
+            )
+        else:
+            fingerprint = (
+                str(self.obs.name),
+                tuple(self.wires.tolist()),
+                str(self.obs.data),
+                self.return_type,
+            )
+
+        return hash(fingerprint)
+
 
 def expval(op):
     r"""Expectation value of the supplied observable.
diff --git a/pennylane/operation.py b/pennylane/operation.py
index 5fdc762f30a..90610ea52f5 100644
--- a/pennylane/operation.py
+++ b/pennylane/operation.py
@@ -233,6 +233,16 @@ def classproperty(func):
 # =============================================================================
 
 
+def _process_data(op):
+    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
+        return str([d % (2 * np.pi) for d in op.data])
+
+    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
+        return str([d % (4 * np.pi) for d in op.data])
+
+    return str(op.data)
+
+
 class Operator(abc.ABC):
     r"""Base class for quantum operators supported by a device.
 
@@ -282,6 +292,11 @@ def __deepcopy__(self, memo):
                 setattr(copied_op, attribute, copy.deepcopy(value, memo))
         return copied_op
 
+    @property
+    def hash(self):
+        """int: returns an integer hash uniquely representing the operator"""
+        return hash((str(self.name), tuple(self.wires.tolist()), _process_data(self)))
+
     @classmethod
     def _matrix(cls, *params):
         """Matrix representation of the operator
diff --git a/pennylane/tape/tape.py b/pennylane/tape/tape.py
index 0ef9343544e..064ce2ea6ec 100644
--- a/pennylane/tape/tape.py
+++ b/pennylane/tape/tape.py
@@ -84,16 +84,6 @@
 """
 
 
-def _process_data(op):
-    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
-        return str([d % (2 * np.pi) for d in op.data])
-
-    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
-        return str([d % (4 * np.pi) for d in op.data])
-
-    return str(op.data)
-
-
 def get_active_tape():
     """Returns the currently recording tape.
     If no tape is currently recording, ``None`` is returned.
@@ -1314,27 +1304,12 @@ def __copy__(self):
 
     @property
     def hash(self):
+        """int: returns an integer hash uniquely representing the quantum tape"""
         fingerprint = []
-        fingerprint.extend(
-            (
-                str(op.name),
-                tuple(op.wires.tolist()),
-                _process_data(op),
-            )
-            for op in self.operations
-        )
-        fingerprint.extend(
-            (
-                str(getattr(getattr(op, "obs", op), "name", op.name)),
-                tuple(op.wires.tolist()),
-                str(getattr(getattr(op, "obs", op), "data", op.data)),
-                op.return_type,
-            )
-            for op in self.measurements
-        )
-        fingerprint.append(self.trainable_params)
-        fingerprint = tuple(item for sublist in fingerprint for item in sublist)
-        return hash(fingerprint)
+        fingerprint.extend(op.hash for op in self.operations)
+        fingerprint.extend(m.hash for m in self.measurements)
+        fingerprint.extend(self.trainable_params)
+        return hash(tuple(fingerprint))
 
     # ========================================================
     # execution methods

From 08181841642eec51644aacbd096727a5850c96e4 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 21:02:48 +0800
Subject: [PATCH 31/45] add tests

---
 tests/interfaces/test_batch_autograd.py | 206 +++++++++++++++++++++++-
 1 file changed, 200 insertions(+), 6 deletions(-)

diff --git a/tests/interfaces/test_batch_autograd.py b/tests/interfaces/test_batch_autograd.py
index a0c108ac01d..2f159178f3d 100644
--- a/tests/interfaces/test_batch_autograd.py
+++ b/tests/interfaces/test_batch_autograd.py
@@ -156,26 +156,181 @@ def cost(a):
         qml.jacobian(cost)(a)
         spy_gradients.assert_called()
 
-    def test_caching(self, tol):
+
+class TestCaching:
+    """Test for caching behaviour"""
+
+    def test_cache_maxsize(self, mocker):
+        """Test the cachesize property of the cache"""
         dev = qml.device("default.qubit", wires=1)
+        spy = mocker.spy(qml.interfaces.batch, "cache_execute")
 
-        def cost(a):
+        def cost(a, cachesize):
             with qml.tape.JacobianTape() as tape:
                 qml.RY(a[0], wires=0)
                 qml.RX(a[1], wires=0)
-                qml.expval(qml.PauliZ(0))
+                qml.probs(wires=0)
+
+            return execute([tape], dev, gradient_fn=param_shift, cachesize=cachesize)[0]
+
+        params = np.array([0.1, 0.2])
+        qml.jacobian(cost)(params, cachesize=2)
+        cache = spy.call_args[0][1]
+
+        assert cache.maxsize == 2
+        assert cache.currsize == 2
+        assert len(cache) == 2
+
+    def test_custom_cache(self, mocker):
+        """Test the use of a custom cache object"""
+        dev = qml.device("default.qubit", wires=1)
+        spy = mocker.spy(qml.interfaces.batch, "cache_execute")
+
+        def cost(a, cache):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.probs(wires=0)
+
+            return execute([tape], dev, gradient_fn=param_shift, cache=cache)[0]
+
+        custom_cache = {}
+        params = np.array([0.1, 0.2])
+        qml.jacobian(cost)(params, cache=custom_cache)
+
+        cache = spy.call_args[0][1]
+        assert cache is custom_cache
+
+    def test_caching_param_shift(self, tol):
+        """Test that, when using parameter-shift transform,
+        caching reduces the number of evaluations to their optimum."""
+        dev = qml.device("default.qubit", wires=1)
+
+        def cost(a, cache):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.probs(wires=0)
 
-            return execute([tape], dev, gradient_fn=param_shift)[0]
+            return execute([tape], dev, gradient_fn=param_shift, cache=cache)[0]
 
+        # Without caching, 9 evaluations are required to compute
+        # the Jacobian: 1 (forward pass) + 2 (backward pass) * (2 shifts * 2 params)
         params = np.array([0.1, 0.2])
-        grad1 = qml.jacobian(cost)(params)
+        qml.jacobian(cost)(params, cache=None)
+        assert dev.num_executions == 9
+
+        # With caching, 5 evaluations are required to compute
+        # the Jacobian: 1 (forward pass) + (2 shifts * 2 params)
+        dev._num_executions = 0
+        jac_fn = qml.jacobian(cost)
+        grad1 = jac_fn(params, cache=True)
         assert dev.num_executions == 5
 
-        grad2 = qml.jacobian(cost)(2 * params)
+        # Check that calling the cost function again
+        # continues to evaluate the device (that is, the cache
+        # is emptied between calls)
+        grad2 = jac_fn(params, cache=True)
         assert dev.num_executions == 10
+        assert np.allclose(grad1, grad2, atol=tol, rtol=0)
 
+        # Check that calling the cost function again
+        # with different parameters produces a different Jacobian
+        grad2 = jac_fn(2 * params, cache=True)
+        assert dev.num_executions == 15
         assert not np.allclose(grad1, grad2, atol=tol, rtol=0)
 
+    @pytest.mark.parametrize("num_params", [2, 3])
+    def test_caching_param_shift_hessian(self, num_params, tol):
+        """Test that, when using parameter-shift transform,
+        caching reduces the number of evaluations to their optimum
+        when computing Hessians."""
+        dev = qml.device("default.qubit", wires=2)
+        params = np.arange(1, num_params + 1) / 10
+
+        N = len(params)
+
+        def cost(x, cache):
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(x[0], wires=[0])
+                qml.RY(x[1], wires=[1])
+
+                for i in range(2, num_params):
+                    qml.RZ(x[i], wires=[i % 2])
+
+                qml.CNOT(wires=[0, 1])
+                qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+            return execute([tape], dev, gradient_fn=param_shift, cache=cache)[0]
+
+        # No caching: number of executions is not ideal
+        hess1 = qml.jacobian(qml.grad(cost))(params, cache=False)
+
+        if num_params == 2:
+            # compare to theoretical result
+            x, y, *_ = params
+            expected = np.array(
+                [
+                    [2 * np.cos(2 * x) * np.sin(y) ** 2, np.sin(2 * x) * np.sin(2 * y)],
+                    [np.sin(2 * x) * np.sin(2 * y), -2 * np.cos(x) ** 2 * np.cos(2 * y)],
+                ]
+            )
+            assert np.allclose(expected, hess1, atol=tol, rtol=0)
+
+        expected_runs = 1  # forward pass
+        expected_runs += 2 * N  # Jacobian
+        expected_runs += 4 * N + 1  # Hessian diagonal
+        expected_runs += 4 * N ** 2  # Hessian off-diagonal
+        assert dev.num_executions == expected_runs
+
+        # No caching: number of executions is ideal
+        dev._num_executions = 0
+        hess2 = qml.jacobian(qml.grad(cost))(params, cache=True)
+        assert np.allclose(hess1, hess2, atol=tol, rtol=0)
+
+        expected_runs_ideal = 1  # forward pass
+        expected_runs_ideal += 2 * N  # Jacobian
+        expected_runs_ideal += 2 * N + 1  # Hessian diagonal
+        expected_runs_ideal += 4 * N * (N - 1) // 2  # Hessian off-diagonal
+        assert dev.num_executions == expected_runs_ideal
+        assert expected_runs_ideal < expected_runs
+
+    def test_caching_adjoint_backward(self):
+        """Test that caching reduces the number of adjoint evaluations
+        when mode=backward"""
+        dev = qml.device("default.qubit", wires=2)
+        params = np.array([0.1, 0.2, 0.3])
+
+        def cost(a, cache):
+            with qml.tape.JacobianTape() as tape:
+                qml.RY(a[0], wires=0)
+                qml.RX(a[1], wires=0)
+                qml.RY(a[2], wires=0)
+                qml.expval(qml.PauliZ(0))
+                qml.expval(qml.PauliZ(1))
+
+            return execute(
+                [tape],
+                dev,
+                gradient_fn="device",
+                cache=cache,
+                mode="backward",
+                gradient_kwargs={"method": "adjoint_jacobian"},
+            )[0]
+
+        # Without caching, 3 evaluations are required.
+        # 1 for the forward pass, and one per output dimension
+        # on the backward pass.
+        qml.jacobian(cost)(params, cache=None)
+        assert dev.num_executions == 3
+
+        # With caching, only 2 evaluations are required. One
+        # for the forward pass, and one for the backward pass.
+        dev._num_executions = 0
+        jac_fn = qml.jacobian(cost)
+        grad1 = jac_fn(params, cache=True)
+        assert dev.num_executions == 2
+
 
 execute_kwargs = [
     {"gradient_fn": param_shift},
@@ -602,3 +757,42 @@ def cost_fn(x):
             res = qml.jacobian(qml.grad(cost_fn))(params)
 
         assert np.allclose(res, np.zeros([2, 2]), atol=tol, rtol=0)
+
+    def test_max_diff(self, tol):
+        """Test that setting the max_diff parameter blocks higher-order
+        derivatives"""
+        dev = qml.device("default.qubit.autograd", wires=2)
+        params = np.array([0.543, -0.654], requires_grad=True)
+
+        def cost_fn(x):
+            with qml.tape.JacobianTape() as tape1:
+                qml.RX(x[0], wires=[0])
+                qml.RY(x[1], wires=[1])
+                qml.CNOT(wires=[0, 1])
+                qml.var(qml.PauliZ(0) @ qml.PauliX(1))
+
+            with qml.tape.JacobianTape() as tape2:
+                qml.RX(x[0], wires=0)
+                qml.RY(x[0], wires=1)
+                qml.CNOT(wires=[0, 1])
+                qml.probs(wires=1)
+
+            result = execute([tape1, tape2], dev, gradient_fn=param_shift, max_diff=1)
+            return result[0] + result[1][0, 0]
+
+        res = cost_fn(params)
+        x, y = params
+        expected = 0.5 * (3 + np.cos(x) ** 2 * np.cos(2 * y))
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        res = qml.grad(cost_fn)(params)
+        expected = np.array(
+            [-np.cos(x) * np.cos(2 * y) * np.sin(x), -np.cos(x) ** 2 * np.sin(2 * y)]
+        )
+        assert np.allclose(res, expected, atol=tol, rtol=0)
+
+        with pytest.warns(UserWarning, match="Output seems independent"):
+            res = qml.jacobian(qml.grad(cost_fn))(params)
+
+        expected = np.zeros([2, 2])
+        assert np.allclose(res, expected, atol=tol, rtol=0)

From 815e1f367d250452350c6aa39079a3084fcd1eae Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 21:07:31 +0800
Subject: [PATCH 32/45] linting

---
 pennylane/interfaces/batch/__init__.py | 11 +++++------
 pennylane/interfaces/batch/autograd.py |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 2ee6b13132f..792366280bd 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -15,19 +15,18 @@
 This subpackage defines functions for interfacing devices' batch execution
 capabilities with different machine learning libraries.
 """
-# pylint: disable=import-outside-toplevel)
+# pylint: disable=import-outside-toplevel,too-many-arguments
+from collections import OrderedDict
 from functools import wraps
+
 from cachetools import LRUCache
+import numpy as np
 
 import pennylane as qml
-import numpy as np
 
 from .autograd import execute as execute_autograd
 
 
-from collections import OrderedDict
-
-
 def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
     """Decorator that adds caching to a function that executes
     multiple tapes on a device.
@@ -236,7 +235,7 @@ def cost_fn(params, x):
 
     if isinstance(cache, bool) and cache:
         # cache=True: create a LRUCache object
-        cache = LRUCache(maxsize=cachesize, getsizeof=lambda x: len(x))
+        cache = LRUCache(maxsize=cachesize, getsizeof=len)
 
     # the default execution function is device.batch_execute
     execute_fn = cache_execute(device.batch_execute, cache)
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 9f64d77e804..b2de123d315 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -15,6 +15,7 @@
 This module contains functions for adding the Autograd interface
 to a PennyLane Device class.
 """
+# pylint: disable=too-many-arguments
 import inspect
 
 import autograd

From 378bcd419f43893ab820dd391181270b6c53c0b9 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 11 Aug 2021 21:19:57 +0800
Subject: [PATCH 33/45] merge master

---
 .github/CHANGELOG.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index bf84f0b7868..679bdc06ed3 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 <h3>New features since last release</h3>
 
+* Vector-Jacobian product transforms have been added to the `qml.gradients` package.
+  [(#1494)](https://github.com/PennyLaneAI/pennylane/pull/1494)
+
+  The new transforms include:
+
+  - `qml.gradients.vjp`
+  - `qml.gradients.batch_vjp`
+  
 <h3>Improvements</h3>
 
 * The tape does not verify any more that all Observables have owners in the annotated queue.
@@ -22,7 +30,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Maria Schuld.
+Josh Izaac, Maria Schuld.
 
 # Release 0.17.0 (current release)
 
@@ -184,13 +192,10 @@ Maria Schuld.
   [(#1476)](https://github.com/PennyLaneAI/pennylane/pull/1476)
   [(#1479)](https://github.com/PennyLaneAI/pennylane/pull/1479)
   [(#1486)](https://github.com/PennyLaneAI/pennylane/pull/1486)
-  [(#1494)](https://github.com/PennyLaneAI/pennylane/pull/1494)
 
   Available quantum gradient transforms include:
 
   - `qml.gradients.finite_diff`
-  - `qml.gradients.vjp`
-  - `qml.gradients.batch_vjp`
   - `qml.gradients.param_shift`
   - `qml.gradients.param_shift_cv`
 

From 96b567e35a00941b9a929e6c22f81278b3898944 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 12 Aug 2021 16:35:57 +0800
Subject: [PATCH 34/45] Apply suggestions from code review

Co-authored-by: Maria Schuld <mariaschuld@gmail.com>
---
 pennylane/interfaces/batch/__init__.py  | 2 +-
 tests/interfaces/test_batch_autograd.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 792366280bd..4ac811bf283 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -72,7 +72,7 @@ def wrapper(tapes, **kwargs):
             kwargs = {}
 
         if cache is None or (isinstance(cache, bool) and not cache):
-            # No caching. Simply execution the execution function
+            # No caching. Simply execute the execution function
             # and return the results.
 
             if not return_tuple:
diff --git a/tests/interfaces/test_batch_autograd.py b/tests/interfaces/test_batch_autograd.py
index 2f159178f3d..73a18ce45af 100644
--- a/tests/interfaces/test_batch_autograd.py
+++ b/tests/interfaces/test_batch_autograd.py
@@ -283,7 +283,7 @@ def cost(x, cache):
         expected_runs += 4 * N ** 2  # Hessian off-diagonal
         assert dev.num_executions == expected_runs
 
-        # No caching: number of executions is ideal
+        # Use caching: number of executions is ideal
         dev._num_executions = 0
         hess2 = qml.jacobian(qml.grad(cost))(params, cache=True)
         assert np.allclose(hess1, hess2, atol=tol, rtol=0)

From 2e5e9a9a1974e685d6fcffc713bfca34bf971492 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 12 Aug 2021 17:37:04 +0800
Subject: [PATCH 35/45] fix

---
 pennylane/interfaces/batch/__init__.py | 27 +++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 792366280bd..abf3285d306 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -16,7 +16,6 @@
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel,too-many-arguments
-from collections import OrderedDict
 from functools import wraps
 
 from cachetools import LRUCache
@@ -74,13 +73,10 @@ def wrapper(tapes, **kwargs):
         if cache is None or (isinstance(cache, bool) and not cache):
             # No caching. Simply execution the execution function
             # and return the results.
+            res = fn(tapes, **kwargs)
+            return res, [] if return_tuple else res
 
-            if not return_tuple:
-                return fn(tapes, **kwargs)
-
-            return fn(tapes, **kwargs), []
-
-        execution_tapes = OrderedDict()
+        execution_tapes = {}
         cached_results = {}
         hashes = {}
         repeated = {}
@@ -106,8 +102,16 @@ def wrapper(tapes, **kwargs):
                 # for execution via the execution function.
                 execution_tapes[i] = tape
 
-        # execute all unique tapes that do not exist in the cache
-        res = fn(execution_tapes.values(), **kwargs)
+        # if there are no execution tapes, simply return!
+        if not execution_tapes:
+            if not repeated:
+                res = list(cached_results.values())
+                return res, [] if return_tuple else res
+
+        else:
+            # execute all unique tapes that do not exist in the cache
+            res = fn(execution_tapes.values(), **kwargs)
+
         final_res = []
 
         for i, tape in enumerate(tapes):
@@ -125,10 +129,7 @@ def wrapper(tapes, **kwargs):
                 final_res.append(r)
                 cache[hashes[i]] = r
 
-        if not return_tuple:
-            return final_res
-
-        return final_res, []
+        return final_res, [] if return_tuple else final_res
 
     wrapper.fn = fn
     return wrapper

From 2057b86ddcafe44b5c81150bf724733d05c91964 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Sun, 15 Aug 2021 22:05:08 +0800
Subject: [PATCH 36/45] Apply suggestions from code review

Co-authored-by: Nathan Killoran <co9olguy@users.noreply.github.com>
---
 pennylane/_device.py                    |  7 ++++---
 pennylane/interfaces/batch/__init__.py  |  3 +++
 pennylane/interfaces/batch/autograd.py  |  3 +--
 tests/interfaces/test_batch_autograd.py | 10 ++++++----
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/pennylane/_device.py b/pennylane/_device.py
index 1fbf03745c5..32e165264de 100644
--- a/pennylane/_device.py
+++ b/pennylane/_device.py
@@ -512,7 +512,7 @@ def execute_and_gradients(self, circuits, method="jacobian", **kwargs):
         Args:
             circuits (list[.tape.QuantumTape]): circuits to execute on the device
             method (str): the device method to call to compute the Jacobian of a single circuit
-            **kwargs: keyword argument to pass when calling ``method``.
+            **kwargs: keyword argument to pass when calling ``method``
 
         Returns:
             tuple[list[array[float]], list[array[float]]]: Tuple containing list of measured value(s)
@@ -525,7 +525,8 @@ def execute_and_gradients(self, circuits, method="jacobian", **kwargs):
 
         for circuit in circuits:
             # Evaluations and gradients are paired, so that
-            # we can re-use the device state for the adjoint method
+            # devices can re-use the device state for the
+            # gradient computation (if applicable).
             res.append(circuit.execute(self))
             jacs.append(gradient_method(circuit, **kwargs))
 
@@ -544,7 +545,7 @@ def gradients(self, circuits, method="jacobian", **kwargs):
         Args:
             circuits (list[.tape.QuantumTape]): circuits to execute on the device
             method (str): the device method to call to compute the Jacobian of a single circuit
-            **kwargs: keyword argument to pass when calling ``method``.
+            **kwargs: keyword argument to pass when calling ``method``
 
         Returns:
             list[array[float]]: List of Jacobians. Returned Jacobians should be of
diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 71abc8189a0..1a46b9d7dba 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -122,6 +122,9 @@ def cost_fn(params, x):
             gradient_fn = device.gradients
 
     elif mode == "forward":
+        # In "forward" mode, gradients are automatically handled
+        # within execute_and_gradients, so providing a gradient_fn
+        # in this case would have ambiguous behaviour.
         raise ValueError("Gradient transforms cannot be used with mode='forward'")
 
     if interface == "autograd":
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 3aab562bf6f..c7e3205c90b 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -152,7 +152,7 @@ def grad_fn(dy):
         jacs = ans[1]
 
         if jacs:
-            # Jacobians were computed on the forward pass (accumulation="forward")
+            # Jacobians were computed on the forward pass (mode="forward")
             # No additional quantum evaluations needed; simply compute the VJPs directly.
             vjps = [qml.gradients.compute_vjp(d, jac) for d, jac in zip(dy, jacs)]
 
@@ -164,7 +164,6 @@ def grad_fn(dy):
             # Longer term, we should have a way of checking this directly
             # (e.g., isinstance(gradient_fn, GradientTransform))
             module_name = getattr(inspect.getmodule(gradient_fn), "__name__", "")
-            print(gradient_fn, gradient_fn.__module__, inspect.ismethod(gradient_fn))
 
             if "pennylane.gradients" in module_name:
 
diff --git a/tests/interfaces/test_batch_autograd.py b/tests/interfaces/test_batch_autograd.py
index dbf5e297d88..26e5ece94f7 100644
--- a/tests/interfaces/test_batch_autograd.py
+++ b/tests/interfaces/test_batch_autograd.py
@@ -1,4 +1,4 @@
-# Copyright 2018-2020 Xanadu Quantum Technologies Inc.
+# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 
 
 class TestAutogradExecuteUnitTests:
-    """Unit tests for the autograd execution"""
+    """Unit tests for autograd execution"""
 
     def test_jacobian_options(self, mocker, tol):
         """Test setting jacobian options"""
@@ -280,6 +280,8 @@ def cost(a, b):
         a = np.array(0.54, requires_grad=True)
         b = np.array(0.8, requires_grad=True)
 
+        # check that the cost function continues to depend on the
+        # values of the parameters for subsequent calls
         res2 = cost(2 * a, b)
         expected = [np.cos(2 * a), -np.cos(2 * a) * np.sin(b)]
         assert np.allclose(res2, expected, atol=tol, rtol=0)
@@ -330,7 +332,7 @@ def cost(a, b, device):
         assert res.shape == (2,)
 
         res = qml.jacobian(cost)(a, b, device=dev)
-        assert not res
+        assert len(res) == 0
 
         def loss(a, b):
             return np.sum(cost(a, b, device=dev))
@@ -338,7 +340,7 @@ def loss(a, b):
         with pytest.warns(UserWarning, match="Output seems independent"):
             res = qml.grad(loss)(a, b)
 
-        assert not res
+        assert np.allclose(res, 0)
 
     def test_matrix_parameter(self, execute_kwargs, tol):
         """Test that the autograd interface works correctly

From c1ccb0d15be2582c25c551202927c0c06c26975d Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 16 Aug 2021 21:34:59 +0800
Subject: [PATCH 37/45] linting

---
 pennylane/interfaces/batch/__init__.py | 2 +-
 pennylane/interfaces/batch/autograd.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 1a46b9d7dba..5fc894cc94a 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -15,7 +15,7 @@
 This subpackage defines functions for interfacing devices' batch execution
 capabilities with different machine learning libraries.
 """
-# pylint: disable=import-outside-toplevel)
+# pylint: disable=import-outside-toplevel,too-many-arguments
 from functools import partial
 
 import pennylane as qml
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index c7e3205c90b..252caafed1e 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -15,6 +15,7 @@
 This module contains functions for adding the Autograd interface
 to a PennyLane Device class.
 """
+# pylint: disable=too-many-arguments
 import inspect
 
 import autograd

From 6aebd37ed31641cfce1abede7fe7b64f0a633ac4 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Mon, 16 Aug 2021 21:41:01 +0800
Subject: [PATCH 38/45] linting

---
 pennylane/interfaces/batch/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 5fc894cc94a..2dd08bb188f 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -16,8 +16,6 @@
 capabilities with different machine learning libraries.
 """
 # pylint: disable=import-outside-toplevel,too-many-arguments
-from functools import partial
-
 import pennylane as qml
 
 from .autograd import execute as execute_autograd

From c540c53de69d15bf097b0916708fff4afcfc423c Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Tue, 17 Aug 2021 00:41:37 +0800
Subject: [PATCH 39/45] linting

---
 pennylane/interfaces/batch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index 4342bb229b2..c306db59e64 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -15,7 +15,7 @@
 This subpackage defines functions for interfacing devices' batch execution
 capabilities with different machine learning libraries.
 """
-# pylint: disable=import-outside-toplevel,too-many-arguments
+# pylint: disable=import-outside-toplevel,too-many-arguments,too-many-branches
 from functools import wraps
 
 from cachetools import LRUCache

From cbbb5f0267526fafc77a2e0e2e55634784827f3b Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Tue, 17 Aug 2021 22:40:35 +0800
Subject: [PATCH 40/45] remove pass

---
 .coveragerc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.coveragerc b/.coveragerc
index 06bb6e08ca1..12e9d2ebec2 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -23,7 +23,6 @@ exclude_lines =
 
     # Don't complain if non-runnable code isn't run:
     if 0:
-    pass
     if __name__ == .__main__.:
 
     # Ignore things that would have trivial tests

From 64e0dd15cb91c50b63591c79c771a2441cba7df9 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 18 Aug 2021 01:25:18 +0800
Subject: [PATCH 41/45] changelog

---
 .github/CHANGELOG.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 05f61a7ed85..699e0075880 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -57,12 +57,15 @@
   ```
 
 * Support for differentiable execution of batches of circuits has been
-  added, via the beta `pennylane.batch` module.
+  added, via the beta `pennylane.interfaces.batch` module.
   [(#1501)](https://github.com/PennyLaneAI/pennylane/pull/1501)
+  [(#1508)](https://github.com/PennyLaneAI/pennylane/pull/1508)
 
   For example:
 
   ```python
+  from pennylane.interfaces.batch import execute
+
   def cost_fn(x):
       with qml.tape.JacobianTape() as tape1:
           qml.RX(x[0], wires=[0])
@@ -76,7 +79,11 @@
           qml.CNOT(wires=[0, 1])
           qml.probs(wires=1)
 
-      result = execute([tape1, tape2], dev, gradient_fn=param_shift)
+      result = execute(
+          [tape1, tape2], dev,
+          gradient_fn=qml.gradients.param_shift,
+          interface="autograd"
+      )
       return result[0] + result[1][0, 0]
 
   res = qml.grad(cost_fn)(params)

From 77e5df11f0eb6f1b34f66c31d5d6c8040c0a3c52 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 18 Aug 2021 13:32:06 +0800
Subject: [PATCH 42/45] Apply suggestions from code review

Co-authored-by: Tom Bromley <49409390+trbromley@users.noreply.github.com>
---
 pennylane/interfaces/batch/__init__.py | 4 ++--
 pennylane/interfaces/batch/autograd.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index c306db59e64..d59c17a114d 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -42,8 +42,8 @@ def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
       extracted, and the tape is not passed to the execution function.
 
     - Finally, there might be the case where one or more tapes in the current
-      set of tapes to be executed share a hash. If this is the case, duplicated
-      are removed, to avoid redundant evaluations.
+      set of tapes to be executed are identical and thus share a hash. If this is the case,
+      duplicates are removed, to avoid redundant evaluations.
 
     Args:
         fn (callable): The execution function to add caching to.
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
index 9fe1d21038d..be71c9920f6 100644
--- a/pennylane/interfaces/batch/autograd.py
+++ b/pennylane/interfaces/batch/autograd.py
@@ -43,7 +43,7 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_d
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
         max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
-            the maximum number of derivatives to support. Increasing this value allows
+            the maximum order of derivatives to support. Increasing this value allows
             for higher order derivatives to be extracted, at the cost of additional
             (classical) computational overhead during the backwards pass.
 

From 3d2b9b6ec2944a76014ffe01a7eb98b01242c0fe Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Wed, 18 Aug 2021 13:32:39 +0800
Subject: [PATCH 43/45] Update pennylane/interfaces/batch/__init__.py

---
 pennylane/interfaces/batch/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index d59c17a114d..f1d2b93ff17 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -52,8 +52,8 @@ def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
             same length as the input ``tapes``.
         cache (None or dict or Cache): The cache to use. If ``None``,
             caching will not occur.
-        pass_kwargs (bool): If ``False``, keyword arguments passed to the
-            wrapped function will be passed directly to ``fn``. If ``True``,
+        pass_kwargs (bool): If ``True``, keyword arguments passed to the
+            wrapped function will be passed directly to ``fn``. If ``False``,
             they will be ignored.
         return_tuple (bool): If ``True``, the output of ``fn`` is returned
             as a tuple ``(fn_ouput, [])``, to match the output of execution functions

From 9e0eb7ad08c16121eae9541633c22025ec70a009 Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Thu, 19 Aug 2021 00:37:06 +0800
Subject: [PATCH 44/45] Add hashing tests

---
 tests/tape/test_tape.py | 222 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)

diff --git a/tests/tape/test_tape.py b/tests/tape/test_tape.py
index c9d81de51f8..61db748601a 100644
--- a/tests/tape/test_tape.py
+++ b/tests/tape/test_tape.py
@@ -1466,3 +1466,225 @@ def test_gate_tape():
         assert qml.tape.get_active_tape() is tape1
 
     assert qml.tape.get_active_tape() is None
+
+
+class TestHashing:
+    """Test for tape hashing"""
+
+    @pytest.mark.parametrize(
+        "m",
+        [
+            qml.expval(qml.PauliZ(0)),
+            qml.state(),
+            qml.probs(wires=0),
+            qml.density_matrix(wires=0),
+            qml.var(qml.PauliY(0)),
+        ],
+    )
+    def test_identical(self, m):
+        """Tests that the circuit hash of identical circuits are identical"""
+        a = 0.3
+        b = 0.2
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.apply(m)
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.apply(m)
+
+        assert tape1.hash == tape2.hash
+
+    def test_identical_numeric(self):
+        """Tests that the circuit hash of identical circuits are identical
+        even though the datatype of the arguments may differ"""
+        a = 0.3
+        b = 0.2
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(np.array(a), wires=[0])
+            qml.RY(np.array(b), wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        assert tape1.hash == tape2.hash
+
+    def test_different_wires(self):
+        """Tests that the circuit hash of circuits with the same operations
+        on different wires have different hashes"""
+        a = 0.3
+        b = 0.2
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[1])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(np.array(a), wires=[0])
+            qml.RY(np.array(b), wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        assert tape1.hash != tape2.hash
+
+    def test_different_trainabilities(self):
+        """Tests that the circuit hash of identical circuits are identical
+        even though the datatype of the arguments may differ"""
+        a = 0.3
+        b = 0.2
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
+
+        tape1.trainable_params = {0}
+        tape2.trainable_params = {0, 1}
+        assert tape1.hash != tape2.hash
+
+    def test_different_parameters(self):
+        """Tests that the circuit hash of circuits with different
+        parameters differs"""
+        a = 0.3
+        b = 0.2
+        c = 0.6
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a, wires=[0])
+            qml.RY(c, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        assert tape1.hash != tape2.hash
+
+    def test_different_operations(self):
+        """Tests that the circuit hash of circuits with different
+        operations differs"""
+        a = 0.3
+        b = 0.2
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RZ(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        assert tape1.hash != tape2.hash
+
+    def test_different_measurements(self):
+        """Tests that the circuit hash of circuits with different
+        measurements differs"""
+        a = 0.3
+        b = 0.2
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.var(qml.PauliZ(0))
+
+        assert tape1.hash != tape2.hash
+
+    def test_different_observables(self):
+        """Tests that the circuit hash of circuits with different
+        observables differs"""
+        a = 0.3
+        b = 0.2
+
+        A = np.diag([1.0, 2.0])
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.Hermitian(A, wires=0))
+
+        assert tape1.hash != tape2.hash
+
+    def test_rotation_modulo_identical(self):
+        """Tests that the circuit hash of circuits with single-qubit
+        rotations differing by multiples of 2pi have identical hash"""
+        a = np.array(np.pi / 2, dtype=np.float64)
+        b = np.array(np.pi / 4, dtype=np.float64)
+
+        H = qml.Hamiltonian([0.1, 0.2], [qml.PauliX(0), qml.PauliZ(0) @ qml.PauliY(1)])
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(a, wires=[0])
+            qml.RY(b, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(H)
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.RX(a - 2 * np.pi, wires=[0])
+            qml.RY(b + 2 * np.pi, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(H)
+
+        assert tape1.hash == tape2.hash
+
+    def test_controlled_rotation_modulo_identical(self):
+        """Tests that the circuit hash of circuits with single-qubit
+        rotations differing by multiples of 2pi have identical hash"""
+        a = np.array(np.pi / 2, dtype=np.float64)
+        b = np.array(np.pi / 2, dtype=np.float64)
+
+        H = qml.Hamiltonian([0.1, 0.2], [qml.PauliX(0), qml.PauliZ(0) @ qml.PauliY(1)])
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.CRX(a, wires=[0, 1])
+            qml.CRY(b, wires=[0, 1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(H)
+
+        with qml.tape.QuantumTape() as tape2:
+            qml.CRX(a - 4 * np.pi, wires=[0, 1])
+            qml.CRY(b + 4 * np.pi, wires=[0, 1])
+            qml.CNOT(wires=[0, 1])
+            qml.expval(H)
+
+        assert tape1.hash == tape2.hash

From 354aec9ee5794b7f90bc9d5798b6888a5a07dedb Mon Sep 17 00:00:00 2001
From: Josh Izaac <josh146@gmail.com>
Date: Fri, 20 Aug 2021 13:34:48 +0800
Subject: [PATCH 45/45] Apply suggestions from code review

Co-authored-by: Tom Bromley <49409390+trbromley@users.noreply.github.com>
---
 pennylane/interfaces/batch/__init__.py | 4 ++--
 tests/tape/test_tape.py                | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
index f1d2b93ff17..36cfdf51d1b 100644
--- a/pennylane/interfaces/batch/__init__.py
+++ b/pennylane/interfaces/batch/__init__.py
@@ -50,7 +50,7 @@ def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
             This function should have the signature ``fn(tapes, **kwargs)``,
             and it should return ``list[tensor_like]``, with the
             same length as the input ``tapes``.
-        cache (None or dict or Cache): The cache to use. If ``None``,
+        cache (None or dict or Cache or bool): The cache to use. If ``None``,
             caching will not occur.
         pass_kwargs (bool): If ``True``, keyword arguments passed to the
             wrapped function will be passed directly to ``fn``. If ``False``,
@@ -86,7 +86,7 @@ def wrapper(tapes, **kwargs):
 
             if h in hashes.values():
                 # Tape already exists within ``tapes``. Determine the
-                # index of the first occurance of the tape, store this,
+                # index of the first occurrence of the tape, store this,
                 # and continue to the next iteration.
                 idx = list(hashes.keys())[list(hashes.values()).index(h)]
                 repeated[i] = idx
diff --git a/tests/tape/test_tape.py b/tests/tape/test_tape.py
index 61db748601a..2bfe534fdf8 100644
--- a/tests/tape/test_tape.py
+++ b/tests/tape/test_tape.py
@@ -1541,8 +1541,8 @@ def test_different_wires(self):
         assert tape1.hash != tape2.hash
 
     def test_different_trainabilities(self):
-        """Tests that the circuit hash of identical circuits are identical
-        even though the datatype of the arguments may differ"""
+        """Tests that the circuit hash of identical circuits differ
+        if the circuits have different trainable parameters"""
         a = 0.3
         b = 0.2
 
@@ -1668,7 +1668,7 @@ def test_rotation_modulo_identical(self):
         assert tape1.hash == tape2.hash
 
     def test_controlled_rotation_modulo_identical(self):
-        """Tests that the circuit hash of circuits with single-qubit
+        """Tests that the circuit hash of circuits with controlled
         rotations differing by multiples of 2pi have identical hash"""
         a = np.array(np.pi / 2, dtype=np.float64)
         b = np.array(np.pi / 2, dtype=np.float64)