diff --git a/INSTALL.md b/INSTALL.md
index 6060c4bd29..6517dc86c6 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Range: header-only, *only used for unit testing*
 - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
-- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 91fff76deba20c751d0646c54f2f1c1e07bd6156 .
+- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 58b3e2c623d772f6e4a2e9cf5758073de32ecc50 .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
   If usable MADNESS installation is not found, TiledArray will download and compile
   MADNESS from source. *This is the recommended way to compile MADNESS for all users*.
diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp
index 5818fae22d..7ef5ca45c8 100644
--- a/examples/demo/demo2.cpp
+++ b/examples/demo/demo2.cpp
@@ -56,6 +56,10 @@ int main(int argc, char* argv[]) {
   assert((ρ.upbound() == Index{11, 9}));
   // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   assert((ρ.extent() == Index{10, 10}));
+  // 1st dimension of ρ is $\mathbb{Z}_{1,11}$
+  assert((ρ.dim(0) == Range1{1, 11}));
+  // 2nd dimension of ρ is $\mathbb{Z}_{-1,9}$
+  assert((ρ.dim(1) == Range1{-1, 9}));
   // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   assert(ρ.volume() == 100);
   // row-major order
@@ -100,12 +104,51 @@ int main(int argc, char* argv[]) {
   auto tile_0_0 = τ.tile({0, 0});
   assert((tile_0_0 == Range{{1, 5}, {-1, 5}}));
 
-  // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s
-  DistArray array0(world, τ);
-  array0.fill(1.0);  // fill $\code{array0}$ with 1's
-
-  // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2.
-  auto t00 = array0.find({0, 0});
+  // clang-format off
+
+  // 2-d array of $\code{double}$ 0s, indexed by ρ
+  Tensor<double> t0(ρ, 0.);
+  // same as $\code{t0}$ but filled with ordinals
+  TensorD t1(ρ, [&ρ](auto&& idx) {
+    return ρ.ordinal(idx);
+  });
+  // print out "0 1 .. 99 "
+  for (auto&& v : t1) cout << v << " ";
+  // same as $\code{t0}$, using existing buffer
+  shared_ptr<double[]> v(new double[ρ.volume()]);
+  TensorD t2(ρ, v); // t2 and v co-manage buffer lifetime
+  v[0] = 1.;
+  assert(t2(1, -1) == 1.);
+  // same as $\code{t0}$, using existing (unmanaged) buffer
+  auto t3 = make_map(v.get(), ρ);
+  v[0] = 2.;
+  assert(t3(1, -1) == 2.);
+  // Tensor has shallow-copy semantics
+  auto t4 = t0;
+  t0(1, -1) = 3.;
+  assert(t4(1, -1) == 3.);
+
+  // clang-format on
+
+  // default instance of $\code{DistArray}$ is
+  // a {\em dense} array of $\code{double}$s
+  // NB can use TArrayD instead of DistArray<>
+  DistArray<> a0(τ);
+  a0.fill(1.);  // fill $\code{da}$ with 1s
+  // every tile exists in a dense array
+  assert(!a0.is_zero({0, 0}));
+  // grab a ${\em future}$ to the {0,0} tile
+  auto t00 = a0.find({0, 0});
+
+  // shape of a {\em sparse} array over τ
+  // tiles with even ordinals ({0,0}, {0,2}, {1,1}) are zero
+  SparseShape s(TensorF(τ.tiles_range(), {0, 1, 0, 1, 0, 1}), τ);
+  // a sparse array of $\code{double}$s
+  // TSpArrayX $\equiv$ DistArray<TensorX, SparsePolicy>
+  TSpArrayD a1(τ, s);
+  // only some tiles are nonzero in sparse array
+  assert(a1.is_zero({0, 0}));
+  assert(!a1.is_zero({0, 1}));
 
 #endif  // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
 
diff --git a/external/versions.cmake b/external/versions.cmake
index e9cfb45375..313b8d5d11 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7)
 set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626)
 set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496)
 
-set(TA_TRACKED_MADNESS_TAG 91fff76deba20c751d0646c54f2f1c1e07bd6156)
-set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0b44ef319643cb9721fbe17d294987c146e6460e)
+set(TA_TRACKED_MADNESS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50)
+set(TA_TRACKED_MADNESS_PREVIOUS_TAG dc3294160209cbd683bfb57cb2b933bd5f86e07e)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h
index 09e99eda86..fd0450ed8e 100644
--- a/src/TiledArray/dist_array.h
+++ b/src/TiledArray/dist_array.h
@@ -158,7 +158,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   using rebind_numeric_t = typename rebind_numeric<Numeric>::type;
 
  private:
-  pimpl_type pimpl_;  ///< managed ptr to Array implementation
+  pimpl_type pimpl_ = {};  ///< managed ptr to Array implementation
   bool defer_deleter_to_next_fence_ =
       false;  ///< if true, the impl object is scheduled to be destroyed in the
               ///< next fence
@@ -277,7 +277,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// array is uninitialized, but these arrays may be assign via a tensor
   /// expression assignment or the copy construction.
 
-  DistArray() : pimpl_() {}
+  DistArray() = default;
 
   /// Copy constructor
 
@@ -298,6 +298,19 @@ class DistArray : public madness::archive::ParallelSerializableObject {
             const std::shared_ptr<const pmap_interface>& pmap = {})
       : pimpl_(init(world, trange, shape_type(1, trange), pmap)) {}
 
+  /// Dense array constructor
+
+  /// Constructs an array with the given meta data in default World.
+  /// This constructor only initializes the array meta data;
+  /// the array tiles are empty and must be assigned by the user.
+  /// \param trange The tiled range object that will be used to set the array
+  /// tiling.
+  /// \param pmap The tile index -> process map
+  explicit DistArray(const trange_type& trange,
+                     const std::shared_ptr<const pmap_interface>& pmap = {})
+      : pimpl_(init(get_default_world(), trange, shape_type(1, trange), pmap)) {
+  }
+
   /// Sparse array constructor
 
   /// Constructs an array with the given meta data. This constructor only
@@ -312,6 +325,19 @@ class DistArray : public madness::archive::ParallelSerializableObject {
                 std::shared_ptr<const pmap_interface>())
       : pimpl_(init(world, trange, shape, pmap)) {}
 
+  /// Sparse array constructor
+
+  /// Constructs an array with the given meta data in default World.
+  /// This constructor only initializes the array meta data; the array tiles
+  /// are empty and must be assigned by the user.
+  /// \param trange The tiled range object that will be used to set the array
+  /// tiling. \param shape The array shape that defines zero and non-zero tiles
+  /// \param pmap The tile index -> process map
+  DistArray(const trange_type& trange, const shape_type& shape,
+            const std::shared_ptr<const pmap_interface>& pmap =
+                std::shared_ptr<const pmap_interface>())
+      : pimpl_(init(get_default_world(), trange, shape, pmap)) {}
+
   /// \name Initializer list constructors
   /// \brief Creates a new tensor containing the elements in the provided
   ///         `std::initializer_list`.
@@ -374,6 +400,41 @@ class DistArray : public madness::archive::ParallelSerializableObject {
               std::initializer_list<std::initializer_list<T>>>>>>
           il)
       : DistArray(array_from_il<DistArray>(world, il)) {}
+
+  template <typename T>
+  explicit DistArray(std::initializer_list<T> il)  // N.B. clang does not like
+                                                   // detail::vector_il<T> here
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(std::initializer_list<std::initializer_list<T>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(
+      std::initializer_list<std::initializer_list<std::initializer_list<T>>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(std::initializer_list<std::initializer_list<
+                         std::initializer_list<std::initializer_list<T>>>>
+                         il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(
+      std::initializer_list<std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<T>>>>>
+          il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<std::initializer_list<
+              std::initializer_list<std::initializer_list<T>>>>>>
+          il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
   ///@}
 
   /// \name Tiling initializer list constructors
@@ -440,6 +501,44 @@ class DistArray : public madness::archive::ParallelSerializableObject {
               std::initializer_list<std::initializer_list<T>>>>>>
           il)
       : DistArray(array_from_il<DistArray>(world, trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange, std::initializer_list<T> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange,
+            std::initializer_list<std::initializer_list<T>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(
+      const trange_type& trange,
+      std::initializer_list<std::initializer_list<std::initializer_list<T>>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange,
+            std::initializer_list<std::initializer_list<
+                std::initializer_list<std::initializer_list<T>>>>
+                il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange,
+            std::initializer_list<std::initializer_list<std::initializer_list<
+                std::initializer_list<std::initializer_list<T>>>>>
+                il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(
+      const trange_type& trange,
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<std::initializer_list<
+              std::initializer_list<std::initializer_list<T>>>>>>
+          il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
   /// @}
 
   /// converting copy constructor
diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h
index 7dbd115d4d..11971c269e 100644
--- a/src/TiledArray/external/btas.h
+++ b/src/TiledArray/external/btas.h
@@ -109,6 +109,34 @@ inline bool is_congruent(const btas::RangeNd<Order, Args...>& r1,
                     r2.extent_data());
 }
 
+/// Test if a BTAS range and a TA range are congruent
+
+/// This function tests that the rank and extent of
+/// \c r1 are equal to those of \c r2.
+/// \param r1 The first Range to compare
+/// \param r2 The second Range to compare
+template <blas::Layout Order, typename... Args>
+inline bool is_congruent(const btas::RangeNd<Order, Args...>& r1,
+                         const TiledArray::Range& r2) {
+  return (r1.rank() == r2.rank()) &&
+         std::equal(r1.extent_data(), r1.extent_data() + r1.rank(),
+                    r2.extent_data());
+}
+
+/// Test if a TA range and a BTAS range are congruent
+
+/// This function tests that the rank and extent of
+/// \c r1 are equal to those of \c r2.
+/// \param r1 The first Range to compare
+/// \param r2 The second Range to compare
+template <blas::Layout Order, typename... Args>
+inline bool is_congruent(const TiledArray::Range& r1,
+                         const btas::RangeNd<Order, Args...>& r2) {
+  return (r1.rank() == r2.rank()) &&
+         std::equal(r1.extent_data(), r1.extent_data() + r1.rank(),
+                    r2.extent_data());
+}
+
 template <typename T, typename Range, typename Storage>
 decltype(auto) make_ti(const btas::Tensor<T, Range, Storage>& arg) {
   return TiledArray::detail::TensorInterface<const T, TiledArray::Range,
diff --git a/src/TiledArray/external/madness.h b/src/TiledArray/external/madness.h
index ecfa313d9b..8a8efd6b2e 100644
--- a/src/TiledArray/external/madness.h
+++ b/src/TiledArray/external/madness.h
@@ -20,11 +20,6 @@
 #ifndef TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED
 #define TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED
 
-// This needs to be defined before world/worldreduce.h and world/worlddc.h
-#ifndef WORLD_INSTANTIATE_STATIC_TEMPLATES
-#define WORLD_INSTANTIATE_STATIC_TEMPLATES
-#endif  // WORLD_INSTANTIATE_STATIC_TEMPLATES
-
 #include <memory>
 
 #include <TiledArray/config.h>
diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h
index 644039abe7..ac42f3bf1c 100644
--- a/src/TiledArray/external/umpire.h
+++ b/src/TiledArray/external/umpire.h
@@ -91,14 +91,16 @@ class umpire_allocator_impl {
       : umpalloc_(umpalloc) {}
 
   template <class U>
-  umpire_allocator_impl(const umpire_allocator_impl<U>& rhs) noexcept
+  umpire_allocator_impl(
+      const umpire_allocator_impl<U, StaticLock>& rhs) noexcept
       : umpalloc_(rhs.umpalloc_) {}
 
   /// allocates memory using umpire dynamic pool
   pointer allocate(size_t n) {
     TA_ASSERT(umpalloc_);
 
-    size_t nbytes = n * sizeof(T);
+    // QuickPool::allocate_internal does not handle zero-size allocations
+    size_t nbytes = n == 0 ? 1 : n * sizeof(T);
     pointer result = nullptr;
     auto* allocation_strategy = umpalloc_->getAllocationStrategy();
 
@@ -117,7 +119,8 @@ class umpire_allocator_impl {
   void deallocate(pointer ptr, size_t n) {
     TA_ASSERT(umpalloc_);
 
-    const auto nbytes = n * sizeof(T);
+    // QuickPool::allocate_internal does not handle zero-size allocations
+    const auto nbytes = n == 0 ? 1 : n * sizeof(T);
     auto* allocation_strategy = umpalloc_->getAllocationStrategy();
 
     // N.B. with multiple threads would have to do this test in
@@ -137,15 +140,15 @@ class umpire_allocator_impl {
   umpire::Allocator* umpalloc_;
 };  // class umpire_allocator
 
-template <class T1, class T2>
-bool operator==(const umpire_allocator_impl<T1>& lhs,
-                const umpire_allocator_impl<T2>& rhs) noexcept {
+template <class T1, class T2, class StaticLock>
+bool operator==(const umpire_allocator_impl<T1, StaticLock>& lhs,
+                const umpire_allocator_impl<T2, StaticLock>& rhs) noexcept {
   return lhs.umpire_allocator() == rhs.umpire_allocator();
 }
 
-template <class T1, class T2>
-bool operator!=(const umpire_allocator_impl<T1>& lhs,
-                const umpire_allocator_impl<T2>& rhs) noexcept {
+template <class T1, class T2, class StaticLock>
+bool operator!=(const umpire_allocator_impl<T1, StaticLock>& lhs,
+                const umpire_allocator_impl<T2, StaticLock>& rhs) noexcept {
   return !(lhs == rhs);
 }
 
diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h
index 87af0e6115..f09a98c0e5 100644
--- a/src/TiledArray/fwd.h
+++ b/src/TiledArray/fwd.h
@@ -117,6 +117,11 @@ namespace symmetry {
 class Permutation;
 }
 
+// shapes
+class DenseShape;
+template <typename T = float>
+class SparseShape;
+
 // TiledArray Arrays
 template <typename, typename>
 class DistArray;
diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h
index 9e221c42d7..dbb8f53b55 100644
--- a/src/TiledArray/host/allocator.h
+++ b/src/TiledArray/host/allocator.h
@@ -53,7 +53,9 @@ class host_allocator_impl
 
   template <class U>
   host_allocator_impl(const host_allocator_impl<U>& rhs) noexcept
-      : base_type(static_cast<const umpire_allocator_impl<U>&>(rhs)) {}
+      : base_type(static_cast<
+                  const umpire_allocator_impl<U, detail::MutexLock<hostEnv>>&>(
+            rhs)) {}
 
   template <typename T1, typename T2>
   friend bool operator==(const host_allocator_impl<T1>& lhs,
diff --git a/src/TiledArray/math/linalg/non-distributed/cholesky.h b/src/TiledArray/math/linalg/non-distributed/cholesky.h
index 4196002533..fc96a6bf1c 100644
--- a/src/TiledArray/math/linalg/non-distributed/cholesky.h
+++ b/src/TiledArray/math/linalg/non-distributed/cholesky.h
@@ -42,9 +42,7 @@ auto rank_local_cholesky(const DistArray<Tile, Policy>& A) {
 
   World& world = A.world();
   auto A_eig = detail::make_matrix(A);
-  if (world.rank() == 0) {
-    linalg::rank_local::cholesky(A_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(cholesky, world, A_eig);
   world.gop.broadcast_serializable(A_eig, 0);
   return A_eig;
 }
@@ -140,11 +138,20 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = TiledRange()) {
   // if need to return L use its copy to compute inverse
   decltype(L_eig) L_inv_eig;
 
+  std::optional<lapack::Error> error_opt;
   if (world.rank() == 0) {
-    if (Both) L_inv_eig = L_eig;
-    auto& L_inv_eig_ref = Both ? L_inv_eig : L_eig;
-    linalg::rank_local::cholesky_linv(L_inv_eig_ref);
-    detail::zero_out_upper_triangle(L_inv_eig_ref);
+    try {
+      if (Both) L_inv_eig = L_eig;
+      auto& L_inv_eig_ref = Both ? L_inv_eig : L_eig;
+      linalg::rank_local::cholesky_linv(L_inv_eig_ref);
+      detail::zero_out_upper_triangle(L_inv_eig_ref);
+    } catch (lapack::Error& err) {
+      error_opt = err;
+    }
+  }
+  world.gop.broadcast_serializable(error_opt, 0);
+  if (error_opt) {
+    throw error_opt.value();
   }
   world.gop.broadcast_serializable(Both ? L_inv_eig : L_eig, 0);
 
@@ -169,9 +176,7 @@ auto cholesky_solve(const Array& A, const Array& B,
   auto A_eig = detail::make_matrix(A);
   auto X_eig = detail::make_matrix(B);
   World& world = A.world();
-  if (world.rank() == 0) {
-    linalg::rank_local::cholesky_solve(A_eig, X_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(cholesky_solve, world, A_eig, X_eig);
   world.gop.broadcast_serializable(X_eig, 0);
   if (x_trange.rank() == 0) x_trange = B.trange();
   return eigen_to_array<Array>(world, x_trange, X_eig);
@@ -192,9 +197,7 @@ auto cholesky_lsolve(Op transpose, const Array& A, const Array& B,
                 "scalar types");
 
   auto X_eig = detail::make_matrix(B);
-  if (world.rank() == 0) {
-    linalg::rank_local::cholesky_lsolve(transpose, L_eig, X_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(cholesky_lsolve, world, transpose, L_eig, X_eig);
   world.gop.broadcast_serializable(X_eig, 0);
   if (l_trange.rank() == 0) l_trange = A.trange();
   if (x_trange.rank() == 0) x_trange = B.trange();
diff --git a/src/TiledArray/math/linalg/non-distributed/heig.h b/src/TiledArray/math/linalg/non-distributed/heig.h
index 5490b6b757..85079f356c 100644
--- a/src/TiledArray/math/linalg/non-distributed/heig.h
+++ b/src/TiledArray/math/linalg/non-distributed/heig.h
@@ -56,9 +56,7 @@ auto heig(const Array& A, TiledRange evec_trange = TiledRange()) {
   World& world = A.world();
   auto A_eig = detail::make_matrix(A);
   std::vector<scalar_type> evals;
-  if (world.rank() == 0) {
-    linalg::rank_local::heig(A_eig, evals);
-  }
+  TA_LAPACK_ON_RANK_ZERO(heig, world, A_eig, evals);
   world.gop.broadcast_serializable(A_eig, 0);
   world.gop.broadcast_serializable(evals, 0);
   if (evec_trange.rank() == 0) evec_trange = A.trange();
@@ -99,9 +97,7 @@ auto heig(const ArrayA& A, const ArrayB& B,
   auto A_eig = detail::make_matrix(A);
   auto B_eig = detail::make_matrix(B);
   std::vector<scalar_type> evals;
-  if (world.rank() == 0) {
-    linalg::rank_local::heig(A_eig, B_eig, evals);
-  }
+  TA_LAPACK_ON_RANK_ZERO(heig, world, A_eig, B_eig, evals);
   world.gop.broadcast_serializable(A_eig, 0);
   world.gop.broadcast_serializable(evals, 0);
   if (evec_trange.rank() == 0) evec_trange = A.trange();
diff --git a/src/TiledArray/math/linalg/non-distributed/lu.h b/src/TiledArray/math/linalg/non-distributed/lu.h
index d1b06bbb1c..6a3e1ea424 100644
--- a/src/TiledArray/math/linalg/non-distributed/lu.h
+++ b/src/TiledArray/math/linalg/non-distributed/lu.h
@@ -27,9 +27,9 @@
 
 #include <TiledArray/config.h>
 
-#include <TiledArray/math/linalg/util.h>
-#include <TiledArray/math/linalg/rank-local.h>
 #include <TiledArray/conversions/eigen.h>
+#include <TiledArray/math/linalg/rank-local.h>
+#include <TiledArray/math/linalg/util.h>
 
 namespace TiledArray::math::linalg::non_distributed {
 
@@ -37,15 +37,14 @@ namespace TiledArray::math::linalg::non_distributed {
  *  @brief Solve a linear system via LU factorization
  */
 template <typename ArrayA, typename ArrayB>
-auto lu_solve(const ArrayA& A, const ArrayB& B, TiledRange x_trange = TiledRange()) {
+auto lu_solve(const ArrayA& A, const ArrayB& B,
+              TiledRange x_trange = TiledRange()) {
   (void)detail::array_traits<ArrayA>{};
   (void)detail::array_traits<ArrayB>{};
   auto& world = A.world();
   auto A_eig = detail::make_matrix(A);
   auto B_eig = detail::make_matrix(B);
-  if (world.rank() == 0) {
-    linalg::rank_local::lu_solve(A_eig, B_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(lu_solve, world, A_eig, B_eig);
   world.gop.broadcast_serializable(B_eig, 0);
   if (x_trange.rank() == 0) x_trange = B.trange();
   return eigen_to_array<ArrayB>(world, x_trange, B_eig);
@@ -59,14 +58,12 @@ auto lu_inv(const Array& A, TiledRange ainv_trange = TiledRange()) {
   (void)detail::array_traits<Array>{};
   auto& world = A.world();
   auto A_eig = detail::make_matrix(A);
-  if (world.rank() == 0) {
-    linalg::rank_local::lu_inv(A_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(lu_inv, world, A_eig);
   world.gop.broadcast_serializable(A_eig, 0);
   if (ainv_trange.rank() == 0) ainv_trange = A.trange();
   return eigen_to_array<Array>(A.world(), ainv_trange, A_eig);
 }
 
-}  // namespace TiledArray::math::linalg::lapack
+}  // namespace TiledArray::math::linalg::non_distributed
 
 #endif  // TILEDARRAY_MATH_LINALG_NON_DISTRIBUTED_LU_H__INCLUDED
diff --git a/src/TiledArray/math/linalg/non-distributed/qr.h b/src/TiledArray/math/linalg/non-distributed/qr.h
index e43cec632d..b66ee222ea 100644
--- a/src/TiledArray/math/linalg/non-distributed/qr.h
+++ b/src/TiledArray/math/linalg/non-distributed/qr.h
@@ -3,35 +3,32 @@
 
 #include <TiledArray/config.h>
 
-#include <TiledArray/math/linalg/util.h>
-#include <TiledArray/math/linalg/rank-local.h>
 #include <TiledArray/conversions/eigen.h>
+#include <TiledArray/math/linalg/rank-local.h>
+#include <TiledArray/math/linalg/util.h>
 
 namespace TiledArray::math::linalg::non_distributed {
 
 template <bool QOnly, typename ArrayV>
-auto householder_qr( const ArrayV& V, TiledRange q_trange = TiledRange(),
-                     TiledRange r_trange = TiledRange() ) {
-
+auto householder_qr(const ArrayV& V, TiledRange q_trange = TiledRange(),
+                    TiledRange r_trange = TiledRange()) {
   (void)detail::array_traits<ArrayV>{};
   auto& world = V.world();
   auto V_eig = detail::make_matrix(V);
   decltype(V_eig) R_eig;
-  if( !world.rank() ) {
-    linalg::rank_local::householder_qr<QOnly>( V_eig, R_eig );
-  }
-  world.gop.broadcast_serializable( V_eig, 0 );
-  if(q_trange.rank() == 0) q_trange = V.trange();
-  auto Q = eigen_to_array<ArrayV>( world, q_trange, V_eig );
+  TA_LAPACK_ON_RANK_ZERO(householder_qr<QOnly>, world, V_eig, R_eig);
+  world.gop.broadcast_serializable(V_eig, 0);
+  if (q_trange.rank() == 0) q_trange = V.trange();
+  auto Q = eigen_to_array<ArrayV>(world, q_trange, V_eig);
   if constexpr (not QOnly) {
-    world.gop.broadcast_serializable( R_eig, 0 );
+    world.gop.broadcast_serializable(R_eig, 0);
     if (r_trange.rank() == 0) {
       // Generate a TRange based on column tiling of V
       auto col_tiling = V.trange().dim(1);
-      r_trange = TiledRange( {col_tiling, col_tiling} );
+      r_trange = TiledRange({col_tiling, col_tiling});
     }
-    auto R = eigen_to_array<ArrayV>( world, r_trange, R_eig );
-    return std::make_tuple( Q, R );
+    auto R = eigen_to_array<ArrayV>(world, r_trange, R_eig);
+    return std::make_tuple(Q, R);
   } else {
     return Q;
   }
diff --git a/src/TiledArray/math/linalg/non-distributed/svd.h b/src/TiledArray/math/linalg/non-distributed/svd.h
index e6ea5ef1da..e0094ef906 100644
--- a/src/TiledArray/math/linalg/non-distributed/svd.h
+++ b/src/TiledArray/math/linalg/non-distributed/svd.h
@@ -75,9 +75,7 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(),
   if constexpr (need_u) U = std::make_unique<Matrix>();
   if constexpr (need_vt) VT = std::make_unique<Matrix>();
 
-  if (world.rank() == 0) {
-    linalg::rank_local::svd(A_eig, S, U.get(), VT.get());
-  }
+  TA_LAPACK_ON_RANK_ZERO(svd, world, A_eig, S, U.get(), VT.get());
 
   world.gop.broadcast_serializable(S, 0);
   if (U) world.gop.broadcast_serializable(*U, 0);
diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp
index 74e1aac526..d23f3b4e3f 100644
--- a/src/TiledArray/math/linalg/rank-local.cpp
+++ b/src/TiledArray/math/linalg/rank-local.cpp
@@ -40,7 +40,7 @@ inline int ta_lapack_fortran_call(F f, Args... args) {
   return info;
 }
 
-#define TA_LAPACK_ERROR(F) throw std::runtime_error("lapack::" #F " failed")
+#define TA_LAPACK_ERROR(F) throw lapack::Error("lapack::" #F " failed")
 
 #define TA_LAPACK_FORTRAN_CALL(F, ARGS...) \
   ((ta_lapack_fortran_call(F, ARGS) == 0) || (TA_LAPACK_ERROR(F), 0))
diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h
index 5c46550bd3..f7db4abd01 100644
--- a/src/TiledArray/math/linalg/rank-local.h
+++ b/src/TiledArray/math/linalg/rank-local.h
@@ -71,4 +71,42 @@ void householder_qr(Matrix<T> &V, Matrix<T> &R);
 
 }  // namespace TiledArray::math::linalg::rank_local
 
+namespace madness::archive {
+
+/// Serialize (deserialize) an lapack::Error
+
+/// \tparam Archive The archive type.
+template <class Archive>
+struct ArchiveSerializeImpl<Archive, lapack::Error> {
+  static inline void serialize(const Archive &ar, lapack::Error &e) {
+    MAD_ARCHIVE_DEBUG(std::cout << "(de)serialize lapack::Error" << std::endl);
+    if constexpr (is_output_archive_v<Archive>) {  // serialize
+      const std::string msg = e.what();
+      ar &msg;
+    } else {
+      std::string msg;
+      ar &msg;
+      e = lapack::Error(msg);
+    }
+  }
+};
+
+}  // namespace madness::archive
+
+/// TA_LAPACK_ON_RANK_ZERO(fn,args...) invokes  linalg::rank_local::fn(args...)
+/// on rank 0 and broadcasts/rethrows the exception, if any
+#define TA_LAPACK_ON_RANK_ZERO(fn, world, args...) \
+  std::optional<lapack::Error> error_opt;          \
+  if (world.rank() == 0) {                         \
+    try {                                          \
+      linalg::rank_local::fn(args);                \
+    } catch (lapack::Error & err) {                \
+      error_opt = err;                             \
+    }                                              \
+  }                                                \
+  world.gop.broadcast_serializable(error_opt, 0);  \
+  if (error_opt) {                                 \
+    throw error_opt.value();                       \
+  }
+
 #endif  // TILEDARRAY_MATH_LINALG_RANK_LOCAL_H__INCLUDED
diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h
index b7a38d38b0..b4f2a0d48f 100644
--- a/src/TiledArray/range.h
+++ b/src/TiledArray/range.h
@@ -49,9 +49,12 @@ class Range {
   typedef Range Range_;                ///< This object type
   typedef TA_1INDEX_TYPE index1_type;  ///< 1-index type, to conform to
                                        ///< Tensor Working Group (TWG) spec
+  typedef std::make_signed_t<TA_1INDEX_TYPE>
+      index1_difference_type;  ///< type representing difference of 1-indices
   typedef container::svector<index1_type>
-      index_type;            ///< Coordinate index type, to conform to
-                             ///< TWG spec
+      index_type;  ///< Coordinate index type, to conform to
+                   ///< TWG spec
+  typedef container::svector<index1_difference_type> index_difference_type;
   typedef index_type index;  ///< Coordinate index type (deprecated)
   typedef detail::SizeArray<const index1_type>
       index_view_type;  ///< Non-owning variant of index_type
diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h
index 7346f45d1c..bf51487922 100644
--- a/src/TiledArray/sparse_shape.h
+++ b/src/TiledArray/sparse_shape.h
@@ -26,6 +26,8 @@
 #ifndef TILEDARRAY_SPARSE_SHAPE_H__INCLUDED
 #define TILEDARRAY_SPARSE_SHAPE_H__INCLUDED
 
+#include <TiledArray/fwd.h>
+
 #include <TiledArray/tensor.h>
 #include <TiledArray/tensor/shift_wrapper.h>
 #include <TiledArray/tensor/tensor_interface.h>
diff --git a/src/TiledArray/special/diagonal_array.h b/src/TiledArray/special/diagonal_array.h
index 825d66fd98..dd62db1498 100644
--- a/src/TiledArray/special/diagonal_array.h
+++ b/src/TiledArray/special/diagonal_array.h
@@ -267,7 +267,7 @@ Array diagonal_array(World &world, TiledRange const &trange, T val = 1) {
 /// \param[in] diagonals_begin the begin iterator of the range of the diagonals
 /// \param[in] diagonals_end the end iterator of the range of the diagonals;
 ///            if not given, default initialized and thus will not be checked
-/// \return a constant diagonal DistArray
+/// \return a diagonal DistArray
 template <typename Array, typename RandomAccessIterator>
 std::enable_if_t<detail::is_iterator<RandomAccessIterator>::value, Array>
 diagonal_array(World &world, TiledRange const &trange,
diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h
index 81141f4982..2cd2d46fe3 100644
--- a/src/TiledArray/tensor/kernels.h
+++ b/src/TiledArray/tensor/kernels.h
@@ -28,6 +28,7 @@
 
 #include <TiledArray/tensor/permute.h>
 #include <TiledArray/tensor/utility.h>
+#include <TiledArray/util/vector.h>
 
 namespace TiledArray {
 
@@ -107,12 +108,14 @@ inline TR tensor_op(Op&& op, const Permutation& perm, const T1& tensor1,
 }
 
 /// provides transform functionality to class \p T, useful for nonintrusive
-/// extension of a tensor type \p T to be usable as element type \p T in \c
-/// Tensor<T> \tparam T a tensor type \note The default implementation
+/// extension of a tensor type \p T to be usable as element type \p T in
+/// \c Tensor<T>
+/// \tparam T a tensor type
+/// \note The default implementation
 /// constructs T, then computes it by coiterating over elements of the argument
 /// tensors and transforming with the transform \c Op .
-///       This should be specialized for classes like TiledArray::Tensor that
-///       already include the appropriate transform constructors already
+/// This should be specialized for classes like TiledArray::Tensor that
+/// already include the appropriate transform constructors already
 template <typename T>
 struct transform {
   /// creates a result tensor in which element \c i is obtained by \c
@@ -283,29 +286,23 @@ inline void inplace_tensor_op(InputOp&& input_op, OutputOp&& output_op,
 /// \endcode
 /// The expected signature of the output
 /// operations is:
-/// \code void op(TR::value_type::value_type*, const
+/// \code
+/// void op(TR::value_type::value_type*, const
 /// TR::value_type::value_type)
 /// \endcode
-/// \tparam InputOp The input operation
-/// type
+/// \tparam InputOp The input operation type
 /// \tparam OutputOp The output operation type
-/// \tparam TR The result tensor
-/// type
+/// \tparam TR The result tensor type
 /// \tparam T1 The first argument tensor type
-/// \tparam Ts The remaining
-/// argument tensor types
+/// \tparam Ts The remaining argument tensor types
 /// \param[in] input_op The operation that is used to
 /// generate the output value from the input arguments
-/// \param[in] output_op The
-/// operation that is used to set the value of the result tensor given the
-/// element pointer and the result value
-/// \param[in] perm The permutation applied
-/// to the argument tensors
+/// \param[in] output_op The operation that is used to set the value
+/// of the result tensor given the element pointer and the result value
+/// \param[in] perm The permutation applied to the argument tensors
 /// \param[in,out] result The result tensor
-/// \param[in]
-/// tensor1 The first argument tensor
-/// \param[in] tensors The remaining argument
-/// tensors
+/// \param[in] tensor1 The first argument tensor
+/// \param[in] tensors The remaining argument tensors
 template <typename InputOp, typename OutputOp, typename TR, typename T1,
           typename... Ts,
           typename std::enable_if<
@@ -357,13 +354,26 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensors...));
 
-  const auto stride = inner_size(result, tensors...);
   const auto volume = result.range().volume();
 
-  for (decltype(result.range().volume()) i = 0ul; i < volume; i += stride)
-    math::inplace_vector_op(std::forward<Op>(op), stride,
-                            result.data() + result.range().ordinal(i),
-                            (tensors.data() + tensors.range().ordinal(i))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(result, tensors...);
+    for (std::decay_t<decltype(volume)> i = 0ul; i < volume; i += stride)
+      math::inplace_vector_op(std::forward<Op>(op), stride,
+                              result.data() + result.range().ordinal(i),
+                              (tensors.data() + tensors.range().ordinal(i))...);
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      std::forward<Op>(op)(
+          result[idx], (tensors[idx - result_lobound +
+                                signed_idx_t(tensors.range().lobound())])...);
+    }
+  }
 }
 
 /// In-place tensor of tensors operations with non-contiguous data
@@ -384,20 +394,33 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensors...));
 
-  const auto stride = inner_size(result, tensors...);
   const auto volume = result.range().volume();
 
-  auto inplace_tensor_range =
-      [&op, stride](
-          typename TR::pointer MADNESS_RESTRICT const result_data,
-          typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
-        for (decltype(result.range().volume()) i = 0ul; i < stride; ++i)
-          inplace_tensor_op(op, result_data[i], tensors_data[i]...);
-      };
-
-  for (decltype(result.range().volume()) ord = 0ul; ord < volume; ord += stride)
-    inplace_tensor_range(result.data() + result.range().ordinal(ord),
-                         (tensors.data() + tensors.range().ordinal(ord))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(result, tensors...);
+    auto inplace_tensor_range =
+        [&op, stride](
+            typename TR::pointer MADNESS_RESTRICT const result_data,
+            typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
+          for (decltype(result.range().volume()) i = 0ul; i < stride; ++i)
+            inplace_tensor_op(op, result_data[i], tensors_data[i]...);
+        };
+
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume; ord += stride)
+      inplace_tensor_range(result.data() + result.range().ordinal(ord),
+                           (tensors.data() + tensors.range().ordinal(ord))...);
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      std::forward<Op>(op)(
+          result[idx], (tensors[idx - result_lobound +
+                                signed_idx_t(tensors.range().lobound())])...);
+    }
+  }
 }
 
 // -------------------------------------------------------------------------
@@ -407,8 +430,9 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
 /// Initialize tensor with contiguous tensor arguments
 
 /// This function initializes the \c i -th element of \c result with the result
-/// of \c op(tensors[i]...) \pre The memory of \c tensor1 has been allocated but
-/// not initialized. \tparam Op The element initialization operation type
+/// of \c op(tensors[i]...)
+/// \pre The memory of \c tensor1 has been allocated but not initialized.
+/// \tparam Op The element initialization operation type
 /// \tparam TR The result tensor type
 /// \tparam Ts The argument tensor types
 /// \param[in] op The result tensor element initialization operation
@@ -437,8 +461,7 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensors[i]...)
-/// \pre The memory of \c tensor1 has been allocated but
-/// not initialized.
+/// \pre The memory of \c tensor1 has been allocated but not initialized.
 /// \tparam Op The element initialization operation type
 /// \tparam TR The result tensor type
 /// \tparam Ts The argument tensor types
@@ -467,21 +490,15 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
 /// of \c op(tensor1[i], tensors[i]...)
 /// \pre The memory of \c result has been
 /// allocated but not initialized.
-/// \tparam Op The element initialization
-/// operation type
+/// \tparam Op The element initialization operation type
 /// \tparam TR The result tensor type
-/// \tparam T1 The first
-/// argument tensor type
+/// \tparam T1 The first argument tensor type
 /// \tparam Ts The argument tensor types
-/// \param[in] op The
-/// result tensor element initialization operation
-/// \param[in] perm The
-/// permutation that will be applied to tensor2
-/// \param[out] result The result
-/// tensor
+/// \param[in] op The result tensor element initialization operation
+/// \param[in] perm The permutation that will be applied to tensor2
+/// \param[out] result The result tensor
 /// \param[in] tensor1 The first argument tensor
-/// \param[in] tensors The
-/// argument tensors
+/// \param[in] tensors The argument tensors
 template <
     typename Op, typename TR, typename T1, typename... Ts,
     typename std::enable_if<is_tensor<TR, T1, Ts...>::value>::type* = nullptr>
@@ -505,8 +522,7 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result,
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensor1[i], tensors[i]...)
-/// \pre The memory of \c result has been
-/// allocated but not initialized.
+/// \pre The memory of \c result has been allocated but not initialized.
 /// \tparam Op The element initialization operation type
 /// \tparam Perm A permutation type
 /// \tparam TR The result tensor type
@@ -546,18 +562,13 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result,
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensor1[i], tensors[i]...)
-/// \pre The memory of \c tensor1 has been
-/// allocated but not initialized.
-/// \tparam Op The element initialization
-/// operation type
+/// \pre The memory of \c tensor1 has been allocated but not initialized.
+/// \tparam Op The element initialization operation type
 /// \tparam T1 The result tensor type
-/// \tparam Ts The argument
-/// tensor types
-/// \param[in] op The result tensor element initialization
-/// operation
+/// \tparam Ts The argument tensor types
+/// \param[in] op The result tensor element initialization operation
 /// \param[out] result The result tensor
-/// \param[in] tensor1 The first
-/// argument tensor
+/// \param[in] tensor1 The first argument tensor
 /// \param[in] tensors The argument tensors
 template <
     typename Op, typename TR, typename T1, typename... Ts,
@@ -569,7 +580,6 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
   TA_ASSERT(!empty(result, tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
   auto wrapper_op = [&op](typename TR::pointer MADNESS_RESTRICT result_ptr,
@@ -578,11 +588,27 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
     new (result_ptr) typename T1::value_type(op(value1, values...));
   };
 
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
-       ord += stride)
-    math::vector_ptr_op(wrapper_op, stride, result.data() + ord,
-                        (tensor1.data() + tensor1.range().ordinal(ord)),
-                        (tensors.data() + tensors.range().ordinal(ord))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
+         ord += stride)
+      math::vector_ptr_op(wrapper_op, stride, result.data() + ord,
+                          (tensor1.data() + tensor1.range().ordinal(ord)),
+                          (tensors.data() + tensors.range().ordinal(ord))...);
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      const signed_idx_t relidx = idx - result_lobound;
+      wrapper_op(
+          &(result[idx]),
+          tensor1[relidx + signed_idx_t(tensor1.range().lobound())],
+          (tensors[relidx + signed_idx_t(tensors.range().lobound())])...);
+    }
+  }
 }
 
 /// Initialize tensor with one or more non-contiguous tensor arguments
@@ -591,13 +617,10 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
 /// of \c op(tensor1[i],tensors[i]...)
 /// \pre The memory of \c tensor1 has been
 /// allocated but not initialized.
-/// \tparam Op The element initialization
-/// operation type
+/// \tparam Op The element initialization operation type
 /// \tparam T1 The result tensor type
-/// \tparam Ts The argument
-/// tensor types
-/// \param[in] op The result tensor element initialization
-/// operation
+/// \tparam Ts The argument tensor types
+/// \param[in] op The result tensor element initialization operation
 /// \param[out] result The result tensor
 /// \param[in] tensor1 The first
 /// argument tensor
@@ -612,24 +635,40 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
   TA_ASSERT(!empty(result, tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
-  auto inplace_tensor_range =
-      [&op, stride](
-          typename TR::pointer MADNESS_RESTRICT const result_data,
-          typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
-          typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
-        for (decltype(result.range().volume()) i = 0ul; i < stride; ++i)
-          new (result_data + i)
-              typename TR::value_type(tensor_op<typename TR::value_type>(
-                  op, tensor1_data[i], tensors_data[i]...));
-      };
-
-  for (decltype(volume) ord = 0ul; ord < volume; ord += stride)
-    inplace_tensor_range(result.data() + ord,
-                         (tensor1.data() + tensor1.range().ordinal(ord)),
-                         (tensors.data() + tensors.range().ordinal(ord))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    auto inplace_tensor_range =
+        [&op, stride](
+            typename TR::pointer MADNESS_RESTRICT const result_data,
+            typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
+            typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
+          for (std::decay_t<decltype(volume)> i = 0ul; i < stride; ++i)
+            new (result_data + i)
+                typename TR::value_type(tensor_op<typename TR::value_type>(
+                    op, tensor1_data[i], tensors_data[i]...));
+        };
+
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume; ord += stride)
+      inplace_tensor_range(result.data() + ord,
+                           (tensor1.data() + tensor1.range().ordinal(ord)),
+                           (tensors.data() + tensors.range().ordinal(ord))...);
+  } else {
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      const signed_idx_t relidx = idx - result_lobound;
+
+      new (&(result[idx]))
+          typename TR::value_type(tensor_op<typename TR::value_type>(
+              op, tensor1[relidx + signed_idx_t(tensor1.range().lobound())],
+              (tensors[relidx + signed_idx_t(tensors.range().lobound())])...));
+    }
+  }
 }
 
 // -------------------------------------------------------------------------
@@ -639,12 +678,11 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
 
 /// Perform an element-wise reduction of the tensors by
 /// executing <tt>join_op(result, reduce_op(result, &tensor1[i],
-/// &tensors[i]...))</tt> for each \c i in the index range of \c tensor1 . \c
-/// result is initialized to \c identity . If HAVE_INTEL_TBB is defined, the
-/// reduction will be executed in an undefined order, otherwise will execute in
-/// the order of increasing \c i .
-/// \tparam ReduceOp The element-wise reduction
-/// operation type
+/// &tensors[i]...))</tt> for each \c i in the index range of \c tensor1 .
+/// \c result is initialized to \c identity . If `HAVE_INTEL_TBB` is defined,
+/// the reduction will be executed in an undefined order, otherwise will
+/// execute in the order of increasing \c i .
+/// \tparam ReduceOp The element-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Identity A type that can be used as an argument to ReduceOp
 /// \tparam T1 The first argument tensor type
@@ -708,10 +746,10 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 
 /// Perform reduction of the tensor-of-tensors' elements by
 /// executing <tt>join_op(result, reduce_op(tensor1[i], tensors[i]...))</tt> for
-/// each \c i in the index range of \c tensor1 . \c result is initialized to \c
-/// identity . This will execute serially, in the order of increasing \c i (each
-/// element's reduction can however be executed in parallel, depending on the
-/// element type).
+/// each \c i in the index range of \c tensor1 . \c result is initialized to
+/// \c identity . This will execute serially, in the order of increasing
+/// \c i (each element's reduction can however be executed in parallel,
+/// depending on the element type).
 /// \tparam ReduceOp The tensor-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Scalar A scalar type
@@ -751,10 +789,10 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
 
 /// Perform an element-wise reduction of the tensors by
 /// executing <tt>join_op(result, reduce_op(tensor1[i], tensors[i]...))</tt> for
-/// each \c i in the index range of \c tensor1 . \c result is initialized to \c
-/// identity . This will execute serially, in the order of increasing \c i (each
-/// element-wise reduction can however be executed in parallel, depending on the
-/// element type).
+/// each \c i in the index range of \c tensor1 . \c result is initialized to
+/// \c identity . This will execute serially, in the order of increasing
+/// \c i (each element-wise reduction can however be executed in parallel,
+/// depending on the element type).
 /// \tparam ReduceOp The element-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Scalar A scalar type
@@ -777,17 +815,31 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
   auto result = identity;
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
-       ord += stride) {
-    auto temp = identity;
-    math::reduce_op(reduce_op, join_op, identity, stride, temp,
-                    tensor1.data() + tensor1.range().ordinal(ord),
-                    (tensors.data() + tensors.range().ordinal(ord))...);
-    join_op(result, temp);
+  if constexpr (detail::has_member_function_data_anyreturn_v<T1> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume;
+         ord += stride) {
+      auto temp = identity;
+      math::reduce_op(reduce_op, join_op, identity, stride, temp,
+                      tensor1.data() + tensor1.range().ordinal(ord),
+                      (tensors.data() + tensors.range().ordinal(ord))...);
+      join_op(result, temp);
+    }
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& t1_rng = tensor1.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto t1_lobound = signed_idx_t(t1_rng.lobound());
+    for (auto&& idx : t1_rng) {
+      using namespace container::operators;
+      signed_idx_t relidx = idx - t1_lobound;
+      reduce_op(result, tensor1[idx],
+                (tensors[idx - t1_lobound +
+                         signed_idx_t(tensors.range().lobound())])...);
+    }
   }
 
   return result;
@@ -797,10 +849,11 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
 
 /// Perform an element-wise reduction of the tensors by
 /// executing <tt>join_op(result, reduce_op(tensor1[i], tensors[i]...))</tt> for
-/// each \c i in the index range of \c tensor1 . \c result is initialized to \c
-/// identity . This will execute serially, in the order of increasing \c i (each
-/// element-wise reduction can however be executed in parallel, depending on the
-/// element type). \tparam ReduceOp The element-wise reduction operation type
+/// each \c i in the index range of \c tensor1 . \c result is initialized to
+/// \c identity . This will execute serially, in the order of increasing
+/// \c i (each element-wise reduction can however be executed in parallel,
+/// depending on the element type).
+/// \tparam ReduceOp The element-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Scalar A scalar type
 /// \tparam T1 The first argument tensor type
@@ -822,31 +875,49 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
-  auto tensor_reduce_range =
-      [&reduce_op, &join_op, &identity, stride](
-          Scalar& MADNESS_RESTRICT result,
-          typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
-          typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
-        for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) {
-          Scalar temp = tensor_reduce(reduce_op, join_op, identity,
-                                      tensor1_data[i], tensors_data[i]...);
-          join_op(result, temp);
-        }
-      };
-
   Scalar result = identity;
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
-       ord += stride) {
-    Scalar temp = tensor_reduce_range(
-        result, tensor1.data() + tensor1.range().ordinal(ord),
-        (tensors.data() + tensors.range().ordinal(ord))...);
-    join_op(result, temp);
+
+  if constexpr (detail::has_member_function_data_anyreturn_v<T1> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    auto tensor_reduce_range =
+        [&reduce_op, &join_op, &identity, stride](
+            Scalar& MADNESS_RESTRICT result,
+            typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
+            typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
+          for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) {
+            Scalar temp = tensor_reduce(reduce_op, join_op, identity,
+                                        tensor1_data[i], tensors_data[i]...);
+            join_op(result, temp);
+          }
+        };
+
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume;
+         ord += stride) {
+      Scalar temp = tensor_reduce_range(
+          result, tensor1.data() + tensor1.range().ordinal(ord),
+          (tensors.data() + tensors.range().ordinal(ord))...);
+      join_op(result, temp);
+    }
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& t1_rng = tensor1.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto t1_lobound = signed_idx_t(t1_rng.lobound());
+    for (auto&& idx : t1_rng) {
+      using namespace container::operators;
+      signed_idx_t relidx = idx - t1_lobound;
+
+      Scalar temp =
+          tensor_reduce(reduce_op, join_op, identity, tensor1[idx],
+                        (tensors[idx - t1_lobound +
+                                 signed_idx_t(tensors.range().lobound())])...);
+      join_op(result, temp);
+    }
   }
 
-  return identity;
+  return result;
 }
 
 }  // namespace detail
diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h
index f7c7a5f2ae..4be515d3b3 100644
--- a/src/TiledArray/tensor/operators.h
+++ b/src/TiledArray/tensor/operators.h
@@ -41,11 +41,13 @@ namespace TiledArray {
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] + right[i]</tt>
 template <typename T1, typename T2,
-          typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator+(const T1& left, const T2& right) {
-  return add(left, right);
+          typename = std::enable_if_t<
+              detail::is_tensor<detail::remove_cvr_t<T1>,
+                                detail::remove_cvr_t<T2>>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          detail::remove_cvr_t<T2>>::value>>
+inline decltype(auto) operator+(T1&& left, T2&& right) {
+  return add(std::forward<T1>(left), std::forward<T2>(right));
 }
 
 /// Tensor minus operator
@@ -56,12 +58,16 @@ inline auto operator+(const T1& left, const T2& right) {
 /// \param left The left-hand tensor argument
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] - right[i]</tt>
-template <typename T1, typename T2,
-          typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator-(const T1& left, const T2& right) {
-  return subt(left, right);
+template <
+    typename T1, typename T2,
+    typename std::enable_if<
+        detail::is_tensor<detail::remove_cvr_t<T1>,
+                          detail::remove_cvr_t<T2>>::value ||
+        detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                    detail::remove_cvr_t<T2>>::value>::type* =
+        nullptr>
+inline decltype(auto) operator-(T1&& left, T2&& right) {
+  return subt(std::forward<T1>(left), std::forward<T2>(right));
 }
 
 /// Tensor multiplication operator
@@ -72,12 +78,16 @@ inline auto operator-(const T1& left, const T2& right) {
 /// \param left The left-hand tensor argument
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] * right[i]</tt>
-template <typename T1, typename T2,
-          typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator*(const T1& left, const T2& right) {
-  return mult(left, right);
+template <
+    typename T1, typename T2,
+    typename std::enable_if<
+        detail::is_tensor<detail::remove_cvr_t<T1>,
+                          detail::remove_cvr_t<T2>>::value ||
+        detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                    detail::remove_cvr_t<T2>>::value>::type* =
+        nullptr>
+inline decltype(auto) operator*(T1&& left, T2&& right) {
+  return mult(std::forward<T1>(left), std::forward<T2>(right));
 }
 
 /// Create a copy of \c left that is scaled by \c right
@@ -89,11 +99,12 @@ inline auto operator*(const T1& left, const T2& right) {
 /// \param right The right-hand scalar argument
 /// \return A tensor where element \c i is equal to <tt> left[i] * right </tt>
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator*(const T& left, N right) {
-  return scale(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator*(T&& left, N right) {
+  return scale(std::forward<T>(left), right);
 }
 
 /// Create a copy of \c right that is scaled by \c left
@@ -103,13 +114,15 @@ inline auto operator*(const T& left, N right) {
 /// \param left The left-hand scalar argument
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt> left * right[i] </tt>
-template <typename N, typename T,
-          typename std::enable_if<
-              detail::is_numeric_v<N> &&
-              (detail::is_tensor<T>::value ||
-               detail::is_tensor_of_tensor<T>::value)>::type* = nullptr>
-inline auto operator*(N left, const T& right) {
-  return scale(right, left);
+template <
+    typename N, typename T,
+    typename std::enable_if<
+        detail::is_numeric_v<N> &&
+        (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+         detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value)>::type* =
+        nullptr>
+inline decltype(auto) operator*(N left, T&& right) {
+  return scale(std::forward<T>(right), left);
 }
 
 /// Create a negated copy of \c arg
@@ -117,11 +130,12 @@ inline auto operator*(N left, const T& right) {
 /// \tparam T The element type of \c arg
 /// \param arg The argument tensor
 /// \return A tensor where element \c i is equal to \c -arg[i]
-template <typename T, typename std::enable_if<detail::is_tensor<T>::value ||
-                                              detail::is_tensor_of_tensor<
-                                                  T>::value>::type* = nullptr>
-inline auto operator-(const T& arg) -> decltype(arg.neg()) {
-  return neg(arg);
+template <typename T, typename std::enable_if<
+                          detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+                          detail::is_tensor_of_tensor<
+                              detail::remove_cvr_t<T>>::value>::type* = nullptr>
+inline decltype(auto) operator-(T&& arg) {
+  return neg(std::forward<T>(arg));
 }
 
 /// Create a permuted copy of \c arg
@@ -129,11 +143,12 @@ inline auto operator-(const T& arg) -> decltype(arg.neg()) {
 /// \tparam T The argument tensor type
 /// \param perm The permutation to be applied to \c arg
 /// \param arg The argument tensor to be permuted
-template <typename T, typename std::enable_if<detail::is_tensor<T>::value ||
-                                              detail::is_tensor_of_tensor<
-                                                  T>::value>::type* = nullptr>
-inline auto operator*(const Permutation& perm, const T& arg) {
-  return permute(arg, perm);
+template <typename T, typename std::enable_if<
+                          detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+                          detail::is_tensor_of_tensor<
+                              detail::remove_cvr_t<T>>::value>::type* = nullptr>
+inline decltype(auto) operator*(const Permutation& perm, T&& arg) {
+  return permute(std::forward<T>(arg), perm);
 }
 
 /// Tensor plus operator
@@ -146,10 +161,11 @@ inline auto operator*(const Permutation& perm, const T& arg) {
 /// \return A tensor where element \c i is equal to <tt>left[i] + right[i]</tt>
 template <typename T1, typename T2,
           typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator+=(T1& left, const T2& right) {
-  return add_to(left, right);
+              detail::is_tensor<detail::remove_cvr_t<T1>, T2>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          T2>::value>::type* = nullptr>
+inline decltype(auto) operator+=(T1&& left, const T2& right) {
+  return add_to(std::forward<T1>(left), right);
 }
 
 /// Tensor minus operator
@@ -162,10 +178,11 @@ inline auto operator+=(T1& left, const T2& right) {
 /// \return A reference to \c left
 template <typename T1, typename T2,
           typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator-=(T1& left, const T2& right) {
-  return sub_to(left, right);
+              detail::is_tensor<detail::remove_cvr_t<T1>, T2>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          T2>::value>::type* = nullptr>
+inline decltype(auto) operator-=(T1&& left, const T2& right) {
+  return subt_to(std::forward<T1>(left), right);
 }
 
 /// In place tensor multiplication
@@ -178,10 +195,11 @@ inline auto operator-=(T1& left, const T2& right) {
 /// \return A reference to \c left
 template <typename T1, typename T2,
           typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator*=(T1& left, const T2& right) {
-  return mult_to(left, right);
+              detail::is_tensor<detail::remove_cvr_t<T1>, T2>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          T2>::value>::type* = nullptr>
+inline decltype(auto) operator*=(T1&& left, const T2& right) {
+  return mult_to(std::forward<T1>(left), right);
 }
 
 /// In place tensor add constant
@@ -193,11 +211,12 @@ inline auto operator*=(T1& left, const T2& right) {
 /// \param right The right-hand scalar argument
 /// \return A reference to \c left
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator+=(T& left, N right) {
-  return add_to(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator+=(T&& left, N right) {
+  return add_to(std::forward<T>(left), right);
 }
 
 /// In place tensor subtract constant
@@ -209,11 +228,12 @@ inline auto operator+=(T& left, N right) {
 /// \param right The right-hand scalar argument
 /// \return A reference to \c left
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator-=(T& left, N right) {
-  return subt_to(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator-=(T&& left, N right) {
+  return subt_to(std::forward<T>(left), right);
 }
 
 /// In place tensor scale
@@ -225,11 +245,12 @@ inline auto operator-=(T& left, N right) {
 /// \param right The right-hand scalar argument
 /// \return A reference to \c left
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator*=(T& left, N right) {
-  return scale_to(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator*=(T&& left, N right) {
+  return scale_to(std::forward<T>(left), right);
 }
 
 }  // namespace TiledArray
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index fcb5ffbe7a..d6ce1b62be 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -163,7 +163,7 @@ class Tensor {
 #endif
       allocator.deallocate(ptr, size);
     };
-    this->data_ = std::shared_ptr<value_type>(ptr, std::move(deleter));
+    this->data_ = std::shared_ptr<value_type[]>(ptr, std::move(deleter));
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
@@ -201,7 +201,7 @@ class Tensor {
 #endif
       allocator.deallocate(ptr, size);
     };
-    this->data_ = std::shared_ptr<value_type>(ptr, std::move(deleter));
+    this->data_ = std::shared_ptr<value_type[]>(ptr, std::move(deleter));
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
@@ -216,7 +216,7 @@ class Tensor {
   /// Number of `range_`-sized blocks in `data_`
   /// \note this is not used for (in)equality comparison
   size_t batch_size_ = 1;
-  std::shared_ptr<value_type> data_;  ///< Shared pointer to the data
+  std::shared_ptr<value_type[]> data_;  ///< Shared pointer to the data
 
  public:
   /// constructs an empty (null) Tensor
@@ -308,9 +308,10 @@ class Tensor {
 
   /// Construct a tensor with a fill op that takes an element index
 
-  /// \tparam ElementIndexOp callable of signature `value_type(const
-  /// Range::index_type&)` \param range An array with the size of of each
-  /// dimension \param element_idx_op a callable of type ElementIndexOp
+  /// \tparam ElementIndexOp callable of signature
+  /// `value_type(const Range::index_type&)`
+  /// \param range An array with the size of of each dimension
+  /// \param element_idx_op a callable of type ElementIndexOp
   template <typename ElementIndexOp,
             typename = std::enable_if_t<std::is_invocable_r_v<
                 value_type, ElementIndexOp, const Range::index_type&>>>
@@ -491,8 +492,8 @@ class Tensor {
   /// \param batch_size The batch size
   /// \param data shared pointer to the data
   Tensor(const range_type& range, size_t batch_size,
-         std::shared_ptr<value_type> data)
-      : range_(range), batch_size_(batch_size), data_(data) {
+         std::shared_ptr<value_type[]> data)
+      : range_(range), batch_size_(batch_size), data_(std::move(data)) {
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
@@ -502,6 +503,20 @@ class Tensor {
 #endif
   }
 
+  /// Construct a tensor with a range equal to \c range using existing data
+  /// assuming unit batch size \param range The range of the tensor \param data
+  /// shared pointer to the data
+  Tensor(const range_type& range, std::shared_ptr<value_type[]> data)
+      : range_(range), batch_size_(1), data_(std::move(data)) {
+#ifdef TA_TENSOR_MEM_TRACE
+    if (nbytes() >= trace_if_larger_than_) {
+      ptr_registry()->insert(
+          this,
+          make_string("TA::Tensor(range, data)::data_.get()=", data_.get()));
+    }
+#endif
+  }
+
   /// The batch size accessor
 
   /// @return the size of tensor batch represented by `*this`
@@ -513,8 +528,8 @@ class Tensor {
   /// the batch
   Tensor batch(size_t idx) const {
     TA_ASSERT(idx < this->batch_size());
-    std::shared_ptr<value_type> data(this->data_,
-                                     this->data_.get() + idx * this->size());
+    std::shared_ptr<value_type[]> data(this->data_,
+                                       this->data_.get() + idx * this->size());
     return Tensor(this->range(), 1, data);
   }
 
@@ -537,6 +552,10 @@ class Tensor {
       result = detail::tensor_op<Tensor>(
           [](const numeric_type value) -> numeric_type { return value; },
           *this);
+    } else if (range_) {  // corner case: data_ = null implies range_.volume()
+                          // == 0;
+      TA_ASSERT(range_.volume() == 0);
+      result = Tensor(range_);
     }
     return result;
   }
@@ -962,12 +981,14 @@ class Tensor {
   /// Read-only shared_ptr to the data
 
   /// \return A const shared_ptr to the tensor data
-  std::shared_ptr<const value_type> data_shared() const { return this->data_; }
+  std::shared_ptr<const value_type[]> data_shared() const {
+    return this->data_;
+  }
 
   /// Mutable shared_ptr to the data
 
   /// \return A mutable shared_ptr to the tensor data
-  std::shared_ptr<value_type> data_shared() { return this->data_; }
+  std::shared_ptr<value_type[]> data_shared() { return this->data_; }
 
   /// Test if the tensor is empty
 
@@ -1478,7 +1499,7 @@ class Tensor {
 
   // Addition operations
 
-  /// Add this and \c other to construct a new tensors
+  /// Add this and \c other to construct a new tensor
 
   /// \tparam Right The right-hand tensor type
   /// \param right The tensor that will be added to this tensor
@@ -1486,7 +1507,7 @@ class Tensor {
   /// \c this and \c other
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  Tensor add(const Right& right) const {
+  Tensor add(const Right& right) const& {
     return binary(
         right,
         [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
@@ -1494,6 +1515,19 @@ class Tensor {
         });
   }
 
+  /// Add this and \c other to construct a new tensor
+
+  /// \tparam Right The right-hand tensor type
+  /// \param right The tensor that will be added to this tensor
+  /// \return A new tensor where the elements are the sum of the elements of
+  /// \c this and \c other
+  template <typename Right,
+            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+  Tensor add(const Right& right) && {
+    add_to(right);
+    return std::move(*this);
+  }
+
   /// Add this and \c other to construct a new, permuted tensor
 
   /// \tparam Right The right-hand tensor type
diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h
index 9c0e02e558..879d2ed9d2 100644
--- a/src/TiledArray/tile_interface/add.h
+++ b/src/TiledArray/tile_interface/add.h
@@ -39,10 +39,21 @@ namespace TiledArray {
 /// \param left The left-hand argument to be added
 /// \param right The right-hand argument to be added
 /// \return A tile that is equal to <tt>(left + right)</tt>
-template <typename Left, typename Right>
-inline auto add(const Left& left, const Right& right)
-    -> decltype(left.add(right)) {
-  return left.add(right);
+template <typename Left, typename Right,
+          typename = std::enable_if_t<
+              detail::has_member_function_add_anyreturn_v<Left&&, Right&&> ||
+              detail::has_member_function_add_anyreturn_v<Right&&, Left&&>>>
+inline decltype(auto) add(Left&& left, Right&& right) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right));
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left));
 }
 
 /// Add and scale tile arguments
@@ -56,9 +67,26 @@ inline auto add(const Left& left, const Right& right)
 /// \return A tile that is equal to <tt>(left + right) * factor</tt>
 template <
     typename Left, typename Right, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline auto add(const Left& left, const Right& right, const Scalar factor) {
-  return left.add(right, factor);
+    typename = std::enable_if_t<detail::is_numeric_v<Scalar> &&
+                                (detail::has_member_function_add_anyreturn_v<
+                                     Left&&, Right&&, const Scalar> ||
+                                 detail::has_member_function_add_anyreturn_v<
+                                     Right&&, Left&&, const Scalar>)>>
+inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                   const Scalar> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                    const Scalar>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right), factor);
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left), factor);
 }
 
 /// Add and permute tile arguments
@@ -72,10 +100,25 @@ inline auto add(const Left& left, const Right& right, const Scalar factor) {
 template <
     typename Left, typename Right, typename Perm,
     typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm> &&
-                                detail::has_member_function_add_anyreturn_v<
-                                    const Left, const Right&, const Perm&>>>
-inline auto add(const Left& left, const Right& right, const Perm& perm) {
-  return left.add(right, perm);
+                                (detail::has_member_function_add_anyreturn_v<
+                                     Left&&, Right&&, const Perm&> ||
+                                 detail::has_member_function_add_anyreturn_v<
+                                     Right&&, Left&&, const Perm&>)>>
+inline decltype(auto) add(Left&& left, Right&& right, const Perm& perm) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Perm&> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                   const Perm&> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Perm&> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                    const Perm&>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right), perm);
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left), perm);
 }
 
 /// Add, scale, and permute tile arguments
@@ -88,13 +131,31 @@ inline auto add(const Left& left, const Right& right, const Perm& perm) {
 /// \param factor The scaling factor
 /// \param perm The permutation to be applied to the result
 /// \return A tile that is equal to <tt>perm ^ (left + right) * factor</tt>
-template <
-    typename Left, typename Right, typename Scalar, typename Perm,
-    typename std::enable_if<detail::is_numeric_v<Scalar> &&
-                            detail::is_permutation_v<Perm>>::type* = nullptr>
-inline auto add(const Left& left, const Right& right, const Scalar factor,
-                const Perm& perm) {
-  return left.add(right, factor, perm);
+template <typename Left, typename Right, typename Scalar, typename Perm,
+          typename = std::enable_if_t<
+              detail::is_numeric_v<Scalar> && detail::is_permutation_v<Perm> &&
+              (detail::has_member_function_add_anyreturn_v<
+                   Left&&, Right&&, const Scalar, const Perm&> ||
+               detail::has_member_function_add_anyreturn_v<
+                   Right&&, Left&&, const Scalar, const Perm&>)>>
+inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor,
+                          const Perm& perm) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar, const Perm&> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                   const Scalar, const Perm&> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar, const Perm&> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                    const Scalar, const Perm&>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right), factor,
+                                        perm);
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left), factor,
+                                          perm);
 }
 
 /// Add to the result tile
@@ -104,9 +165,12 @@ inline auto add(const Left& left, const Right& right, const Scalar factor,
 /// \param result The result tile
 /// \param arg The argument to be added to the result
 /// \return A tile that is equal to <tt>result[i] += arg[i]</tt>
-template <typename Result, typename Arg>
-inline Result& add_to(Result& result, const Arg& arg) {
-  return result.add_to(arg);
+template <
+    typename Result, typename Arg,
+    typename = std::enable_if_t<
+        detail::has_member_function_add_to_anyreturn_v<Result&&, const Arg&>>>
+inline decltype(auto) add_to(Result&& result, const Arg& arg) {
+  return std::forward<Result>(result).add_to(arg);
 }
 
 /// Add and scale to the result tile
@@ -118,11 +182,14 @@ inline Result& add_to(Result& result, const Arg& arg) {
 /// \param arg The argument to be added to \c result
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>(result[i] += arg[i]) *= factor</tt>
-template <
-    typename Result, typename Arg, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Result& add_to(Result& result, const Arg& arg, const Scalar factor) {
-  return result.add_to(arg, factor);
+template <typename Result, typename Arg, typename Scalar,
+          typename std::enable_if<
+              detail::is_numeric_v<Scalar> &&
+              detail::has_member_function_add_to_anyreturn_v<
+                  Result&&, const Arg&, const Scalar>>::type* = nullptr>
+inline decltype(auto) add_to(Result&& result, const Arg& arg,
+                             const Scalar factor) {
+  return std::forward<Result>(result).add_to(arg, factor);
 }
 
 namespace tile_interface {
diff --git a/src/TiledArray/tile_op/tile_interface.h b/src/TiledArray/tile_op/tile_interface.h
index 65d970ebeb..ee8c1093a2 100644
--- a/src/TiledArray/tile_op/tile_interface.h
+++ b/src/TiledArray/tile_op/tile_interface.h
@@ -372,8 +372,8 @@ inline auto subt(const Arg& arg, const Scalar value, const Perm& perm) {
 /// \param arg The argument to be subtracted from the result
 /// \return A tile that is equal to <tt>result[i] -= arg[i]</tt>
 template <typename Result, typename Arg>
-inline Result& subt_to(Result& result, const Arg& arg) {
-  return result.subt_to(arg);
+inline decltype(auto) subt_to(Result&& result, const Arg& arg) {
+  return std::forward<Result>(result).subt_to(arg);
 }
 
 /// Subtract and scale from the result tile
diff --git a/src/TiledArray/util/vector.h b/src/TiledArray/util/vector.h
index 12c5d0dfcd..6e69f523f4 100644
--- a/src/TiledArray/util/vector.h
+++ b/src/TiledArray/util/vector.h
@@ -27,13 +27,16 @@
 #define TILEDARRAY_UTIL_VECTOR_H
 
 #include <boost/container/small_vector.hpp>
+#include <boost/version.hpp>
 
 // Boost.Container 1.75 and earlier uses standard exception classes, 1.76+ use
-// Boost.Container exceptions, unless BOOST_CONTAINER_USE_STD_EXCEPTIONS is defined:
+// Boost.Container exceptions, unless BOOST_CONTAINER_USE_STD_EXCEPTIONS is
+// defined:
 // https://www.boost.org/doc/libs/master/doc/html/container/release_notes.html#container.release_notes.release_notes_boost_1_76_00
-// Define BOOST_CONTAINER_USE_STD_EXCEPTIONS for Boost <1.76 so that exception checking can use this macro with all versions of Boost
+// Define BOOST_CONTAINER_USE_STD_EXCEPTIONS for Boost <1.76 so that exception
+// checking can use this macro with all versions of Boost
 #if BOOST_VERSION < 107600 && !defined(BOOST_CONTAINER_USE_STD_EXCEPTIONS)
-# define BOOST_CONTAINER_USE_STD_EXCEPTIONS 1
+#define BOOST_CONTAINER_USE_STD_EXCEPTIONS 1
 #endif
 
 #include <vector>
@@ -41,6 +44,7 @@
 
 #include <TiledArray/utility.h>
 #include <madness/world/archive.h>
+#include "TiledArray/error.h"
 
 namespace TiledArray {
 
@@ -90,6 +94,32 @@ constexpr auto iv(Int i0, Ints... rest) {
   return result;
 }
 
+namespace operators {
+
+template <typename T1, std::size_t N1, typename T2, std::size_t N2>
+decltype(auto) operator+(const boost::container::small_vector<T1, N1>& v1,
+                         const boost::container::small_vector<T2, N2>& v2) {
+  TA_ASSERT(v1.size() == v2.size());
+  boost::container::small_vector<std::common_type_t<T1, T2>, std::max(N1, N2)>
+      result(v1.size());
+  std::transform(v1.begin(), v1.end(), v2.begin(), result.begin(),
+                 [](auto&& a, auto&& b) { return a + b; });
+  return result;
+}
+
+template <typename T1, std::size_t N1, typename T2, std::size_t N2>
+decltype(auto) operator-(const boost::container::small_vector<T1, N1>& v1,
+                         const boost::container::small_vector<T2, N2>& v2) {
+  TA_ASSERT(v1.size() == v2.size());
+  boost::container::small_vector<std::common_type_t<T1, T2>, std::max(N1, N2)>
+      result(v1.size());
+  std::transform(v1.begin(), v1.end(), v2.begin(), result.begin(),
+                 [](auto&& a, auto&& b) { return a - b; });
+  return result;
+}
+
+}  // namespace operators
+
 }  // namespace container
 }  // namespace TiledArray
 
diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp
index 061c5fdd17..c2ac8262d0 100644
--- a/tests/dist_array.cpp
+++ b/tests/dist_array.cpp
@@ -76,6 +76,13 @@ BOOST_AUTO_TEST_CASE(constructors) {
   for (ArrayN::const_iterator it = ad.begin(); it != ad.end(); ++it)
     BOOST_CHECK(!it->probe());
 
+  // Construct a dense array in default world
+  {
+    BOOST_REQUIRE_NO_THROW(ArrayN ad(tr));
+    ArrayN ad(tr);
+    BOOST_CHECK_EQUAL(ad.world().id(), get_default_world().id());
+  }
+
   // Construct a sparse array
   BOOST_REQUIRE_NO_THROW(
       SpArrayN as(world, tr, TiledArray::SparseShape<float>(shape_tensor, tr)));
@@ -88,6 +95,14 @@ BOOST_AUTO_TEST_CASE(constructors) {
   // now fill it
   BOOST_REQUIRE_NO_THROW(as.fill(1));
 
+  // Construct a sparse array in default world
+  {
+    BOOST_REQUIRE_NO_THROW(
+        SpArrayN as(tr, TiledArray::SparseShape<float>(shape_tensor, tr)));
+    SpArrayN as(tr, TiledArray::SparseShape<float>(shape_tensor, tr));
+    BOOST_CHECK_EQUAL(as.world().id(), get_default_world().id());
+  }
+
   // Construct a sparse array from another sparse array
   {
     auto op = [](auto& result, const auto& input) { result = input.clone(); };
@@ -107,6 +122,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         ++itr;
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_vector(il);
+      BOOST_CHECK_EQUAL(a_vector.world().id(), get_default_world().id());
+    }
   }
 
   // Create a matrix with an initializer list
@@ -122,6 +143,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_matrix(il);
+      BOOST_CHECK_EQUAL(a_matrix.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 3 tensor with an initializer list
@@ -144,6 +171,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor3(il);
+      BOOST_CHECK_EQUAL(a_tensor3.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 4 tensor with an initializer list
@@ -168,6 +201,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor4(il);
+      BOOST_CHECK_EQUAL(a_tensor4.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 5 tensor with an initializer list
@@ -194,6 +233,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor5(il);
+      BOOST_CHECK_EQUAL(a_tensor5.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 6 tensor with an initializer list
@@ -222,6 +267,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor6(il);
+      BOOST_CHECK_EQUAL(a_tensor6.world().id(), get_default_world().id());
+    }
   }
 }
 
@@ -232,6 +283,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 3}};
     TArray<double> a_vector(world, tr, il);
     BOOST_CHECK_EQUAL(a_vector.size(), 2);
+
+    // now with default world
+    {
+      TArray<double> a_vector(tr, il);
+      BOOST_CHECK_EQUAL(a_vector.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -239,6 +296,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 3}};
     TArray<double> a_matrix(world, tr, il);
     BOOST_CHECK_EQUAL(a_matrix.size(), 4);
+
+    // now with default world
+    {
+      TArray<double> a_matrix(tr, il);
+      BOOST_CHECK_EQUAL(a_matrix.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -247,6 +310,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 8);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -257,6 +326,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 16);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -269,6 +344,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 32);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -286,6 +367,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
                   {0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 64);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 }
 
diff --git a/tests/eigen.cpp b/tests/eigen.cpp
index bfa4f1a0db..d577804417 100644
--- a/tests/eigen.cpp
+++ b/tests/eigen.cpp
@@ -421,7 +421,12 @@ BOOST_AUTO_TEST_CASE(tensor_to_array) {
     decltype(tensor) tensor_copy;
     if (GlobalFixture::world->rank() == 1) tensor_copy = tensor;
     GlobalFixture::world->gop.broadcast_serializable(tensor_copy, 1);
+// Eigen::TensorBase::operator== is ambiguously defined in C++20
+#if __cplusplus >= 202002L
+    Eigen::Tensor<bool, 0> eq = ((tensor - tensor_copy).abs() == 0).all();
+#else
     Eigen::Tensor<bool, 0> eq = (tensor == tensor_copy).all();
+#endif
     BOOST_CHECK(eq() == true);
   }
 
diff --git a/tests/einsum.cpp b/tests/einsum.cpp
index 1c0172e554..0fcb71f072 100644
--- a/tests/einsum.cpp
+++ b/tests/einsum.cpp
@@ -764,7 +764,12 @@ bool isApprox(const Eigen::TensorBase<TA, Eigen::ReadOnlyAccessors>& A,
   Eigen::Tensor<bool, 0> r;
   if constexpr (std::is_integral_v<typename TA::Scalar> &&
                 std::is_integral_v<typename TB::Scalar>) {
+// Eigen::TensorBase::operator== is ambiguously defined in C++20
+#if __cplusplus >= 202002L
+    r = ((derived(A) - derived(B)).abs() == 0).all();
+#else
     r = (derived(A) == derived(B)).all();
+#endif
   } else {  // soft floating-point comparison
     r = ((derived(A) - derived(B)).abs() <= abs_comparison_threshold).all();
   }
diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h
index 6c365334fa..1b7cdd07e1 100644
--- a/tests/sparse_tile.h
+++ b/tests/sparse_tile.h
@@ -122,10 +122,36 @@ class EigenSparseTile {
   matrix_type& matrix() { return std::get<0>(*impl_); }
 
   /// data read-write accessor
-  template <typename Index>
+  template <typename Index, typename = std::enable_if_t<
+                                detail::is_integral_sized_range_v<Index>>>
   value_type& operator[](const Index& idx) {
     auto start = range().lobound_data();
-    return std::get<0>(*impl_).coeffRef(idx[0] - start[0], idx[1] - start[1]);
+    return matrix().coeffRef(idx[0] - start[0], idx[1] - start[1]);
+  }
+
+  /// data read-write accessor
+  template <typename Ordinal,
+            std::enable_if_t<std::is_integral_v<Ordinal>>* = nullptr>
+  value_type& operator[](const Ordinal& ord) {
+    auto idx = range().idx(ord);
+    auto start = range().lobound_data();
+    return matrix().coeffRef(idx[0] - start[0], idx[1] - start[1]);
+  }
+
+  /// data read-only accessor
+  template <typename Index, typename = std::enable_if_t<
+                                detail::is_integral_sized_range_v<Index>>>
+  value_type operator[](const Index& idx) const {
+    auto start = range().lobound_data();
+    return matrix().coeff(idx[0] - start[0], idx[1] - start[1]);
+  }
+
+  /// data read-only accessor
+  template <typename Ordinal, std::enable_if_t<std::is_integral_v<Ordinal>>>
+  value_type operator[](const Ordinal& ord) const {
+    auto idx = range().idx(ord);
+    auto start = range().lobound_data();
+    return matrix().coeff(idx[0] - start[0], idx[1] - start[1]);
   }
 
   /// Maximum # of elements in the tile
@@ -138,8 +164,8 @@ class EigenSparseTile {
 
   // output
   template <typename Archive,
-            typename std::enable_if<madness::is_output_archive_v<
-                Archive>>::type* = nullptr>
+            typename std::enable_if<
+                madness::is_output_archive_v<Archive>>::type* = nullptr>
   void serialize(Archive& ar) {
     if (impl_) {
       ar & true;
@@ -151,7 +177,7 @@ class EigenSparseTile {
         for (typename matrix_type::InnerIterator it(mat, k); it; ++it) {
           datavec.push_back(Eigen::Triplet<T>(it.row(), it.col(), it.value()));
         }
-      ar& datavec & this->range();
+      ar& datavec& this->range();
     } else {
       ar & false;
     }
@@ -159,8 +185,8 @@ class EigenSparseTile {
 
   // output
   template <typename Archive,
-            typename std::enable_if<madness::is_input_archive_v<
-                Archive>>::type* = nullptr>
+            typename std::enable_if<
+                madness::is_input_archive_v<Archive>>::type* = nullptr>
   void serialize(Archive& ar) {
     bool have_impl = false;
     ar& have_impl;
@@ -229,22 +255,22 @@ EigenSparseTile<T, TagType> add(const EigenSparseTile<T, TagType>& arg1,
                                      arg1.range());
 }
 
-// dense_result[i] = dense_arg1[i] + sparse_arg2[i]
-template <typename T, typename TagType>
-TiledArray::Tensor<T> add(const TiledArray::Tensor<T>& arg1,
-                          const EigenSparseTile<T, TagType>& arg2) {
-  TA_ASSERT(arg1.range() == arg2.range());
-
-  // this could be done better ...
-  return TiledArray::add(arg1, static_cast<TiledArray::Tensor<T>>(arg2));
-}
-
-// dense_result[i] = sparse_arg1[i] + dense_arg2[i]
-template <typename T, typename TagType>
-TiledArray::Tensor<T> add(const EigenSparseTile<T, TagType>& arg1,
-                          const TiledArray::Tensor<T>& arg2) {
-  return TiledArray::add(arg2, arg1);
-}
+//// dense_result[i] = dense_arg1[i] + sparse_arg2[i]
+// template <typename T, typename TagType>
+// TiledArray::Tensor<T> add(const TiledArray::Tensor<T>& arg1,
+//                           const EigenSparseTile<T, TagType>& arg2) {
+//   TA_ASSERT(arg1.range() == arg2.range());
+//
+//   // this could be done better ...
+//   return TiledArray::add(arg1, static_cast<TiledArray::Tensor<T>>(arg2));
+// }
+//
+//// dense_result[i] = sparse_arg1[i] + dense_arg2[i]
+// template <typename T, typename TagType>
+// TiledArray::Tensor<T> add(const EigenSparseTile<T, TagType>& arg1,
+//                           const TiledArray::Tensor<T>& arg2) {
+//   return TiledArray::add(arg2, static_cast<TiledArray::Tensor<T>>(arg1));
+// }
 
 // dense_result[perm ^ i] = dense_arg1[i] + sparse_arg2[i]
 template <
diff --git a/tests/tile_op_add.cpp b/tests/tile_op_add.cpp
index b264ae15bf..c2e08c170c 100644
--- a/tests/tile_op_add.cpp
+++ b/tests/tile_op_add.cpp
@@ -26,6 +26,7 @@
 #include "../src/TiledArray/tile_op/add.h"
 #include "../src/tiledarray.h"
 #include "range_fixture.h"
+#include "sparse_tile.h"
 #include "unit_test_config.h"
 
 // using TiledArray::detail::Add;
@@ -49,8 +50,7 @@ struct AddFixture : public RangeFixture {
 
 };  // AddFixture
 
-BOOST_FIXTURE_TEST_SUITE(tile_op_add_suite, AddFixture,
-                         TA_UT_LABEL_SERIAL)
+BOOST_FIXTURE_TEST_SUITE(tile_op_add_suite, AddFixture, TA_UT_LABEL_SERIAL)
 
 BOOST_AUTO_TEST_CASE(constructor) {
   // Check that the constructors can be called without throwing exceptions
@@ -398,4 +398,84 @@ BOOST_AUTO_TEST_CASE(binary_add_right_zero_perm_consume_right) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(binary_add_heterogeneous) {
+  TensorD a(RangeFixture::r, [](auto&) { return TiledArray::drand(); });
+  EigenSparseTile<double> b(RangeFixture::r);
+
+  /////////////////
+  // dense + sparse
+  /////////////////
+  {{// a is persistent
+    auto c = add(a, b);
+
+  // Check that the result range is correct
+  BOOST_CHECK_EQUAL(c.range(), a.range());
+
+  // Check that a nor b were consumed
+  BOOST_CHECK_NE(a.data(), nullptr);
+  BOOST_CHECK_NE(c.data(), a.data());
+
+  // Check that the data in the new tile is correct
+  for (std::size_t i = 0ul; i < r.volume(); ++i) {
+    BOOST_CHECK_EQUAL(c[i], a[i] + b[i]);
+  }
+}
+{  // a is consumed
+  auto a_copy = a.clone();
+  if (r.rank() == 3) a.shift({-7, 7, 0});
+  auto c = add(std::move(a), std::move(b));
+
+  // Check that the result range is correct
+  BOOST_CHECK_EQUAL(c.range(), b.range());
+
+  // Check that a was consumed
+  BOOST_CHECK_EQUAL(a.data(), nullptr);
+
+  // Check that the data in the new tile is correct
+  for (std::size_t i = 0ul; i < r.volume(); ++i) {
+    BOOST_CHECK_EQUAL(c[i], a_copy[i] + b[i]);
+  }
+  a = a_copy;
+}
+}
+
+/////////////////
+// sparse + dense
+/////////////////
+{
+  {  // a is persistent
+    auto c = add(b, a);
+
+    // Check that the result range is correct
+    BOOST_CHECK_EQUAL(c.range(), b.range());
+
+    // Check that a was not consumed
+    BOOST_CHECK_NE(a.data(), nullptr);
+    BOOST_CHECK_NE(c.data(), a.data());
+
+    // Check that the data in the new tile is correct
+    for (std::size_t i = 0ul; i < r.volume(); ++i) {
+      BOOST_CHECK_EQUAL(c[i], b[i] + a[i]);
+    }
+  }
+  {  // a is consumed
+    auto a_copy = a.clone();
+    if (r.rank() == 3) a.shift({-7, 7, 0});
+    auto c = add(std::move(b), std::move(a));
+
+    // Check that the result range is correct
+    BOOST_CHECK_EQUAL(c.range(), b.range());
+
+    // Check that a was consumed
+    BOOST_CHECK_EQUAL(a.data(), nullptr);
+
+    // Check that the data in the new tile is correct
+    for (std::size_t i = 0ul; i < r.volume(); ++i) {
+      BOOST_CHECK_EQUAL(c[i], b[i] + a_copy[i]);
+    }
+    a = a_copy;
+  }
+}
+}
+
 BOOST_AUTO_TEST_SUITE_END()