From 4bde729663e9309805af6b08aba84369e54960c7 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 19 May 2023 10:19:25 -0400
Subject: [PATCH 01/14] created a placeholded test

---
 tests/einsum.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)
diff --git a/tests/einsum.cpp b/tests/einsum.cpp
index 0fcb71f072..f62be4e016 100644
--- a/tests/einsum.cpp
+++ b/tests/einsum.cpp
@@ -714,7 +714,55 @@ BOOST_AUTO_TEST_CASE(xxx) {
   BOOST_CHECK(are_equal);
 }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_tot
+
+BOOST_AUTO_TEST_SUITE(einsum_tot_t)
+
+BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) {
+  using t_type = DistArray<Tensor<double>, SparsePolicy>;
+  using tot_type = DistArray<Tensor<Tensor<double>>, SparsePolicy>;
+  using matrix_il = TiledArray::detail::matrix_il<Tensor<double>>;
+  auto& world = TiledArray::get_default_world();
+  Tensor<double> lhs_elem_0_0(
+      Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57});
+  Tensor<double> lhs_elem_0_1(
+      Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74});
+  Tensor<double> lhs_elem_1_0(
+      Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89});
+  Tensor<double> lhs_elem_1_1(
+      Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71});
+  Tensor<double> lhs_elem_2_0(
+      Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14});
+  Tensor<double> lhs_elem_2_1(
+      Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24});
+  Tensor<double> lhs_elem_3_0(
+      Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79});
+  Tensor<double> lhs_elem_3_1(
+      Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97});
+  matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1},
+                   {lhs_elem_1_0, lhs_elem_1_1},
+                   {lhs_elem_2_0, lhs_elem_2_1},
+                   {lhs_elem_3_0, lhs_elem_3_1}};
+  TiledRange lhs_trange{{0, 2, 4}, {0, 2}};
+  tot_type lhs(world, lhs_trange, lhs_il);
+
+  TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}};
+  t_type rhs(world, rhs_trange);
+  rhs.fill_random();
+
+  TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1),
+                               rhs_trange.dim(0)};
+  tot_type ref_result(world, ref_result_trange);
+  // TODO compute ref_result
+
+  tot_type result;
+  BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k"));
+  // dist_array_t out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,k;m,n");
+  // const bool are_equal = ToTArrayFixture::are_equal(corr, out);
+  // BOOST_CHECK(are_equal);
+}
+
+BOOST_AUTO_TEST_SUITE_END()  // einsum_tot_t
 
 // Eigen einsum indices
 BOOST_AUTO_TEST_SUITE(einsum_index, TA_UT_LABEL_SERIAL)
@@ -740,7 +788,7 @@ BOOST_AUTO_TEST_CASE(einsum_index) {
   BOOST_CHECK((v.range() == Range{src}));
 }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_index
 
 #include "TiledArray/einsum/eigen.h"
 
@@ -919,7 +967,7 @@ BOOST_AUTO_TEST_CASE(einsum_eigen_hji_jih_hj) {
   BOOST_CHECK(isApprox(reference, result));
 }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_eigen
 
 // TiledArray einsum expressions
 BOOST_AUTO_TEST_SUITE(einsum_tiledarray)
@@ -1098,4 +1146,4 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_dot) {
 //   BOOST_CHECK(hik_hkj_hji == hkj_hji_hik);
 // }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_tiledarray

From 6f63289c391290bdae8c450daa7d082656aabc12 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 21 May 2023 16:44:40 -0400
Subject: [PATCH 02/14] [unit] re-enable type_traits

---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 32a8e9ee6c..49b5c61f95 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -34,6 +34,7 @@ set(executable ta_test)
 set(ta_test_src_files  ta_test.cpp
     range1.cpp
     range.cpp
+    type_traits.cpp
     tensor.cpp
     tensor_of_tensor.cpp
     tensor_tensor_view.cpp

From 47223893caf8602436bf044e2565344e2d4625e2 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 21 May 2023 17:31:08 -0400
Subject: [PATCH 03/14] introduced is_nested_tensor and
 tensors_have_equal_nested_rank

also cleaned up implementation of is_tensor_of_tensor
---
 src/TiledArray/expressions/expr_engine.h |  2 +-
 src/TiledArray/tensor/type_traits.h      | 97 +++++++++++++++++++++---
 tests/type_traits.cpp                    | 28 +++++++
 3 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/src/TiledArray/expressions/expr_engine.h b/src/TiledArray/expressions/expr_engine.h
index bd4dbd9ccd..c364a5c1ba 100644
--- a/src/TiledArray/expressions/expr_engine.h
+++ b/src/TiledArray/expressions/expr_engine.h
@@ -73,7 +73,7 @@ class ExprEngine : private NO_DEFAULTS {
   World* world_;  ///< The world where this expression will be evaluated
   BipartiteIndexList
       indices_;  ///< The index list of this expression; bipartite due to need
-                 ///< to support recursive tensors (i.e. Tensor-of-Tensor)
+                 ///< to support nested tensors (e.g. tensors of tensors)
   bool permute_tiles_;  ///< Result tile permutation flag (\c true == permute
                         ///< tile)
   /// The permutation that will be applied to the outer tensor of tensors
diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h
index 62448336a3..2903e5e7f7 100644
--- a/src/TiledArray/tensor/type_traits.h
+++ b/src/TiledArray/tensor/type_traits.h
@@ -28,9 +28,9 @@
 
 #include <TiledArray/config.h>
 
+#include <TiledArray/fwd.h>
 #include <TiledArray/type_traits.h>
 #include <type_traits>
-#include <TiledArray/fwd.h>
 
 namespace Eigen {
 
@@ -60,10 +60,23 @@ class ShiftWrapper;
 // Note: These type traits help differentiate different implementation
 // functions for tensors, so a tensor of tensors is not considered a tensor.
 
+/// is true type if all `Ts...` are tensors of scalars
 template <typename... Ts>
 struct is_tensor;
+/// is true type if all `Ts...` are tensors of tensors of scalars
 template <typename... Ts>
 struct is_tensor_of_tensor;
+/// is true type if all `Ts...` are _nested_ tensors; a nested tensor is a
+/// tensors of scalars or tensors of nested tensors
+template <typename... Ts>
+struct is_nested_tensor;
+/// is true type if `T1`, `T2`, and `Ts...` are tensors of same nested
+/// rank, i.e. they are all tensors of scalars or tensors of tensors of scalars,
+/// etc. ;
+/// \warning the types must be tensors, hence
+/// `tensors_have_equal_nested_rank<Scalar1,Scalar2>` is false
+template <typename T1, typename T2, typename... Ts>
+struct tensors_have_equal_nested_rank;
 
 template <typename>
 struct is_tensor_helper : public std::false_type {};
@@ -83,23 +96,41 @@ struct is_tensor_helper<ShiftWrapper<const T>> : public is_tensor_helper<T> {};
 template <typename T>
 struct is_tensor_helper<Tile<T>> : public is_tensor_helper<T> {};
 
+////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct is_nested_tensor<> : public std::false_type {};
+
 template <typename T>
-struct is_tensor_of_tensor_helper : public std::false_type {};
+struct is_nested_tensor<T> : is_tensor_helper<T> {};
 
-template <typename T, typename A>
-struct is_tensor_of_tensor_helper<Tensor<T, A>> : public is_tensor_helper<T> {};
+template <typename T1, typename T2, typename... Ts>
+struct is_nested_tensor<T1, T2, Ts...> {
+  static constexpr bool value =
+      is_tensor_helper<T1>::value && is_nested_tensor<T2, Ts...>::value;
+};
 
-template <typename T, typename... Args>
-struct is_tensor_of_tensor_helper<TensorInterface<T, Args...>>
-    : public is_tensor_helper<T> {};
+/// @tparam Ts a parameter pack
+/// @c is_nested_tensor_v<Ts...> is an alias for @c
+/// is_nested_tensor<Ts...>::value
+template <typename... Ts>
+constexpr const bool is_nested_tensor_v = is_nested_tensor<Ts...>::value;
 
-template <typename T>
-struct is_tensor_of_tensor_helper<ShiftWrapper<T>>
-    : public is_tensor_of_tensor_helper<T> {};
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Enabler = void>
+struct is_tensor_of_tensor_helper : public std::false_type {};
 
 template <typename T>
-struct is_tensor_of_tensor_helper<Tile<T>>
-    : public is_tensor_of_tensor_helper<T> {};
+struct is_tensor_of_tensor_helper<
+    T, std::enable_if_t<is_tensor_helper<T>::value>> {
+  static constexpr bool value =
+      is_tensor_helper<detail::remove_cvr_t<typename T::value_type>>::value &&
+      !is_tensor_of_tensor_helper<
+          detail::remove_cvr_t<typename T::value_type>>::value;
+};
+
+////////////////////////////////////////////////////////////////////////////////
 
 template <>
 struct is_tensor<> : public std::false_type {};
@@ -121,6 +152,8 @@ struct is_tensor<T1, T2, Ts...> {
 template <typename... Ts>
 constexpr const bool is_tensor_v = is_tensor<Ts...>::value;
 
+////////////////////////////////////////////////////////////////////////////////
+
 template <>
 struct is_tensor_of_tensor<> : public std::false_type {};
 
@@ -141,6 +174,42 @@ struct is_tensor_of_tensor<T1, T2, Ts...> {
 template <typename... Ts>
 constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor<Ts...>::value;
 
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T1, typename T2, typename Enabler = void>
+struct tensors_have_equal_nested_rank_helper : std::false_type {};
+
+template <typename T1, typename T2>
+struct tensors_have_equal_nested_rank_helper<
+    T1, T2, std::enable_if_t<is_nested_tensor_v<T1, T2>>> {
+  static constexpr bool value =
+      tensors_have_equal_nested_rank_helper<
+          detail::remove_cvr_t<typename T1::value_type>,
+          detail::remove_cvr_t<typename T2::value_type>>::value ||
+      (detail::is_numeric_v<detail::remove_cvr_t<typename T1::value_type>> &&
+       detail::is_numeric_v<detail::remove_cvr_t<typename T2::value_type>>);
+};
+
+template <typename T1, typename T2>
+struct tensors_have_equal_nested_rank<T1, T2>
+    : tensors_have_equal_nested_rank_helper<T1, T2> {};
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+struct tensors_have_equal_nested_rank<T1, T2, T3, Ts...> {
+  static constexpr bool value =
+      tensors_have_equal_nested_rank<T1, T2>::value &&
+      tensors_have_equal_nested_rank<T2, T3, Ts...>::value;
+};
+
+/// @tparam Ts a parameter pack
+/// @c tensors_have_equal_nested_rank_v<Ts...> is an alias for @c
+/// tensors_have_equal_nested_rank<Ts...>::value
+template <typename T1, typename T2, typename... Ts>
+constexpr const bool tensors_have_equal_nested_rank_v =
+    tensors_have_equal_nested_rank<T1, T2, Ts...>::value;
+
+////////////////////////////////////////////////////////////////////////////////
+
 template <typename T, typename Enabler = void>
 struct is_ta_tensor : public std::false_type {};
 
@@ -150,6 +219,8 @@ struct is_ta_tensor<Tensor<T, A>> : public std::true_type {};
 template <typename T>
 constexpr const bool is_ta_tensor_v = is_ta_tensor<T>::value;
 
+////////////////////////////////////////////////////////////////////////////////
+
 // Test if the tensor is contiguous
 
 template <typename T>
@@ -198,6 +269,8 @@ template <typename... Ts>
 constexpr const bool is_contiguous_tensor_v =
     is_contiguous_tensor<Ts...>::value;
 
+////////////////////////////////////////////////////////////////////////////////
+
 // Test if the tensor is shifted
 
 template <typename T>
diff --git a/tests/type_traits.cpp b/tests/type_traits.cpp
index 105ae6ff72..77940bcb6f 100644
--- a/tests/type_traits.cpp
+++ b/tests/type_traits.cpp
@@ -275,4 +275,32 @@ BOOST_AUTO_TEST_CASE(convertibility) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(tensor) {
+  using TI = TiledArray::Tensor<int>;
+  using TTI = TiledArray::Tensor<TiledArray::Tensor<int>>;
+  using TTTI = TiledArray::Tensor<TiledArray::Tensor<TiledArray::Tensor<int>>>;
+  using TD = TiledArray::Tensor<double>;
+  using TTD = TiledArray::Tensor<TiledArray::Tensor<double>>;
+  using TTTD =
+      TiledArray::Tensor<TiledArray::Tensor<TiledArray::Tensor<double>>>;
+
+  using namespace TiledArray::detail;
+  BOOST_CHECK((is_tensor_v<TI>));
+  BOOST_CHECK(!(is_tensor_v<TI, TTI, TTTI, TD, TTD, TTTD>));
+  BOOST_CHECK((is_tensor_of_tensor_v<TTI, TTD>));
+  BOOST_CHECK(!(is_tensor_of_tensor_v<TTI, TTTI, TTD, TTTD>));
+  BOOST_CHECK((!is_tensor_of_tensor_v<TI>));
+  BOOST_CHECK((!is_tensor_of_tensor_v<TD>));
+  BOOST_CHECK((is_nested_tensor_v<TI, TTI, TTTI, TD, TTD, TTTD>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<int, double>));
+  BOOST_CHECK((tensors_have_equal_nested_rank_v<TI, TD>));
+  BOOST_CHECK((tensors_have_equal_nested_rank_v<TTI, TTD>));
+  BOOST_CHECK((tensors_have_equal_nested_rank_v<TTTI, TTTD>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<int, TD>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TI, TTI>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TI, TTTI>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TTI, TTTI>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TI, TD, TTD>));
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 1f2c08c090e93e033af92bf633ed1eb60d80d0aa Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 22 May 2023 23:44:19 -0600
Subject: [PATCH 04/14] Tensor::mult(other,...) can deal with other of
 different nested rank than this

---
 src/TiledArray/expressions/mult_engine.h |  14 +-
 src/TiledArray/tensor/kernels.h          |  33 +++--
 src/TiledArray/tensor/operators.h        |  33 ++---
 src/TiledArray/tensor/permute.h          |  18 ++-
 src/TiledArray/tensor/tensor.h           | 173 ++++++++++++++---------
 tests/einsum.cpp                         |  16 ++-
 tests/sparse_tile.h                      |  46 +++++-
 tests/tensor_of_tensor.cpp               |  62 +++++++-
 8 files changed, 267 insertions(+), 128 deletions(-)

diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h
index 19788505fd..a53133d4b0 100644
--- a/src/TiledArray/expressions/mult_engine.h
+++ b/src/TiledArray/expressions/mult_engine.h
@@ -189,12 +189,14 @@ struct EngineTrait<ScalMultEngine<Left, Right, Scalar, Result>> {
 /// Multiplication expression engine
 
 /// This implements any expression encoded with the multiplication operator.
-/// This includes Hadamard product, e.g. \code (c("i,j")=)a("i,j")*b("i,j")
-/// \endcode , and pure contractions, e.g. \code (c("i,j")=)a("i,k")*b("k,j")
-/// \endcode . \internal mixed Hadamard-contraction case, e.g. \code
-/// c("i,j,l")=a("i,l,k")*b("j,l,k") \endcode , is not supported since
-///   this requires that the result labels are assigned by user (currently they
-///   are computed by this engine)
+/// This includes Hadamard product, e.g.
+/// \code (c("i,j")=)a("i,j")*b("i,j") \endcode ,
+/// and pure contractions, e.g. \code (c("i,j")=)a("i,k")*b("k,j") \endcode .
+/// \internal mixed Hadamard-contraction case, e.g.
+/// \code c("i,j,l")=a("i,l,k")*b("j,l,k") \endcode ,
+/// is not supported since
+/// this requires that the result labels are assigned by user (currently they
+/// are computed by this engine)
 /// \tparam Left The left-hand engine type
 /// \tparam Right The right-hand engine type
 /// \tparam Result The result tile type
diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h
index 2cd2d46fe3..87db8c1cc6 100644
--- a/src/TiledArray/tensor/kernels.h
+++ b/src/TiledArray/tensor/kernels.h
@@ -61,13 +61,14 @@ struct transform;
 /// \param tensor1 The first argument tensor
 /// \param tensors The remaining argument tensors
 template <typename TR, typename Op, typename T1, typename... Ts,
-          typename std::enable_if<
-              is_tensor<TR, T1, Ts...>::value ||
-              is_tensor_of_tensor<TR, T1, Ts...>::value>::type* = nullptr>
+          typename = std::enable_if_t<
+              detail::is_nested_tensor_v<TR, T1, Ts...> ||
+              std::is_invocable_r_v<TR, Op, const T1&, const Ts&...>>>
 inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) {
   if constexpr (std::is_invocable_r_v<TR, Op, const T1&, const Ts&...>) {
     return std::forward<Op>(op)(tensor1, tensors...);
   } else {
+    static_assert(detail::is_nested_tensor_v<TR, T1, Ts...>);
     return TiledArray::detail::transform<TR>()(std::forward<Op>(op), tensor1,
                                                tensors...);
   }
@@ -93,8 +94,7 @@ inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) {
 /// \param[in] tensors The remaining argument tensors
 template <typename TR, typename Op, typename T1, typename... Ts,
           typename std::enable_if<
-              (is_tensor<T1, Ts...>::value ||
-               is_tensor_of_tensor<TR, T1, Ts...>::value) &&
+              is_nested_tensor_v<T1, Ts...> &&
               is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
 inline TR tensor_op(Op&& op, const Permutation& perm, const T1& tensor1,
                     const Ts&... tensors) {
@@ -219,7 +219,7 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
 /// \param[in] tensors The argument tensors
 template <typename Op, typename TR, typename... Ts,
           typename std::enable_if<
-              is_tensor_of_tensor<TR, Ts...>::value &&
+              !is_tensor_v<TR, Ts...> &&
               is_contiguous_tensor<TR, Ts...>::value>::type* = nullptr>
 inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
@@ -228,7 +228,11 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   const auto volume = result.range().volume();
 
   for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) {
-    inplace_tensor_op(op, result.at_ordinal(ord), tensors.at_ordinal(ord)...);
+    if constexpr (std::is_invocable_r_v<void, Op, typename TR::value_type&,
+                                        typename Ts::value_type...>)
+      op(result.at_ordinal(ord), tensors.at_ordinal(ord)...);
+    else
+      inplace_tensor_op(op, result.at_ordinal(ord), tensors.at_ordinal(ord)...);
   }
 }
 
@@ -457,7 +461,7 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
                       tensors.data()...);
 }
 
-/// Initialize tensor of tensors with contiguous tensor arguments
+/// Initialize nested tensor with contiguous tensor arguments
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensors[i]...)
@@ -470,7 +474,8 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
 /// \param[in] tensors The argument tensors
 template <
     typename Op, typename TR, typename... Ts,
-    typename std::enable_if<is_tensor_of_tensor<TR, Ts...>::value &&
+    typename std::enable_if<(is_nested_tensor<TR, Ts...>::value &&
+                             !is_tensor<TR, Ts...>::value) &&
                             is_contiguous_tensor<TR>::value>::type* = nullptr>
 inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
@@ -478,9 +483,13 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
 
   const auto volume = result.range().volume();
 
-  for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) {
-    new (result.data() + ord) typename TR::value_type(
-        tensor_op<typename TR::value_type>(op, tensors.at_ordinal(ord)...));
+  if constexpr (std::is_invocable_r_v<TR, Op, const Ts&...>) {
+    result = std::forward<Op>(op)(tensors...);
+  } else {
+    for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) {
+      new (result.data() + ord) typename TR::value_type(
+          tensor_op<typename TR::value_type>(op, tensors.at_ordinal(ord)...));
+    }
   }
 }
 
diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h
index 4be515d3b3..b8ed77671d 100644
--- a/src/TiledArray/tensor/operators.h
+++ b/src/TiledArray/tensor/operators.h
@@ -41,11 +41,8 @@ namespace TiledArray {
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] + right[i]</tt>
 template <typename T1, typename T2,
-          typename = std::enable_if_t<
-              detail::is_tensor<detail::remove_cvr_t<T1>,
-                                detail::remove_cvr_t<T2>>::value ||
-              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
-                                          detail::remove_cvr_t<T2>>::value>>
+          typename = std::enable_if_t<detail::tensors_have_equal_nested_rank_v<
+              detail::remove_cvr_t<T1>, detail::remove_cvr_t<T2>>>>
 inline decltype(auto) operator+(T1&& left, T2&& right) {
   return add(std::forward<T1>(left), std::forward<T2>(right));
 }
@@ -58,14 +55,9 @@ inline decltype(auto) operator+(T1&& left, T2&& right) {
 /// \param left The left-hand tensor argument
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] - right[i]</tt>
-template <
-    typename T1, typename T2,
-    typename std::enable_if<
-        detail::is_tensor<detail::remove_cvr_t<T1>,
-                          detail::remove_cvr_t<T2>>::value ||
-        detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
-                                    detail::remove_cvr_t<T2>>::value>::type* =
-        nullptr>
+template <typename T1, typename T2,
+          typename = std::enable_if_t<detail::tensors_have_equal_nested_rank_v<
+              detail::remove_cvr_t<T1>, detail::remove_cvr_t<T2>>>>
 inline decltype(auto) operator-(T1&& left, T2&& right) {
   return subt(std::forward<T1>(left), std::forward<T2>(right));
 }
@@ -80,12 +72,8 @@ inline decltype(auto) operator-(T1&& left, T2&& right) {
 /// \return A tensor where element \c i is equal to <tt>left[i] * right[i]</tt>
 template <
     typename T1, typename T2,
-    typename std::enable_if<
-        detail::is_tensor<detail::remove_cvr_t<T1>,
-                          detail::remove_cvr_t<T2>>::value ||
-        detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
-                                    detail::remove_cvr_t<T2>>::value>::type* =
-        nullptr>
+    typename std::enable_if<detail::is_nested_tensor_v<
+        detail::remove_cvr_t<T1>, detail::remove_cvr_t<T2>>>::type* = nullptr>
 inline decltype(auto) operator*(T1&& left, T2&& right) {
   return mult(std::forward<T1>(left), std::forward<T2>(right));
 }
@@ -100,8 +88,7 @@ inline decltype(auto) operator*(T1&& left, T2&& right) {
 /// \return A tensor where element \c i is equal to <tt> left[i] * right </tt>
 template <typename T, typename N,
           typename std::enable_if<
-              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
-               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_nested_tensor_v<detail::remove_cvr_t<T>> &&
               detail::is_numeric_v<N>>::type* = nullptr>
 inline decltype(auto) operator*(T&& left, N right) {
   return scale(std::forward<T>(left), right);
@@ -118,9 +105,7 @@ template <
     typename N, typename T,
     typename std::enable_if<
         detail::is_numeric_v<N> &&
-        (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
-         detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value)>::type* =
-        nullptr>
+        detail::is_nested_tensor_v<detail::remove_cvr_t<T>>>::type* = nullptr>
 inline decltype(auto) operator*(N left, T&& right) {
   return scale(std::forward<T>(right), left);
 }
diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h
index 1b888e3a3d..7fb103217f 100644
--- a/src/TiledArray/tensor/permute.h
+++ b/src/TiledArray/tensor/permute.h
@@ -97,10 +97,14 @@ inline void fuse_dimensions(SizeType* MADNESS_RESTRICT const fused_size,
 
 /// The expected signature of the input operations is:
 /// \code
-/// Result::value_type input_op(const Arg0::value_type, const
-/// Args::value_type...) \endcode The expected signature of the output
-/// operations is: \code void output_op(Result::value_type*, const
-/// Result::value_type) \endcode \tparam InputOp The input operation type
+/// Result::value_type input_op(const Arg0::value_type,
+///                             const Args::value_type...)
+/// \endcode
+/// The expected signature of the output
+/// operations is:
+/// \code void output_op(Result::value_type*, const Result::value_type)
+/// \endcode
+/// \tparam InputOp The input operation type
 /// \tparam OutputOp The output operation type
 /// \tparam Result The result tensor type
 /// \tparam Arg0 The first tensor argument type
@@ -152,7 +156,7 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
 
       // Copy the block
       math::vector_ptr_op(op, block_size, result.data() + perm_index,
-                          arg0.data() + index, (args.data() + index)...);
+                          &arg0[index], &args[index]...);
     }
 
   } else {
@@ -194,8 +198,8 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
 
         math::transpose(input_op, output_op, other_fused_size[1],
                         other_fused_size[3], result_outer_stride,
-                        result.data() + perm_index, other_fused_weight[1],
-                        arg0.data() + index, (args.data() + index)...);
+                        &result[perm_index], other_fused_weight[1],
+                        &arg0[index], &args[index]...);
       }
     }
   }
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index d6ce1b62be..73e3fc0caf 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -99,6 +99,8 @@ class Tensor {
       scalar_type;  ///< the scalar type that supports T
 
  private:
+  template <typename X>
+  using value_t = typename X::value_type;
   template <typename X>
   using numeric_t = typename TiledArray::detail::numeric_type<X>::type;
 
@@ -350,6 +352,12 @@ class Tensor {
   /// \param other The tensor to be copied
   /// \note this constructor is disabled if \p T1 already has a conversion
   ///       operator to this type
+  /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather
+  ///          than copied to make the semantics of  this to be consistent
+  ///          between tensors of scalars and tensors of scalars; specifically,
+  ///          if `T1` is a tensor of scalars the constructed tensor is
+  ///          is independent of \p other, thus should apply clone to inner
+  ///          tensor nests to behave similarly for nested tensors
   template <
       typename T1,
       typename std::enable_if<
@@ -357,7 +365,13 @@ class Tensor {
           !detail::has_conversion_operator_v<T1, Tensor>>::type* = nullptr>
   explicit Tensor(const T1& other)
       : Tensor(detail::clone_range(other), 1, default_construct{false}) {
-    auto op = [](const numeric_t<T1> arg) -> numeric_t<T1> { return arg; };
+    auto op = [](const value_type& arg) -> decltype(auto) {
+      // clone nested tensors
+      if constexpr (detail::is_tensor_v<value_type>)
+        return arg.clone();
+      else
+        return arg;
+    };
 
     detail::tensor_init(op, *this, other);
   }
@@ -368,13 +382,25 @@ class Tensor {
   /// \tparam Perm A permutation type
   /// \param other The tensor to be copied
   /// \param perm The permutation that will be applied to the copy
+  /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather
+  ///          than copied to make the semantics of  this to be consistent
+  ///          between tensors of scalars and tensors of scalars; specifically,
+  ///          if `T1` is a tensor of scalars the constructed tensor is
+  ///          is independent of \p other, thus should apply clone to inner
+  ///          tensor nests to behave similarly for nested tensors
   template <
       typename T1, typename Perm,
-      typename std::enable_if<is_tensor<T1>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<T1> &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor(const T1& other, const Perm& perm)
       : Tensor(outer(perm) * other.range(), 1, default_construct{false}) {
-    auto op = [](const numeric_t<T1> arg) -> numeric_t<T1> { return arg; };
+    auto op = [](const value_type& arg) -> decltype(auto) {
+      // clone nested tensors
+      if constexpr (detail::is_tensor_v<value_type>)
+        return arg.clone();
+      else
+        return arg;
+    };
 
     detail::tensor_init(op, outer(perm), *this, other);
 
@@ -448,7 +474,7 @@ class Tensor {
   /// \param right The right-hand tensor argument
   /// \param op The element-wise operation
   template <typename T1, typename T2, typename Op,
-            typename std::enable_if<is_tensor<T1, T2>::value>::type* = nullptr>
+            typename = std::enable_if_t<detail::is_nested_tensor_v<T1, T2>>>
   Tensor(const T1& left, const T2& right, Op&& op)
       : Tensor(detail::clone_range(left), 1, default_construct{false}) {
     detail::tensor_init(op, *this, left, right);
@@ -1331,8 +1357,10 @@ class Tensor {
   /// \c op(*this[i],other[i])
   template <typename Right, typename Op,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  Tensor binary(const Right& right, Op&& op) const {
-    return Tensor(*this, right, op);
+  auto binary(const Right& right, Op&& op) const {
+    using result_value_type = decltype(op(
+        std::declval<const T&>(), std::declval<const value_t<Right>&>()));
+    return Tensor<result_value_type>(*this, right, op);
   }
 
   /// Use a binary, element wise operation to construct a new, permuted tensor
@@ -1341,7 +1369,7 @@ class Tensor {
   /// \tparam Op The binary operation type
   /// \tparam Perm A permutation tile
   /// \param right The right-hand argument in the binary operation
-  /// \param op The binary, element-wise operation
+  /// \param op The binary element-wise operation
   /// \param perm The permutation to be applied to this tensor
   /// \return A tensor where element \c i of the new tensor is equal to
   /// \c op(*this[i],other[i])
@@ -1349,7 +1377,7 @@ class Tensor {
       typename Right, typename Op, typename Perm,
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
-  Tensor binary(const Right& right, Op&& op, const Perm& perm) const {
+  auto binary(const Right& right, Op&& op, const Perm& perm) const {
     constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
     [[maybe_unused]] constexpr bool is_bperm =
         detail::is_bipartite_permutation_v<Perm>;
@@ -1357,16 +1385,19 @@ class Tensor {
     // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
     // not match Tensor");
     if constexpr (!is_tot) {
+      using result_value_type = decltype(op(
+          std::declval<const T&>(), std::declval<const value_t<Right>&>()));
+      using ResultTensor = Tensor<result_value_type>;
       if constexpr (is_bperm) {
         TA_ASSERT(inner_size(perm) == 0);  // ensure this is a plain permutation
-        return Tensor(*this, right, op, outer(perm));
+        return ResultTensor(*this, right, op, outer(perm));
       } else
-        return Tensor(*this, right, op, perm);
+        return ResultTensor(*this, right, op, perm);
     } else {
       // AFAIK the other branch fundamentally relies on raw pointer arithmetic,
       // which won't work for ToTs.
       auto temp = binary(right, std::forward<Op>(op));
-      Permute<Tensor, Tensor> p;
+      Permute<decltype(temp), decltype(temp)> p;
       return p(temp, perm);
     }
     abort();  // unreachable
@@ -1377,7 +1408,7 @@ class Tensor {
   /// \tparam Right The right-hand tensor type
   /// \tparam Op The binary operation type
   /// \param right The right-hand argument in the binary operation
-  /// \param op The binary, element-wise operation
+  /// \param op The binary element-wise operation
   /// \return A reference to this object
   /// \throw TiledArray::Exception When this tensor is empty.
   /// \throw TiledArray::Exception When \c other is empty.
@@ -1385,7 +1416,8 @@ class Tensor {
   /// to the range of \c other.
   /// \throw TiledArray::Exception When this and \c other are the same.
   template <typename Right, typename Op,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
+                nullptr>
   Tensor& inplace_binary(const Right& right, Op&& op) {
     detail::inplace_tensor_op(op, *this, right);
     return *this;
@@ -1394,7 +1426,7 @@ class Tensor {
   /// Use a unary, element wise operation to construct a new tensor
 
   /// \tparam Op The unary operation type
-  /// \param op The unary, element-wise operation
+  /// \param op The unary element-wise operation
   /// \return A tensor where element \c i of the new tensor is equal to
   /// \c op(*this[i])
   /// \throw TiledArray::Exception When this tensor is empty.
@@ -1407,7 +1439,7 @@ class Tensor {
 
   /// \tparam Op The unary operation type
   /// \tparam Perm A permutation tile
-  /// \param op The unary operation
+  /// \param op The unary element-wise operation
   /// \param perm The permutation to be applied to this tensor
   /// \return A permuted tensor with elements that have been modified by \c op
   /// \throw TiledArray::Exception When this tensor is empty.
@@ -1459,7 +1491,7 @@ class Tensor {
   template <typename Scalar, typename std::enable_if<
                                  detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor scale(const Scalar factor) const {
-    return unary([factor](const numeric_type a) -> numeric_type {
+    return unary([factor](const value_type& a) -> decltype(auto) {
       using namespace TiledArray::detail;
       return a * factor;
     });
@@ -1494,7 +1526,7 @@ class Tensor {
                                  detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& scale_to(const Scalar factor) {
     return inplace_unary(
-        [factor](numeric_type& MADNESS_RESTRICT res) { res *= factor; });
+        [factor](value_type& MADNESS_RESTRICT res) { res *= factor; });
   }
 
   // Addition operations
@@ -1510,7 +1542,7 @@ class Tensor {
   Tensor add(const Right& right) const& {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
           return l + r;
         });
   }
@@ -1543,7 +1575,7 @@ class Tensor {
   Tensor add(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_type& r) -> decltype(auto) {
           return l + r;
         },
         perm);
@@ -1562,9 +1594,11 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor add(const Right& right, const Scalar factor) const {
-    return binary(right,
-                  [factor](const numeric_type l, const numeric_t<Right> r)
-                      -> numeric_type { return (l + r) * factor; });
+    return binary(
+        right,
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l + r) * factor;
+        });
   }
 
   /// Scale and add this and \c other to construct a new, permuted tensor
@@ -1584,8 +1618,9 @@ class Tensor {
   Tensor add(const Right& right, const Scalar factor, const Perm& perm) const {
     return binary(
         right,
-        [factor](const numeric_type l, const numeric_t<Right> r)
-            -> numeric_type { return (l + r) * factor; },
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l + r) * factor;
+        },
         perm);
   }
 
@@ -1622,8 +1657,8 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor& add_to(const Right& right) {
-    return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l,
-                                    const numeric_t<Right> r) { l += r; });
+    return inplace_binary(right, [](value_type& MADNESS_RESTRICT l,
+                                    const value_t<Right> r) { l += r; });
   }
 
   /// Add \c other to this tensor, and scale the result
@@ -1639,8 +1674,8 @@ class Tensor {
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& add_to(const Right& right, const Scalar factor) {
     return inplace_binary(
-        right, [factor](numeric_type& MADNESS_RESTRICT l,
-                        const numeric_t<Right> r) { (l += r) *= factor; });
+        right, [factor](value_type& MADNESS_RESTRICT l,
+                        const value_t<Right> r) { (l += r) *= factor; });
   }
 
   /// Add a constant to this tensor
@@ -1661,11 +1696,11 @@ class Tensor {
   /// \return A new tensor where the elements are the different between the
   /// elements of \c this and \c right
   template <typename Right,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+            typename = std::enable_if<
+                detail::tensors_have_equal_nested_rank_v<Tensor<T>, Right>>>
   Tensor subt(const Right& right) const {
     return binary(
-        right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        right, [](const value_type& l, const value_type& r) -> decltype(auto) {
           return l - r;
         });
   }
@@ -1685,7 +1720,7 @@ class Tensor {
   Tensor subt(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_type& r) -> decltype(auto) {
           return l - r;
         },
         perm);
@@ -1705,9 +1740,11 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor subt(const Right& right, const Scalar factor) const {
-    return binary(right,
-                  [factor](const numeric_type l, const numeric_t<Right> r)
-                      -> numeric_type { return (l - r) * factor; });
+    return binary(
+        right,
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l - r) * factor;
+        });
   }
 
   /// Subtract \c right from this and return the result scaled by a scaling \c
@@ -1728,8 +1765,9 @@ class Tensor {
   Tensor subt(const Right& right, const Scalar factor, const Perm& perm) const {
     return binary(
         right,
-        [factor](const numeric_type l, const numeric_t<Right> r)
-            -> numeric_type { return (l - r) * factor; },
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l - r) * factor;
+        },
         perm);
   }
 
@@ -1760,8 +1798,8 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor& subt_to(const Right& right) {
-    return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l,
-                                    const numeric_t<Right> r) { l -= r; });
+    return inplace_binary(
+        right, [](auto& MADNESS_RESTRICT l, const auto& r) { l -= r; });
   }
 
   /// Subtract \c right from and scale this tensor
@@ -1776,9 +1814,10 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& subt_to(const Right& right, const Scalar factor) {
-    return inplace_binary(
-        right, [factor](numeric_type& MADNESS_RESTRICT l,
-                        const numeric_t<Right> r) { (l -= r) *= factor; });
+    return inplace_binary(right,
+                          [factor](auto& MADNESS_RESTRICT l, const auto& r) {
+                            (l -= r) *= factor;
+                          });
   }
 
   /// Subtract a constant from this tensor
@@ -1795,11 +1834,12 @@ class Tensor {
   /// \return A new tensor where the elements are the product of the elements
   /// of \c this and \c right
   template <typename Right,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  Tensor mult(const Right& right) const {
+            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
+                nullptr>
+  decltype(auto) mult(const Right& right) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
           return l * r;
         });
   }
@@ -1814,12 +1854,12 @@ class Tensor {
   /// of \c this and \c right
   template <
       typename Right, typename Perm,
-      typename std::enable_if<is_tensor<Right>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
-  Tensor mult(const Right& right, const Perm& perm) const {
+  decltype(auto) mult(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
           return l * r;
         },
         perm);
@@ -1835,12 +1875,12 @@ class Tensor {
   /// of \c this and \c right, scaled by \c factor
   template <
       typename Right, typename Scalar,
-      typename std::enable_if<is_tensor<Right>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
-  Tensor mult(const Right& right, const Scalar factor) const {
+  decltype(auto) mult(const Right& right, const Scalar factor) const {
     return binary(right,
-                  [factor](const numeric_type l, const numeric_t<Right> r)
-                      -> numeric_type { return (l * r) * factor; });
+                  [factor](const value_type& l, const value_t<Right>& r)
+                      -> decltype(auto) { return (l * r) * factor; });
   }
 
   /// Scale and multiply this by \c right to create a new, permuted tensor
@@ -1853,15 +1893,17 @@ class Tensor {
   /// \param perm The permutation to be applied to this tensor
   /// \return A new tensor where the elements are the product of the elements
   /// of \c this and \c right, scaled by \c factor
-  template <typename Right, typename Scalar, typename Perm,
-            typename std::enable_if<
-                is_tensor<Right>::value && detail::is_numeric_v<Scalar> &&
-                detail::is_permutation_v<Perm>>::type* = nullptr>
-  Tensor mult(const Right& right, const Scalar factor, const Perm& perm) const {
+  template <
+      typename Right, typename Scalar, typename Perm,
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
+                              detail::is_numeric_v<Scalar> &&
+                              detail::is_permutation_v<Perm>>::type* = nullptr>
+  decltype(auto) mult(const Right& right, const Scalar factor,
+                      const Perm& perm) const {
     return binary(
         right,
-        [factor](const numeric_type l, const numeric_t<Right> r)
-            -> numeric_type { return (l * r) * factor; },
+        [factor](const value_type& l, const value_t<Right>& r)
+            -> decltype(auto) { return (l * r) * factor; },
         perm);
   }
 
@@ -1871,10 +1913,11 @@ class Tensor {
   /// \param right The tensor that will be multiplied by this tensor
   /// \return A reference to this tensor
   template <typename Right,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
+                nullptr>
   Tensor& mult_to(const Right& right) {
-    return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l,
-                                    const numeric_t<Right> r) { l *= r; });
+    return inplace_binary(right, [](value_type& MADNESS_RESTRICT l,
+                                    const value_t<Right>& r) { l *= r; });
   }
 
   /// Scale and multiply this tensor by \c right
@@ -1886,12 +1929,12 @@ class Tensor {
   /// \return A reference to this tensor
   template <
       typename Right, typename Scalar,
-      typename std::enable_if<is_tensor<Right>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& mult_to(const Right& right, const Scalar factor) {
     return inplace_binary(
-        right, [factor](numeric_type& MADNESS_RESTRICT l,
-                        const numeric_t<Right> r) { (l *= r) *= factor; });
+        right, [factor](value_type& MADNESS_RESTRICT l,
+                        const value_t<Right>& r) { (l *= r) *= factor; });
   }
 
   // Negation operations
diff --git a/tests/einsum.cpp b/tests/einsum.cpp
index f62be4e016..ee06cf099f 100644
--- a/tests/einsum.cpp
+++ b/tests/einsum.cpp
@@ -755,11 +755,17 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) {
   tot_type ref_result(world, ref_result_trange);
   // TODO compute ref_result
 
-  tot_type result;
-  BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k"));
-  // dist_array_t out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,k;m,n");
-  // const bool are_equal = ToTArrayFixture::are_equal(corr, out);
-  // BOOST_CHECK(are_equal);
+  /////////////////////////////////////////////////////////
+  // ToT * T
+
+  // this is not supported by the expression layer since this is a
+  // - general product w.r.t. outer indices
+  // - involves ToT * T
+  // tot_type result;
+  // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k"));
+
+  // will try to make this work
+  // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n");
 }
 
 BOOST_AUTO_TEST_SUITE_END()  // einsum_tot_t
diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h
index 1b7cdd07e1..360790b2de 100644
--- a/tests/sparse_tile.h
+++ b/tests/sparse_tile.h
@@ -47,6 +47,7 @@ class EigenSparseTile {
   typedef T value_type;                  // Element type
   typedef T numeric_type;  // The scalar type that is compatible with value_type
   typedef size_t size_type;  // Size type
+  typedef const T& const_reference;
   // other typedefs
   typedef Eigen::SparseMatrix<T, Eigen::RowMajor> matrix_type;
 
@@ -139,19 +140,24 @@ class EigenSparseTile {
   }
 
   /// data read-only accessor
-  template <typename Index, typename = std::enable_if_t<
-                                detail::is_integral_sized_range_v<Index>>>
-  value_type operator[](const Index& idx) const {
+  template <typename Index>
+  std::enable_if_t<detail::is_integral_sized_range_v<Index>, const value_type&>
+  operator[](const Index& idx) const {
+    static const value_type zero = 0;
     auto start = range().lobound_data();
-    return matrix().coeff(idx[0] - start[0], idx[1] - start[1]);
+    auto* ptr = coeffPtr(idx[0] - start[0], idx[1] - start[1]);
+    return ptr == nullptr ? zero : *ptr;
   }
 
   /// data read-only accessor
-  template <typename Ordinal, std::enable_if_t<std::is_integral_v<Ordinal>>>
-  value_type operator[](const Ordinal& ord) const {
+  template <typename Ordinal,
+            typename = std::enable_if_t<std::is_integral_v<Ordinal>>>
+  const value_type& operator[](const Ordinal& ord) const {
+    static const value_type zero = 0;
     auto idx = range().idx(ord);
     auto start = range().lobound_data();
-    return matrix().coeff(idx[0] - start[0], idx[1] - start[1]);
+    auto* ptr = coeffPtr(idx[0] - start[0], idx[1] - start[1]);
+    return ptr == nullptr ? zero : *ptr;
   }
 
   /// Maximum # of elements in the tile
@@ -218,6 +224,32 @@ class EigenSparseTile {
  private:
   std::shared_ptr<impl_type> impl_;
 
+  // pointer-based coeffRef
+  const value_type* coeffPtr(Eigen::Index row, Eigen::Index col) const {
+    auto& mat = matrix();
+    constexpr bool IsRowMajor =
+        std::decay_t<decltype(mat)>::Flags & Eigen::RowMajorBit ? 1 : 0;
+    using Eigen::Index;
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+
+    auto* outerIndexPtr = mat.outerIndexPtr();
+    auto* innerNonZeros = mat.innerNonZeroPtr();
+    const auto start = outerIndexPtr[outer];
+    const auto end = innerNonZeros ? outerIndexPtr[outer] + innerNonZeros[outer]
+                                   : outerIndexPtr[outer + 1];
+    TA_ASSERT(end >= start &&
+              "you probably called coeffRef on a non finalized matrix");
+    if (end <= start) return nullptr;
+    const Index p = mat.data().searchLowerIndex(
+        start, end - 1,
+        (typename std::decay_t<decltype(mat)>::StorageIndex)inner);
+    if ((p < end) && (mat.data().index(p) == inner))
+      return &(mat.data().value(p));
+    else
+      return nullptr;
+  }
+
 };  // class EigenSparseTile
 
 // configure TA traits to be usable as tile
diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp
index 21d136b67c..f6fae22be5 100644
--- a/tests/tensor_of_tensor.cpp
+++ b/tests/tensor_of_tensor.cpp
@@ -47,7 +47,10 @@ struct TensorOfTensorFixture {
   TensorOfTensorFixture()
       : a(make_rand_tensor_of_tensor(Range(size))),
         b(make_rand_tensor_of_tensor(Range(size))),
-        c(a - b)
+        c(a - b),
+        aa(make_rand_tensor(Range(size))),
+        bb(make_rand_tensor(Range(size))),
+        cc(aa - bb)
 #ifdef TILEDARRAY_HAS_BTAS
         ,
         d(make_rand_TobT(Range(size))),
@@ -123,13 +126,15 @@ struct TensorOfTensorFixture {
   static const BipartitePermutation bperm;
 
   Tensor<Tensor<int>> a, b, c;
+  Tensor<int> aa, bb, cc;
 #ifdef TILEDARRAY_HAS_BTAS
   Tensor<bTensorI> d, e, f, g, h;
 #endif  // defined(TILEDARRAY_HAS_BTAS)
 
   template <typename T>
   Tensor<T>& ToT(size_t idx);
-
+  template <typename T>
+  T& ToS(size_t idx);
 };  // TensorOfTensorFixture
 
 template <>
@@ -158,6 +163,18 @@ Tensor<bTensorI>& TensorOfTensorFixture::ToT<bTensorI>(size_t idx) {
 }
 #endif
 
+template <>
+Tensor<int>& TensorOfTensorFixture::ToS<Tensor<int>>(size_t idx) {
+  if (idx == 0)
+    return aa;
+  else if (idx == 1)
+    return bb;
+  else if (idx == 2)
+    return cc;
+  else
+    throw std::range_error("idx out of range");
+}
+
 const std::array<std::size_t, 2> TensorOfTensorFixture::size{{10, 11}};
 const Permutation TensorOfTensorFixture::perm{1, 0};
 const BipartitePermutation TensorOfTensorFixture::bperm(Permutation{1, 0, 3, 2},
@@ -171,6 +188,7 @@ typedef boost::mpl::list<TiledArray::Tensor<int>, bTensorI> itensor_types;
 #else
 typedef boost::mpl::list<TiledArray::Tensor<int>> itensor_types;
 #endif
+typedef boost::mpl::list<TiledArray::Tensor<int>> itensor_nobtas_types;
 
 BOOST_AUTO_TEST_CASE_TEMPLATE(default_constructor, ITensor, itensor_types) {
   BOOST_CHECK_NO_THROW(Tensor<ITensor> t);
@@ -964,6 +982,46 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(scal_mult_to, ITensor, itensor_types) {
   }
 }
 
+BOOST_AUTO_TEST_CASE_TEMPLATE(mixed_mult_TxS, ITensor, itensor_nobtas_types) {
+  const auto& a = ToT<ITensor>(0);
+  const auto& b = ToS<ITensor>(0);
+  Tensor<ITensor> t;
+  BOOST_CHECK_NO_THROW(t = a.mult(b));
+
+  BOOST_CHECK(!t.empty());
+  BOOST_CHECK_EQUAL(t.range(), a.range());
+
+  for (decltype(t.range().extent(0)) i = 0; i < t.range().extent(0); ++i) {
+    for (decltype(t.range().extent(1)) j = 0; j < t.range().extent(1); ++j) {
+      BOOST_CHECK(!t(i, j).empty());
+      BOOST_CHECK_EQUAL(t(i, j).range(), a(i, j).range());
+      for (std::size_t index = 0ul; index < t(i, j).size(); ++index) {
+        BOOST_CHECK_EQUAL(t(i, j)[index], a(i, j)[index] * b(i, j));
+      }
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(mixed_mult_SxT, ITensor, itensor_nobtas_types) {
+  const auto& a = ToS<ITensor>(0);
+  const auto& b = ToT<ITensor>(0);
+  Tensor<ITensor> t;
+  BOOST_CHECK_NO_THROW(t = a.mult(b));
+
+  BOOST_CHECK(!t.empty());
+  BOOST_CHECK_EQUAL(t.range(), a.range());
+
+  for (decltype(t.range().extent(0)) i = 0; i < t.range().extent(0); ++i) {
+    for (decltype(t.range().extent(1)) j = 0; j < t.range().extent(1); ++j) {
+      BOOST_CHECK(!t(i, j).empty());
+      BOOST_CHECK_EQUAL(t(i, j).range(), b(i, j).range());
+      for (std::size_t index = 0ul; index < t(i, j).size(); ++index) {
+        BOOST_CHECK_EQUAL(t(i, j)[index], a(i, j) * b(i, j)[index]);
+      }
+    }
+  }
+}
+
 BOOST_AUTO_TEST_CASE_TEMPLATE(neg, ITensor, itensor_types) {
   const auto& a = ToT<ITensor>(0);
   Tensor<ITensor> t;

From a9e5029293b26d67ea2ad32c998758bcf8a90ed5 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 23 May 2023 21:18:04 -0600
Subject: [PATCH 05/14] introduce TensorInterface::at_ordinal

---
 src/TiledArray/tensor/tensor_interface.h | 40 +++++++++++++++++-------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h
index bc5e9abab2..a514959cab 100644
--- a/src/TiledArray/tensor/tensor_interface.h
+++ b/src/TiledArray/tensor/tensor_interface.h
@@ -219,25 +219,43 @@ class TensorInterface {
 
   /// Element subscript accessor
 
-  /// \param index The ordinal element index
-  /// \return A const reference to the element at \c index.
-  const_reference operator[](const ordinal_type index) const {
-    TA_ASSERT(range_.includes(index));
-    return data_[range_.ordinal(index)];
+  /// \param index_ordinal The ordinal element index
+  /// \return A const reference to the element at \c index_ordinal.
+  const_reference operator[](const ordinal_type index_ordinal) const {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
   }
 
   /// Element subscript accessor
 
   /// \param index The ordinal element index
-  /// \return A const reference to the element at \c index.
-  reference operator[](const ordinal_type index) {
-    TA_ASSERT(range_.includes(index));
-    return data_[range_.ordinal(index)];
+  /// \return A const reference to the element at \c index_ordinal.
+  reference operator[](const ordinal_type index_ordinal) {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
+  }
+
+  /// Element accessor
+
+  /// \param index_ordinal The ordinal element index
+  /// \return A const reference to the element at \c index_ordinal.
+  const_reference at_ordinal(const ordinal_type index_ordinal) const {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
+  }
+
+  /// Element accessor
+
+  /// \param index_ordinal The ordinal element index
+  /// \return A const reference to the element at \c index_ordinal.
+  reference at_ordinal(const ordinal_type index_ordinal) {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
   }
 
   /// Element accessor
 
-  /// \tparam Index An integral type pack or a single coodinate index type
+  /// \tparam Index An integral type pack or a single coordinate index type
   /// \param idx The index pack
   template <typename... Index>
   reference operator()(const Index&... idx) {
@@ -247,7 +265,7 @@ class TensorInterface {
 
   /// Element accessor
 
-  /// \tparam Index An integral type pack or a single coodinate index type
+  /// \tparam Index An integral type pack or a single coordinate index type
   /// \param idx The index pack
   template <typename... Index>
   const_reference operator()(const Index&... idx) const {

From 4d8d7019056febc5748450cd2e4637678350aaaa Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 23 May 2023 21:19:50 -0600
Subject: [PATCH 06/14] TA::detail::permute prefers Tensor::at_ordinal

bump BTAS tag to pull in https://github.com/ValeevGroup/BTAS/pull/158
---
 INSTALL.md                      |  2 +-
 external/versions.cmake         |  4 ++--
 src/TiledArray/tensor/permute.h | 20 ++++++++++----------
 tests/sparse_tile.h             |  9 +++++++++
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 5a190a93aa..765d443235 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Container: header-only
   - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing*
   - Boost.Range: header-only, *only used for unit testing*
-- [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile
+- [BTAS](http://github.com/ValeevGroup/BTAS), tag fdb6aa000f4314b16d74e2dd35bfb527c268cac5 . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
 - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
diff --git a/external/versions.cmake b/external/versions.cmake
index ea45a87437..3362c306c6 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
-set(TA_TRACKED_BTAS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a)
-set(TA_TRACKED_BTAS_PREVIOUS_TAG 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3)
+set(TA_TRACKED_BTAS_TAG fdb6aa000f4314b16d74e2dd35bfb527c268cac5)
+set(TA_TRACKED_BTAS_PREVIOUS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a)
 
 set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da)
 set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796)
diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h
index 7fb103217f..43fbfc9328 100644
--- a/src/TiledArray/tensor/permute.h
+++ b/src/TiledArray/tensor/permute.h
@@ -150,13 +150,13 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
     };
 
     // Permute the data
-    for (typename Result::ordinal_type index = 0ul; index < volume;
-         index += block_size) {
-      const typename Result::ordinal_type perm_index = perm_index_op(index);
+    for (typename Result::ordinal_type ord = 0ul; ord < volume;
+         ord += block_size) {
+      const typename Result::ordinal_type perm_ord = perm_index_op(ord);
 
       // Copy the block
-      math::vector_ptr_op(op, block_size, result.data() + perm_index,
-                          &arg0[index], &args[index]...);
+      math::vector_ptr_op(op, block_size, result.data() + perm_ord,
+                          &arg0.at_ordinal(ord), &args.at_ordinal(ord)...);
     }
 
   } else {
@@ -190,16 +190,16 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
     // Copy data from the input to the output matrix via a series of matrix
     // transposes.
     for (typename Result::ordinal_type i = 0ul; i < other_fused_size[0]; ++i) {
-      typename Result::ordinal_type index = i * other_fused_weight[0];
+      typename Result::ordinal_type ord = i * other_fused_weight[0];
       for (typename Result::ordinal_type j = 0ul; j < other_fused_size[2];
-           ++j, index += other_fused_weight[2]) {
+           ++j, ord += other_fused_weight[2]) {
         // Compute the ordinal index of the input and output matrices.
-        typename Result::ordinal_type perm_index = perm_index_op(index);
+        typename Result::ordinal_type perm_ord = perm_index_op(ord);
 
         math::transpose(input_op, output_op, other_fused_size[1],
                         other_fused_size[3], result_outer_stride,
-                        &result[perm_index], other_fused_weight[1],
-                        &arg0[index], &args[index]...);
+                        &result.at_ordinal(perm_ord), other_fused_weight[1],
+                        &arg0.at_ordinal(ord), &args.at_ordinal(ord)...);
       }
     }
   }
diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h
index 360790b2de..888c39811f 100644
--- a/tests/sparse_tile.h
+++ b/tests/sparse_tile.h
@@ -48,6 +48,7 @@ class EigenSparseTile {
   typedef T numeric_type;  // The scalar type that is compatible with value_type
   typedef size_t size_type;  // Size type
   typedef const T& const_reference;
+  typedef size_type ordinal_type;
   // other typedefs
   typedef Eigen::SparseMatrix<T, Eigen::RowMajor> matrix_type;
 
@@ -160,6 +161,14 @@ class EigenSparseTile {
     return ptr == nullptr ? zero : *ptr;
   }
 
+  const value_type& at_ordinal(const ordinal_type index_ordinal) const {
+    return this->operator[](index_ordinal);
+  }
+
+  value_type& at_ordinal(const ordinal_type index_ordinal) {
+    return this->operator[](index_ordinal);
+  }
+
   /// Maximum # of elements in the tile
   size_type size() const { return std::get<0>(*impl_).volume(); }
 

From a575b358d4f2b6bb8346ab4060798a921f9dba59 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 25 May 2023 20:43:21 -0400
Subject: [PATCH 07/14] fix up casting in mixed expressions where casting is
 not provided by the tile evaluation e.g. complex = real

---
 src/TiledArray/expressions/expr.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h
index bcc65cb412..1a7bc2ff05 100644
--- a/src/TiledArray/expressions/expr.h
+++ b/src/TiledArray/expressions/expr.h
@@ -252,13 +252,13 @@ class Expr {
                               >::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
-    auto eval_tile_fn =
-        &Expr_::template eval_tile<typename A::value_type, const T&,
-                                   TiledArray::Cast<typename A::value_type, T>,
-                                   Op>;
-    array.set(index, array.world().taskq.add(
-                         eval_tile_fn, tile,
-                         TiledArray::Cast<typename A::value_type, T>(), op));
+    auto eval_tile_fn = &Expr_::template eval_tile<
+        typename A::value_type, const T&,
+        TiledArray::Cast<typename Op::argument_type, T>, Op>;
+    array.set(index,
+              array.world().taskq.add(
+                  eval_tile_fn, tile,
+                  TiledArray::Cast<typename Op::argument_type, T>(), op));
   }
 
 #ifdef TILEDARRAY_HAS_CUDA
@@ -278,13 +278,13 @@ class Expr {
                 ::TiledArray::detail::is_cuda_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
-    auto eval_tile_fn =
-        &Expr_::template eval_tile<typename A::value_type, const T&,
-                                   TiledArray::Cast<typename A::value_type, T>,
-                                   Op>;
-    array.set(index, madness::add_cuda_task(
-                         array.world(), eval_tile_fn, tile,
-                         TiledArray::Cast<typename A::value_type, T>(), op));
+    auto eval_tile_fn = &Expr_::template eval_tile<
+        typename A::value_type, const T&,
+        TiledArray::Cast<typename Op::argument_type, T>, Op>;
+    array.set(index,
+              madness::add_cuda_task(
+                  array.world(), eval_tile_fn, tile,
+                  TiledArray::Cast<typename Op::argument_type, T>(), op));
   }
 #endif
 

From 751ec783c87a7f6e0ac519175d4c2eb59ab51a44 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 25 May 2023 20:44:53 -0400
Subject: [PATCH 08/14] introduced TA::conversions::to that can be used for
 custom element conversions in e.g. TA::Tensor<T>(TA::Tensor<U>)

---
 src/TiledArray/fwd.h            | 13 +++++++++++++
 src/TiledArray/tensor/complex.h | 14 ++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h
index f09a98c0e5..2d99c1078a 100644
--- a/src/TiledArray/fwd.h
+++ b/src/TiledArray/fwd.h
@@ -156,6 +156,19 @@ using Array
 
 enum class HostExecutor { Thread, MADWorld, Default = MADWorld };
 
+namespace conversions {
+
+/// user defined conversions
+
+/// must define
+/// \code
+///  To operator()(From&& from);
+/// \endcode
+template <typename To, typename From>
+struct to;
+
+}  // namespace conversions
+
 }  // namespace TiledArray
 
 #ifndef TILEDARRAY_DISABLE_NAMESPACE_TA
diff --git a/src/TiledArray/tensor/complex.h b/src/TiledArray/tensor/complex.h
index cfa330101d..69a8971bf6 100644
--- a/src/TiledArray/tensor/complex.h
+++ b/src/TiledArray/tensor/complex.h
@@ -27,6 +27,7 @@
 #define TILEDARRAY_SRC_TILEDARRAY_TENSOR_COMPLEX_H__INCLUDED
 
 #include <TiledArray/config.h>
+#include <TiledArray/fwd.h>
 #include <TiledArray/type_traits.h>
 
 namespace TiledArray {
@@ -301,6 +302,19 @@ TILEDARRAY_FORCE_INLINE
 }
 
 }  // namespace detail
+
+namespace conversions {
+
+template <typename T>
+struct to<T, std::complex<T>> {
+  T operator()(const std::complex<T>& v) {
+    TA_ASSERT(v.imag() == 0);
+    return v.real();
+  }
+};
+
+}  // namespace conversions
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_SRC_TILEDARRAY_TENSOR_COMPLEX_H__INCLUDED

From 56868245a2da7ef5d9b0fba9ea724d68b3983759 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 25 May 2023 20:46:22 -0400
Subject: [PATCH 09/14] converting constructor of TA::Tensor<T>, i.e.
 TA::Tensor<T>(TA::Tensor<U>) and TA::Tensor<T>(TA::Tensor<U>,
 TA::Permutation), can use custom element conversions provided by
 TA::conversions::to

---
 src/TiledArray/tensor/tensor.h | 35 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index 73e3fc0caf..b0141faa19 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -214,6 +214,20 @@ class Tensor {
 #endif
   }
 
+  template <typename T_>
+  static decltype(auto) value_converter(const T_& arg) {
+    using arg_type = detail::remove_cvr_t<decltype(arg)>;
+    if constexpr (detail::is_tensor_v<arg_type>)  // clone nested tensors
+      return arg.clone();
+    else if constexpr (!std::is_same_v<arg_type, value_type>) {  // convert
+      if constexpr (std::is_convertible_v<arg_type, value_type>)
+        return static_cast<value_type>(arg);
+      else
+        return conversions::to<value_type, arg_type>()(arg);
+    } else
+      return arg;
+  };
+
   range_type range_;  ///< Range
   /// Number of `range_`-sized blocks in `data_`
   /// \note this is not used for (in)equality comparison
@@ -365,15 +379,7 @@ class Tensor {
           !detail::has_conversion_operator_v<T1, Tensor>>::type* = nullptr>
   explicit Tensor(const T1& other)
       : Tensor(detail::clone_range(other), 1, default_construct{false}) {
-    auto op = [](const value_type& arg) -> decltype(auto) {
-      // clone nested tensors
-      if constexpr (detail::is_tensor_v<value_type>)
-        return arg.clone();
-      else
-        return arg;
-    };
-
-    detail::tensor_init(op, *this, other);
+    detail::tensor_init(value_converter<typename T1::value_type>, *this, other);
   }
 
   /// Construct a permuted tensor copy
@@ -394,15 +400,8 @@ class Tensor {
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor(const T1& other, const Perm& perm)
       : Tensor(outer(perm) * other.range(), 1, default_construct{false}) {
-    auto op = [](const value_type& arg) -> decltype(auto) {
-      // clone nested tensors
-      if constexpr (detail::is_tensor_v<value_type>)
-        return arg.clone();
-      else
-        return arg;
-    };
-
-    detail::tensor_init(op, outer(perm), *this, other);
+    detail::tensor_init(value_converter<typename T1::value_type>, outer(perm),
+                        *this, other);
 
     // If we actually have a ToT the inner permutation was not applied above so
     // we do that now

From edc9f9f5bc4849c0981e62df40d7809d273e0666 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 25 May 2023 20:46:59 -0400
Subject: [PATCH 10/14] converting constructor of TA::DistArray is not explicit
 to support TZArray = TArray

---
 src/TiledArray/dist_array.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h
index fd0450ed8e..c6d6cddb79 100644
--- a/src/TiledArray/dist_array.h
+++ b/src/TiledArray/dist_array.h
@@ -548,7 +548,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// initialized using TiledArray::Cast<Tile,OtherTile>
   /// \param other The array to be copied
   template <typename OtherTile, typename = enable_if_not_my_type<OtherTile>>
-  explicit DistArray(const DistArray<OtherTile, Policy>& other) : pimpl_() {
+  DistArray(const DistArray<OtherTile, Policy>& other) : pimpl_() {
     *this = foreach<Tile>(other, [](Tile& result, const OtherTile& source) {
       result = TiledArray::Cast<Tile, OtherTile>{}(source);
     });

From 7f9ffea286c34104478944225ac02e498beef631 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 30 May 2023 16:15:49 -0400
Subject: [PATCH 11/14] bump MAD tag to use master PaRSEC backend + associated
 TTG bump

- pulls in https://github.com/m-a-d-n-e-s-s/madness/pull/472
- pull in https://github.com/TESSEorg/ttg/pull/252
---
 INSTALL.md              | 2 +-
 external/versions.cmake | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 765d443235..9279d59e26 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Range: header-only, *only used for unit testing*
 - [BTAS](http://github.com/ValeevGroup/BTAS), tag fdb6aa000f4314b16d74e2dd35bfb527c268cac5 . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
-- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 .
+- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 41324ea8f2c04df687ac2095c9001230db83b5cc .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
   If usable MADNESS installation is not found, TiledArray will download and compile
   MADNESS from source. *This is the recommended way to compile MADNESS for all users*.
diff --git a/external/versions.cmake b/external/versions.cmake
index 3362c306c6..d5c71cb632 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7)
 set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626)
 set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496)
 
-set(TA_TRACKED_MADNESS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258)
-set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50)
+set(TA_TRACKED_MADNESS_TAG 41324ea8f2c04df687ac2095c9001230db83b5cc)
+set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
@@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864)
 set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006)
 
 set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg)
-set(TA_TRACKED_TTG_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a)
-set(TA_TRACKED_TTG_PREVIOUS_TAG 1251bec25e07a74a05e5cd4cdec181a95a9baa66)
+set(TA_TRACKED_TTG_TAG 0adff52aa1ebdad013ab3843a7a68c2bb06b60a8)
+set(TA_TRACKED_TTG_PREVIOUS_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a)

From dca3d5ecefc51bfb5ee82fe8fcb84aa991148c05 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 30 May 2023 19:47:00 -0400
Subject: [PATCH 12/14] microopt

---
 src/TiledArray/math/linalg/scalapack/block_cyclic.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/TiledArray/math/linalg/scalapack/block_cyclic.h b/src/TiledArray/math/linalg/scalapack/block_cyclic.h
index 902312788b..34e2a17b38 100644
--- a/src/TiledArray/math/linalg/scalapack/block_cyclic.h
+++ b/src/TiledArray/math/linalg/scalapack/block_cyclic.h
@@ -133,7 +133,7 @@ class BlockCyclicMatrix : public madness::WorldObject<BlockCyclicMatrix<T>> {
   template <typename Tile,
             typename = std::enable_if_t<
                 TiledArray::detail::is_contiguous_tensor_v<Tile>>>
-  Tile extract_submatrix(std::vector<size_t> lo, std::vector<size_t> up) {
+  Tile extract_submatrix(std::array<size_t, 2> lo, std::array<size_t, 2> up) {
     assert(bc_dist_.i_own(lo[0], lo[1]));
 
     auto [i_st, j_st] = bc_dist_.local_indx(lo[0], lo[1]);
@@ -247,8 +247,10 @@ class BlockCyclicMatrix : public madness::WorldObject<BlockCyclicMatrix<T>> {
           const auto j_block_end = std::min(n, j_block_begin + nb);
 
           // Cut block if necessary to adhere to tile dimensions
-          const auto i_last = std::min(i_block_end, static_cast<decltype(m)>(up[0]));
-          const auto j_last = std::min(j_block_end, static_cast<decltype(m)>(up[1]));
+          const auto i_last =
+              std::min(i_block_end, static_cast<decltype(m)>(up[0]));
+          const auto j_last =
+              std::min(j_block_end, static_cast<decltype(m)>(up[1]));
 
           // Calculate extents of the block to be copied
           i_extent = i_last - i;
@@ -263,8 +265,8 @@ class BlockCyclicMatrix : public madness::WorldObject<BlockCyclicMatrix<T>> {
                 local_mat_.block(i_local, j_local, i_extent, j_extent);
 
           } else {
-            std::vector<size_t> lo{i, j};
-            std::vector<size_t> up{i_last, j_last};
+            std::array<size_t, 2> lo{i, j};
+            std::array<size_t, 2> up{i_last, j_last};
             madness::Future<Tensor<T>> remtile_fut = world_base_t::send(
                 owner(i, j),
                 &BlockCyclicMatrix<T>::template extract_submatrix<Tensor<T>>,

From dea24e846d2f7876780666462e9b3e6ab187fb99 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 26 Jul 2023 14:36:53 -0400
Subject: [PATCH 13/14] minor fixups in tensor.h to support operation with
 nonstandard allocators ... addresses
 https://github.com/ValeevGroup/mpqc4/issues/412

---
 src/TiledArray/tensor/tensor.h | 57 ++++++++++++++++++----------------
 tests/tensor.cpp               | 14 +++++++++
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index b0141faa19..3c10ba4077 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -60,9 +60,9 @@ template <typename T, typename Allocator>
 class Tensor {
   // meaningful error if T& is not assignable, see
   // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48101
-  static_assert(
-      std::is_assignable<std::add_lvalue_reference_t<T>, T>::value,
-      "Tensor<T>: T must be an assignable type (e.g. cannot be const)");
+  static_assert(std::is_assignable<std::add_lvalue_reference_t<T>, T>::value,
+                "Tensor<T,Allocator>: T must be an assignable type (e.g. "
+                "cannot be const)");
 
 #ifdef TA_TENSOR_MEM_TRACE
   template <typename... Ts>
@@ -80,16 +80,17 @@ class Tensor {
   typedef typename range_type::ordinal_type
       size_type;  ///< Size type (to meet the container concept)
   typedef Allocator allocator_type;  ///< Allocator type
-  typedef
-      typename allocator_type::value_type value_type;  ///< Array element type
-  typedef
-      typename allocator_type::reference reference;  ///< Element reference type
-  typedef typename allocator_type::const_reference
-      const_reference;                               ///< Element reference type
-  typedef typename allocator_type::pointer pointer;  ///< Element pointer type
-  typedef typename allocator_type::const_pointer
+  typedef typename std::allocator_traits<allocator_type>::value_type
+      value_type;  ///< Array element type
+  typedef std::add_lvalue_reference_t<value_type>
+      reference;  ///< Element (lvalue) reference type
+  typedef std::add_lvalue_reference_t<std::add_const_t<value_type>>
+      const_reference;  ///< Element (const lvalue) reference type
+  typedef typename std::allocator_traits<allocator_type>::pointer
+      pointer;  ///< Element pointer type
+  typedef typename std::allocator_traits<allocator_type>::const_pointer
       const_pointer;  ///< Element const pointer type
-  typedef typename allocator_type::difference_type
+  typedef typename std::allocator_traits<allocator_type>::difference_type
       difference_type;                   ///< Difference type
   typedef pointer iterator;              ///< Element iterator type
   typedef const_pointer const_iterator;  ///< Element const iterator type
@@ -1359,7 +1360,9 @@ class Tensor {
   auto binary(const Right& right, Op&& op) const {
     using result_value_type = decltype(op(
         std::declval<const T&>(), std::declval<const value_t<Right>&>()));
-    return Tensor<result_value_type>(*this, right, op);
+    using result_allocator_type = typename std::allocator_traits<
+        Allocator>::template rebind_alloc<result_value_type>;
+    return Tensor<result_value_type, result_allocator_type>(*this, right, op);
   }
 
   /// Use a binary, element wise operation to construct a new, permuted tensor
@@ -1386,7 +1389,9 @@ class Tensor {
     if constexpr (!is_tot) {
       using result_value_type = decltype(op(
           std::declval<const T&>(), std::declval<const value_t<Right>&>()));
-      using ResultTensor = Tensor<result_value_type>;
+      using result_allocator_type = typename std::allocator_traits<
+          Allocator>::template rebind_alloc<result_value_type>;
+      using ResultTensor = Tensor<result_value_type, result_allocator_type>;
       if constexpr (is_bperm) {
         TA_ASSERT(inner_size(perm) == 0);  // ensure this is a plain permutation
         return ResultTensor(*this, right, op, outer(perm));
@@ -1696,7 +1701,7 @@ class Tensor {
   /// elements of \c this and \c right
   template <typename Right,
             typename = std::enable_if<
-                detail::tensors_have_equal_nested_rank_v<Tensor<T>, Right>>>
+                detail::tensors_have_equal_nested_rank_v<Tensor, Right>>>
   Tensor subt(const Right& right) const {
     return binary(
         right, [](const value_type& l, const value_type& r) -> decltype(auto) {
@@ -2490,8 +2495,8 @@ std::size_t Tensor<T, A>::trace_if_larger_than_ =
     std::numeric_limits<std::size_t>::max();
 #endif
 
-template <typename T>
-Tensor<T> operator*(const Permutation& p, const Tensor<T>& t) {
+template <typename T, typename A>
+Tensor<T, A> operator*(const Permutation& p, const Tensor<T, A>& t) {
   return t.permute(p);
 }
 
@@ -2543,11 +2548,11 @@ template <typename Alpha, typename... As, typename... Bs, typename Beta,
           typename... Cs>
 void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
           Beta beta, Tensor<Cs...>& C, const math::GemmHelper& gemm_helper) {
-  static_assert(
-      !detail::is_tensor_of_tensor_v<Tensor<As...>, Tensor<Bs...>,
-                                     Tensor<Cs...>>,
-      "TA::Tensor<T>::gemm without custom element op is only applicable to "
-      "plain tensors");
+  static_assert(!detail::is_tensor_of_tensor_v<Tensor<As...>, Tensor<Bs...>,
+                                               Tensor<Cs...>>,
+                "TA::Tensor<T,Allocator>::gemm without custom element op is "
+                "only applicable to "
+                "plain tensors");
   {
     // Check that tensor C is not empty and has the correct rank
     TA_ASSERT(!C.empty());
@@ -2705,16 +2710,16 @@ bool operator!=(const Tensor<T, A>& a, const Tensor<T, A>& b) {
 
 namespace detail {
 
-/// Implements taking the trace of a Tensor<T> (\c T is a numeric type)
+/// Implements taking the trace of a Tensor<T,A>
 ///
 /// \tparam T The type of the elements in the tensor. For this specialization
 ///           to be considered must satisfy the concept of numeric type.
 /// \tparam A The type of the allocator for the tensor
 template <typename T, typename A>
 struct Trace<Tensor<T, A>, detail::enable_if_numeric_t<T>> {
-  decltype(auto) operator()(const Tensor<T>& t) const {
-    using size_type = typename Tensor<T>::size_type;
-    using value_type = typename Tensor<T>::value_type;
+  decltype(auto) operator()(const Tensor<T, A>& t) const {
+    using size_type = typename Tensor<T, A>::size_type;
+    using value_type = typename Tensor<T, A>::value_type;
     const auto range = t.range();
 
     // Get pointers to the range data
diff --git a/tests/tensor.cpp b/tests/tensor.cpp
index 1281e5d164..be214ef841 100644
--- a/tests/tensor.cpp
+++ b/tests/tensor.cpp
@@ -724,6 +724,20 @@ BOOST_AUTO_TEST_CASE(block) {
 #endif
 }
 
+BOOST_AUTO_TEST_CASE(allocator) {
+  TensorD x(r, 1.0);
+  Tensor<double, std::allocator<double>> y(r, 1.0);
+  static_assert(std::is_same_v<decltype(x.add(y)), TensorD>);
+  static_assert(std::is_same_v<decltype(y.add(x)), decltype(y)>);
+  static_assert(std::is_same_v<decltype(x.subt(y)), TensorD>);
+  static_assert(std::is_same_v<decltype(y.subt(x)), decltype(y)>);
+  static_assert(std::is_same_v<decltype(x.mult(y)), TensorD>);
+  static_assert(std::is_same_v<decltype(y.mult(x)), decltype(y)>);
+  BOOST_REQUIRE_NO_THROW(x.add_to(y));
+  BOOST_REQUIRE_NO_THROW(x.subt_to(y));
+  BOOST_REQUIRE_NO_THROW(x.mult_to(y));
+}
+
 BOOST_AUTO_TEST_CASE(rebind) {
   static_assert(
       std::is_same_v<TensorD::rebind_t<std::complex<double>>, TensorZ>);

From 843b746ebd1805af8f992b8a41faee7462021347 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 28 Jul 2023 07:06:19 -0400
Subject: [PATCH 14/14] amend
 https://github.com/ValeevGroup/tiledarray/commit/c6b987370755605f7d8538c59e5f4cf072a17d3c
 to use TA-allocated streams

---
 tests/librett.cpp | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tests/librett.cpp b/tests/librett.cpp
index cdb2bcf6ce..3785071071 100644
--- a/tests/librett.cpp
+++ b/tests/librett.cpp
@@ -27,8 +27,6 @@
 #include <TiledArray/cuda/btas_um_tensor.h>
 #include "unit_test_config.h"
 
-#include <GpuUtils.h>
-
 struct LibreTTFixture {
   //  LibreTTFixture()
   //      : A(100),
@@ -71,8 +69,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
   TiledArray::permutation_to_col_major(perm);
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   status =
@@ -121,8 +118,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
   cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice);
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   std::vector<int> extent({B, A});
@@ -181,8 +177,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
   // b(j,i,k) = a(i,j,k)
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   std::vector<int> extent3{int(A), int(B), int(C)};
@@ -245,8 +240,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
   // b(j,i,k) = a(i,j,k)
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   std::vector<int> extent({A, B, C});
@@ -303,8 +297,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) {
   }
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   std::vector<int> extent({A, A});
@@ -354,8 +347,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
   }
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   std::vector<int> extent({B, A});
@@ -405,8 +397,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
   }
 
   librettHandle plan;
-  librett_gpuStream_t stream;
-  cudaCheck(cudaStreamCreate(&stream));
+  auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0);
   librettResult status;
 
   // b(k,i,j) = a(i,j,k)