From 4ceb416130733f9a01fb342e7436f759284a8633 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 8 Nov 2023 10:09:27 -0500 Subject: [PATCH 01/88] [unit] enabled tot x t test, does not compile @bimalgaudel will fix --- src/TiledArray/einsum/tiledarray.h | 6 +++--- tests/einsum.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index c248956066..7d4aca0425 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -422,9 +422,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B) { template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { - static_assert(std::is_same::value); - using E = expressions::TsrExpr; - return Einsum::einsum(E(A), E(B), Einsum::idx(cs), world); + using ECT = expressions::TsrExpr; + using ECU = expressions::TsrExpr; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ee06cf099f..45c4d3e399 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -765,7 +765,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From 65f437492715caa61d7177cb82b9bf6013662f58 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 13 Nov 2023 12:28:02 -0500 Subject: [PATCH 02/88] [WIP] T x ToT overload of einsum: first attempt. --- src/TiledArray/einsum/tiledarray.h | 225 +++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 7d4aca0425..52dab7477e 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -283,6 +283,231 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; +} // namespace + +template < + typename ArrayT_, typename ArrayToT_, typename... Indices, + typename = std::enable_if_t && IsArrayToT>> +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, + std::tuple, Indices...> cs, + World &world) { + using ArrayT = std::remove_cv_t; + using ArrayToT = std::remove_cv_t; + using Shape = typename ArrayToT::shape_type; + using T = typename ArrayT::value_type; + using ToT = typename ArrayToT::value_type; + + auto a = std::get<0>(Einsum::idx(A)); + auto b = std::get<0>(Einsum::idx(B)); + Einsum::Index c = std::get<0>(cs); + + struct { + std::string a, b, c; + } inner; + if constexpr (std::tuple_size::value == 2) { + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + inner.c = ";" + (std::string)std::get<1>(cs); + } + + // these are "Hadamard" (fused) indices + auto h = a & b & c; + + auto e = (a ^ b); + // contracted indices + auto i = (a & b) - h; + + // cannot be hadamard reduction type operation for this overload + TA_ASSERT(e); + + // no Hadamard indices => standard contraction (or even outer product) + // same a, b, and c => pure Hadamard + TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + + // maps Index to TiledRange1 + // (asserts same index maps to the same TR1 in A, and B) + auto range_map = + (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); + + using ::Einsum::index::permutation; + using TiledArray::Permutation; + + auto arrayTermA = ArrayTerm{A.array(), a}; + auto arrayTermB = ArrayTerm{B.array(), b}; + + { + auto ei = (e + i & arrayTermA.idx); + if (arrayTermA.idx != h + ei) + arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); + arrayTermA.expr = ei; + } + + { + auto ei = (e + i & arrayTermB.idx); + if (arrayTermB.idx != h + ei) + arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); + arrayTermB.expr = ei; + } + + ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; + for (auto idx : e) { + C.tiles *= Range(range_map[idx].tiles_range()); + } + if (C.idx != h + e) { + C.permutation = permutation(h + e, C.idx); + } + C.expr = e; + + struct { + RangeProduct tiles; + std::vector> batch; + } H; + + for (auto idx : h) { + H.tiles *= Range(range_map[idx].tiles_range()); + H.batch.push_back({}); + for (auto r : range_map[idx]) { + H.batch.back().push_back(Range{r}.size()); + } + } + + using Index = Einsum::Index; + + // generalized contraction + { + auto ei = (e + i & arrayTermA.idx); + arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); + } + + { + auto ei = (e + i & arrayTermB.idx); + arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); + } + + std::vector> worlds; + std::vector> local_tiles; + + // iterates over tiles of hadamard indices + for (Index h : H.tiles) { + auto &A = arrayTermA; + auto &B = arrayTermB; + + auto own = A.own(h) || B.own(h); + auto comm = world.mpi.comm().Split(own, world.rank()); + worlds.push_back(std::make_unique(comm)); + auto &owners = worlds.back(); + if (!own) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); + } + + { + arrayTermA.local_tiles.clear(); + const Permutation &P = arrayTermA.permutation; + + for (Index ei : arrayTermA.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermA.array.is_local(idx)) continue; + if (arrayTermA.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermA.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermA.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermA.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermA.array.pmap()->is_replicated(); + arrayTermA.ei = TiledArray::make_array( + *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), + arrayTermA.local_tiles.end(), replicated); + } + + { + arrayTermB.local_tiles.clear(); + const Permutation &P = arrayTermB.permutation; + + for (Index ei : arrayTermB.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermB.array.is_local(idx)) continue; + if (arrayTermB.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermB.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermB.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermB.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermB.array.pmap()->is_replicated(); + arrayTermB.ei = TiledArray::make_array( + *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), + arrayTermB.local_tiles.end(), replicated); + } + + // todo + // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + A.ei.defer_deleter_to_next_fence(); + B.ei.defer_deleter_to_next_fence(); + A.ei = ArrayT(); + B.ei = ArrayToT(); + // why omitting this fence leads to deadlock? + owners->gop.fence(); + for (Index e : C.tiles) { + if (!C.ei.is_local(e)) continue; + if (C.ei.is_zero(e)) continue; + // TODO no need for immediate evaluation + auto tile = C.ei.find_local(e).get(); + assert(tile.batch_size() == batch); + const Permutation &P = C.permutation; + auto c = apply(P, h + e); + auto shape = C.array.trange().tile(c); + shape = apply_inverse(P, shape); + tile = tile.reshape(shape); + if (P) tile = tile.permute(P); + local_tiles.push_back({c, tile}); + } + // mark for lazy deletion + C.ei = ArrayToT(); + } + + if constexpr (!Shape::is_dense()) { + TiledRange tiled_range = TiledRange(range_map[c]); + std::vector> tile_norms; + for (auto &[index, tile] : local_tiles) { + tile_norms.push_back({index, tile.norm()}); + } + Shape shape(world, tile_norms, tiled_range); + C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + } + + for (auto &[index, tile] : local_tiles) { + if (C.array.is_zero(index)) continue; + C.array.set(index, tile); + } + + for (auto &w : worlds) { + w->gop.fence(); + } + + return C.array; +} + +template && IsArrayToT>> +auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, + std::tuple, Indices...> cs, + World &world) { + return einsum(A, B, cs, world); +} + /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. From ab0698dc9f95fe0609ac52a3b428408bccef7ba2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 14 Nov 2023 14:34:05 -0500 Subject: [PATCH 03/88] tiny step towards supporting T*ToT in expr --- src/TiledArray/tensor/type_traits.h | 7 ++++--- src/TiledArray/tile_op/contract_reduce.h | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index eed84c6026..fd197c8cdf 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -114,7 +114,7 @@ struct is_nested_tensor { /// @c is_nested_tensor_v is an alias for @c /// is_nested_tensor::value template -constexpr const bool is_nested_tensor_v = is_nested_tensor::value; +inline constexpr const bool is_nested_tensor_v = is_nested_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -150,7 +150,7 @@ struct is_tensor { /// @tparam Ts a parameter pack /// @c is_tensor_v is an alias for @c is_tensor::value template -constexpr const bool is_tensor_v = is_tensor::value; +inline constexpr const bool is_tensor_v = is_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -172,7 +172,8 @@ struct is_tensor_of_tensor { /// @c is_tensor_of_tensor_v is an alias for @c /// is_tensor_of_tensor::value template -constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor::value; +inline constexpr const bool is_tensor_of_tensor_v = + is_tensor_of_tensor::value; //////////////////////////////////////////////////////////////////////////////// diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index 48b7936d26..d9d87d59c8 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -64,17 +64,20 @@ class ContractReduceBase { using elem_muladd_op_type = void(result_value_type&, const left_value_type&, const right_value_type&); - static_assert( - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v, - "ContractReduce can only handle plain tensors or nested tensors " - "(tensors-of-tensors); mixed contractions are not supported"); static constexpr bool plain_tensors = - !(TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v); + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v; + static constexpr bool nested_tensors = + TiledArray::detail::is_nested_tensor_v; + static constexpr bool mixed_tensors = !plain_tensors && !nested_tensors; + static_assert(!mixed_tensors || + (mixed_tensors && + TiledArray::detail::is_nested_tensor_v), + "ContractReduce applied to 1 plain tensor and 1 nested tensor " + "must produce a nested tensor " + "(tensors-of-tensors)"); private: struct Impl { From a9a6b58958c444b8b1900b345bae0993716d5c7d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 12:41:58 -0500 Subject: [PATCH 04/88] [WIP]: Make binary_egine less restrictive on left and right arg types. --- src/TiledArray/einsum/tiledarray.h | 21 ++++++++++++--------- src/TiledArray/expressions/binary_engine.h | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 52dab7477e..09640d31f6 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -309,7 +309,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, Einsum::Index c = std::get<0>(cs); struct { - std::string a, b, c; + std::string b, c; } inner; if constexpr (std::tuple_size::value == 2) { inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); @@ -319,16 +319,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // these are "Hadamard" (fused) indices auto h = a & b & c; - auto e = (a ^ b); // contracted indices auto i = (a & b) - h; + // contraction not allowed in tensor x tensor-of-tensor + TA_ASSERT(!i); - // cannot be hadamard reduction type operation for this overload - TA_ASSERT(e); - - // no Hadamard indices => standard contraction (or even outer product) - // same a, b, and c => pure Hadamard - TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + // indices exclusively in 'a' or exclusively in 'b' + auto e = (a ^ b); // maps Index to TiledRange1 // (asserts same index maps to the same TR1 in A, and B) @@ -364,6 +361,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; + arrayTermB.expr += inner.b; + C.expr += inner.c; + struct { RangeProduct tiles; std::vector> batch; @@ -453,7 +453,10 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } // todo - // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + + // + A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); A.ei = ArrayT(); diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 4758ab0069..93192e2b5e 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -146,11 +146,10 @@ class BinaryEngine : public ExprEngine { TiledArray::detail::is_tensor_of_tensor_v; constexpr bool right_tile_is_tot = TiledArray::detail::is_tensor_of_tensor_v; - static_assert(!(left_tile_is_tot ^ right_tile_is_tot), - "ContEngine can only handle tensors of same nested-ness " - "(both plain or both ToT)"); constexpr bool args_are_plain_tensors = !left_tile_is_tot && !right_tile_is_tot; + constexpr bool args_are_mixed_tensors = + left_tile_is_tot ^ right_tile_is_tot; if (args_are_plain_tensors && (left_outer_permtype_ == PermutationType::matrix_transpose || left_outer_permtype_ == PermutationType::identity)) { @@ -175,6 +174,20 @@ class BinaryEngine : public ExprEngine { right_inner_permtype_ == PermutationType::identity))) { right_.permute_tiles(false); } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (left_inner_permtype_ == PermutationType::matrix_transpose || + left_inner_permtype_ == PermutationType::identity))) { + left_.permute_tiles(false); + } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (right_inner_permtype_ == PermutationType::matrix_transpose || + right_inner_permtype_ == PermutationType::identity))) { + right_.permute_tiles(false); + } } public: From e4eb2c9409385639a6c1fff5fae19b02ceb2ce8e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 20 Nov 2023 14:06:14 -0500 Subject: [PATCH 05/88] moar ToT * T progress --- src/TiledArray/expressions/cont_engine.h | 299 ++++++++++++++--------- src/TiledArray/expressions/mult_engine.h | 4 +- src/TiledArray/expressions/product.h | 3 + src/TiledArray/tile_op/scal.h | 2 + tests/einsum.cpp | 8 +- 5 files changed, 194 insertions(+), 122 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 35c2f34199..9a1cb9f5f9 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -107,15 +107,26 @@ class ContEngine : public BinaryEngine { protected: op_type op_; ///< Tile operation - using tile_element_type = typename value_type::value_type; - std::function - inner_tile_nonreturn_op_; ///< Tile element operation (only non-null for - ///< nested tensor expressions) - std::function - inner_tile_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns - ///< the result + + // tile types of the result and (after evaluation) left and right arguments + using result_tile_type = value_type; + using left_tile_type = typename EngineTrait::eval_type; + using right_tile_type = typename EngineTrait::eval_type; + + // tile element types of the result and (after evaluation) left and right + // arguments + using result_tile_element_type = typename result_tile_type::value_type; + using left_tile_element_type = typename left_tile_type::value_type; + using right_tile_element_type = typename right_tile_type::value_type; + + std::function + element_nonreturn_op_; ///< Tile element operation (only non-null for + ///< nested tensor expressions) + std::function + element_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns + ///< the result TiledArray::detail::ProcGrid proc_grid_; ///< Process grid for the contraction size_type K_ = 1; ///< Inner dimension size @@ -239,8 +250,8 @@ class ContEngine : public BinaryEngine { // precondition checks // 1. if ToT inner tile op has been initialized if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - TA_ASSERT(inner_tile_nonreturn_op_); - TA_ASSERT(inner_tile_return_op_); + TA_ASSERT(element_nonreturn_op_); + TA_ASSERT(element_return_op_); } // Initialize children @@ -271,7 +282,7 @@ class ContEngine : public BinaryEngine { op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), (permute_tiles_ ? perm_ : BipartitePermutation{}), - this->inner_tile_nonreturn_op_); + this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(outer(perm_)); shape_ = ContEngine_::make_shape(outer(perm_)); @@ -284,7 +295,7 @@ class ContEngine : public BinaryEngine { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - BipartitePermutation{}, this->inner_tile_nonreturn_op_); + BipartitePermutation{}, this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(); shape_ = ContEngine_::make_shape(); @@ -457,120 +468,172 @@ class ContEngine : public BinaryEngine { protected: void init_inner_tile_op(const IndexList& inner_target_indices) { - if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - using inner_tile_type = typename value_type::value_type; + if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { + constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v< + result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || inner_prod == TensorProduct::Hadamard); if (inner_prod == TensorProduct::Contraction) { - using inner_tile_type = typename value_type::value_type; - using contract_inner_tile_type = - TiledArray::detail::ContractReduce; - // factor_ is absorbed into inner_tile_nonreturn_op_ - auto contrreduce_op = - (inner_target_indices != inner(this->indices_)) - ? contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_), - (this->permute_tiles_ ? inner(this->perm_) - : Permutation{})) - : contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_)); - this->inner_tile_nonreturn_op_ = [contrreduce_op]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - contrreduce_op(result, left, right); - }; + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + using op_type = TiledArray::detail::ContractReduce< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type>; + // factor_ is absorbed into inner_tile_nonreturn_op_ + auto contrreduce_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_), + (this->permute_tiles_ ? inner(this->perm_) + : Permutation{})) + : op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_)); + this->element_nonreturn_op_ = + [contrreduce_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + contrreduce_op(result, left, right); + }; + } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { - // inner tile op depends on the outer op ... e.g. if outer op - // is contract then inner must implement (ternary) multiply-add; - // if the outer is hadamard then the inner is binary multiply - const auto outer_prod = this->product_type(); - if (this->factor_ == 1) { - using base_op_type = - TiledArray::detail::Mult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(), this->permute_tiles_ - ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type()); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } - }; - } else { - using base_op_type = - TiledArray::detail::ScalMult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(this->factor_), - this->permute_tiles_ ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type(this->factor_)); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + // inner tile op depends on the outer op ... e.g. if outer op + // is contract then inner must implement (ternary) multiply-add; + // if the outer is hadamard then the inner is binary multiply + const auto outer_prod = this->product_type(); + if (this->factor_ == 1) { + using base_op_type = + TiledArray::detail::Mult; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(), this->permute_tiles_ + ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type()); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } else { + using base_op_type = TiledArray::detail::ScalMult< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type, false, false>; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(this->factor_), + this->permute_tiles_ ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type(this->factor_)); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } + } // ToT x ToT + } else if (inner_prod == TensorProduct::General) { + TA_ASSERT(!tot_x_tot); + constexpr bool tot_x_t = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + constexpr bool t_x_tot = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + if constexpr (tot_x_t || t_x_tot) { + using arg_tile_element_type = + std::conditional_t; + using scalar_type = + std::conditional_t; + + auto scal_op = [do_perm = this->permute_tiles_, + perm = this->permute_tiles_ ? inner(this->perm_) + : Permutation{}]( + const left_tile_element_type& left, + const right_tile_element_type& right) + -> result_tile_element_type { + using TiledArray::scale; + if constexpr (tot_x_t) { + if (do_perm) + return scale(left, right, perm); + else + return scale(left, right); + } else if constexpr (tot_x_t) { + if (do_perm) + return scale(right, left, perm); + else + return scale(right, left); + } else + abort(); // unreachable }; + this->element_nonreturn_op_ = + [scal_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + result = scal_op(left, right); + }; } } else abort(); // unsupported TensorProduct type - TA_ASSERT(inner_tile_nonreturn_op_); - this->inner_tile_return_op_ = - [inner_tile_nonreturn_op = this->inner_tile_nonreturn_op_]( - const inner_tile_type& left, const inner_tile_type& right) { - inner_tile_type result; - inner_tile_nonreturn_op(result, left, right); - return result; - }; + TA_ASSERT(element_nonreturn_op_); + this->element_return_op_ = [inner_tile_nonreturn_op = + this->element_nonreturn_op_]( + const left_tile_element_type& left, + const right_tile_element_type& right) { + result_tile_element_type result; + inner_tile_nonreturn_op(result, left, right); + return result; + }; } } diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index a53133d4b0..91924efeb2 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -406,7 +406,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_)); + return op_type(op_base_type(this->element_return_op_)); } else abort(); } else { // plain tensors @@ -431,7 +431,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_), perm); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index d364764964..381b1f485c 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -57,6 +57,9 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Hadamard; else result = TensorProduct::Contraction; + } else if ((left_indices && !right_indices) || + (!left_indices && right_indices)) { // used for ToT*T or T*ToT + result = TensorProduct::General; } return result; } diff --git a/src/TiledArray/tile_op/scal.h b/src/TiledArray/tile_op/scal.h index 54d5337ed4..a89770c5a7 100644 --- a/src/TiledArray/tile_op/scal.h +++ b/src/TiledArray/tile_op/scal.h @@ -128,6 +128,8 @@ class Scal { return Scal_::template eval(arg); } + void set_factor(const scalar_type factor) { factor_ = factor; } + }; // class Scal } // namespace detail diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 45c4d3e399..3033936381 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -764,8 +764,12 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work - tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // will try to make this work FIRST since this is used by the einsum code + // below + tot_type out; + out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); + // will try to make this work NEXT + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From b80d1c44c94963ce1b08d516aab5b873cbb3b8ec Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 06/88] [skip_ci] add permutation optimizer for general case: supports inner operation between tot * t. --- src/TiledArray/expressions/permopt.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 21d4a0ec39..dc029b73a1 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -527,6 +527,18 @@ class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { } }; +/// +/// +/// +class GeneralPermutationOptimizer : public GEMMPermutationOptimizer { + public: + GeneralPermutationOptimizer(const GeneralPermutationOptimizer&) = default; + GeneralPermutationOptimizer& operator=(const GeneralPermutationOptimizer&) = + default; + virtual ~GeneralPermutationOptimizer() = default; + using GEMMPermutationOptimizer::GEMMPermutationOptimizer; +}; + inline std::shared_ptr make_permutation_optimizer( TensorProduct product_type, const IndexList& left_indices, const IndexList& right_indices, bool prefer_to_permute_left) { @@ -540,6 +552,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } @@ -559,6 +574,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } From c199457ec5729ccb20e403ff7b1a08e5ac5617f0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 07/88] add permutation optimizer for scaling --- src/CMakeLists.txt | 13 +-- src/TiledArray/expressions/permopt.cpp | 32 +++++++ src/TiledArray/expressions/permopt.h | 112 +++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 src/TiledArray/expressions/permopt.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 55227c2093..6e6c708891 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,7 +100,6 @@ TiledArray/dist_eval/contraction_eval.h TiledArray/dist_eval/dist_eval.h TiledArray/dist_eval/unary_eval.h TiledArray/einsum/index.h -TiledArray/einsum/index.cpp TiledArray/einsum/range.h TiledArray/einsum/string.h TiledArray/expressions/add_engine.h @@ -195,13 +194,10 @@ TiledArray/util/bug.h TiledArray/util/function.h TiledArray/util/initializer_list.h TiledArray/util/logger.h -TiledArray/util/ptr_registry.cpp TiledArray/util/ptr_registry.h -TiledArray/util/random.cpp TiledArray/util/random.h TiledArray/util/singleton.h TiledArray/util/threads.h -TiledArray/util/threads.cpp TiledArray/util/thread_specific.h TiledArray/util/time.h TiledArray/util/vector.h @@ -243,10 +239,15 @@ TiledArray/tensor_impl.cpp TiledArray/array_impl.cpp TiledArray/dist_array.cpp TiledArray/version.cpp -TiledArray/util/backtrace.cpp -TiledArray/util/bug.cpp +TiledArray/einsum/index.cpp +TiledArray/expressions/permopt.cpp TiledArray/math/linalg/basic.cpp TiledArray/math/linalg/rank-local.cpp +TiledArray/util/backtrace.cpp +TiledArray/util/bug.cpp +TiledArray/util/ptr_registry.cpp +TiledArray/util/random.cpp +TiledArray/util/threads.cpp ) # feed TILEDARRAY_GIT_REVISION and TILEDARRAY_GIT_DESCRIPTION to TiledArray/version.cpp only to avoid recompiling everything set_source_files_properties( diff --git a/src/TiledArray/expressions/permopt.cpp b/src/TiledArray/expressions/permopt.cpp new file mode 100644 index 0000000000..9b125fdc04 --- /dev/null +++ b/src/TiledArray/expressions/permopt.cpp @@ -0,0 +1,32 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2020 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * + * permopt.cpp + * Nov 21, 2023 + * + */ + +#include + +namespace TiledArray::expressions { + +IndexList ScalePermutationOptimizer::null_indices_; + +} // namespace TiledArray::expressions diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 21d4a0ec39..998ea78efe 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -51,6 +52,56 @@ inline blas::Op to_cblas_op(PermutationType permtype) { : math::blas::NoTranspose; } +/// Optimizer of permutations for a unary operation +class UnaryOpPermutationOptimizer { + public: + /// construct using initial indices for the argument + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& argument_indices) + : argument_indices_(argument_indices) {} + + /// construct using initial indices for the argument, + /// and the desired result indices + /// \param result_indices the desired result index list + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& result_indices, + const IndexList& argument_indices) + : result_indices_(result_indices), argument_indices_(argument_indices) { + TA_ASSERT(argument_indices_.is_permutation(argument_indices_)); + target_result_indices_ = argument_indices_; + } + + UnaryOpPermutationOptimizer() = delete; + UnaryOpPermutationOptimizer(const UnaryOpPermutationOptimizer&) = default; + UnaryOpPermutationOptimizer& operator=(const UnaryOpPermutationOptimizer&) = + default; + virtual ~UnaryOpPermutationOptimizer() = default; + + /// \return the desired result indices + const IndexList& result_indices() const { + TA_ASSERT(result_indices_); + return result_indices_; + } + /// \return initial argument indices + const IndexList& argument_indices() const { return argument_indices_; } + + /// \return the proposed argument index list + const IndexList& target_argument_indices() const { + return target_result_indices_; + } + /// \return the proposed result index list (not necessarily same as that + /// returned by result_indices()) + const IndexList& target_result_indices() const { + return target_result_indices_; + } + /// \return the type of permutation bringing the initial left index list to + /// the target left index list + PermutationType argument_permtype() const { return PermutationType::general; } + + private: + IndexList result_indices_, argument_indices_, target_result_indices_; +}; + /// Abstract optimizer of permutations for a binary operation class BinaryOpPermutationOptimizer { public: @@ -479,6 +530,61 @@ class HadamardPermutationOptimizer : public BinaryOpPermutationOptimizer { IndexList target_result_indices_; }; +// clang-format off +/// Implements BinaryOpPermutationOptimizer interface for a scale operation viewed as a binary tensor product, i.e. +/// a tensor product between an order-0 tensor and an arbitrary tensor +// clang-format on +class ScalePermutationOptimizer : public BinaryOpPermutationOptimizer { + public: + ScalePermutationOptimizer(const ScalePermutationOptimizer&) = default; + ScalePermutationOptimizer& operator=(const ScalePermutationOptimizer&) = + default; + ~ScalePermutationOptimizer() = default; + + ScalePermutationOptimizer(const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(left_indices, right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices), + target_result_indices_(left_argument_is_scalar_ ? right_indices + : left_indices) {} + + ScalePermutationOptimizer(const IndexList& result_indices, + const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(result_indices, left_indices, + right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices) { + const auto& arg_indices = + left_argument_is_scalar_ ? right_indices : left_indices; + TA_ASSERT(arg_indices.is_permutation(result_indices)); + target_result_indices_ = arg_indices; + } + + const IndexList& target_left_indices() const override final { + return !left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_right_indices() const override final { + return left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_result_indices() const override final { + return target_result_indices_; + } + PermutationType left_permtype() const override final { + return PermutationType::general; + } + PermutationType right_permtype() const override final { + return PermutationType::general; + } + TensorProduct op_type() const override final { return TensorProduct::Scale; } + + private: + bool left_argument_is_scalar_; + IndexList target_result_indices_; + static IndexList null_indices_; +}; + class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { public: NullBinaryOpPermutationOptimizer(const NullBinaryOpPermutationOptimizer&) = @@ -540,6 +646,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared(left_indices, + right_indices); default: abort(); } @@ -559,6 +668,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared( + target_indices, left_indices, right_indices); default: abort(); } From bff7d2888cd69e5ef4b9bb4ed86e775e6528c4db Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Nov 2023 16:33:46 -0500 Subject: [PATCH 08/88] expression-level support for ToT x T (and vice versa) implemented, need to test --- src/TiledArray/expressions/cont_engine.h | 19 ++++----- src/TiledArray/expressions/product.h | 5 ++- tests/einsum.cpp | 49 +++++++++++++++++++++--- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 9a1cb9f5f9..5ec69c7d0d 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -158,9 +158,10 @@ class ContEngine : public BinaryEngine { TensorProduct inner_product_type() const { TA_ASSERT(inner_product_type_ != TensorProduct::Invalid); // init_indices() must initialize this - /// only Hadamard and contraction are supported now + /// only Hadamard, contraction, and scale are supported now TA_ASSERT(inner_product_type_ == TensorProduct::Hadamard || - inner_product_type_ == TensorProduct::Contraction); + inner_product_type_ == TensorProduct::Contraction || + inner_product_type_ == TensorProduct::Scale); return inner_product_type_; } @@ -473,7 +474,8 @@ class ContEngine : public BinaryEngine { result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || - inner_prod == TensorProduct::Hadamard); + inner_prod == TensorProduct::Hadamard || + inner_prod == TensorProduct::Scale); if (inner_prod == TensorProduct::Contraction) { TA_ASSERT(tot_x_tot); if constexpr (tot_x_tot) { @@ -577,8 +579,8 @@ class ContEngine : public BinaryEngine { } }; } - } // ToT x ToT - } else if (inner_prod == TensorProduct::General) { + } // ToT x T or T x ToT + } else if (inner_prod == TensorProduct::Scale) { TA_ASSERT(!tot_x_tot); constexpr bool tot_x_t = TiledArray::detail::is_tensor_of_tensor_v { std::conditional_t; - auto scal_op = [do_perm = this->permute_tiles_, - perm = this->permute_tiles_ ? inner(this->perm_) + auto scal_op = [perm = this->permute_tiles_ ? inner(this->perm_) : Permutation{}]( const left_tile_element_type& left, const right_tile_element_type& right) -> result_tile_element_type { using TiledArray::scale; if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(left, right, perm); else return scale(left, right); } else if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(right, left, perm); else return scale(right, left); diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index 381b1f485c..7111b7831b 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -39,6 +39,9 @@ enum class TensorProduct { Contraction, /// free, fused, and contracted indices General, + /// no indices on one, free indices on the other; only used for inner index + /// products in mixed nested products (ToT x T) + Scale, /// invalid Invalid = -1 }; @@ -59,7 +62,7 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Contraction; } else if ((left_indices && !right_indices) || (!left_indices && right_indices)) { // used for ToT*T or T*ToT - result = TensorProduct::General; + result = TensorProduct::Scale; } return result; } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3033936381..ea5529e5b8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -718,6 +718,49 @@ BOOST_AUTO_TEST_SUITE_END() // einsum_tot BOOST_AUTO_TEST_SUITE(einsum_tot_t) +BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), + rhs_trange.dim(0)}; + tot_type ref_result(world, ref_result_trange); + // TODO compute ref_result + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); + + // TODO check result against ref_result +} + BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; @@ -764,11 +807,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work FIRST since this is used by the einsum code - // below - tot_type out; - out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); - // will try to make this work NEXT + // will try to make this work // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } From 72e1bcb66e4675e86d067390103f868f0d028033 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 14:54:30 -0500 Subject: [PATCH 09/88] [ci skip] implement 'i,j;m,n * j,k -> i,j,k;m,n' reference evaluation manually. --- tests/einsum.cpp | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ea5529e5b8..800d51d3e0 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -793,10 +793,41 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; - tot_type ref_result(world, ref_result_trange); // TODO compute ref_result + // i,j;m,n * j,k => i,j,k;m,n + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), + rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + for (auto const& tile : ref_result) { + tot_type::value_type result_tile{tile.make_range()}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + auto k = res_ix[2]; + + using Ix2 = std::array; + using Ix3 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{j, k})); + + res_el = lhs_el.scale(rhs_el); + } + + ref_result.set(tile.index(), result_tile); + } + + std::cout << ref_result << std::endl; ///////////////////////////////////////////////////////// // ToT * T From c6940539f68dfa7eec5b3ba5922d2eb8c77070e9 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 17:34:55 -0500 Subject: [PATCH 10/88] [ci skip] more manual tot * t reference evaluation --- tests/einsum.cpp | 68 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 800d51d3e0..6501d91a10 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -751,14 +751,58 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { rhs.fill_random(); TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; + rhs_trange.dim(0), lhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - // TODO compute ref_result + + // + // i,l,k,j;n,m = i,j;m,n * k,l + // + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto l = res_ix[1]; + auto k = res_ix[2]; + auto j = res_ix[3]; + + using Ix2 = std::array; + using Ix4 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, l})); + + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{1, 0}); // permute [0,1] -> [1,0] + } + return result_tile; + }; + + using std::begin; + using std::endl; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // TODO check result against ref_result + // todo: fix it + // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + // BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -799,8 +843,11 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - for (auto const& tile : ref_result) { - tot_type::value_type result_tile{tile.make_range()}; + // + // why cannot lhs and rhs be captured by ref? + // + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; for (auto&& res_ix : result_tile.range()) { auto i = res_ix[0]; auto j = res_ix[1]; @@ -823,11 +870,16 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { res_el = lhs_el.scale(rhs_el); } + return result_tile; + }; - ref_result.set(tile.index(), result_tile); - } + using std::begin; + using std::endl; - std::cout << ref_result << std::endl; + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } ///////////////////////////////////////////////////////// // ToT * T From 29b5dba22c87dd12d4265506e52593b9b026c997 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:04:59 -0500 Subject: [PATCH 11/88] Add equality comparison for SparseShape. --- src/TiledArray/sparse_shape.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index bf51487922..271857a72c 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -1742,6 +1742,17 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } +template +constexpr inline bool operator==(const SparseShape& a, + const SparseShape& b) { + return true; +} +template +constexpr inline bool operator!=(const SparseShape& a, + const SparseShape& b) { + return !(a == b); +} + #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From f2945dad86058ee08f7e68acafddf391eb0d186c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:05:40 -0500 Subject: [PATCH 12/88] Validate outer-product type tot * t evaluation using expression layer. --- tests/einsum.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6501d91a10..aad4a00c0a 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,9 +800,8 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // todo: fix it - // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - // BOOST_CHECK(are_equal); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From be06fbe6380daeed181ace0815c778c170f8f36d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 11:42:05 -0500 Subject: [PATCH 13/88] [unit] einsum_tot_t pulls remote tiles using strick blocking (dowork=false) also fixed a few typos --- tests/einsum.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index aad4a00c0a..db2731a2e1 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -771,10 +771,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using Ix4 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); @@ -790,7 +790,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); @@ -856,10 +856,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -873,7 +873,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); From 3cd64dbbda97a9071d36d67826a63d5b88d6f5c2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 12:04:54 -0500 Subject: [PATCH 14/88] [unit] einsum_tot_t must test ToT*T AND T*ToT (the latter is currently broken due to missing Tensor functionality for binary Scalar*Tensor ops) --- tests/einsum.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index db2731a2e1..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -802,6 +802,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); + + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -887,10 +894,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From f246756bd707d319c33f2d536f698904fe9be0dd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 27 Nov 2023 23:16:39 -0500 Subject: [PATCH 15/88] Avoid code-duplication by generalizing the existing einsum function. --- src/TiledArray/einsum/range.h | 3 +- src/TiledArray/einsum/tiledarray.h | 316 ++++++----------------------- tests/einsum.cpp | 12 +- 3 files changed, 72 insertions(+), 259 deletions(-) diff --git a/src/TiledArray/einsum/range.h b/src/TiledArray/einsum/range.h index 32eb669588..79b409e64d 100644 --- a/src/TiledArray/einsum/range.h +++ b/src/TiledArray/einsum/range.h @@ -14,7 +14,8 @@ using small_vector = TiledArray::container::svector; struct Range { using value_type = int64_t; using iterator = boost::counting_iterator; - template + template , bool> = true> explicit Range(Pair &&pair) : Range(pair.first, pair.second) {} Range(value_type begin, value_type end) : begin_(begin), end_(end) {} auto begin() const { return iterator(begin_); } diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 09640d31f6..1a3840f99f 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -64,13 +64,38 @@ struct ArrayTerm { } }; -template -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; + +template +constexpr bool AreArrayT = IsArrayT && IsArrayT; + +template +constexpr bool AreArrayToT = IsArrayToT && IsArrayToT; + +template +constexpr bool AreArraySame = + AreArrayT || AreArrayToT; + +} // namespace + +template +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::tuple, Indices...> cs, World &world) { - using Array = std::remove_cv_t; - using Tensor = typename Array::value_type; - using Shape = typename Array::shape_type; + using ArrayA = std::remove_cv_t; + using ArrayB = std::remove_cv_t; + using ArrayC = std::conditional_t< + AreArraySame, ArrayA, + std::conditional_t, ArrayA, ArrayB>>; + // using Array = ArrayC; + using ResultTensor = typename ArrayC::value_type; + using ResultShape = typename ArrayC::shape_type; auto a = std::get<0>(Einsum::idx(A)); auto b = std::get<0>(Einsum::idx(B)); @@ -91,7 +116,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // no Hadamard indices => standard contraction (or even outer product) // same a, b, and c => pure Hadamard if (!h || (!(a ^ b) && !(b ^ c))) { - Array C; + ArrayC C; C(std::string(c) + inner.c) = A * B; return C; } @@ -108,17 +133,22 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ::Einsum::index::permutation; using TiledArray::Permutation; - ArrayTerm AB[2] = {{A.array(), a}, {B.array(), b}}; + std::tuple, ArrayTerm> AB{{A.array(), a}, + {B.array(), b}}; - for (auto &term : AB) { + auto update_perm_and_indices = [&e = std::as_const(e), &i = std::as_const(i), + &h = std::as_const(h)](auto &term) { auto ei = (e + i & term.idx); if (term.idx != h + ei) { term.permutation = permutation(term.idx, h + ei); } term.expr = ei; - } + }; - ArrayTerm C = {Array(world, TiledRange(range_map[c])), c}; + std::invoke(update_perm_and_indices, std::get<0>(AB)); + std::invoke(update_perm_and_indices, std::get<1>(AB)); + + ArrayTerm C = {ArrayC(world, TiledRange(range_map[c])), c}; for (auto idx : e) { C.tiles *= Range(range_map[idx].tiles_range()); } @@ -127,8 +157,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; - AB[0].expr += inner.a; - AB[1].expr += inner.b; + std::get<0>(AB).expr += inner.a; + std::get<1>(AB).expr += inner.b; + C.expr += inner.c; struct { @@ -163,7 +194,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - Tensor tile(TiledArray::Range{batch}, typename Tensor::value_type(0)); + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type(0)); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); @@ -193,16 +225,20 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // generalized contraction - for (auto &term : AB) { + auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), + &range_map = std::as_const(range_map)](auto &term) { auto ei = (e + i & term.idx); term.ei_tiled_range = TiledRange(range_map[ei]); for (auto idx : ei) { term.tiles *= Range(range_map[idx].tiles_range()); } - } + }; + + std::invoke(update_tr, std::get<0>(AB)); + std::invoke(update_tr, std::get<1>(AB)); std::vector> worlds; - std::vector> local_tiles; + std::vector> local_tiles; // iterates over tiles of hadamard indices for (Index h : H.tiles) { @@ -216,7 +252,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - for (auto &term : AB) { + + auto retile = [&owners, &h = std::as_const(h), batch](auto &term) { term.local_tiles.clear(); const Permutation &P = term.permutation; @@ -232,235 +269,18 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, term.local_tiles.push_back({ei, tile}); } bool replicated = term.array.pmap()->is_replicated(); - term.ei = TiledArray::make_array( + term.ei = TiledArray::make_array( *owners, term.ei_tiled_range, term.local_tiles.begin(), term.local_tiles.end(), replicated); - } - C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - A.ei.defer_deleter_to_next_fence(); - B.ei.defer_deleter_to_next_fence(); - A.ei = Array(); - B.ei = Array(); - // why omitting this fence leads to deadlock? - owners->gop.fence(); - for (Index e : C.tiles) { - if (!C.ei.is_local(e)) continue; - if (C.ei.is_zero(e)) continue; - // TODO no need for immediate evaluation - auto tile = C.ei.find_local(e).get(); - assert(tile.batch_size() == batch); - const Permutation &P = C.permutation; - auto c = apply(P, h + e); - auto shape = C.array.trange().tile(c); - shape = apply_inverse(P, shape); - tile = tile.reshape(shape); - if (P) tile = tile.permute(P); - local_tiles.push_back({c, tile}); - } - // mark for lazy deletion - C.ei = Array(); - } - - if constexpr (!Shape::is_dense()) { - TiledRange tiled_range = TiledRange(range_map[c]); - std::vector> tile_norms; - for (auto &[index, tile] : local_tiles) { - tile_norms.push_back({index, tile.norm()}); - } - Shape shape(world, tile_norms, tiled_range); - C.array = Array(world, TiledRange(range_map[c]), shape); - } - - for (auto &[index, tile] : local_tiles) { - if (C.array.is_zero(index)) continue; - C.array.set(index, tile); - } - - for (auto &w : worlds) { - w->gop.fence(); - } - - return C.array; -} - -namespace { -template -constexpr bool IsArrayT = detail::is_tensor_v; - -template -constexpr bool IsArrayToT = - detail::is_tensor_of_tensor_v; -} // namespace - -template < - typename ArrayT_, typename ArrayToT_, typename... Indices, - typename = std::enable_if_t && IsArrayToT>> -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, - std::tuple, Indices...> cs, - World &world) { - using ArrayT = std::remove_cv_t; - using ArrayToT = std::remove_cv_t; - using Shape = typename ArrayToT::shape_type; - using T = typename ArrayT::value_type; - using ToT = typename ArrayToT::value_type; - - auto a = std::get<0>(Einsum::idx(A)); - auto b = std::get<0>(Einsum::idx(B)); - Einsum::Index c = std::get<0>(cs); - - struct { - std::string b, c; - } inner; - if constexpr (std::tuple_size::value == 2) { - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); - inner.c = ";" + (std::string)std::get<1>(cs); - } + }; + std::invoke(retile, std::get<0>(AB)); + std::invoke(retile, std::get<1>(AB)); - // these are "Hadamard" (fused) indices - auto h = a & b & c; - - // contracted indices - auto i = (a & b) - h; - // contraction not allowed in tensor x tensor-of-tensor - TA_ASSERT(!i); - - // indices exclusively in 'a' or exclusively in 'b' - auto e = (a ^ b); - - // maps Index to TiledRange1 - // (asserts same index maps to the same TR1 in A, and B) - auto range_map = - (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - - using ::Einsum::index::permutation; - using TiledArray::Permutation; - - auto arrayTermA = ArrayTerm{A.array(), a}; - auto arrayTermB = ArrayTerm{B.array(), b}; - - { - auto ei = (e + i & arrayTermA.idx); - if (arrayTermA.idx != h + ei) - arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); - arrayTermA.expr = ei; - } - - { - auto ei = (e + i & arrayTermB.idx); - if (arrayTermB.idx != h + ei) - arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); - arrayTermB.expr = ei; - } - - ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; - for (auto idx : e) { - C.tiles *= Range(range_map[idx].tiles_range()); - } - if (C.idx != h + e) { - C.permutation = permutation(h + e, C.idx); - } - C.expr = e; - - arrayTermB.expr += inner.b; - C.expr += inner.c; - - struct { - RangeProduct tiles; - std::vector> batch; - } H; - - for (auto idx : h) { - H.tiles *= Range(range_map[idx].tiles_range()); - H.batch.push_back({}); - for (auto r : range_map[idx]) { - H.batch.back().push_back(Range{r}.size()); - } - } - - using Index = Einsum::Index; - - // generalized contraction - { - auto ei = (e + i & arrayTermA.idx); - arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); - } - - { - auto ei = (e + i & arrayTermB.idx); - arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); - } - - std::vector> worlds; - std::vector> local_tiles; - - // iterates over tiles of hadamard indices - for (Index h : H.tiles) { - auto &A = arrayTermA; - auto &B = arrayTermB; - - auto own = A.own(h) || B.own(h); - auto comm = world.mpi.comm().Split(own, world.rank()); - worlds.push_back(std::make_unique(comm)); - auto &owners = worlds.back(); - if (!own) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); - } - - { - arrayTermA.local_tiles.clear(); - const Permutation &P = arrayTermA.permutation; - - for (Index ei : arrayTermA.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermA.array.is_local(idx)) continue; - if (arrayTermA.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermA.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermA.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermA.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermA.array.pmap()->is_replicated(); - arrayTermA.ei = TiledArray::make_array( - *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), - arrayTermA.local_tiles.end(), replicated); - } - - { - arrayTermB.local_tiles.clear(); - const Permutation &P = arrayTermB.permutation; - - for (Index ei : arrayTermB.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermB.array.is_local(idx)) continue; - if (arrayTermB.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermB.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermB.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermB.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermB.array.pmap()->is_replicated(); - arrayTermB.ei = TiledArray::make_array( - *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), - arrayTermB.local_tiles.end(), replicated); - } - - // todo C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - - // - A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); - A.ei = ArrayT(); - B.ei = ArrayToT(); + A.ei = ArrayA(); + B.ei = ArrayB(); // why omitting this fence leads to deadlock? owners->gop.fence(); for (Index e : C.tiles) { @@ -478,17 +298,17 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, local_tiles.push_back({c, tile}); } // mark for lazy deletion - C.ei = ArrayToT(); + C.ei = ArrayC(); } - if constexpr (!Shape::is_dense()) { + if constexpr (!ResultShape::is_dense()) { TiledRange tiled_range = TiledRange(range_map[c]); std::vector> tile_norms; for (auto &[index, tile] : local_tiles) { tile_norms.push_back({index, tile.norm()}); } - Shape shape(world, tile_norms, tiled_range); - C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + ResultShape shape(world, tile_norms, tiled_range); + C.array = ArrayC(world, TiledRange(range_map[c]), shape); } for (auto &[index, tile] : local_tiles) { @@ -503,14 +323,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } -template && IsArrayToT>> -auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, - std::tuple, Indices...> cs, - World &world) { - return einsum(A, B, cs, world); -} - /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..8eea2884f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); - { // reverse the order - tot_type result; - BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - BOOST_CHECK(are_equal); - } +// { // reverse the order +// tot_type result; +// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); +// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); +// BOOST_CHECK(are_equal); +// } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From e5ec53161ccf22ffb40ddc40a9d2c1b3b29cb7c8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 Nov 2023 10:28:47 -0500 Subject: [PATCH 16/88] In einsum, handle inner index labels when tot times t, or, t times tot arguments are passed. --- src/TiledArray/einsum/tiledarray.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 1a3840f99f..eb317e0aef 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -93,7 +93,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ArrayC = std::conditional_t< AreArraySame, ArrayA, std::conditional_t, ArrayA, ArrayB>>; - // using Array = ArrayC; using ResultTensor = typename ArrayC::value_type; using ResultShape = typename ArrayC::shape_type; @@ -105,8 +104,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::string a, b, c; } inner; if constexpr (std::tuple_size::value == 2) { - inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT) + inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); + + if constexpr (IsArrayToT) + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + + static_assert(IsArrayToT || IsArrayToT); inner.c = ";" + (std::string)std::get<1>(cs); } From 8341bbb8cc5b902136cc87e374f19b56ccd2cddb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:00:36 -0500 Subject: [PATCH 17/88] amend https://github.com/ValeevGroup/tiledarray/commit/bff7d2888cd69e5ef4b9bb4ed86e775e6528c4db --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 5ec69c7d0d..21aceae14c 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -609,7 +609,7 @@ class ContEngine : public BinaryEngine { return scale(left, right, perm); else return scale(left, right); - } else if constexpr (tot_x_t) { + } else if constexpr (t_x_tot) { if (perm) return scale(right, left, perm); else From 56b49a03464294eb629b38e63060e93b98695142 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:02:22 -0500 Subject: [PATCH 18/88] relax type requirements on tensor_init to support mixed (ToT alongside T) invocations, this allows T * ToT expr to compile and unit test to succeed --- src/TiledArray/tensor/kernels.h | 7 ++++--- tests/einsum.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 87db8c1cc6..97f7dc1e5b 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -541,9 +541,10 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result, /// \param[out] result The result tensor /// \param[in] tensor1 The first argument tensor /// \param[in] tensors The argument tensors -template ::value>::type* = nullptr> +template < + typename Op, typename TR, typename T1, typename... Ts, + typename std::enable_if::value && + !is_tensor::value>::type* = nullptr> inline void tensor_init(Op&& op, const Permutation& perm, TR& result, const T1& tensor1, const Ts&... tensors) { TA_ASSERT(!empty(result, tensor1, tensors...)); diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8eea2884f9..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); -// { // reverse the order -// tot_type result; -// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); -// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); -// BOOST_CHECK(are_equal); -// } + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From b75b1fcac72a9f82c95529972e2a20cd6ab2ed56 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 30 Nov 2023 14:06:19 -0500 Subject: [PATCH 19/88] relax Tensor(left,right,binaryelemeop,permutation) ctor constraints --- src/TiledArray/tensor/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 3c10ba4077..f3076c4514 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -492,7 +492,7 @@ class Tensor { /// \param perm The permutation that will be applied to the arguments template < typename T1, typename T2, typename Op, typename Perm, - typename std::enable_if::value && + typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> Tensor(const T1& left, const T2& right, Op&& op, const Perm& perm) : Tensor(outer(perm) * left.range(), 1, default_construct{false}) { From f8d41002c106e8cb54fa79ae02e8b1ca06216c7e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 7 Dec 2023 18:38:25 -0500 Subject: [PATCH 20/88] Support for pure hadamard product between a tot and a t: 'i,j;m,n * i,j -> i,j;m,n' --- src/TiledArray/expressions/binary_engine.h | 6 +- src/TiledArray/expressions/mult_engine.h | 6 ++ tests/einsum.cpp | 92 ++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 93192e2b5e..411a1c7c13 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -204,8 +204,10 @@ class BinaryEngine : public ExprEngine { /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { if (permute_tiles_) { - TA_ASSERT(left_.indices().size() == target_indices.size()); - TA_ASSERT(right_.indices().size() == target_indices.size()); + TA_ASSERT(left_.indices().size() == target_indices.size() || + (left_.indices().second().size() ^ target_indices.second().size())); + TA_ASSERT(right_.indices().size() == target_indices.size() || + (right_.indices().second().size() ^ target_indices.second().size())); init_indices_(target_indices); diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 91924efeb2..9713e0b0df 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -407,6 +407,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type()); } else abort(); } else { // plain tensors @@ -432,6 +435,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..9ea4dd39d3 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -900,6 +900,98 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } +BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_4_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_4_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_5_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_5_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}, + {lhs_elem_4_0, lhs_elem_4_1}, + {lhs_elem_5_0, lhs_elem_5_1}}; + TiledRange lhs_trange{{0, 2, 6}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + // + // i,j;m,n = j,i;n,m * i,j + // + TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + + using Ix2 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false ); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); + auto rhs_el = + rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{0, 1} // permute + ); + } + return result_tile; + }; + + using std::begin; + using std::end; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); + + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); +} + BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t // Eigen einsum indices From 726ebb893e6ad21cfcef92c70ce4600b42b6d9d3 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:56:34 -0500 Subject: [PATCH 21/88] SparseShape inequality comparison added. --- src/TiledArray/sparse_shape.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index 271857a72c..b589dc73cf 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -797,6 +797,13 @@ class SparseShape { return equal; } + /// Bitwise comparison + /// \param other a SparseShape object + /// \return true if this object and @c other object are bitwise NOT identical + inline bool operator!=(const SparseShape& other) const { + return !(*this == other); + } + private: /// Create a copy of a sub-block of the shape @@ -1742,17 +1749,6 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } -template -constexpr inline bool operator==(const SparseShape& a, - const SparseShape& b) { - return true; -} -template -constexpr inline bool operator!=(const SparseShape& a, - const SparseShape& b) { - return !(a == b); -} - #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From 7fd52d54b02136857eb429da3bb2685f1ee4c77e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:57:16 -0500 Subject: [PATCH 22/88] Disable shape comparison in ToTArrayFixture. --- tests/tot_array_fixture.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 9d46fadcc7..1619a794c8 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -237,6 +237,7 @@ struct ToTArrayFixture { * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons + * TODO: shape comparisons */ template @@ -254,7 +255,7 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - if (lhs.shape() != rhs.shape()) return false; + // if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From cdc9db23455dbccef01b7f906a0c7b3fafe11806 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:58:25 -0500 Subject: [PATCH 23/88] Default construction of result tensor tile in `einsum` made more generic. --- src/TiledArray/einsum/tiledarray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index eb317e0aef..48648407cb 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -199,7 +199,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, batch *= H.batch[i].at(h[i]); } ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type(0)); + typename ResultTensor::value_type{}); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); From d2fb429f93504a1996bca7b7355b818f27eefb00 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:00:17 -0500 Subject: [PATCH 24/88] Restore (optional) shape comparison on ToTArrayFixture::are_equal function. --- tests/einsum.cpp | 6 +++--- tests/tot_array_fixture.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 9ea4dd39d3..a1c26d1782 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,13 +800,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -988,7 +988,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 1619a794c8..21a9c956c6 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -231,16 +231,15 @@ struct ToTArrayFixture { * - Same type * - Either both are initialized or both are not initialized * - Same MPI context - * - Same shape + * - Same shape (unless the template parameter ShapeCmp is set false) * - Same distribution * - Same tiling * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons - * TODO: shape comparisons */ - template + template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { // Same type @@ -255,7 +254,8 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - // if (lhs.shape() != rhs.shape()) return false; + if constexpr (ShapeCmp) + if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From 42fb41bd9e1bcd01d7f1171aae9a68dcb033d72b Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:03:38 -0500 Subject: [PATCH 25/88] Relax restricitons on this->product_type() values while calling make_tile_op(). --- src/TiledArray/expressions/mult_engine.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 9713e0b0df..20093b2cec 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -408,7 +408,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type()); } else abort(); @@ -436,7 +435,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type(this->element_return_op_), perm); } else abort(); From 7b7dbb8f8af59af85e0bfc38f3d734e9b2ef2fc7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 11 Dec 2023 07:35:16 -0500 Subject: [PATCH 26/88] Typo. --- tests/einsum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index a1c26d1782..ebd9784bfd 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -1269,7 +1269,7 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_abi_cdi_cdab) { "abi,cdi->cdab"); } -BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_ai_abcd) { +BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_bai_abcd) { einsum_tiledarray_check<3, 3, 4>(random(3, 12, 13), random(14, 15, 3), "icd,bai->abcd"); From 02a7db7ab1dc2545b98794d700e3b9854517f564 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 15 Dec 2023 09:28:57 -0500 Subject: [PATCH 27/88] [skip ci] einsum unit test for ij;mn * kj;mn -> ijk;mn --- tests/einsum.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ebd9784bfd..eb2ffe1869 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -580,6 +580,40 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) { BOOST_CHECK(are_equal); } +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { + using dist_array_t = DistArray>, DensePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + + auto random_tot = [](TA::Range const& rng) { + TA::Range inner_rng{7,14}; + TA::Tensor t{inner_rng}; + TA::Tensor> result{rng}; + for (auto& e: result) e = t; + return result; + }; + + auto random_tot_darr = [&random_tot](World& world, + TiledRange const& tr) { + dist_array_t result(world, tr); + for (auto it = result.begin(); it != result.end(); ++it) { + auto tile = + TA::get_default_world().taskq.add(random_tot, it.make_range()); + *it = tile; + } + return result; + }; + + TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + auto lhs = random_tot_darr(world, lhs_trange); + + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + auto rhs = random_tot_darr(world, rhs_trange); + dist_array_t result; + BOOST_REQUIRE_NO_THROW( + result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n")); +} + BOOST_AUTO_TEST_CASE(xxx) { using dist_array_t = DistArray>, DensePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -1328,6 +1362,13 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_hji_jih_hj) { "hji,jih->hj"); } +BOOST_AUTO_TEST_CASE(einsum_tiledarray_ik_jk_ijk) { + einsum_tiledarray_check<2, 2, 3>(random(7, 5), + random(14, 5), "ik,jk->ijk"); + einsum_tiledarray_check<2, 2, 3>(sparse_zero(7, 5), sparse_zero(14, 5), + "ik,jk->ijk"); +} + BOOST_AUTO_TEST_CASE(einsum_tiledarray_replicated) { einsum_tiledarray_check<3, 3, 3>(replicated(random(7, 14, 3)), random(7, 15, 3), From f0be0c97d193b5c4df3653f4dfe4179695bb57e6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 15 Dec 2023 10:45:59 -0500 Subject: [PATCH 28/88] Tensor::gemm involving custom elem_op supports batching --- src/TiledArray/tensor/tensor.h | 75 ++++++++++++++++++++++++---------- tests/einsum.cpp | 4 +- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index f3076c4514..c901dc0f4b 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -292,10 +292,12 @@ class Tensor { /// Construct a tensor with a range equal to \c range. The data is /// uninitialized. /// \param range The range of the tensor - explicit Tensor(const range_type& range) - : Tensor(range, 1, default_construct{true}) {} + /// \param batch_size The batch size (default is 1) + explicit Tensor(const range_type& range, size_type batch_size = 1) + : Tensor(range, batch_size, default_construct{true}) {} - /// Construct a tensor with a fill value + /// Construct a tensor of tensor values, setting all elements to the same + /// value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements @@ -312,12 +314,14 @@ class Tensor { new (data + i) value_type(cloner(value)); } - /// Construct a tensor with a fill value + /// Construct a tensor of scalars, setting all elements to the same value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements - template >::type* = nullptr> + template && + !detail::is_tensor::value>::type* = + nullptr> Tensor(const range_type& range, const Value& value) : Tensor(range, 1, default_construct{false}) { detail::tensor_init([value]() -> Value { return value; }, *this); @@ -358,7 +362,7 @@ class Tensor { math::uninitialized_copy_vector(range.volume(), u, this->data()); } - Tensor(const Range& range, std::initializer_list il) + explicit Tensor(const Range& range, std::initializer_list il) : Tensor(range, il.begin()) {} /// Construct a copy of a tensor interface object @@ -1004,6 +1008,22 @@ class Tensor { /// \return A mutable pointer to the tensor data pointer data() { return this->data_.get(); } + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + const_pointer batch_data(size_t batch_idx) const { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + pointer batch_data(size_t batch_idx) { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + /// Read-only shared_ptr to the data /// \return A const shared_ptr to the tensor data @@ -2194,6 +2214,8 @@ class Tensor { TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); TA_ASSERT(!right.empty()); TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); + TA_ASSERT(left.batch_size() == right.batch_size()); + const auto batch_sz = left.batch_size(); // Check that the inner dimensions of left and right match TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(), @@ -2207,7 +2229,8 @@ class Tensor { if (this->empty()) { // initialize, if empty *this = Tensor(gemm_helper.make_result_range(left.range(), - right.range())); + right.range()), + batch_sz); } else { // Check that the outer dimensions of left match the corresponding // dimensions in result @@ -2230,6 +2253,9 @@ class Tensor { TA_ASSERT(ignore_tile_position() || gemm_helper.right_result_congruent( right.range().upbound_data(), this->range_.upbound_data())); + + // check that batch size of this matches that of left and right + TA_ASSERT(this->batch_size() == batch_sz); } // Compute gemm dimensions @@ -2243,20 +2269,25 @@ class Tensor { const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K); - for (integer m = 0; m != M; ++m) { - for (integer n = 0; n != N; ++n) { - auto c_offset = m * N + n; - for (integer k = 0; k != K; ++k) { - auto a_offset = - gemm_helper.left_op() == TiledArray::math::blas::NoTranspose - ? m * lda + k - : k * lda + m; - auto b_offset = - gemm_helper.right_op() == TiledArray::math::blas::NoTranspose - ? k * ldb + n - : n * ldb + k; - elem_muladd_op(*(this->data() + c_offset), *(left.data() + a_offset), - *(right.data() + b_offset)); + for (integer b = 0; b != batch_size(); ++b) { + auto this_data = this->batch_data(b); + auto left_data = left.batch_data(b); + auto right_data = right.batch_data(b); + for (integer m = 0; m != M; ++m) { + for (integer n = 0; n != N; ++n) { + auto c_offset = m * N + n; + for (integer k = 0; k != K; ++k) { + auto a_offset = + gemm_helper.left_op() == TiledArray::math::blas::NoTranspose + ? m * lda + k + : k * lda + m; + auto b_offset = + gemm_helper.right_op() == TiledArray::math::blas::NoTranspose + ? k * ldb + n + : n * ldb + k; + elem_muladd_op(*(this_data + c_offset), *(left_data + a_offset), + *(right_data + b_offset)); + } } } } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb2ffe1869..eb976b31f5 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -604,10 +604,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { return result; }; - TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2, 5}}; auto lhs = random_tot_darr(world, lhs_trange); - TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}}; auto rhs = random_tot_darr(world, rhs_trange); dist_array_t result; BOOST_REQUIRE_NO_THROW( From 6e1868639fc1811ea2f60b65b4e85618a9b3e102 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 13:10:40 -0500 Subject: [PATCH 29/88] Make single-valued initializer lists explicit in ambiguous cases. --- tests/initializer_list.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/initializer_list.cpp b/tests/initializer_list.cpp index 4d051f957d..3f5ad27b80 100644 --- a/tests/initializer_list.cpp +++ b/tests/initializer_list.cpp @@ -471,7 +471,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(vector, T, scalar_type_list) { auto array = array_from_il>(world, tr, il); using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 2.0}), - tile_type(tr.make_tile_range(1), {3.0})}; + tile_type(tr.make_tile_range(1), std::initializer_list{3.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; tile_type tile = array.find(i); @@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(matrix, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; @@ -503,11 +503,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0}), tile_type(tr.make_tile_range(4), {10.0, 13.0}), tile_type(tr.make_tile_range(5), {11.0, 12.0, 14.0, 15.0}), - tile_type(tr.make_tile_range(6), {16.0}), + tile_type(tr.make_tile_range(6), std::initializer_list{16.0}), tile_type(tr.make_tile_range(7), {17.0, 18.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; From 2520fe54218419f41b64a5f7bc6f9288e31b1207 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:34:25 -0500 Subject: [PATCH 30/88] Use .data() method to access elements by ordinal in tensor_reduce function. --- src/TiledArray/tensor/kernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 97f7dc1e5b..f1ec6d99c5 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -787,8 +787,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, auto result = identity; for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) { auto temp = - tensor_reduce(reduce_op, join_op, identity, tensor1.at_ordinal(ord), - tensors.at_ordinal(ord)...); + tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], + tensors.data()[ord]...); join_op(result, temp); } From eacc22bf803941407bbd9716a51a1cd2baa9fc80 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:36:08 -0500 Subject: [PATCH 31/88] Implement Tot x T (and reverse) generalized contraction. --- src/TiledArray/einsum/tiledarray.h | 84 +++++++++++++++--------------- tests/einsum.cpp | 14 +++-- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 48648407cb..2bd548df5c 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -181,50 +181,51 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (std::tuple_size::value > 1) { - TA_ASSERT(e); - } else if (!e) { // hadamard reduction - auto &[A, B] = AB; - TiledRange trange(range_map[i]); - RangeProduct tiles; - for (auto idx : i) { - tiles *= Range(range_map[idx].tiles_range()); - } - auto pa = A.permutation; - auto pb = B.permutation; - for (Index h : H.tiles) { - if (!C.array.is_local(h)) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); + if constexpr (std::tuple_size::value > 1) TA_ASSERT(e); + if constexpr (AreArraySame) { + if (!e) { // hadamard reduction + auto &[A, B] = AB; + TiledRange trange(range_map[i]); + RangeProduct tiles; + for (auto idx : i) { + tiles *= Range(range_map[idx].tiles_range()); } - ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type{}); - for (Index i : tiles) { - // skip this unless both input tiles exist - const auto pahi_inv = apply_inverse(pa, h + i); - const auto pbhi_inv = apply_inverse(pb, h + i); - if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; - - auto ai = A.array.find(pahi_inv).get(); - auto bi = B.array.find(pbhi_inv).get(); - if (pa) ai = ai.permute(pa); - if (pb) bi = bi.permute(pb); - auto shape = trange.tile(i); - ai = ai.reshape(shape, batch); - bi = bi.reshape(shape, batch); - for (size_t k = 0; k < batch; ++k) { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; + auto pa = A.permutation; + auto pb = B.permutation; + for (Index h : H.tiles) { + if (!C.array.is_local(h)) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); } + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type{}); + for (Index i : tiles) { + // skip this unless both input tiles exist + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (size_t k = 0; k < batch; ++k) { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; + } + } + auto pc = C.permutation; + auto shape = apply_inverse(pc, C.array.trange().tile(h)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C.array.set(h, tile); } - auto pc = C.permutation; - auto shape = apply_inverse(pc, C.array.trange().tile(h)); - tile = tile.reshape(shape); - if (pc) tile = tile.permute(pc); - C.array.set(h, tile); + return C.array; } - return C.array; } // generalized contraction @@ -468,7 +469,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { using ECT = expressions::TsrExpr; using ECU = expressions::TsrExpr; - return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); + using ResultExprT = std::conditional_t, T, U>; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb976b31f5..3e7b502da9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -845,7 +845,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { } } -BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -877,7 +877,6 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - // TODO compute ref_result // i,j;m,n * j,k => i,j,k;m,n TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), rhs_trange.dim(1)}; @@ -928,10 +927,17 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("i,j,k;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); + tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + { + result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); + are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + } } BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { From 54362997ea05c26128fa7c68d667492b9a4173fd Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Dec 2023 16:16:07 -0500 Subject: [PATCH 32/88] bump pybind11 version to VG/v2.11 --- python/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 690b35979d..168bfa2984 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.12) FetchContent_Declare( pybind11 GIT_REPOSITORY https://github.com/ValeevGroup/pybind11.git - GIT_TAG 80d452484c5409444b0ec19383faa84bb7a4d351 # v2.4.3 + GIT_TAG ValeevGroup/v2.11 ) FetchContent_MakeAvailable(pybind11) @@ -39,11 +39,11 @@ if (BUILD_TESTING) # check for presence of prerequisite modules foreach(_mod pytest numpy) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import ${_mod}" + execute_process(COMMAND ${Python_EXECUTABLE} -c "import ${_mod}" OUTPUT_QUIET ERROR_QUIET RESULTS_VARIABLE check_for_${_mod}) if (check_for_${_mod}) - message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${PYTHON_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake") + message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${Python_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake") endif(check_for_${_mod}) endforeach(_mod) @@ -51,7 +51,7 @@ if (BUILD_TESTING) add_test( NAME tiledarray/unit/python/run # need to use pytest to find tiledarray module properly - COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v + COMMAND ${Python_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v WORKING_DIRECTORY ${PROJECT_BINARY_DIR} ) set_tests_properties(tiledarray/unit/python/run From f7e206d3a3fb70dde483e9003900b45fca28de87 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 8 Nov 2023 10:09:27 -0500 Subject: [PATCH 33/88] [unit] enabled tot x t test, does not compile @bimalgaudel will fix --- src/TiledArray/einsum/tiledarray.h | 6 +++--- tests/einsum.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index c248956066..7d4aca0425 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -422,9 +422,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B) { template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { - static_assert(std::is_same::value); - using E = expressions::TsrExpr; - return Einsum::einsum(E(A), E(B), Einsum::idx(cs), world); + using ECT = expressions::TsrExpr; + using ECU = expressions::TsrExpr; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ee06cf099f..45c4d3e399 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -765,7 +765,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From e62a6757c1df6863a703d8163736495b30a7dc11 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 13 Nov 2023 12:28:02 -0500 Subject: [PATCH 34/88] [WIP] T x ToT overload of einsum: first attempt. --- src/TiledArray/einsum/tiledarray.h | 225 +++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 7d4aca0425..52dab7477e 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -283,6 +283,231 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; +} // namespace + +template < + typename ArrayT_, typename ArrayToT_, typename... Indices, + typename = std::enable_if_t && IsArrayToT>> +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, + std::tuple, Indices...> cs, + World &world) { + using ArrayT = std::remove_cv_t; + using ArrayToT = std::remove_cv_t; + using Shape = typename ArrayToT::shape_type; + using T = typename ArrayT::value_type; + using ToT = typename ArrayToT::value_type; + + auto a = std::get<0>(Einsum::idx(A)); + auto b = std::get<0>(Einsum::idx(B)); + Einsum::Index c = std::get<0>(cs); + + struct { + std::string a, b, c; + } inner; + if constexpr (std::tuple_size::value == 2) { + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + inner.c = ";" + (std::string)std::get<1>(cs); + } + + // these are "Hadamard" (fused) indices + auto h = a & b & c; + + auto e = (a ^ b); + // contracted indices + auto i = (a & b) - h; + + // cannot be hadamard reduction type operation for this overload + TA_ASSERT(e); + + // no Hadamard indices => standard contraction (or even outer product) + // same a, b, and c => pure Hadamard + TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + + // maps Index to TiledRange1 + // (asserts same index maps to the same TR1 in A, and B) + auto range_map = + (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); + + using ::Einsum::index::permutation; + using TiledArray::Permutation; + + auto arrayTermA = ArrayTerm{A.array(), a}; + auto arrayTermB = ArrayTerm{B.array(), b}; + + { + auto ei = (e + i & arrayTermA.idx); + if (arrayTermA.idx != h + ei) + arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); + arrayTermA.expr = ei; + } + + { + auto ei = (e + i & arrayTermB.idx); + if (arrayTermB.idx != h + ei) + arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); + arrayTermB.expr = ei; + } + + ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; + for (auto idx : e) { + C.tiles *= Range(range_map[idx].tiles_range()); + } + if (C.idx != h + e) { + C.permutation = permutation(h + e, C.idx); + } + C.expr = e; + + struct { + RangeProduct tiles; + std::vector> batch; + } H; + + for (auto idx : h) { + H.tiles *= Range(range_map[idx].tiles_range()); + H.batch.push_back({}); + for (auto r : range_map[idx]) { + H.batch.back().push_back(Range{r}.size()); + } + } + + using Index = Einsum::Index; + + // generalized contraction + { + auto ei = (e + i & arrayTermA.idx); + arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); + } + + { + auto ei = (e + i & arrayTermB.idx); + arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); + } + + std::vector> worlds; + std::vector> local_tiles; + + // iterates over tiles of hadamard indices + for (Index h : H.tiles) { + auto &A = arrayTermA; + auto &B = arrayTermB; + + auto own = A.own(h) || B.own(h); + auto comm = world.mpi.comm().Split(own, world.rank()); + worlds.push_back(std::make_unique(comm)); + auto &owners = worlds.back(); + if (!own) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); + } + + { + arrayTermA.local_tiles.clear(); + const Permutation &P = arrayTermA.permutation; + + for (Index ei : arrayTermA.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermA.array.is_local(idx)) continue; + if (arrayTermA.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermA.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermA.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermA.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermA.array.pmap()->is_replicated(); + arrayTermA.ei = TiledArray::make_array( + *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), + arrayTermA.local_tiles.end(), replicated); + } + + { + arrayTermB.local_tiles.clear(); + const Permutation &P = arrayTermB.permutation; + + for (Index ei : arrayTermB.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermB.array.is_local(idx)) continue; + if (arrayTermB.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermB.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermB.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermB.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermB.array.pmap()->is_replicated(); + arrayTermB.ei = TiledArray::make_array( + *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), + arrayTermB.local_tiles.end(), replicated); + } + + // todo + // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + A.ei.defer_deleter_to_next_fence(); + B.ei.defer_deleter_to_next_fence(); + A.ei = ArrayT(); + B.ei = ArrayToT(); + // why omitting this fence leads to deadlock? + owners->gop.fence(); + for (Index e : C.tiles) { + if (!C.ei.is_local(e)) continue; + if (C.ei.is_zero(e)) continue; + // TODO no need for immediate evaluation + auto tile = C.ei.find_local(e).get(); + assert(tile.batch_size() == batch); + const Permutation &P = C.permutation; + auto c = apply(P, h + e); + auto shape = C.array.trange().tile(c); + shape = apply_inverse(P, shape); + tile = tile.reshape(shape); + if (P) tile = tile.permute(P); + local_tiles.push_back({c, tile}); + } + // mark for lazy deletion + C.ei = ArrayToT(); + } + + if constexpr (!Shape::is_dense()) { + TiledRange tiled_range = TiledRange(range_map[c]); + std::vector> tile_norms; + for (auto &[index, tile] : local_tiles) { + tile_norms.push_back({index, tile.norm()}); + } + Shape shape(world, tile_norms, tiled_range); + C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + } + + for (auto &[index, tile] : local_tiles) { + if (C.array.is_zero(index)) continue; + C.array.set(index, tile); + } + + for (auto &w : worlds) { + w->gop.fence(); + } + + return C.array; +} + +template && IsArrayToT>> +auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, + std::tuple, Indices...> cs, + World &world) { + return einsum(A, B, cs, world); +} + /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. From dce1bdc40203e78e7c3252ae30cc38eeff8528aa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 14 Nov 2023 14:34:05 -0500 Subject: [PATCH 35/88] tiny step towards supporting T*ToT in expr --- src/TiledArray/tensor/type_traits.h | 7 ++++--- src/TiledArray/tile_op/contract_reduce.h | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index eed84c6026..fd197c8cdf 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -114,7 +114,7 @@ struct is_nested_tensor { /// @c is_nested_tensor_v is an alias for @c /// is_nested_tensor::value template -constexpr const bool is_nested_tensor_v = is_nested_tensor::value; +inline constexpr const bool is_nested_tensor_v = is_nested_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -150,7 +150,7 @@ struct is_tensor { /// @tparam Ts a parameter pack /// @c is_tensor_v is an alias for @c is_tensor::value template -constexpr const bool is_tensor_v = is_tensor::value; +inline constexpr const bool is_tensor_v = is_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -172,7 +172,8 @@ struct is_tensor_of_tensor { /// @c is_tensor_of_tensor_v is an alias for @c /// is_tensor_of_tensor::value template -constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor::value; +inline constexpr const bool is_tensor_of_tensor_v = + is_tensor_of_tensor::value; //////////////////////////////////////////////////////////////////////////////// diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index 48b7936d26..d9d87d59c8 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -64,17 +64,20 @@ class ContractReduceBase { using elem_muladd_op_type = void(result_value_type&, const left_value_type&, const right_value_type&); - static_assert( - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v, - "ContractReduce can only handle plain tensors or nested tensors " - "(tensors-of-tensors); mixed contractions are not supported"); static constexpr bool plain_tensors = - !(TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v); + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v; + static constexpr bool nested_tensors = + TiledArray::detail::is_nested_tensor_v; + static constexpr bool mixed_tensors = !plain_tensors && !nested_tensors; + static_assert(!mixed_tensors || + (mixed_tensors && + TiledArray::detail::is_nested_tensor_v), + "ContractReduce applied to 1 plain tensor and 1 nested tensor " + "must produce a nested tensor " + "(tensors-of-tensors)"); private: struct Impl { From 8230b165159b20a0600f3d195fb3db1474f5e268 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 12:41:58 -0500 Subject: [PATCH 36/88] [WIP]: Make binary_egine less restrictive on left and right arg types. --- src/TiledArray/einsum/tiledarray.h | 21 ++++++++++++--------- src/TiledArray/expressions/binary_engine.h | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 52dab7477e..09640d31f6 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -309,7 +309,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, Einsum::Index c = std::get<0>(cs); struct { - std::string a, b, c; + std::string b, c; } inner; if constexpr (std::tuple_size::value == 2) { inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); @@ -319,16 +319,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // these are "Hadamard" (fused) indices auto h = a & b & c; - auto e = (a ^ b); // contracted indices auto i = (a & b) - h; + // contraction not allowed in tensor x tensor-of-tensor + TA_ASSERT(!i); - // cannot be hadamard reduction type operation for this overload - TA_ASSERT(e); - - // no Hadamard indices => standard contraction (or even outer product) - // same a, b, and c => pure Hadamard - TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + // indices exclusively in 'a' or exclusively in 'b' + auto e = (a ^ b); // maps Index to TiledRange1 // (asserts same index maps to the same TR1 in A, and B) @@ -364,6 +361,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; + arrayTermB.expr += inner.b; + C.expr += inner.c; + struct { RangeProduct tiles; std::vector> batch; @@ -453,7 +453,10 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } // todo - // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + + // + A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); A.ei = ArrayT(); diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 4758ab0069..93192e2b5e 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -146,11 +146,10 @@ class BinaryEngine : public ExprEngine { TiledArray::detail::is_tensor_of_tensor_v; constexpr bool right_tile_is_tot = TiledArray::detail::is_tensor_of_tensor_v; - static_assert(!(left_tile_is_tot ^ right_tile_is_tot), - "ContEngine can only handle tensors of same nested-ness " - "(both plain or both ToT)"); constexpr bool args_are_plain_tensors = !left_tile_is_tot && !right_tile_is_tot; + constexpr bool args_are_mixed_tensors = + left_tile_is_tot ^ right_tile_is_tot; if (args_are_plain_tensors && (left_outer_permtype_ == PermutationType::matrix_transpose || left_outer_permtype_ == PermutationType::identity)) { @@ -175,6 +174,20 @@ class BinaryEngine : public ExprEngine { right_inner_permtype_ == PermutationType::identity))) { right_.permute_tiles(false); } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (left_inner_permtype_ == PermutationType::matrix_transpose || + left_inner_permtype_ == PermutationType::identity))) { + left_.permute_tiles(false); + } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (right_inner_permtype_ == PermutationType::matrix_transpose || + right_inner_permtype_ == PermutationType::identity))) { + right_.permute_tiles(false); + } } public: From a129754727a63b8fe7a2840b323fc726f32b0399 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 20 Nov 2023 14:06:14 -0500 Subject: [PATCH 37/88] moar ToT * T progress --- src/TiledArray/expressions/cont_engine.h | 299 ++++++++++++++--------- src/TiledArray/expressions/mult_engine.h | 4 +- src/TiledArray/expressions/product.h | 3 + src/TiledArray/tile_op/scal.h | 2 + tests/einsum.cpp | 8 +- 5 files changed, 194 insertions(+), 122 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 35c2f34199..9a1cb9f5f9 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -107,15 +107,26 @@ class ContEngine : public BinaryEngine { protected: op_type op_; ///< Tile operation - using tile_element_type = typename value_type::value_type; - std::function - inner_tile_nonreturn_op_; ///< Tile element operation (only non-null for - ///< nested tensor expressions) - std::function - inner_tile_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns - ///< the result + + // tile types of the result and (after evaluation) left and right arguments + using result_tile_type = value_type; + using left_tile_type = typename EngineTrait::eval_type; + using right_tile_type = typename EngineTrait::eval_type; + + // tile element types of the result and (after evaluation) left and right + // arguments + using result_tile_element_type = typename result_tile_type::value_type; + using left_tile_element_type = typename left_tile_type::value_type; + using right_tile_element_type = typename right_tile_type::value_type; + + std::function + element_nonreturn_op_; ///< Tile element operation (only non-null for + ///< nested tensor expressions) + std::function + element_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns + ///< the result TiledArray::detail::ProcGrid proc_grid_; ///< Process grid for the contraction size_type K_ = 1; ///< Inner dimension size @@ -239,8 +250,8 @@ class ContEngine : public BinaryEngine { // precondition checks // 1. if ToT inner tile op has been initialized if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - TA_ASSERT(inner_tile_nonreturn_op_); - TA_ASSERT(inner_tile_return_op_); + TA_ASSERT(element_nonreturn_op_); + TA_ASSERT(element_return_op_); } // Initialize children @@ -271,7 +282,7 @@ class ContEngine : public BinaryEngine { op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), (permute_tiles_ ? perm_ : BipartitePermutation{}), - this->inner_tile_nonreturn_op_); + this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(outer(perm_)); shape_ = ContEngine_::make_shape(outer(perm_)); @@ -284,7 +295,7 @@ class ContEngine : public BinaryEngine { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - BipartitePermutation{}, this->inner_tile_nonreturn_op_); + BipartitePermutation{}, this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(); shape_ = ContEngine_::make_shape(); @@ -457,120 +468,172 @@ class ContEngine : public BinaryEngine { protected: void init_inner_tile_op(const IndexList& inner_target_indices) { - if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - using inner_tile_type = typename value_type::value_type; + if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { + constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v< + result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || inner_prod == TensorProduct::Hadamard); if (inner_prod == TensorProduct::Contraction) { - using inner_tile_type = typename value_type::value_type; - using contract_inner_tile_type = - TiledArray::detail::ContractReduce; - // factor_ is absorbed into inner_tile_nonreturn_op_ - auto contrreduce_op = - (inner_target_indices != inner(this->indices_)) - ? contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_), - (this->permute_tiles_ ? inner(this->perm_) - : Permutation{})) - : contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_)); - this->inner_tile_nonreturn_op_ = [contrreduce_op]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - contrreduce_op(result, left, right); - }; + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + using op_type = TiledArray::detail::ContractReduce< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type>; + // factor_ is absorbed into inner_tile_nonreturn_op_ + auto contrreduce_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_), + (this->permute_tiles_ ? inner(this->perm_) + : Permutation{})) + : op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_)); + this->element_nonreturn_op_ = + [contrreduce_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + contrreduce_op(result, left, right); + }; + } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { - // inner tile op depends on the outer op ... e.g. if outer op - // is contract then inner must implement (ternary) multiply-add; - // if the outer is hadamard then the inner is binary multiply - const auto outer_prod = this->product_type(); - if (this->factor_ == 1) { - using base_op_type = - TiledArray::detail::Mult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(), this->permute_tiles_ - ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type()); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } - }; - } else { - using base_op_type = - TiledArray::detail::ScalMult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(this->factor_), - this->permute_tiles_ ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type(this->factor_)); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + // inner tile op depends on the outer op ... e.g. if outer op + // is contract then inner must implement (ternary) multiply-add; + // if the outer is hadamard then the inner is binary multiply + const auto outer_prod = this->product_type(); + if (this->factor_ == 1) { + using base_op_type = + TiledArray::detail::Mult; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(), this->permute_tiles_ + ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type()); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } else { + using base_op_type = TiledArray::detail::ScalMult< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type, false, false>; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(this->factor_), + this->permute_tiles_ ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type(this->factor_)); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } + } // ToT x ToT + } else if (inner_prod == TensorProduct::General) { + TA_ASSERT(!tot_x_tot); + constexpr bool tot_x_t = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + constexpr bool t_x_tot = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + if constexpr (tot_x_t || t_x_tot) { + using arg_tile_element_type = + std::conditional_t; + using scalar_type = + std::conditional_t; + + auto scal_op = [do_perm = this->permute_tiles_, + perm = this->permute_tiles_ ? inner(this->perm_) + : Permutation{}]( + const left_tile_element_type& left, + const right_tile_element_type& right) + -> result_tile_element_type { + using TiledArray::scale; + if constexpr (tot_x_t) { + if (do_perm) + return scale(left, right, perm); + else + return scale(left, right); + } else if constexpr (tot_x_t) { + if (do_perm) + return scale(right, left, perm); + else + return scale(right, left); + } else + abort(); // unreachable }; + this->element_nonreturn_op_ = + [scal_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + result = scal_op(left, right); + }; } } else abort(); // unsupported TensorProduct type - TA_ASSERT(inner_tile_nonreturn_op_); - this->inner_tile_return_op_ = - [inner_tile_nonreturn_op = this->inner_tile_nonreturn_op_]( - const inner_tile_type& left, const inner_tile_type& right) { - inner_tile_type result; - inner_tile_nonreturn_op(result, left, right); - return result; - }; + TA_ASSERT(element_nonreturn_op_); + this->element_return_op_ = [inner_tile_nonreturn_op = + this->element_nonreturn_op_]( + const left_tile_element_type& left, + const right_tile_element_type& right) { + result_tile_element_type result; + inner_tile_nonreturn_op(result, left, right); + return result; + }; } } diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index a53133d4b0..91924efeb2 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -406,7 +406,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_)); + return op_type(op_base_type(this->element_return_op_)); } else abort(); } else { // plain tensors @@ -431,7 +431,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_), perm); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index d364764964..381b1f485c 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -57,6 +57,9 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Hadamard; else result = TensorProduct::Contraction; + } else if ((left_indices && !right_indices) || + (!left_indices && right_indices)) { // used for ToT*T or T*ToT + result = TensorProduct::General; } return result; } diff --git a/src/TiledArray/tile_op/scal.h b/src/TiledArray/tile_op/scal.h index 54d5337ed4..a89770c5a7 100644 --- a/src/TiledArray/tile_op/scal.h +++ b/src/TiledArray/tile_op/scal.h @@ -128,6 +128,8 @@ class Scal { return Scal_::template eval(arg); } + void set_factor(const scalar_type factor) { factor_ = factor; } + }; // class Scal } // namespace detail diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 45c4d3e399..3033936381 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -764,8 +764,12 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work - tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // will try to make this work FIRST since this is used by the einsum code + // below + tot_type out; + out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); + // will try to make this work NEXT + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From bf959a241633501810dd0f04e5910983dc394c84 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 38/88] [skip_ci] add permutation optimizer for general case: supports inner operation between tot * t. --- src/TiledArray/expressions/permopt.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 21d4a0ec39..dc029b73a1 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -527,6 +527,18 @@ class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { } }; +/// +/// +/// +class GeneralPermutationOptimizer : public GEMMPermutationOptimizer { + public: + GeneralPermutationOptimizer(const GeneralPermutationOptimizer&) = default; + GeneralPermutationOptimizer& operator=(const GeneralPermutationOptimizer&) = + default; + virtual ~GeneralPermutationOptimizer() = default; + using GEMMPermutationOptimizer::GEMMPermutationOptimizer; +}; + inline std::shared_ptr make_permutation_optimizer( TensorProduct product_type, const IndexList& left_indices, const IndexList& right_indices, bool prefer_to_permute_left) { @@ -540,6 +552,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } @@ -559,6 +574,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } From 8dd614ec8c2a946191c4ddf5811ea61ebb8bf7b8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 39/88] add permutation optimizer for scaling --- src/CMakeLists.txt | 13 +-- src/TiledArray/expressions/permopt.cpp | 32 ++++++ src/TiledArray/expressions/permopt.h | 130 +++++++++++++++++++++---- 3 files changed, 151 insertions(+), 24 deletions(-) create mode 100644 src/TiledArray/expressions/permopt.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 55227c2093..6e6c708891 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,7 +100,6 @@ TiledArray/dist_eval/contraction_eval.h TiledArray/dist_eval/dist_eval.h TiledArray/dist_eval/unary_eval.h TiledArray/einsum/index.h -TiledArray/einsum/index.cpp TiledArray/einsum/range.h TiledArray/einsum/string.h TiledArray/expressions/add_engine.h @@ -195,13 +194,10 @@ TiledArray/util/bug.h TiledArray/util/function.h TiledArray/util/initializer_list.h TiledArray/util/logger.h -TiledArray/util/ptr_registry.cpp TiledArray/util/ptr_registry.h -TiledArray/util/random.cpp TiledArray/util/random.h TiledArray/util/singleton.h TiledArray/util/threads.h -TiledArray/util/threads.cpp TiledArray/util/thread_specific.h TiledArray/util/time.h TiledArray/util/vector.h @@ -243,10 +239,15 @@ TiledArray/tensor_impl.cpp TiledArray/array_impl.cpp TiledArray/dist_array.cpp TiledArray/version.cpp -TiledArray/util/backtrace.cpp -TiledArray/util/bug.cpp +TiledArray/einsum/index.cpp +TiledArray/expressions/permopt.cpp TiledArray/math/linalg/basic.cpp TiledArray/math/linalg/rank-local.cpp +TiledArray/util/backtrace.cpp +TiledArray/util/bug.cpp +TiledArray/util/ptr_registry.cpp +TiledArray/util/random.cpp +TiledArray/util/threads.cpp ) # feed TILEDARRAY_GIT_REVISION and TILEDARRAY_GIT_DESCRIPTION to TiledArray/version.cpp only to avoid recompiling everything set_source_files_properties( diff --git a/src/TiledArray/expressions/permopt.cpp b/src/TiledArray/expressions/permopt.cpp new file mode 100644 index 0000000000..9b125fdc04 --- /dev/null +++ b/src/TiledArray/expressions/permopt.cpp @@ -0,0 +1,32 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2020 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * + * permopt.cpp + * Nov 21, 2023 + * + */ + +#include + +namespace TiledArray::expressions { + +IndexList ScalePermutationOptimizer::null_indices_; + +} // namespace TiledArray::expressions diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index dc029b73a1..998ea78efe 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -51,6 +52,56 @@ inline blas::Op to_cblas_op(PermutationType permtype) { : math::blas::NoTranspose; } +/// Optimizer of permutations for a unary operation +class UnaryOpPermutationOptimizer { + public: + /// construct using initial indices for the argument + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& argument_indices) + : argument_indices_(argument_indices) {} + + /// construct using initial indices for the argument, + /// and the desired result indices + /// \param result_indices the desired result index list + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& result_indices, + const IndexList& argument_indices) + : result_indices_(result_indices), argument_indices_(argument_indices) { + TA_ASSERT(argument_indices_.is_permutation(argument_indices_)); + target_result_indices_ = argument_indices_; + } + + UnaryOpPermutationOptimizer() = delete; + UnaryOpPermutationOptimizer(const UnaryOpPermutationOptimizer&) = default; + UnaryOpPermutationOptimizer& operator=(const UnaryOpPermutationOptimizer&) = + default; + virtual ~UnaryOpPermutationOptimizer() = default; + + /// \return the desired result indices + const IndexList& result_indices() const { + TA_ASSERT(result_indices_); + return result_indices_; + } + /// \return initial argument indices + const IndexList& argument_indices() const { return argument_indices_; } + + /// \return the proposed argument index list + const IndexList& target_argument_indices() const { + return target_result_indices_; + } + /// \return the proposed result index list (not necessarily same as that + /// returned by result_indices()) + const IndexList& target_result_indices() const { + return target_result_indices_; + } + /// \return the type of permutation bringing the initial left index list to + /// the target left index list + PermutationType argument_permtype() const { return PermutationType::general; } + + private: + IndexList result_indices_, argument_indices_, target_result_indices_; +}; + /// Abstract optimizer of permutations for a binary operation class BinaryOpPermutationOptimizer { public: @@ -479,6 +530,61 @@ class HadamardPermutationOptimizer : public BinaryOpPermutationOptimizer { IndexList target_result_indices_; }; +// clang-format off +/// Implements BinaryOpPermutationOptimizer interface for a scale operation viewed as a binary tensor product, i.e. +/// a tensor product between an order-0 tensor and an arbitrary tensor +// clang-format on +class ScalePermutationOptimizer : public BinaryOpPermutationOptimizer { + public: + ScalePermutationOptimizer(const ScalePermutationOptimizer&) = default; + ScalePermutationOptimizer& operator=(const ScalePermutationOptimizer&) = + default; + ~ScalePermutationOptimizer() = default; + + ScalePermutationOptimizer(const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(left_indices, right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices), + target_result_indices_(left_argument_is_scalar_ ? right_indices + : left_indices) {} + + ScalePermutationOptimizer(const IndexList& result_indices, + const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(result_indices, left_indices, + right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices) { + const auto& arg_indices = + left_argument_is_scalar_ ? right_indices : left_indices; + TA_ASSERT(arg_indices.is_permutation(result_indices)); + target_result_indices_ = arg_indices; + } + + const IndexList& target_left_indices() const override final { + return !left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_right_indices() const override final { + return left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_result_indices() const override final { + return target_result_indices_; + } + PermutationType left_permtype() const override final { + return PermutationType::general; + } + PermutationType right_permtype() const override final { + return PermutationType::general; + } + TensorProduct op_type() const override final { return TensorProduct::Scale; } + + private: + bool left_argument_is_scalar_; + IndexList target_result_indices_; + static IndexList null_indices_; +}; + class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { public: NullBinaryOpPermutationOptimizer(const NullBinaryOpPermutationOptimizer&) = @@ -527,18 +633,6 @@ class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { } }; -/// -/// -/// -class GeneralPermutationOptimizer : public GEMMPermutationOptimizer { - public: - GeneralPermutationOptimizer(const GeneralPermutationOptimizer&) = default; - GeneralPermutationOptimizer& operator=(const GeneralPermutationOptimizer&) = - default; - virtual ~GeneralPermutationOptimizer() = default; - using GEMMPermutationOptimizer::GEMMPermutationOptimizer; -}; - inline std::shared_ptr make_permutation_optimizer( TensorProduct product_type, const IndexList& left_indices, const IndexList& right_indices, bool prefer_to_permute_left) { @@ -552,9 +646,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); - case TensorProduct::General: - return std::make_shared( - left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared(left_indices, + right_indices); default: abort(); } @@ -574,9 +668,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); - case TensorProduct::General: - return std::make_shared( - left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared( + target_indices, left_indices, right_indices); default: abort(); } From 43d61f02fec226a2c26744b210d8f93970299f24 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Nov 2023 16:33:46 -0500 Subject: [PATCH 40/88] expression-level support for ToT x T (and vice versa) implemented, need to test --- src/TiledArray/expressions/cont_engine.h | 19 ++++----- src/TiledArray/expressions/product.h | 5 ++- tests/einsum.cpp | 49 +++++++++++++++++++++--- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 9a1cb9f5f9..5ec69c7d0d 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -158,9 +158,10 @@ class ContEngine : public BinaryEngine { TensorProduct inner_product_type() const { TA_ASSERT(inner_product_type_ != TensorProduct::Invalid); // init_indices() must initialize this - /// only Hadamard and contraction are supported now + /// only Hadamard, contraction, and scale are supported now TA_ASSERT(inner_product_type_ == TensorProduct::Hadamard || - inner_product_type_ == TensorProduct::Contraction); + inner_product_type_ == TensorProduct::Contraction || + inner_product_type_ == TensorProduct::Scale); return inner_product_type_; } @@ -473,7 +474,8 @@ class ContEngine : public BinaryEngine { result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || - inner_prod == TensorProduct::Hadamard); + inner_prod == TensorProduct::Hadamard || + inner_prod == TensorProduct::Scale); if (inner_prod == TensorProduct::Contraction) { TA_ASSERT(tot_x_tot); if constexpr (tot_x_tot) { @@ -577,8 +579,8 @@ class ContEngine : public BinaryEngine { } }; } - } // ToT x ToT - } else if (inner_prod == TensorProduct::General) { + } // ToT x T or T x ToT + } else if (inner_prod == TensorProduct::Scale) { TA_ASSERT(!tot_x_tot); constexpr bool tot_x_t = TiledArray::detail::is_tensor_of_tensor_v { std::conditional_t; - auto scal_op = [do_perm = this->permute_tiles_, - perm = this->permute_tiles_ ? inner(this->perm_) + auto scal_op = [perm = this->permute_tiles_ ? inner(this->perm_) : Permutation{}]( const left_tile_element_type& left, const right_tile_element_type& right) -> result_tile_element_type { using TiledArray::scale; if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(left, right, perm); else return scale(left, right); } else if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(right, left, perm); else return scale(right, left); diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index 381b1f485c..7111b7831b 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -39,6 +39,9 @@ enum class TensorProduct { Contraction, /// free, fused, and contracted indices General, + /// no indices on one, free indices on the other; only used for inner index + /// products in mixed nested products (ToT x T) + Scale, /// invalid Invalid = -1 }; @@ -59,7 +62,7 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Contraction; } else if ((left_indices && !right_indices) || (!left_indices && right_indices)) { // used for ToT*T or T*ToT - result = TensorProduct::General; + result = TensorProduct::Scale; } return result; } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3033936381..ea5529e5b8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -718,6 +718,49 @@ BOOST_AUTO_TEST_SUITE_END() // einsum_tot BOOST_AUTO_TEST_SUITE(einsum_tot_t) +BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), + rhs_trange.dim(0)}; + tot_type ref_result(world, ref_result_trange); + // TODO compute ref_result + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); + + // TODO check result against ref_result +} + BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; @@ -764,11 +807,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work FIRST since this is used by the einsum code - // below - tot_type out; - out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); - // will try to make this work NEXT + // will try to make this work // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } From 74e5e78a4897430e73e9e9af0133a3fca8188cd7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 14:54:30 -0500 Subject: [PATCH 41/88] [ci skip] implement 'i,j;m,n * j,k -> i,j,k;m,n' reference evaluation manually. --- tests/einsum.cpp | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ea5529e5b8..800d51d3e0 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -793,10 +793,41 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; - tot_type ref_result(world, ref_result_trange); // TODO compute ref_result + // i,j;m,n * j,k => i,j,k;m,n + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), + rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + for (auto const& tile : ref_result) { + tot_type::value_type result_tile{tile.make_range()}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + auto k = res_ix[2]; + + using Ix2 = std::array; + using Ix3 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{j, k})); + + res_el = lhs_el.scale(rhs_el); + } + + ref_result.set(tile.index(), result_tile); + } + + std::cout << ref_result << std::endl; ///////////////////////////////////////////////////////// // ToT * T From 86f287768baacf5fcbda63795622487a08d0b54a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 17:34:55 -0500 Subject: [PATCH 42/88] [ci skip] more manual tot * t reference evaluation --- tests/einsum.cpp | 68 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 800d51d3e0..6501d91a10 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -751,14 +751,58 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { rhs.fill_random(); TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; + rhs_trange.dim(0), lhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - // TODO compute ref_result + + // + // i,l,k,j;n,m = i,j;m,n * k,l + // + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto l = res_ix[1]; + auto k = res_ix[2]; + auto j = res_ix[3]; + + using Ix2 = std::array; + using Ix4 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, l})); + + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{1, 0}); // permute [0,1] -> [1,0] + } + return result_tile; + }; + + using std::begin; + using std::endl; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // TODO check result against ref_result + // todo: fix it + // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + // BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -799,8 +843,11 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - for (auto const& tile : ref_result) { - tot_type::value_type result_tile{tile.make_range()}; + // + // why cannot lhs and rhs be captured by ref? + // + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; for (auto&& res_ix : result_tile.range()) { auto i = res_ix[0]; auto j = res_ix[1]; @@ -823,11 +870,16 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { res_el = lhs_el.scale(rhs_el); } + return result_tile; + }; - ref_result.set(tile.index(), result_tile); - } + using std::begin; + using std::endl; - std::cout << ref_result << std::endl; + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } ///////////////////////////////////////////////////////// // ToT * T From e40d882ada11464bec3b25b6999cacc9767d229a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:04:59 -0500 Subject: [PATCH 43/88] Add equality comparison for SparseShape. --- src/TiledArray/sparse_shape.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index bf51487922..271857a72c 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -1742,6 +1742,17 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } +template +constexpr inline bool operator==(const SparseShape& a, + const SparseShape& b) { + return true; +} +template +constexpr inline bool operator!=(const SparseShape& a, + const SparseShape& b) { + return !(a == b); +} + #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From f9e4f0db11f1a9f07b85f0b5250935b3aa507d62 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:05:40 -0500 Subject: [PATCH 44/88] Validate outer-product type tot * t evaluation using expression layer. --- tests/einsum.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6501d91a10..aad4a00c0a 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,9 +800,8 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // todo: fix it - // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - // BOOST_CHECK(are_equal); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From 42a1dc708397325ea768d7543a448a4050ddae71 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 11:42:05 -0500 Subject: [PATCH 45/88] [unit] einsum_tot_t pulls remote tiles using strick blocking (dowork=false) also fixed a few typos --- tests/einsum.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index aad4a00c0a..db2731a2e1 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -771,10 +771,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using Ix4 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); @@ -790,7 +790,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); @@ -856,10 +856,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -873,7 +873,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); From 076f488905ca69150140bb97b4377f9690cd8a58 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 12:04:54 -0500 Subject: [PATCH 46/88] [unit] einsum_tot_t must test ToT*T AND T*ToT (the latter is currently broken due to missing Tensor functionality for binary Scalar*Tensor ops) --- tests/einsum.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index db2731a2e1..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -802,6 +802,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); + + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -887,10 +894,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From 7b2a90b490bff387f0a52f7d335e98bc7440f968 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 27 Nov 2023 23:16:39 -0500 Subject: [PATCH 47/88] Avoid code-duplication by generalizing the existing einsum function. --- src/TiledArray/einsum/range.h | 3 +- src/TiledArray/einsum/tiledarray.h | 316 ++++++----------------------- tests/einsum.cpp | 12 +- 3 files changed, 72 insertions(+), 259 deletions(-) diff --git a/src/TiledArray/einsum/range.h b/src/TiledArray/einsum/range.h index 32eb669588..79b409e64d 100644 --- a/src/TiledArray/einsum/range.h +++ b/src/TiledArray/einsum/range.h @@ -14,7 +14,8 @@ using small_vector = TiledArray::container::svector; struct Range { using value_type = int64_t; using iterator = boost::counting_iterator; - template + template , bool> = true> explicit Range(Pair &&pair) : Range(pair.first, pair.second) {} Range(value_type begin, value_type end) : begin_(begin), end_(end) {} auto begin() const { return iterator(begin_); } diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 09640d31f6..1a3840f99f 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -64,13 +64,38 @@ struct ArrayTerm { } }; -template -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; + +template +constexpr bool AreArrayT = IsArrayT && IsArrayT; + +template +constexpr bool AreArrayToT = IsArrayToT && IsArrayToT; + +template +constexpr bool AreArraySame = + AreArrayT || AreArrayToT; + +} // namespace + +template +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::tuple, Indices...> cs, World &world) { - using Array = std::remove_cv_t; - using Tensor = typename Array::value_type; - using Shape = typename Array::shape_type; + using ArrayA = std::remove_cv_t; + using ArrayB = std::remove_cv_t; + using ArrayC = std::conditional_t< + AreArraySame, ArrayA, + std::conditional_t, ArrayA, ArrayB>>; + // using Array = ArrayC; + using ResultTensor = typename ArrayC::value_type; + using ResultShape = typename ArrayC::shape_type; auto a = std::get<0>(Einsum::idx(A)); auto b = std::get<0>(Einsum::idx(B)); @@ -91,7 +116,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // no Hadamard indices => standard contraction (or even outer product) // same a, b, and c => pure Hadamard if (!h || (!(a ^ b) && !(b ^ c))) { - Array C; + ArrayC C; C(std::string(c) + inner.c) = A * B; return C; } @@ -108,17 +133,22 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ::Einsum::index::permutation; using TiledArray::Permutation; - ArrayTerm AB[2] = {{A.array(), a}, {B.array(), b}}; + std::tuple, ArrayTerm> AB{{A.array(), a}, + {B.array(), b}}; - for (auto &term : AB) { + auto update_perm_and_indices = [&e = std::as_const(e), &i = std::as_const(i), + &h = std::as_const(h)](auto &term) { auto ei = (e + i & term.idx); if (term.idx != h + ei) { term.permutation = permutation(term.idx, h + ei); } term.expr = ei; - } + }; - ArrayTerm C = {Array(world, TiledRange(range_map[c])), c}; + std::invoke(update_perm_and_indices, std::get<0>(AB)); + std::invoke(update_perm_and_indices, std::get<1>(AB)); + + ArrayTerm C = {ArrayC(world, TiledRange(range_map[c])), c}; for (auto idx : e) { C.tiles *= Range(range_map[idx].tiles_range()); } @@ -127,8 +157,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; - AB[0].expr += inner.a; - AB[1].expr += inner.b; + std::get<0>(AB).expr += inner.a; + std::get<1>(AB).expr += inner.b; + C.expr += inner.c; struct { @@ -163,7 +194,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - Tensor tile(TiledArray::Range{batch}, typename Tensor::value_type(0)); + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type(0)); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); @@ -193,16 +225,20 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // generalized contraction - for (auto &term : AB) { + auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), + &range_map = std::as_const(range_map)](auto &term) { auto ei = (e + i & term.idx); term.ei_tiled_range = TiledRange(range_map[ei]); for (auto idx : ei) { term.tiles *= Range(range_map[idx].tiles_range()); } - } + }; + + std::invoke(update_tr, std::get<0>(AB)); + std::invoke(update_tr, std::get<1>(AB)); std::vector> worlds; - std::vector> local_tiles; + std::vector> local_tiles; // iterates over tiles of hadamard indices for (Index h : H.tiles) { @@ -216,7 +252,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - for (auto &term : AB) { + + auto retile = [&owners, &h = std::as_const(h), batch](auto &term) { term.local_tiles.clear(); const Permutation &P = term.permutation; @@ -232,235 +269,18 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, term.local_tiles.push_back({ei, tile}); } bool replicated = term.array.pmap()->is_replicated(); - term.ei = TiledArray::make_array( + term.ei = TiledArray::make_array( *owners, term.ei_tiled_range, term.local_tiles.begin(), term.local_tiles.end(), replicated); - } - C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - A.ei.defer_deleter_to_next_fence(); - B.ei.defer_deleter_to_next_fence(); - A.ei = Array(); - B.ei = Array(); - // why omitting this fence leads to deadlock? - owners->gop.fence(); - for (Index e : C.tiles) { - if (!C.ei.is_local(e)) continue; - if (C.ei.is_zero(e)) continue; - // TODO no need for immediate evaluation - auto tile = C.ei.find_local(e).get(); - assert(tile.batch_size() == batch); - const Permutation &P = C.permutation; - auto c = apply(P, h + e); - auto shape = C.array.trange().tile(c); - shape = apply_inverse(P, shape); - tile = tile.reshape(shape); - if (P) tile = tile.permute(P); - local_tiles.push_back({c, tile}); - } - // mark for lazy deletion - C.ei = Array(); - } - - if constexpr (!Shape::is_dense()) { - TiledRange tiled_range = TiledRange(range_map[c]); - std::vector> tile_norms; - for (auto &[index, tile] : local_tiles) { - tile_norms.push_back({index, tile.norm()}); - } - Shape shape(world, tile_norms, tiled_range); - C.array = Array(world, TiledRange(range_map[c]), shape); - } - - for (auto &[index, tile] : local_tiles) { - if (C.array.is_zero(index)) continue; - C.array.set(index, tile); - } - - for (auto &w : worlds) { - w->gop.fence(); - } - - return C.array; -} - -namespace { -template -constexpr bool IsArrayT = detail::is_tensor_v; - -template -constexpr bool IsArrayToT = - detail::is_tensor_of_tensor_v; -} // namespace - -template < - typename ArrayT_, typename ArrayToT_, typename... Indices, - typename = std::enable_if_t && IsArrayToT>> -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, - std::tuple, Indices...> cs, - World &world) { - using ArrayT = std::remove_cv_t; - using ArrayToT = std::remove_cv_t; - using Shape = typename ArrayToT::shape_type; - using T = typename ArrayT::value_type; - using ToT = typename ArrayToT::value_type; - - auto a = std::get<0>(Einsum::idx(A)); - auto b = std::get<0>(Einsum::idx(B)); - Einsum::Index c = std::get<0>(cs); - - struct { - std::string b, c; - } inner; - if constexpr (std::tuple_size::value == 2) { - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); - inner.c = ";" + (std::string)std::get<1>(cs); - } + }; + std::invoke(retile, std::get<0>(AB)); + std::invoke(retile, std::get<1>(AB)); - // these are "Hadamard" (fused) indices - auto h = a & b & c; - - // contracted indices - auto i = (a & b) - h; - // contraction not allowed in tensor x tensor-of-tensor - TA_ASSERT(!i); - - // indices exclusively in 'a' or exclusively in 'b' - auto e = (a ^ b); - - // maps Index to TiledRange1 - // (asserts same index maps to the same TR1 in A, and B) - auto range_map = - (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - - using ::Einsum::index::permutation; - using TiledArray::Permutation; - - auto arrayTermA = ArrayTerm{A.array(), a}; - auto arrayTermB = ArrayTerm{B.array(), b}; - - { - auto ei = (e + i & arrayTermA.idx); - if (arrayTermA.idx != h + ei) - arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); - arrayTermA.expr = ei; - } - - { - auto ei = (e + i & arrayTermB.idx); - if (arrayTermB.idx != h + ei) - arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); - arrayTermB.expr = ei; - } - - ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; - for (auto idx : e) { - C.tiles *= Range(range_map[idx].tiles_range()); - } - if (C.idx != h + e) { - C.permutation = permutation(h + e, C.idx); - } - C.expr = e; - - arrayTermB.expr += inner.b; - C.expr += inner.c; - - struct { - RangeProduct tiles; - std::vector> batch; - } H; - - for (auto idx : h) { - H.tiles *= Range(range_map[idx].tiles_range()); - H.batch.push_back({}); - for (auto r : range_map[idx]) { - H.batch.back().push_back(Range{r}.size()); - } - } - - using Index = Einsum::Index; - - // generalized contraction - { - auto ei = (e + i & arrayTermA.idx); - arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); - } - - { - auto ei = (e + i & arrayTermB.idx); - arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); - } - - std::vector> worlds; - std::vector> local_tiles; - - // iterates over tiles of hadamard indices - for (Index h : H.tiles) { - auto &A = arrayTermA; - auto &B = arrayTermB; - - auto own = A.own(h) || B.own(h); - auto comm = world.mpi.comm().Split(own, world.rank()); - worlds.push_back(std::make_unique(comm)); - auto &owners = worlds.back(); - if (!own) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); - } - - { - arrayTermA.local_tiles.clear(); - const Permutation &P = arrayTermA.permutation; - - for (Index ei : arrayTermA.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermA.array.is_local(idx)) continue; - if (arrayTermA.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermA.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermA.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermA.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermA.array.pmap()->is_replicated(); - arrayTermA.ei = TiledArray::make_array( - *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), - arrayTermA.local_tiles.end(), replicated); - } - - { - arrayTermB.local_tiles.clear(); - const Permutation &P = arrayTermB.permutation; - - for (Index ei : arrayTermB.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermB.array.is_local(idx)) continue; - if (arrayTermB.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermB.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermB.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermB.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermB.array.pmap()->is_replicated(); - arrayTermB.ei = TiledArray::make_array( - *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), - arrayTermB.local_tiles.end(), replicated); - } - - // todo C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - - // - A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); - A.ei = ArrayT(); - B.ei = ArrayToT(); + A.ei = ArrayA(); + B.ei = ArrayB(); // why omitting this fence leads to deadlock? owners->gop.fence(); for (Index e : C.tiles) { @@ -478,17 +298,17 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, local_tiles.push_back({c, tile}); } // mark for lazy deletion - C.ei = ArrayToT(); + C.ei = ArrayC(); } - if constexpr (!Shape::is_dense()) { + if constexpr (!ResultShape::is_dense()) { TiledRange tiled_range = TiledRange(range_map[c]); std::vector> tile_norms; for (auto &[index, tile] : local_tiles) { tile_norms.push_back({index, tile.norm()}); } - Shape shape(world, tile_norms, tiled_range); - C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + ResultShape shape(world, tile_norms, tiled_range); + C.array = ArrayC(world, TiledRange(range_map[c]), shape); } for (auto &[index, tile] : local_tiles) { @@ -503,14 +323,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } -template && IsArrayToT>> -auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, - std::tuple, Indices...> cs, - World &world) { - return einsum(A, B, cs, world); -} - /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..8eea2884f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); - { // reverse the order - tot_type result; - BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - BOOST_CHECK(are_equal); - } +// { // reverse the order +// tot_type result; +// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); +// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); +// BOOST_CHECK(are_equal); +// } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From c8f9542866a08ccfae45e6bbf4dd42d65c1641b8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 Nov 2023 10:28:47 -0500 Subject: [PATCH 48/88] In einsum, handle inner index labels when tot times t, or, t times tot arguments are passed. --- src/TiledArray/einsum/tiledarray.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 1a3840f99f..eb317e0aef 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -93,7 +93,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ArrayC = std::conditional_t< AreArraySame, ArrayA, std::conditional_t, ArrayA, ArrayB>>; - // using Array = ArrayC; using ResultTensor = typename ArrayC::value_type; using ResultShape = typename ArrayC::shape_type; @@ -105,8 +104,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::string a, b, c; } inner; if constexpr (std::tuple_size::value == 2) { - inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT) + inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); + + if constexpr (IsArrayToT) + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + + static_assert(IsArrayToT || IsArrayToT); inner.c = ";" + (std::string)std::get<1>(cs); } From f04a94358e4bbc8e0121363b563b6550a412569d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:00:36 -0500 Subject: [PATCH 49/88] amend https://github.com/ValeevGroup/tiledarray/commit/bff7d2888cd69e5ef4b9bb4ed86e775e6528c4db --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 5ec69c7d0d..21aceae14c 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -609,7 +609,7 @@ class ContEngine : public BinaryEngine { return scale(left, right, perm); else return scale(left, right); - } else if constexpr (tot_x_t) { + } else if constexpr (t_x_tot) { if (perm) return scale(right, left, perm); else From 178393b84e229a967b2120838db3907ad4531f4c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:02:22 -0500 Subject: [PATCH 50/88] relax type requirements on tensor_init to support mixed (ToT alongside T) invocations, this allows T * ToT expr to compile and unit test to succeed --- src/TiledArray/tensor/kernels.h | 7 ++++--- tests/einsum.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 87db8c1cc6..97f7dc1e5b 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -541,9 +541,10 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result, /// \param[out] result The result tensor /// \param[in] tensor1 The first argument tensor /// \param[in] tensors The argument tensors -template ::value>::type* = nullptr> +template < + typename Op, typename TR, typename T1, typename... Ts, + typename std::enable_if::value && + !is_tensor::value>::type* = nullptr> inline void tensor_init(Op&& op, const Permutation& perm, TR& result, const T1& tensor1, const Ts&... tensors) { TA_ASSERT(!empty(result, tensor1, tensors...)); diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8eea2884f9..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); -// { // reverse the order -// tot_type result; -// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); -// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); -// BOOST_CHECK(are_equal); -// } + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From 3eb8280d9cd7c84b31c1050e369ed27c6ed27ac7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 30 Nov 2023 14:06:19 -0500 Subject: [PATCH 51/88] relax Tensor(left,right,binaryelemeop,permutation) ctor constraints --- src/TiledArray/tensor/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 3c10ba4077..f3076c4514 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -492,7 +492,7 @@ class Tensor { /// \param perm The permutation that will be applied to the arguments template < typename T1, typename T2, typename Op, typename Perm, - typename std::enable_if::value && + typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> Tensor(const T1& left, const T2& right, Op&& op, const Perm& perm) : Tensor(outer(perm) * left.range(), 1, default_construct{false}) { From 0f4e8183e13ce92a78219866f70afd7bda0a2bb7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 7 Dec 2023 18:38:25 -0500 Subject: [PATCH 52/88] Support for pure hadamard product between a tot and a t: 'i,j;m,n * i,j -> i,j;m,n' --- src/TiledArray/expressions/binary_engine.h | 6 +- src/TiledArray/expressions/mult_engine.h | 6 ++ tests/einsum.cpp | 92 ++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 93192e2b5e..411a1c7c13 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -204,8 +204,10 @@ class BinaryEngine : public ExprEngine { /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { if (permute_tiles_) { - TA_ASSERT(left_.indices().size() == target_indices.size()); - TA_ASSERT(right_.indices().size() == target_indices.size()); + TA_ASSERT(left_.indices().size() == target_indices.size() || + (left_.indices().second().size() ^ target_indices.second().size())); + TA_ASSERT(right_.indices().size() == target_indices.size() || + (right_.indices().second().size() ^ target_indices.second().size())); init_indices_(target_indices); diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 91924efeb2..9713e0b0df 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -407,6 +407,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type()); } else abort(); } else { // plain tensors @@ -432,6 +435,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..9ea4dd39d3 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -900,6 +900,98 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } +BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_4_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_4_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_5_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_5_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}, + {lhs_elem_4_0, lhs_elem_4_1}, + {lhs_elem_5_0, lhs_elem_5_1}}; + TiledRange lhs_trange{{0, 2, 6}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + // + // i,j;m,n = j,i;n,m * i,j + // + TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + + using Ix2 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false ); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); + auto rhs_el = + rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{0, 1} // permute + ); + } + return result_tile; + }; + + using std::begin; + using std::end; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); + + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); +} + BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t // Eigen einsum indices From ba2b9a3b90a8d80340427139bb0a9dc04e76f827 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:56:34 -0500 Subject: [PATCH 53/88] SparseShape inequality comparison added. --- src/TiledArray/sparse_shape.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index 271857a72c..b589dc73cf 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -797,6 +797,13 @@ class SparseShape { return equal; } + /// Bitwise comparison + /// \param other a SparseShape object + /// \return true if this object and @c other object are bitwise NOT identical + inline bool operator!=(const SparseShape& other) const { + return !(*this == other); + } + private: /// Create a copy of a sub-block of the shape @@ -1742,17 +1749,6 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } -template -constexpr inline bool operator==(const SparseShape& a, - const SparseShape& b) { - return true; -} -template -constexpr inline bool operator!=(const SparseShape& a, - const SparseShape& b) { - return !(a == b); -} - #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From be8e07a5667c02bbc9b1b516f9763db89038187d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:57:16 -0500 Subject: [PATCH 54/88] Disable shape comparison in ToTArrayFixture. --- tests/tot_array_fixture.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 9d46fadcc7..1619a794c8 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -237,6 +237,7 @@ struct ToTArrayFixture { * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons + * TODO: shape comparisons */ template @@ -254,7 +255,7 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - if (lhs.shape() != rhs.shape()) return false; + // if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From e96df681b3f20328808b129ef16776c89e62dbe5 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:58:25 -0500 Subject: [PATCH 55/88] Default construction of result tensor tile in `einsum` made more generic. --- src/TiledArray/einsum/tiledarray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index eb317e0aef..48648407cb 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -199,7 +199,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, batch *= H.batch[i].at(h[i]); } ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type(0)); + typename ResultTensor::value_type{}); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); From 5b7c3dd5ed7f43d03ece64f93da8e28a7b5011a0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:00:17 -0500 Subject: [PATCH 56/88] Restore (optional) shape comparison on ToTArrayFixture::are_equal function. --- tests/einsum.cpp | 6 +++--- tests/tot_array_fixture.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 9ea4dd39d3..a1c26d1782 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,13 +800,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -988,7 +988,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 1619a794c8..21a9c956c6 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -231,16 +231,15 @@ struct ToTArrayFixture { * - Same type * - Either both are initialized or both are not initialized * - Same MPI context - * - Same shape + * - Same shape (unless the template parameter ShapeCmp is set false) * - Same distribution * - Same tiling * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons - * TODO: shape comparisons */ - template + template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { // Same type @@ -255,7 +254,8 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - // if (lhs.shape() != rhs.shape()) return false; + if constexpr (ShapeCmp) + if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From df240014a838cf2e43c408f82dff91fd00ac75a0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:03:38 -0500 Subject: [PATCH 57/88] Relax restricitons on this->product_type() values while calling make_tile_op(). --- src/TiledArray/expressions/mult_engine.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 9713e0b0df..20093b2cec 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -408,7 +408,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type()); } else abort(); @@ -436,7 +435,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type(this->element_return_op_), perm); } else abort(); From cbf06b1c8c20aa38bb0d1c65487f75de06f02a23 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 11 Dec 2023 07:35:16 -0500 Subject: [PATCH 58/88] Typo. --- tests/einsum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index a1c26d1782..ebd9784bfd 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -1269,7 +1269,7 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_abi_cdi_cdab) { "abi,cdi->cdab"); } -BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_ai_abcd) { +BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_bai_abcd) { einsum_tiledarray_check<3, 3, 4>(random(3, 12, 13), random(14, 15, 3), "icd,bai->abcd"); From c86b7d027560320f52179d8f402ceb460d61fc06 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 15 Dec 2023 09:28:57 -0500 Subject: [PATCH 59/88] [skip ci] einsum unit test for ij;mn * kj;mn -> ijk;mn --- tests/einsum.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ebd9784bfd..eb2ffe1869 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -580,6 +580,40 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) { BOOST_CHECK(are_equal); } +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { + using dist_array_t = DistArray>, DensePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + + auto random_tot = [](TA::Range const& rng) { + TA::Range inner_rng{7,14}; + TA::Tensor t{inner_rng}; + TA::Tensor> result{rng}; + for (auto& e: result) e = t; + return result; + }; + + auto random_tot_darr = [&random_tot](World& world, + TiledRange const& tr) { + dist_array_t result(world, tr); + for (auto it = result.begin(); it != result.end(); ++it) { + auto tile = + TA::get_default_world().taskq.add(random_tot, it.make_range()); + *it = tile; + } + return result; + }; + + TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + auto lhs = random_tot_darr(world, lhs_trange); + + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + auto rhs = random_tot_darr(world, rhs_trange); + dist_array_t result; + BOOST_REQUIRE_NO_THROW( + result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n")); +} + BOOST_AUTO_TEST_CASE(xxx) { using dist_array_t = DistArray>, DensePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -1328,6 +1362,13 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_hji_jih_hj) { "hji,jih->hj"); } +BOOST_AUTO_TEST_CASE(einsum_tiledarray_ik_jk_ijk) { + einsum_tiledarray_check<2, 2, 3>(random(7, 5), + random(14, 5), "ik,jk->ijk"); + einsum_tiledarray_check<2, 2, 3>(sparse_zero(7, 5), sparse_zero(14, 5), + "ik,jk->ijk"); +} + BOOST_AUTO_TEST_CASE(einsum_tiledarray_replicated) { einsum_tiledarray_check<3, 3, 3>(replicated(random(7, 14, 3)), random(7, 15, 3), From c72f3f4f0915e921498beeb66f562be32fca805f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 15 Dec 2023 10:45:59 -0500 Subject: [PATCH 60/88] Tensor::gemm involving custom elem_op supports batching --- src/TiledArray/tensor/tensor.h | 75 ++++++++++++++++++++++++---------- tests/einsum.cpp | 4 +- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index f3076c4514..c901dc0f4b 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -292,10 +292,12 @@ class Tensor { /// Construct a tensor with a range equal to \c range. The data is /// uninitialized. /// \param range The range of the tensor - explicit Tensor(const range_type& range) - : Tensor(range, 1, default_construct{true}) {} + /// \param batch_size The batch size (default is 1) + explicit Tensor(const range_type& range, size_type batch_size = 1) + : Tensor(range, batch_size, default_construct{true}) {} - /// Construct a tensor with a fill value + /// Construct a tensor of tensor values, setting all elements to the same + /// value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements @@ -312,12 +314,14 @@ class Tensor { new (data + i) value_type(cloner(value)); } - /// Construct a tensor with a fill value + /// Construct a tensor of scalars, setting all elements to the same value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements - template >::type* = nullptr> + template && + !detail::is_tensor::value>::type* = + nullptr> Tensor(const range_type& range, const Value& value) : Tensor(range, 1, default_construct{false}) { detail::tensor_init([value]() -> Value { return value; }, *this); @@ -358,7 +362,7 @@ class Tensor { math::uninitialized_copy_vector(range.volume(), u, this->data()); } - Tensor(const Range& range, std::initializer_list il) + explicit Tensor(const Range& range, std::initializer_list il) : Tensor(range, il.begin()) {} /// Construct a copy of a tensor interface object @@ -1004,6 +1008,22 @@ class Tensor { /// \return A mutable pointer to the tensor data pointer data() { return this->data_.get(); } + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + const_pointer batch_data(size_t batch_idx) const { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + pointer batch_data(size_t batch_idx) { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + /// Read-only shared_ptr to the data /// \return A const shared_ptr to the tensor data @@ -2194,6 +2214,8 @@ class Tensor { TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); TA_ASSERT(!right.empty()); TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); + TA_ASSERT(left.batch_size() == right.batch_size()); + const auto batch_sz = left.batch_size(); // Check that the inner dimensions of left and right match TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(), @@ -2207,7 +2229,8 @@ class Tensor { if (this->empty()) { // initialize, if empty *this = Tensor(gemm_helper.make_result_range(left.range(), - right.range())); + right.range()), + batch_sz); } else { // Check that the outer dimensions of left match the corresponding // dimensions in result @@ -2230,6 +2253,9 @@ class Tensor { TA_ASSERT(ignore_tile_position() || gemm_helper.right_result_congruent( right.range().upbound_data(), this->range_.upbound_data())); + + // check that batch size of this matches that of left and right + TA_ASSERT(this->batch_size() == batch_sz); } // Compute gemm dimensions @@ -2243,20 +2269,25 @@ class Tensor { const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K); - for (integer m = 0; m != M; ++m) { - for (integer n = 0; n != N; ++n) { - auto c_offset = m * N + n; - for (integer k = 0; k != K; ++k) { - auto a_offset = - gemm_helper.left_op() == TiledArray::math::blas::NoTranspose - ? m * lda + k - : k * lda + m; - auto b_offset = - gemm_helper.right_op() == TiledArray::math::blas::NoTranspose - ? k * ldb + n - : n * ldb + k; - elem_muladd_op(*(this->data() + c_offset), *(left.data() + a_offset), - *(right.data() + b_offset)); + for (integer b = 0; b != batch_size(); ++b) { + auto this_data = this->batch_data(b); + auto left_data = left.batch_data(b); + auto right_data = right.batch_data(b); + for (integer m = 0; m != M; ++m) { + for (integer n = 0; n != N; ++n) { + auto c_offset = m * N + n; + for (integer k = 0; k != K; ++k) { + auto a_offset = + gemm_helper.left_op() == TiledArray::math::blas::NoTranspose + ? m * lda + k + : k * lda + m; + auto b_offset = + gemm_helper.right_op() == TiledArray::math::blas::NoTranspose + ? k * ldb + n + : n * ldb + k; + elem_muladd_op(*(this_data + c_offset), *(left_data + a_offset), + *(right_data + b_offset)); + } } } } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb2ffe1869..eb976b31f5 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -604,10 +604,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { return result; }; - TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2, 5}}; auto lhs = random_tot_darr(world, lhs_trange); - TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}}; auto rhs = random_tot_darr(world, rhs_trange); dist_array_t result; BOOST_REQUIRE_NO_THROW( From 657a12887c119bd63366d509595cd486ec5cb081 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 13:10:40 -0500 Subject: [PATCH 61/88] Make single-valued initializer lists explicit in ambiguous cases. --- tests/initializer_list.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/initializer_list.cpp b/tests/initializer_list.cpp index 4d051f957d..3f5ad27b80 100644 --- a/tests/initializer_list.cpp +++ b/tests/initializer_list.cpp @@ -471,7 +471,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(vector, T, scalar_type_list) { auto array = array_from_il>(world, tr, il); using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 2.0}), - tile_type(tr.make_tile_range(1), {3.0})}; + tile_type(tr.make_tile_range(1), std::initializer_list{3.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; tile_type tile = array.find(i); @@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(matrix, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; @@ -503,11 +503,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0}), tile_type(tr.make_tile_range(4), {10.0, 13.0}), tile_type(tr.make_tile_range(5), {11.0, 12.0, 14.0, 15.0}), - tile_type(tr.make_tile_range(6), {16.0}), + tile_type(tr.make_tile_range(6), std::initializer_list{16.0}), tile_type(tr.make_tile_range(7), {17.0, 18.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; From a08026c0a5d84343fbbf88118cc935de6e0c45c4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:34:25 -0500 Subject: [PATCH 62/88] Use .data() method to access elements by ordinal in tensor_reduce function. --- src/TiledArray/tensor/kernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 97f7dc1e5b..f1ec6d99c5 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -787,8 +787,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, auto result = identity; for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) { auto temp = - tensor_reduce(reduce_op, join_op, identity, tensor1.at_ordinal(ord), - tensors.at_ordinal(ord)...); + tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], + tensors.data()[ord]...); join_op(result, temp); } From a5b253b5429bc6dbcafc2ee177c259f71502117f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:36:08 -0500 Subject: [PATCH 63/88] Implement Tot x T (and reverse) generalized contraction. --- src/TiledArray/einsum/tiledarray.h | 84 +++++++++++++++--------------- tests/einsum.cpp | 14 +++-- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 48648407cb..2bd548df5c 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -181,50 +181,51 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (std::tuple_size::value > 1) { - TA_ASSERT(e); - } else if (!e) { // hadamard reduction - auto &[A, B] = AB; - TiledRange trange(range_map[i]); - RangeProduct tiles; - for (auto idx : i) { - tiles *= Range(range_map[idx].tiles_range()); - } - auto pa = A.permutation; - auto pb = B.permutation; - for (Index h : H.tiles) { - if (!C.array.is_local(h)) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); + if constexpr (std::tuple_size::value > 1) TA_ASSERT(e); + if constexpr (AreArraySame) { + if (!e) { // hadamard reduction + auto &[A, B] = AB; + TiledRange trange(range_map[i]); + RangeProduct tiles; + for (auto idx : i) { + tiles *= Range(range_map[idx].tiles_range()); } - ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type{}); - for (Index i : tiles) { - // skip this unless both input tiles exist - const auto pahi_inv = apply_inverse(pa, h + i); - const auto pbhi_inv = apply_inverse(pb, h + i); - if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; - - auto ai = A.array.find(pahi_inv).get(); - auto bi = B.array.find(pbhi_inv).get(); - if (pa) ai = ai.permute(pa); - if (pb) bi = bi.permute(pb); - auto shape = trange.tile(i); - ai = ai.reshape(shape, batch); - bi = bi.reshape(shape, batch); - for (size_t k = 0; k < batch; ++k) { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; + auto pa = A.permutation; + auto pb = B.permutation; + for (Index h : H.tiles) { + if (!C.array.is_local(h)) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); } + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type{}); + for (Index i : tiles) { + // skip this unless both input tiles exist + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (size_t k = 0; k < batch; ++k) { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; + } + } + auto pc = C.permutation; + auto shape = apply_inverse(pc, C.array.trange().tile(h)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C.array.set(h, tile); } - auto pc = C.permutation; - auto shape = apply_inverse(pc, C.array.trange().tile(h)); - tile = tile.reshape(shape); - if (pc) tile = tile.permute(pc); - C.array.set(h, tile); + return C.array; } - return C.array; } // generalized contraction @@ -468,7 +469,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { using ECT = expressions::TsrExpr; using ECU = expressions::TsrExpr; - return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); + using ResultExprT = std::conditional_t, T, U>; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb976b31f5..3e7b502da9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -845,7 +845,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { } } -BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -877,7 +877,6 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - // TODO compute ref_result // i,j;m,n * j,k => i,j,k;m,n TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), rhs_trange.dim(1)}; @@ -928,10 +927,17 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("i,j,k;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); + tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + { + result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); + are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + } } BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { From f001847d09461a37d5686c34a1155f50b1a1fb63 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 21 Dec 2023 15:05:19 -0500 Subject: [PATCH 64/88] einsum tot x tot 'i,j;m,n * j,k;m,n -> i,jk;m,n' unit-test compares results --- tests/einsum.cpp | 51 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3e7b502da9..3e66e4b05b 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -581,13 +581,16 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) { } BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { - using dist_array_t = DistArray>, DensePolicy>; + using tot_type = DistArray>, DensePolicy>; using matrix_il = TiledArray::detail::matrix_il>; auto& world = TiledArray::get_default_world(); auto random_tot = [](TA::Range const& rng) { TA::Range inner_rng{7,14}; TA::Tensor t{inner_rng}; + std::generate(t.begin(),t.end(),[]()->double{ + return TA::detail::MakeRandom::generate_value(); + }); TA::Tensor> result{rng}; for (auto& e: result) e = t; return result; @@ -595,7 +598,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto random_tot_darr = [&random_tot](World& world, TiledRange const& tr) { - dist_array_t result(world, tr); + tot_type result(world, tr); for (auto it = result.begin(); it != result.end(); ++it) { auto tile = TA::get_default_world().taskq.add(random_tot, it.make_range()); @@ -609,9 +612,51 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}}; auto rhs = random_tot_darr(world, rhs_trange); - dist_array_t result; + tot_type result; BOOST_REQUIRE_NO_THROW( result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n")); + + // i,j,k;m,n = i,j;m,n * k,j;m,n + TiledRange ref_result_trange{lhs.trange().dim(0), lhs.trange().dim(1), + rhs.trange().dim(0)}; + tot_type ref_result(world, ref_result_trange); + + // + // why cannot lhs and rhs be captured by ref? + // + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix: result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + auto k = res_ix[2]; + using Ix2 = std::array; + using Ix3 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, j}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, j})); + res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n + } + return result_tile; + }; + + using std::begin; + using std::end; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); } BOOST_AUTO_TEST_CASE(xxx) { From f4bba8e9fd6bc879dd2e92ca342827249701bbfc Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 21 Dec 2023 15:19:35 -0500 Subject: [PATCH 65/88] Make shape comparison flags more explicit. --- tests/einsum.cpp | 12 ++++++------ tests/tot_array_fixture.h | 10 ++++++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3e66e4b05b..e518626e97 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -655,7 +655,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); *it = tile; } - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); } @@ -879,13 +879,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -976,11 +976,11 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { // will try to make this work tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); { result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); - are_equal = ToTArrayFixture::are_equal(result, ref_result); + are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); } } @@ -1073,7 +1073,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 21a9c956c6..c01399dbba 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -88,6 +88,12 @@ using input_archive_type = madness::archive::BinaryFstreamInputArchive; // Type of an output archive using output_archive_type = madness::archive::BinaryFstreamOutputArchive; +enum class ShapeComp { + True, + False +}; + + /* * * When generating arrays containing tensors of tensors (ToT) we adopt simple @@ -238,7 +244,7 @@ struct ToTArrayFixture { * * TODO: pmap comparisons */ - template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { @@ -254,7 +260,7 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - if constexpr (ShapeCmp) + if constexpr (ShapeCompFlag == ShapeComp::True) if (lhs.shape() != rhs.shape()) return false; // Same pmap? From 0c30bb349dcbb1fd9489d07fb146e3de7d7fb413 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 23 Dec 2023 07:53:44 -0500 Subject: [PATCH 66/88] use version-controlled clang-format.sh from https://github.com/ValeevGroup/DevOps/blob/master/tools/clang-format/clang-format.sh --- .pre-commit-config.yaml | 4 +- bin/admin/clang-format.sh | 94 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) create mode 100755 bin/admin/clang-format.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23f1509ca1..fd5c27bf6d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,5 +38,5 @@ repos: name: Format C/C++ code using clang-format. language: system files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ - entry: clang-format -i - args: [--style=file] + entry: bin/admin/clang-format.sh + args: [--style=file -i] diff --git a/bin/admin/clang-format.sh b/bin/admin/clang-format.sh new file mode 100755 index 0000000000..3531dcc1b3 --- /dev/null +++ b/bin/admin/clang-format.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# these are the versions of clang-format that are supported required +# should be ordered from oldest to newest to make sure the newest is picked +supported_clang_format_versions="16 17" +preferred_clang_format_version="" # prefer most recent supported clang-format version +for v in $supported_clang_format_versions; do + preferred_clang_format_version=$v +done + +# append common locations of clang-format to PATH +unameOut="$(uname -s)" +case "${unameOut}" in + Darwin*) + extra_path="" + # this prefers more recent versions + for v in $supported_clang_format_versions; do + extra_path=/opt/homebrew/opt/llvm@$v/bin:/opt/homebrew/opt/clang-format@$v/bin:$extra_path + done + # prepend paths + export PATH=$extra_path:$PATH:/opt/homebrew/bin + ;; +esac + +path_to_clang_format=`which clang-format` +have_supported_clang_format_version=0 +if [[ "X$path_to_clang_format" != "X" ]]; then + + # check clang-format version + clang_format_version=`clang-format --version | sed 's/.* version //' | awk -F'[.]' '{print $1}'` + + #echo "supported_clang_format_versions=\"$supported_clang_format_versions\" clang_format_version=$clang_format_version" + + # if found clang-format, but wrong version, check if docker is available + for v in $supported_clang_format_versions; do + if [[ $clang_format_version -eq $v ]]; then + have_supported_clang_format_version=1 + break + fi + done +fi + +if [[ $have_supported_clang_format_version -eq 0 ]]; then + echo "WARNING: found clang-format with unsupported version $clang_format_version (supported versions: $supported_clang_format_versions)" + + # look for docker + path_to_docker=`which docker` + if [[ "X$path_to_docker" = "X" ]]; then + echo "ERROR: docker is not found either, PATH=$PATH, install one of supported clang-format versions (any of these: $supported_clang_format_versions) or install docker" + exit 1 + fi + + # if docker up? + docker info >/dev/null 2>&1 + if [[ $? -ne 0 ]]; then + echo "ERROR: docker is found but not running, start it" + exit 1 + fi + + # use docker to run clang-format + mount_path=$(readlink -f "$HOME") + + # convert file names in the arguments to relative paths + args="" + for i in "$@"; do + # skip options + if [[ "$i" == -* ]]; then + args="$args $i" + continue + fi + abs_file_path=$(readlink -f "$i") + if [[ "X$abs_file_path" = "X" ]]; then + echo "ERROR: given file $i is not found" + exit 1 + fi + + dir=$(dirname $abs_file_path) + file_path_relative_to_project_root=$(basename $abs_file_path) + while [[ "$dir" != "$mount_path" && "$dir" != "/" ]]; do + file_path_relative_to_project_root="$(basename $dir)/$file_path_relative_to_project_root" + dir=$(dirname $dir) + #echo "dir=$dir file_path_relative_to_project_root=$file_path_relative_to_project_root" + done + if [[ "$dir" == "/" ]]; then + echo "ERROR: given file $i (absolute path $abs_file_path) is not under \$HOME=$mount_path, cannot use docker-based clang-format in this case" + exit 1 + fi + args="$args /hostHOME/$file_path_relative_to_project_root" + done + docker run --platform linux/x86_64 -v $mount_path:/hostHOME xianpengshen/clang-tools:$preferred_clang_format_version clang-format $args +else + #echo "found $path_to_clang_format with required version $clang_format_version" + clang-format $* +fi From ba0be00b5e7ea9fc6b31a7789be81bd4a4cae959 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 23 Dec 2023 07:56:35 -0500 Subject: [PATCH 67/88] [ut] einsum_tot/ijk_mn_eq_ij_mn_times_kj_mn : how NOT to compute ref_result --- tests/einsum.cpp | 78 ++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index e518626e97..22a6ddc326 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -586,18 +586,17 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto& world = TiledArray::get_default_world(); auto random_tot = [](TA::Range const& rng) { - TA::Range inner_rng{7,14}; + TA::Range inner_rng{7, 14}; TA::Tensor t{inner_rng}; - std::generate(t.begin(),t.end(),[]()->double{ + std::generate(t.begin(), t.end(), []() -> double { return TA::detail::MakeRandom::generate_value(); }); TA::Tensor> result{rng}; - for (auto& e: result) e = t; + for (auto& e : result) e = t; return result; }; - auto random_tot_darr = [&random_tot](World& world, - TiledRange const& tr) { + auto random_tot_darr = [&random_tot](World& world, TiledRange const& tr) { tot_type result(world, tr); for (auto it = result.begin(); it != result.end(); ++it) { auto tile = @@ -621,12 +620,9 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { rhs.trange().dim(0)}; tot_type ref_result(world, ref_result_trange); - // - // why cannot lhs and rhs be captured by ref? - // - auto make_tile = [lhs, rhs](TA::Range const& rng) { + auto make_tile = [&lhs, &rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; - for (auto&& res_ix: result_tile.range()) { + for (auto&& res_ix : result_tile.range()) { auto i = res_ix[0]; auto j = res_ix[1]; auto k = res_ix[2]; @@ -643,7 +639,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto const& lhs_el = lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, j})); - res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n + res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n } return result_tile; }; @@ -651,12 +647,28 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using std::begin; using std::end; - for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + const auto have_spare_threads = madness::ThreadPool::size() > 0; + if (have_spare_threads) { + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + if (ref_result.is_local(it.index())) { + // using tasks does not work because: + // - make_tile pulls possibly remote data + // - but it also blocks thread on a remote tile futures, whose + // fulfillment requires available threads in the pool + // + // *it = world.taskq.add(make_tile, it.make_range()); + + // this technically will only work if the number of free threads in the + // pool is > 0 (i.e. main is not part of the pool or pool has 2 threads) + // + // OK, fine, @bosilca, blocking in tasks is BAD + *it = make_tile(it.make_range()); + } + } + bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); } - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - BOOST_REQUIRE(are_equal); } BOOST_AUTO_TEST_CASE(xxx) { @@ -879,13 +891,15 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -976,11 +990,13 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { // will try to make this work tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); { result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); - are_equal = ToTArrayFixture::are_equal(result, ref_result); + are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); } } @@ -1014,12 +1030,9 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); Tensor lhs_elem_5_1( Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); - matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, - {lhs_elem_1_0, lhs_elem_1_1}, - {lhs_elem_2_0, lhs_elem_2_1}, - {lhs_elem_3_0, lhs_elem_3_1}, - {lhs_elem_4_0, lhs_elem_4_1}, - {lhs_elem_5_0, lhs_elem_5_1}}; + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, {lhs_elem_3_0, lhs_elem_3_1}, + {lhs_elem_4_0, lhs_elem_4_1}, {lhs_elem_5_0, lhs_elem_5_1}}; TiledRange lhs_trange{{0, 2, 6}, {0, 2}}; tot_type lhs(world, lhs_trange, lhs_il); @@ -1046,17 +1059,15 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false ); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); auto const& lhs_el = lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); - auto rhs_el = - rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); - res_el = tot_type::element_type( - lhs_el.scale(rhs_el), // scale - TiledArray::Permutation{0, 1} // permute + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + res_el = tot_type::element_type(lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{0, 1} // permute ); } return result_tile; @@ -1073,7 +1084,8 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } From 987040b68c06c69c10cd11728f493dfa55cedf0f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 23 Dec 2023 08:05:34 -0500 Subject: [PATCH 68/88] [ut] einsum_tot/ijk_mn_eq_ij_mn_times_kj_mn : how to compute ref_result --- tests/einsum.cpp | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 22a6ddc326..12692dc515 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -620,6 +620,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { rhs.trange().dim(0)}; tot_type ref_result(world, ref_result_trange); + // to be able to pull remote tiles make them local AND ready + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); auto make_tile = [&lhs, &rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; for (auto&& res_ix : result_tile.range()) { @@ -630,9 +634,9 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, j}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -647,28 +651,14 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using std::begin; using std::end; - const auto have_spare_threads = madness::ThreadPool::size() > 0; - if (have_spare_threads) { - for (auto it = begin(ref_result); it != end(ref_result); ++it) { - if (ref_result.is_local(it.index())) { - // using tasks does not work because: - // - make_tile pulls possibly remote data - // - but it also blocks thread on a remote tile futures, whose - // fulfillment requires available threads in the pool - // - // *it = world.taskq.add(make_tile, it.make_range()); - - // this technically will only work if the number of free threads in the - // pool is > 0 (i.e. main is not part of the pool or pool has 2 threads) - // - // OK, fine, @bosilca, blocking in tasks is BAD - *it = make_tile(it.make_range()); - } + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + if (ref_result.is_local(it.index())) { + *it = world.taskq.add(make_tile, it.make_range()); } - bool are_equal = - ToTArrayFixture::are_equal(result, ref_result); - BOOST_REQUIRE(are_equal); } + bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); } BOOST_AUTO_TEST_CASE(xxx) { From 2392f2018d005c89ba804a2db78c891e24b7eb8c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 23 Dec 2023 09:35:14 -0500 Subject: [PATCH 69/88] [ut] ref result manual computation pattern from previous commit applied to more cases. --- tests/einsum.cpp | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 12692dc515..57a31a48e8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -839,6 +839,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { // i,l,k,j;n,m = i,j;m,n * k,l // + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); + // why cannot lhs and rhs be captured by ref? auto make_tile = [lhs, rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; @@ -852,10 +856,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using Ix4 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); @@ -874,8 +878,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + if (ref_result.is_local(it.index())) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } } tot_type result; @@ -931,6 +937,9 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); + lhs.make_replicated(); + rhs.make_replicated(); + // // why cannot lhs and rhs be captured by ref? // @@ -945,10 +954,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -965,8 +974,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + if (ref_result.is_local(it.index())) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } } ///////////////////////////////////////////////////////// @@ -1036,6 +1047,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); + // why cannot lhs and rhs be captured by ref? auto make_tile = [lhs, rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; @@ -1046,10 +1061,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { using Ix2 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); @@ -1067,8 +1082,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + if (ref_result.is_local(it.index())) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } } tot_type result; From 8b365a91ad6834071491f1525c9b426e66f02b81 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 23 Dec 2023 10:35:24 -0500 Subject: [PATCH 70/88] [ut] typo --- tests/einsum.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 57a31a48e8..49e6812cac 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -939,6 +939,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { lhs.make_replicated(); rhs.make_replicated(); + world.gop.fence(); // // why cannot lhs and rhs be captured by ref? From 6c7a9f498b12101da345b519d496a4f9c33f89fd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 23 Dec 2023 12:33:18 -0500 Subject: [PATCH 71/88] [ci skip] add .batched_size() method to Tensor that returns size() multiplied by batch_size(). --- src/TiledArray/tensor/tensor.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index c901dc0f4b..e6c98b0cf0 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -672,6 +672,10 @@ class Tensor { /// \return The number of elements in the tensor ordinal_type size() const { return (this->range().volume()); } + /// \return The number of elements in the tensor by summing up the sizes of + /// the batches. + ordinal_type batched_size() const { return size() * batch_size(); } + /// Tensor data size (in bytes) accessor /// \return The number of bytes occupied by this tensor's data @@ -1064,10 +1068,10 @@ class Tensor { bool empty = this->empty(); auto range = this->range_; auto batch_size = this->batch_size_; - ar& empty; + ar & empty; if (!empty) { - ar& range; - ar& batch_size; + ar & range; + ar & batch_size; if constexpr (madness::is_input_archive_v) { *this = Tensor(std::move(range), batch_size, default_construct{true}); } From 60327021442f33bfff3e4e8d60ab7adce4c337a5 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 24 Dec 2023 12:31:11 -0500 Subject: [PATCH 72/88] Tensor reduce works on batch_size() * volume() many elements. --- src/TiledArray/tensor/kernels.h | 41 ++++++++++++++++++++++------- src/TiledArray/tensor/type_traits.h | 17 ++++++++++++ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index f1ec6d99c5..c2f7c0897d 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -714,7 +714,12 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); auto init = std::forward(identity); math::reduce_op(std::forward(reduce_op), @@ -782,13 +787,17 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); auto result = identity; - for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) { - auto temp = - tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], - tensors.data()[ord]...); + for (std::remove_cv_t ord = 0ul; ord < volume; ++ord) { + auto temp = tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], + tensors.data()[ord]...); join_op(result, temp); } @@ -825,7 +834,12 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); auto result = identity; if constexpr (detail::has_member_function_data_anyreturn_v && @@ -840,6 +854,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, join_op(result, temp); } } else { // if 1+ tensor lacks data() must iterate over individual elements + // TA_ASSERT(tensor1.batch_size() == 1); // todo: asser the same for the + // remaining tensors auto& t1_rng = tensor1.range(); using signed_idx_t = Range::index_difference_type; auto t1_lobound = signed_idx_t(t1_rng.lobound()); @@ -884,8 +900,15 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); + // TA_ASSERT(tensor1.batch_size() == 1); // todo: assert the same for the + // remaining tensors - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); Scalar result = identity; @@ -897,7 +920,7 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar& MADNESS_RESTRICT result, typename T1::const_pointer MADNESS_RESTRICT const tensor1_data, typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { - for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) { + for (std::remove_cv_t i = 0ul; i < stride; ++i) { Scalar temp = tensor_reduce(reduce_op, join_op, identity, tensor1_data[i], tensors_data[i]...); join_op(result, temp); diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index fd197c8cdf..10fdb70204 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -400,6 +400,23 @@ struct ordinal_traits>> { std::decay_t().range())>>::type; }; +template +class has_batch_size { + /// true case + template + static auto __test(U* p) -> decltype(p->batch_size(), std::true_type()); + /// false case + template + static std::false_type __test(...); + + public: + static constexpr const bool value = + std::is_same(0))>::value; +}; + +template +constexpr inline bool has_batch_size_v = has_batch_size::value; + } // namespace detail } // namespace TiledArray From 959c84fe3f99b59b6e8cc3173ccea4a46557ea0f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 25 Dec 2023 15:32:44 -0500 Subject: [PATCH 73/88] Rename TA::Tensor member function 'batched_size' to 'total_size'. --- src/TiledArray/tensor/kernels.h | 16 ++++++++-------- src/TiledArray/tensor/tensor.h | 2 +- src/TiledArray/tensor/type_traits.h | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index c2f7c0897d..d87007205b 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -715,8 +715,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity, TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); @@ -788,8 +788,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); @@ -835,8 +835,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); @@ -904,8 +904,8 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, // remaining tensors const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index e6c98b0cf0..15f2dcdd3e 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -674,7 +674,7 @@ class Tensor { /// \return The number of elements in the tensor by summing up the sizes of /// the batches. - ordinal_type batched_size() const { return size() * batch_size(); } + ordinal_type total_size() const { return size() * batch_size(); } /// Tensor data size (in bytes) accessor diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 10fdb70204..89f8da70a2 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -401,10 +401,10 @@ struct ordinal_traits>> { }; template -class has_batch_size { +class has_total_size { /// true case template - static auto __test(U* p) -> decltype(p->batch_size(), std::true_type()); + static auto __test(U* p) -> decltype(p->total_size(), std::true_type()); /// false case template static std::false_type __test(...); @@ -415,7 +415,7 @@ class has_batch_size { }; template -constexpr inline bool has_batch_size_v = has_batch_size::value; +constexpr inline bool has_total_size_v = has_total_size::value; } // namespace detail From f0cd2a9b1b5166e8c856c768b8c602990be3480c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 25 Dec 2023 19:43:07 -0500 Subject: [PATCH 74/88] [cmake] disable clang-format use by umpire/blt --- external/umpire.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/external/umpire.cmake b/external/umpire.cmake index aa98f27b1e..efa0a0da36 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -87,6 +87,7 @@ else() -DENABLE_EXAMPLES=OFF -DENABLE_LOGGING=OFF -DENABLE_ASSERTS=${enable_umpire_asserts} + -DENABLE_CLANGFORMAT=OFF ) # caveat: on recent Ubuntu default libstdc++ provides filesystem, but if using older gcc (gcc-8) must link against From 0d4d2b6dc60adeabdeab08c3cc80efd5553f5bea Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 25 Dec 2023 20:46:37 -0500 Subject: [PATCH 75/88] Rename TA::Tensor and TA::Tile member function 'batch_size' to more revealing 'nbatch'. --- src/TiledArray/einsum/tiledarray.h | 2 +- src/TiledArray/tensor.h | 4 +- src/TiledArray/tensor/kernels.h | 4 +- src/TiledArray/tensor/tensor.h | 184 +++++++++++----------- src/TiledArray/tile.h | 10 +- src/TiledArray/tile_op/binary_reduction.h | 4 +- 6 files changed, 103 insertions(+), 105 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 2bd548df5c..18a3871f0b 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -293,7 +293,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, if (C.ei.is_zero(e)) continue; // TODO no need for immediate evaluation auto tile = C.ei.find_local(e).get(); - assert(tile.batch_size() == batch); + assert(tile.nbatch() == batch); const Permutation &P = C.permutation; auto c = apply(P, h + e); auto shape = C.array.trange().tile(c); diff --git a/src/TiledArray/tensor.h b/src/TiledArray/tensor.h index edb7ba2e47..20ecab9e0e 100644 --- a/src/TiledArray/tensor.h +++ b/src/TiledArray/tensor.h @@ -63,8 +63,8 @@ inline std::ostream& operator<<(std::ostream& os, const T& t) { os << t.range() << " { "; const auto n = t.range().volume(); std::size_t offset = 0ul; - const auto more_than_1_batch = t.batch_size() > 1; - for (auto b = 0ul; b != t.batch_size(); ++b) { + const auto more_than_1_batch = t.nbatch() > 1; + for (auto b = 0ul; b != t.nbatch(); ++b) { if (more_than_1_batch) { os << "[batch " << b << "]{ "; } diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index d87007205b..682cb1b209 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -854,7 +854,7 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, join_op(result, temp); } } else { // if 1+ tensor lacks data() must iterate over individual elements - // TA_ASSERT(tensor1.batch_size() == 1); // todo: asser the same for the + // TA_ASSERT(tensor1.nbatch() == 1); // todo: assert the same for the // remaining tensors auto& t1_rng = tensor1.range(); using signed_idx_t = Range::index_difference_type; @@ -900,7 +900,7 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - // TA_ASSERT(tensor1.batch_size() == 1); // todo: assert the same for the + // TA_ASSERT(tensor1.nbatch() == 1); // todo: assert the same for the // remaining tensors const auto volume = [&tensor1]() { diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 15f2dcdd3e..1b5beff19d 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -139,9 +139,9 @@ class Tensor { private: using default_construct = bool; - Tensor(const range_type& range, size_t batch_size, bool default_construct) - : range_(range), batch_size_(batch_size) { - size_t size = range_.volume() * batch_size; + Tensor(const range_type& range, size_t nbatch, bool default_construct) + : range_(range), nbatch_(nbatch) { + size_t size = range_.volume() * nbatch; allocator_type allocator; auto* ptr = allocator.allocate(size); if (default_construct) { @@ -177,9 +177,9 @@ class Tensor { #endif } - Tensor(range_type&& range, size_t batch_size, bool default_construct) - : range_(std::move(range)), batch_size_(batch_size) { - size_t size = range_.volume() * batch_size; + Tensor(range_type&& range, size_t nbatch, bool default_construct) + : range_(std::move(range)), nbatch_(nbatch) { + size_t size = range_.volume() * nbatch; allocator_type allocator; auto* ptr = allocator.allocate(size); if (default_construct) { @@ -232,7 +232,7 @@ class Tensor { range_type range_; ///< Range /// Number of `range_`-sized blocks in `data_` /// \note this is not used for (in)equality comparison - size_t batch_size_ = 1; + size_t nbatch_ = 1; std::shared_ptr data_; ///< Shared pointer to the data public: @@ -246,9 +246,7 @@ class Tensor { /// \post `*this` is a shallow copy of \p other , /// i.e. `*this == other && this->data()==other.data()` Tensor(const Tensor& other) - : range_(other.range_), - batch_size_(other.batch_size_), - data_(other.data_) { + : range_(other.range_), nbatch_(other.nbatch_), data_(other.data_) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -266,7 +264,7 @@ class Tensor { /// \post `other.empty()` Tensor(Tensor&& other) : range_(std::move(other.range_)), - batch_size_(std::move(other.batch_size_)), + nbatch_(std::move(other.nbatch_)), data_(std::move(other.data_)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { @@ -292,9 +290,9 @@ class Tensor { /// Construct a tensor with a range equal to \c range. The data is /// uninitialized. /// \param range The range of the tensor - /// \param batch_size The batch size (default is 1) - explicit Tensor(const range_type& range, size_type batch_size = 1) - : Tensor(range, batch_size, default_construct{true}) {} + /// \param nbatch The number of batches (default is 1) + explicit Tensor(const range_type& range, size_type nbatch = 1) + : Tensor(range, nbatch, default_construct{true}) {} /// Construct a tensor of tensor values, setting all elements to the same /// value @@ -519,15 +517,15 @@ class Tensor { /// Construct a tensor with a range equal to \c range using existing data /// \param range The range of the tensor - /// \param batch_size The batch size + /// \param nbatch The number of batches /// \param data shared pointer to the data - Tensor(const range_type& range, size_t batch_size, + Tensor(const range_type& range, size_t nbatch, std::shared_ptr data) - : range_(range), batch_size_(batch_size), data_(std::move(data)) { + : range_(range), nbatch_(nbatch), data_(std::move(data)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( - this, make_string("TA::Tensor(range, batch_size, data)::data_.get()=", + this, make_string("TA::Tensor(range, nbatch, data)::data_.get()=", data_.get())); } #endif @@ -537,7 +535,7 @@ class Tensor { /// assuming unit batch size \param range The range of the tensor \param data /// shared pointer to the data Tensor(const range_type& range, std::shared_ptr data) - : range_(range), batch_size_(1), data_(std::move(data)) { + : range_(range), nbatch_(1), data_(std::move(data)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -550,14 +548,14 @@ class Tensor { /// The batch size accessor /// @return the size of tensor batch represented by `*this` - size_t batch_size() const { return this->batch_size_; } + size_t nbatch() const { return this->nbatch_; } /// @param[in] idx the batch index - /// @pre `idx < this->batch_size()` - /// @return (plain, i.e. batch_size=1) Tensor representing element \p idx of + /// @pre `idx < this->nbatch()` + /// @return (plain, i.e. nbatch=1) Tensor representing element \p idx of /// the batch Tensor batch(size_t idx) const { - TA_ASSERT(idx < this->batch_size()); + TA_ASSERT(idx < this->nbatch()); std::shared_ptr data(this->data_, this->data_.get() + idx * this->size()); return Tensor(this->range(), 1, data); @@ -566,13 +564,13 @@ class Tensor { /// Returns Tensor representing the data using another range and batch size /// @param[in] range the Range of the result - /// @param[in] batch_size the batch size of the result + /// @param[in] nbatch the number of batches of the result /// @return Tensor object representing `this->data()` using @p range and @p - /// batch_size - auto reshape(const range_type& range, size_t batch_size = 1) const { - TA_ASSERT(this->range().volume() * this->batch_size() == - range.volume() * batch_size); - return Tensor(range, batch_size, this->data_); + /// nbatch + auto reshape(const range_type& range, size_t nbatch = 1) const { + TA_ASSERT(this->range().volume() * this->nbatch() == + range.volume() * nbatch); + return Tensor(range, nbatch, this->data_); } /// @return a deep copy of `*this` @@ -617,7 +615,7 @@ class Tensor { } #endif range_ = other.range_; - batch_size_ = other.batch_size_; + nbatch_ = other.nbatch_; data_ = other.data_; #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { @@ -650,7 +648,7 @@ class Tensor { } #endif range_ = std::move(other.range_); - batch_size_ = std::move(other.batch_size_); + nbatch_ = std::move(other.nbatch_); data_ = std::move(other.data_); #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { @@ -674,14 +672,14 @@ class Tensor { /// \return The number of elements in the tensor by summing up the sizes of /// the batches. - ordinal_type total_size() const { return size() * batch_size(); } + ordinal_type total_size() const { return size() * nbatch(); } /// Tensor data size (in bytes) accessor /// \return The number of bytes occupied by this tensor's data /// \warning this only returns valid value if this is a tensor of scalars std::size_t nbytes() const { - return this->range().volume() * this->batch_size_ * sizeof(T); + return this->range().volume() * this->nbatch_ * sizeof(T); } /// Const element accessor @@ -690,7 +688,7 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Const reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> const_reference operator[](const Ordinal ord) const { @@ -700,7 +698,7 @@ class Tensor { TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -711,7 +709,7 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> reference operator[](const Ordinal ord) { @@ -721,7 +719,7 @@ class Tensor { TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -732,12 +730,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Const reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> const_reference at_ordinal(const Ordinal ord) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -748,12 +746,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> reference at_ordinal(const Ordinal ord) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -764,12 +762,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator[](const Index& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -781,12 +779,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator[](const Index& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -798,12 +796,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator[](const std::initializer_list& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -815,12 +813,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator[](const std::initializer_list& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -832,12 +830,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Const reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator()(const Ordinal& ord) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) // thus assume at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && @@ -853,12 +851,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator()(const Ordinal& ord) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) // thus assume at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && @@ -874,12 +872,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator()(const Index& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -891,12 +889,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator()(const Index& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -908,12 +906,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator()(const std::initializer_list& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -925,12 +923,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator()(const std::initializer_list& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -943,14 +941,14 @@ class Tensor { /// \param[in] i an index \return Const reference to the element at position /// \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template < typename... Index, std::enable_if_t<(sizeof...(Index) > 1ul) && detail::is_integral_list::value>* = nullptr> const_reference operator()(const Index&... i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); @@ -965,14 +963,14 @@ class Tensor { /// \param[in] i an index \return Reference to the element at position \c i /// . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template < typename... Index, std::enable_if_t<(sizeof...(Index) > 1ul) && detail::is_integral_list::value>* = nullptr> reference operator()(const Index&... i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); @@ -1013,18 +1011,18 @@ class Tensor { pointer data() { return this->data_.get(); } /// @param[in] batch_idx the batch index - /// @pre `batch_idx < this->batch_size()` + /// @pre `batch_idx < this->nbatch()` /// @return A const pointer to the tensor data of the batch \p batch_idx const_pointer batch_data(size_t batch_idx) const { - TA_ASSERT(batch_idx < this->batch_size()); + TA_ASSERT(batch_idx < this->nbatch()); return data() + batch_idx * size(); } /// @param[in] batch_idx the batch index - /// @pre `batch_idx < this->batch_size()` + /// @pre `batch_idx < this->nbatch()` /// @return A const pointer to the tensor data of the batch \p batch_idx pointer batch_data(size_t batch_idx) { - TA_ASSERT(batch_idx < this->batch_size()); + TA_ASSERT(batch_idx < this->nbatch()); return data() + batch_idx * size(); } @@ -1049,9 +1047,9 @@ class Tensor { /// (`this->empty()` is equivalent to `*this == Tensor{}`), /// but is not identical /// to a default-constructed Tensor (e.g., `this->empty()` does not - /// imply `this->batch_size() == Tensor{}.batch_size()`) + /// imply `this->nbatch() == Tensor{}.nbatch()`) bool empty() const { - // empty data_ implies default values for range_ (but NOT batch_size_) + // empty data_ implies default values for range_ (but NOT nbatch_) TA_ASSERT( (this->data_.use_count() == 0 && !this->range_) || (this->data_.use_count() != 0 && this->range_)); // range is empty @@ -1067,16 +1065,16 @@ class Tensor { void serialize(Archive& ar) { bool empty = this->empty(); auto range = this->range_; - auto batch_size = this->batch_size_; + auto nbatch = this->nbatch_; ar & empty; if (!empty) { ar & range; - ar & batch_size; + ar & nbatch; if constexpr (madness::is_input_archive_v) { - *this = Tensor(std::move(range), batch_size, default_construct{true}); + *this = Tensor(std::move(range), nbatch, default_construct{true}); } ar& madness::archive::wrap(this->data_.get(), - this->range_.volume() * batch_size); + this->range_.volume() * nbatch); } else { if constexpr (madness::is_input_archive_v) { *this = Tensor{}; @@ -1105,7 +1103,7 @@ class Tensor { #endif std::swap(data_, other.data_); std::swap(range_, other.range_); - std::swap(batch_size_, other.batch_size_); + std::swap(nbatch_, other.nbatch_); #ifdef TA_TENSOR_MEM_TRACE if (other_to_be_traced) { ptr_registry()->insert( @@ -2123,11 +2121,11 @@ class Tensor { if (this->empty()) { *this = Tensor(gemm_helper.make_result_range(A.range_, B.range()), - A.batch_size(), default_construct{true}); + A.nbatch(), default_construct{true}); beta = 0; } - TA_ASSERT(this->batch_size() == A.batch_size()); - TA_ASSERT(this->batch_size() == B.batch_size()); + TA_ASSERT(this->nbatch() == A.nbatch()); + TA_ASSERT(this->nbatch() == B.nbatch()); // may need to split gemm into multiply + accumulate for tracing purposes #ifdef TA_ENABLE_TILE_OPS_LOGGING @@ -2138,11 +2136,11 @@ class Tensor { std::unique_ptr data_copy; size_t tile_volume; if (twostep) { - tile_volume = range().volume() * batch_size(); + tile_volume = range().volume() * nbatch(); data_copy = std::make_unique(tile_volume); std::copy(data_.get(), data_.get() + tile_volume, data_copy.get()); } - for (size_t i = 0; i < this->batch_size(); ++i) { + for (size_t i = 0; i < this->nbatch(); ++i) { auto Ci = this->batch(i); TiledArray::gemm(alpha, A.batch(i), B.batch(i), twostep ? numeric_type(0) : numeric_type(1), Ci, @@ -2183,7 +2181,7 @@ class Tensor { TiledArray::TileOpsLogger::get_instance().gemm_printer( *logger.log, tformed_left_range, A.data(), tformed_right_range, B.data(), tformed_right_range, - this->data(), this->batch_size()); + this->data(), this->nbatch()); } } } @@ -2196,7 +2194,7 @@ class Tensor { } } #else // TA_ENABLE_TILE_OPS_LOGGING - for (size_t i = 0; i < this->batch_size(); ++i) { + for (size_t i = 0; i < this->nbatch(); ++i) { auto Ci = this->batch(i); TiledArray::gemm(alpha, A.batch(i), B.batch(i), beta, Ci, gemm_helper); } @@ -2218,8 +2216,8 @@ class Tensor { TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); TA_ASSERT(!right.empty()); TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); - TA_ASSERT(left.batch_size() == right.batch_size()); - const auto batch_sz = left.batch_size(); + TA_ASSERT(left.nbatch() == right.nbatch()); + const auto batch_sz = left.nbatch(); // Check that the inner dimensions of left and right match TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(), @@ -2259,7 +2257,7 @@ class Tensor { right.range().upbound_data(), this->range_.upbound_data())); // check that batch size of this matches that of left and right - TA_ASSERT(this->batch_size() == batch_sz); + TA_ASSERT(this->nbatch() == batch_sz); } // Compute gemm dimensions @@ -2273,7 +2271,7 @@ class Tensor { const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K); - for (integer b = 0; b != batch_size(); ++b) { + for (integer b = 0; b != nbatch(); ++b) { auto this_data = this->batch_data(b); auto left_data = left.batch_data(b); auto right_data = right.batch_data(b); @@ -2599,9 +2597,9 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, TA_ASSERT(!B.empty()); TA_ASSERT(B.range().rank() == gemm_helper.right_rank()); - TA_ASSERT(A.batch_size() == 1); - TA_ASSERT(B.batch_size() == 1); - TA_ASSERT(C.batch_size() == 1); + TA_ASSERT(A.nbatch() == 1); + TA_ASSERT(B.nbatch() == 1); + TA_ASSERT(C.nbatch() == 1); // Check that the outer dimensions of left match the corresponding // dimensions in result @@ -2699,7 +2697,7 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, TiledArray::TileOpsLogger::get_instance().gemm_printer( *logger.log, tformed_left_range, A.data(), tformed_right_range, B.data(), tformed_right_range, C.data(), - C.batch_size()); + C.nbatch()); } } } @@ -2725,8 +2723,8 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, /// \param[in] a a Tensor object /// \param[in] b another Tensor object /// \return true if ranges and data of \p a and \p b are equal -/// \internal this does not compare batch_size so any -/// 2 empty tensors are equal even if their batch_size +/// \internal this does not compare nbatch so any +/// 2 empty tensors are equal even if their nbatch /// differ template bool operator==(const Tensor& a, const Tensor& b) { diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index b8242fbf19..1091362287 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -589,7 +589,7 @@ class Tile { void serialize(Archive& ar) const { // Serialize data for empty tile check bool empty = !static_cast(pimpl_); - ar& empty; + ar & empty; if (!empty) { // Serialize tile data ar&* pimpl_; @@ -602,12 +602,12 @@ class Tile { void serialize(Archive& ar) { // Check for empty tile bool empty = false; - ar& empty; + ar & empty; if (!empty) { // Deserialize tile data tensor_type tensor; - ar& tensor; + ar & tensor; // construct a new pimpl pimpl_ = std::make_shared(std::move(tensor)); @@ -617,10 +617,10 @@ class Tile { } } - constexpr static std::size_t batch_size() { return 1; } + constexpr static std::size_t nbatch() { return 1; } const auto& batch(std::size_t idx) const { - TA_ASSERT(idx < this->batch_size()); + TA_ASSERT(idx < this->nbatch()); return *this; } diff --git a/src/TiledArray/tile_op/binary_reduction.h b/src/TiledArray/tile_op/binary_reduction.h index d65d133f32..4bbac16bcf 100644 --- a/src/TiledArray/tile_op/binary_reduction.h +++ b/src/TiledArray/tile_op/binary_reduction.h @@ -63,8 +63,8 @@ class DotReduction { void operator()(result_type& result, const first_argument_type& left, const second_argument_type& right) const { using TiledArray::dot; - TA_ASSERT(left.batch_size() == right.batch_size()); - size_t nb = left.batch_size(); + TA_ASSERT(left.nbatch() == right.nbatch()); + size_t nb = left.nbatch(); for (size_t i = 0; i < nb; ++i) { result += dot(left.batch(i), right.batch(i)); } From efb852e9efa864d965fd29dff5d7bb5100694da1 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 2 Jan 2024 14:43:38 -0500 Subject: [PATCH 76/88] Generic scalar_type instead of a cpp literal value --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 21aceae14c..2a658dc886 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -511,7 +511,7 @@ class ContEngine : public BinaryEngine { // is contract then inner must implement (ternary) multiply-add; // if the outer is hadamard then the inner is binary multiply const auto outer_prod = this->product_type(); - if (this->factor_ == 1) { + if (this->factor_ == scalar_type{1}) { using base_op_type = TiledArray::detail::Mult Date: Wed, 3 Jan 2024 10:29:58 -0500 Subject: [PATCH 77/88] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/512 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/dist_eval/dist_eval.h | 2 +- tests/dist_op_communicator.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 8624da6e01..cbdbc817a2 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag c0c4ea543439c740e3ee848fdd055c633a47f6c5 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0cb3920715c9a659bbb8158f9a31db1bd97d4614 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index eff687a3fe..1780dbbfb1 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 03c82cf2780d9e96298cc9140ac128c73eacd3b1) +set(TA_TRACKED_MADNESS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index 2fd6329de5..c6d0442174 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -110,7 +110,7 @@ class DistEvalImpl : public TensorImpl, const std::shared_ptr& pmap, const Permutation& perm) : TensorImpl_(world, trange, shape, pmap), - id_(world.unique_obj_id()), + id_(world.make_unique_obj_id()), source_to_target_(), target_to_source_(), task_count_(-1), diff --git a/tests/dist_op_communicator.cpp b/tests/dist_op_communicator.cpp index 4eac7a135c..28922e8d6c 100644 --- a/tests/dist_op_communicator.cpp +++ b/tests/dist_op_communicator.cpp @@ -30,9 +30,9 @@ struct DistOpFixture { DistOpFixture() : group_list(), world_group_list(), - group_did(GlobalFixture::world->unique_obj_id(), + group_did(GlobalFixture::world->make_unique_obj_id(), GlobalFixture::world->rank() % 2), - world_did(GlobalFixture::world->unique_obj_id(), + world_did(GlobalFixture::world->make_unique_obj_id(), GlobalFixture::world->size()) { for (ProcessID p = GlobalFixture::world->rank() % 2; p < GlobalFixture::world->size(); p += 2) From 74759c77fedd7876616253f76bbb922023e60802 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 10:35:48 -0500 Subject: [PATCH 78/88] introduced TensorImpl::local_nnz --- src/TiledArray/tensor_impl.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor_impl.h b/src/TiledArray/tensor_impl.h index 6811fc6cb2..7ead791fd2 100644 --- a/src/TiledArray/tensor_impl.h +++ b/src/TiledArray/tensor_impl.h @@ -53,6 +53,8 @@ class TensorImpl : private NO_DEFAULTS { const trange_type trange_; ///< Tiled range type std::shared_ptr shape_; ///< Tensor shape std::shared_ptr pmap_; ///< Process map for tiles + mutable std::atomic> + local_nnz_; ///< Number of nonzero tiles assigned to this rank (memoized) public: /// Constructor @@ -74,6 +76,7 @@ class TensorImpl : private NO_DEFAULTS { trange_(trange), shape_(std::make_shared(shape)), pmap_(pmap) { + local_nnz_ = -1; // ensure that shapes are identical on every rank if (replicate_shape && !shape.is_dense()) world.gop.broadcast_serializable(*shape_, 0); @@ -115,8 +118,8 @@ class TensorImpl : private NO_DEFAULTS { /// Tensor tile volume accessor - /// \return The number of tiles in the tensor - /// \throw nothing + /// \return The number of tiles in the tensor, equivalent to + /// `this->trange().tiles_range().volume()` \throw nothing ordinal_type size() const { return trange_.tiles_range().volume(); } /// Max count of local tiles @@ -131,6 +134,27 @@ class TensorImpl : private NO_DEFAULTS { return static_cast(pmap_->local_size()); } + /// Count of nonzero local tiles + + /// This function is primarily available for debugging purposes. + /// \return The count of nonzero local tiles; for dense array this will be + /// equal to the value produced by local_size(), for a sparse array this will + /// be less than the value produced by local_size() + ordinal_type local_nnz() const { + if (local_nnz_ == -1) { + if (is_dense()) + local_nnz_ = local_size(); + else { + ordinal_type count = 0; + for (auto&& idx : trange_.tiles_range()) { + if (is_local(idx) && !is_zero(idx)) ++count; + } + local_nnz_ = count; + } + } + return local_nnz_; + } + /// Query a tile owner /// \tparam Index The sized integral range type From f3716f836e6289a89d499df27ed38b24a00d9467 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 10:52:39 -0500 Subject: [PATCH 79/88] annotate virtual DistEval class members with override --- src/TiledArray/dist_eval/array_eval.h | 10 +++------- src/TiledArray/dist_eval/binary_eval.h | 6 +++--- src/TiledArray/dist_eval/contraction_eval.h | 6 +++--- src/TiledArray/dist_eval/dist_eval.h | 2 +- src/TiledArray/dist_eval/unary_eval.h | 6 +++--- 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index 3bb34742cf..bb1ac49ae4 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -250,7 +250,7 @@ class ArrayEvalImpl /// Virtual destructor virtual ~ArrayEvalImpl() {} - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { // Get the array index that corresponds to the target index auto array_index = DistEvalImpl_::perm_index_to_source(i); @@ -266,11 +266,7 @@ class ArrayEvalImpl return eval_tile(tile, consumable_tile); } - /// Discard a tile that is not needed - - /// This function handles the cleanup for tiles that are not needed in - /// subsequent computation. - virtual void discard_tile(ordinal_type) const { + void discard_tile(ordinal_type i) const override { const_cast(this)->notify(); } @@ -305,7 +301,6 @@ class ArrayEvalImpl /// This function will evaluate the children of this distributed evaluator /// and evaluate the tiles for this distributed evaluator. /// \return The number of tiles that will be set by this process - virtual int internal_eval() { // Counter for the number of tasks submitted by this object int task_count = 0; @@ -325,6 +320,7 @@ class ArrayEvalImpl } return task_count; + int internal_eval() override { } }; // class ArrayEvalImpl diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index fa33d74d9c..e343c087b3 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -100,7 +100,7 @@ class BinaryEvalImpl : public DistEvalImpl, /// \return A \c Future to the tile at index i /// \throw TiledArray::Exception When tile \c i is owned by a remote node. /// \throw TiledArray::Exception When tile \c i a zero tile. - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); @@ -118,7 +118,7 @@ class BinaryEvalImpl : public DistEvalImpl, /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. /// \param i The index of the tile - virtual void discard_tile(ordinal_type i) const { get_tile(i); } + void discard_tile(ordinal_type i) const override { get_tile(i); } private: /// Task function for evaluating tiles @@ -160,7 +160,7 @@ class BinaryEvalImpl : public DistEvalImpl, /// until the tasks for the children are evaluated (not for the tasks of /// this object). /// \return The number of tiles that will be set by this process - virtual int internal_eval() { + int internal_eval() override { // Evaluate child tensors left_.eval(); right_.eval(); diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 18aac80c57..8ff0d80091 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -1560,7 +1560,7 @@ class Summa /// \return A \c Future to the tile at index i /// \throw TiledArray::Exception When tile \c i is owned by a remote node. /// \throw TiledArray::Exception When tile \c i a zero tile. - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); @@ -1584,7 +1584,7 @@ class Summa /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. /// \param i The index of the tile - virtual void discard_tile(ordinal_type i) const { get_tile(i); } + void discard_tile(ordinal_type i) const override { get_tile(i); } private: /// Adjust iteration depth based on memory constraints @@ -1647,7 +1647,7 @@ class Summa /// until the tasks for the children are evaluated (not for the tasks of /// this object). /// \return The number of tiles that will be set by this process - virtual int internal_eval() { + int internal_eval() override { #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL printf("eval: start eval children rank=%i\n", TensorImpl_::world().rank()); #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index c6d0442174..7585b7e4bf 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -176,7 +176,7 @@ class DistEvalImpl : public TensorImpl, } /// Tile set notification - virtual void notify() { set_counter_++; } + void notify() override { set_counter_++; } /// Wait for all tiles to be assigned void wait() const { diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index 191d247aef..d687fcb4af 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -85,7 +85,7 @@ class UnaryEvalImpl /// \return A \c Future to the tile at index i /// \throw TiledArray::Exception When tile \c i is owned by a remote node. /// \throw TiledArray::Exception When tile \c i a zero tile. - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); const auto source = arg_.owner(DistEvalImpl_::perm_index_to_source(i)); @@ -98,7 +98,7 @@ class UnaryEvalImpl /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. /// \param i The index of the tile - virtual void discard_tile(ordinal_type i) const { get_tile(i); } + void discard_tile(ordinal_type i) const override { get_tile(i); } private: /// Input tile argument type @@ -144,7 +144,7 @@ class UnaryEvalImpl /// until the tasks for the children are evaluated (not for the tasks of /// this object). /// \return The number of tiles that will be set by this process - virtual int internal_eval() { + int internal_eval() override { // Convert pimpl to this object type so it can be used in tasks std::shared_ptr self = std::enable_shared_from_this::shared_from_this(); From ee1b36765cc07c6afa0c88e45f7804a89208ffe5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 10:59:42 -0500 Subject: [PATCH 80/88] if MADNESS configured with ENABLE_WORLDOBJECT_FUTURE_TRACE trace futures associated with DistributedStorage bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/514 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/distributed_storage.h | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index cbdbc817a2..c3b7b0659f 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0cb3920715c9a659bbb8158f9a31db1bd97d4614 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag cf3c98053453329f35b775c8b9f561301f6a997e . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 1780dbbfb1..9499354eba 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) +set(TA_TRACKED_MADNESS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h index 27c2885dcd..47c52ead2a 100644 --- a/src/TiledArray/distributed_storage.h +++ b/src/TiledArray/distributed_storage.h @@ -234,6 +234,13 @@ class DistributedStorage : public madness::WorldObject > { // Return the local element. const_accessor acc; [[maybe_unused]] const bool inserted = data_.insert(acc, i); +#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE + if (inserted) { + auto& f_nonconst_ref = + const_castsecond)>&>(acc->second); + this->trace(f_nonconst_ref); + } +#endif return acc->second; } @@ -249,6 +256,13 @@ class DistributedStorage : public madness::WorldObject > { // Return the local element. accessor acc; [[maybe_unused]] const bool inserted = data_.insert(acc, i); +#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE + if (inserted) { + auto& f_nonconst_ref = + const_castsecond)>&>(acc->second); + this->trace(f_nonconst_ref); + } +#endif return acc->second; } @@ -308,6 +322,14 @@ class DistributedStorage : public madness::WorldObject > { // Set the future existing_f.set(f); } +#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE + else { + auto& f_nonconst_ref = + const_castsecond)>&>( + acc->second); + this->trace(f_nonconst_ref); + } +#endif } else { if (f.probe()) { set_remote(i, f); From 886ec199cfae45d19eab876e0ab45c4504b9ba09 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 11:01:31 -0500 Subject: [PATCH 81/88] binary_wrapper.h: hush warnings due to implicitly capture of `this` --- src/TiledArray/tile_op/binary_wrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h index b66be2986d..dac995f94b 100644 --- a/src/TiledArray/tile_op/binary_wrapper.h +++ b/src/TiledArray/tile_op/binary_wrapper.h @@ -294,10 +294,10 @@ class BinaryWrapper { if (perm_) return meta::invoke(op_, eval_left, eval_right, perm_); - auto op_left = [=](eval_t& _left, eval_t& _right) { + auto op_left = [=, this](eval_t& _left, eval_t& _right) { return op_.consume_left(_left, _right); }; - auto op_right = [=](eval_t& _left, eval_t& _right) { + auto op_right = [=, this](eval_t& _left, eval_t& _right) { return op_.consume_right(_left, _right); }; // Override consumable From c3a36dc247200212cb6a3de4949b986f9f283fed Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 11:02:55 -0500 Subject: [PATCH 82/88] reimplement ArrayEvalImpl::internal_eval() using TensorImpl::local_nnz() --- src/TiledArray/dist_eval/array_eval.h | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index bb1ac49ae4..10ad0543e0 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -301,27 +301,7 @@ class ArrayEvalImpl /// This function will evaluate the children of this distributed evaluator /// and evaluate the tiles for this distributed evaluator. /// \return The number of tiles that will be set by this process - // Counter for the number of tasks submitted by this object - int task_count = 0; - - // Get a count of the number of local tiles. - if (TensorImpl_::shape().is_dense()) { - task_count = TensorImpl_::pmap()->local_size(); - } else { - // Create iterator to tiles that are local for this evaluator. - typename array_type::pmap_interface::const_iterator it = - TensorImpl_::pmap()->begin(); - const typename array_type::pmap_interface::const_iterator end = - TensorImpl_::pmap()->end(); - - for (; it != end; ++it) { - if (!TensorImpl_::is_zero(*it)) ++task_count; - } - } - - return task_count; - int internal_eval() override { - } + int internal_eval() override { return TensorImpl_::local_nnz(); } }; // class ArrayEvalImpl From a9c7e62d6a58695c5e4c48c7799c591d8dd1d032 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 3 Jan 2024 17:06:51 -0500 Subject: [PATCH 83/88] Bug fix. --- src/TiledArray/einsum/tiledarray.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 18a3871f0b..1851973709 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -181,7 +181,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (std::tuple_size::value > 1) TA_ASSERT(e); if constexpr (AreArraySame) { if (!e) { // hadamard reduction auto &[A, B] = AB; From c16ecc14542a110dcda615d8a2bcffaecbde909f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 4 Jan 2024 08:36:05 -0500 Subject: [PATCH 84/88] Remove [=] capture when not needed. [=, this] is C++20 extension. A warning is issued by clang-17 at least. --- src/TiledArray/tile_op/binary_wrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h index dac995f94b..4c02b84318 100644 --- a/src/TiledArray/tile_op/binary_wrapper.h +++ b/src/TiledArray/tile_op/binary_wrapper.h @@ -294,10 +294,10 @@ class BinaryWrapper { if (perm_) return meta::invoke(op_, eval_left, eval_right, perm_); - auto op_left = [=, this](eval_t& _left, eval_t& _right) { + auto op_left = [this](eval_t& _left, eval_t& _right) { return op_.consume_left(_left, _right); }; - auto op_right = [=, this](eval_t& _left, eval_t& _right) { + auto op_right = [this](eval_t& _left, eval_t& _right) { return op_.consume_right(_left, _right); }; // Override consumable From bc1b712d1315ef7ae352776ef3b4309701d38bff Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:35:22 -0500 Subject: [PATCH 85/88] introduced TA_TRACE_GLOBAL_COMM_STATS CMake option that enables tracing stats of communication within global objects (DistEval's + DistributedStorage) --- CMakeLists.txt | 4 ++++ src/TiledArray/config.h.in | 3 +++ 2 files changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a47fbd989..7f98e3fbf2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,6 +165,10 @@ if(TA_ENABLE_TILE_OPS_LOGGING AND NOT DEFINED TA_TILE_OPS_LOG_LEVEL) set(TA_TILE_OPS_LOG_LEVEL 1) endif(TA_ENABLE_TILE_OPS_LOGGING AND NOT DEFINED TA_TILE_OPS_LOG_LEVEL) +option(TA_TRACE_GLOBAL_COMM_STATS "Enable tracing of communication stats of global objects (DistEval's and DIstributedStorage) TiledArray" OFF) +add_feature_info(TASK_TRACE_DEBUG TA_TRACE_GLOBAL_COMM_STATS "Debug communication stats of global objects (DistEval's and DIstributedStorage) TiledArray") +set(TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ${TA_TRACE_GLOBAL_COMM_STATS}) + option(TA_RANGEV3 "Enable Range-V3 library" OFF) add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library") diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 1c38298623..79f9f0932a 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -174,6 +174,9 @@ #cmakedefine TA_ENABLE_TILE_OPS_LOGGING 1 #define TA_TILE_OPS_LOG_LEVEL 0@TA_TILE_OPS_LOG_LEVEL@ +/* Enables collection of communication statistics for global objects (DistEval and DistributedStorage) */ +#cmakedefine TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE 1 + /* ----------- pragma helpers ---------------*/ #define TILEDARRAY_PRAGMA(x) _Pragma(#x) /* same as TILEDARRAY_PRAGMA(x), but expands x */ From 56e0e2efb82570cfc24b5745874fd6c30b4ef1a3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:42:25 -0500 Subject: [PATCH 86/88] if configured with TA_TRACE_GLOBAL_COMM_STATS will collect stats of DistEval comms --- src/TiledArray/dist_eval/array_eval.h | 160 ++++++++++++++++++-- src/TiledArray/dist_eval/binary_eval.h | 68 ++++++++- src/TiledArray/dist_eval/contraction_eval.h | 44 +++++- src/TiledArray/dist_eval/dist_eval.h | 94 ++++++++++-- src/TiledArray/dist_eval/unary_eval.h | 33 +++- 5 files changed, 366 insertions(+), 33 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index 10ad0543e0..6dade3dc2b 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -198,6 +198,26 @@ class ArrayEvalImpl std::shared_ptr op_; ///< The tile operation BlockRange block_range_; ///< Sub-block range +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // tracing artifacts + using pending_counter_t = std::atomic[]; // 1 counter per rank + mutable std::shared_ptr + ntiles_pending_; // number of pending tiles from each rank + mutable std::shared_ptr + ntasks_pending_; // number of pending tasks using data from each rank + + struct AtomicCounterDecreaser : public madness::CallbackInterface { + std::shared_ptr> counter; + + AtomicCounterDecreaser(std::shared_ptr> counter) + : counter(std::move(counter)) {} + void notify() override { + --(*counter); + delete this; + } + }; +#endif + public: /// Construct with full array range @@ -217,7 +237,28 @@ class ArrayEvalImpl : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), array_(array), op_(std::make_shared(op)), - block_range_() {} + block_range_() +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + ntiles_pending_(new std::atomic[world.size()]), + ntasks_pending_(new std::atomic[world.size()]) +#endif + { +#if 0 + std::stringstream ss; + ss << "ArrayEvalImpl: id=" << this->id(); + if (array_) ss << " array.id()=" << array_.id(); + ss << "\n"; + std::cout << ss.str(); +#endif + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + for (auto rank = 0; rank != world.size(); ++rank) { + ntiles_pending_[rank] = 0; + ntasks_pending_[rank] = 0; + } +#endif + } /// Constructor with sub-block range @@ -245,10 +286,42 @@ class ArrayEvalImpl : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), array_(array), op_(std::make_shared(op)), - block_range_(array.trange().tiles_range(), lower_bound, upper_bound) {} + block_range_(array.trange().tiles_range(), lower_bound, upper_bound) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + ntiles_pending_(new std::atomic[world.size()]), + ntasks_pending_(new std::atomic[world.size()]) +#endif + { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + for (auto rank = 0; rank != world.size(); ++rank) { + ntiles_pending_[rank] = 0; + ntasks_pending_[rank] = 0; + } +#endif + } /// Virtual destructor - virtual ~ArrayEvalImpl() {} + virtual ~ArrayEvalImpl() { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + if (std::find_if(ntiles_pending_.get(), + ntiles_pending_.get() + this->world().size(), + [](const auto& v) { return v != 0; }) != + ntiles_pending_.get() + this->world().size()) { + madness::print_error( + "ArrayEvalImpl: pending tiles at destruction! (id=", this->id(), ")"); + abort(); + } + if (std::find_if(ntasks_pending_.get(), + ntasks_pending_.get() + this->world().size(), + [](const auto& v) { return v != 0; }) != + ntasks_pending_.get() + this->world().size()) { + madness::print_error( + "ArrayEvalImpl: pending tasks at destruction! (id=", this->id(), ")"); + abort(); + } +#endif + } Future get_tile(ordinal_type i) const override { // Get the array index that corresponds to the target index @@ -258,15 +331,49 @@ class ArrayEvalImpl // index to the correct location. if (block_range_.rank()) array_index = block_range_.ordinal(array_index); - // Get the tile from array_, which may be located on a remote node. - Future tile = array_.find(array_index); + const bool arg_tile_is_remote = !array_.is_local(array_index); + const ProcessID arg_tile_owner = array_.owner(array_index); - const bool consumable_tile = !array_.is_local(array_index); - - return eval_tile(tile, consumable_tile); + Future result; + bool task_created = false; + if (arg_tile_is_remote) { + TA_ASSERT(arg_tile_owner != this->world().rank()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ntiles_pending_[arg_tile_owner]++; +#endif + auto arg_tile = array_.find(array_index); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + arg_tile.register_callback( + new AtomicCounterDecreaser(std::shared_ptr>( + ntiles_pending_, ntiles_pending_.get() + arg_tile_owner))); +#endif + std::tie(result, task_created) = + eval_tile(arg_tile, /* consumable_tile = */ true +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + arg_tile_owner +#endif + ); + } else { + TA_ASSERT(arg_tile_owner == this->world().rank()); + std::tie(result, task_created) = eval_tile(array_.find_local(array_index), + /* consumable_tile = */ false +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + arg_tile_owner +#endif + ); + } +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + TA_ASSERT(ntiles_pending_[this->world().rank()] == 0); + // even if data is local we may have created a task to evaluate it + // TA_ASSERT(ntasks_pending_[this->world().rank()] == 0); +#endif + return result; } void discard_tile(ordinal_type i) const override { + TA_ASSERT(this->is_local(i)); const_cast(this)->notify(); } @@ -277,23 +384,36 @@ class ArrayEvalImpl } /// Evaluate a single LazyArrayTile - madness::Future eval_tile( + /// @return A pair of the future to the tile and a boolean indicating whether + /// a task was created to produce the tile + [[nodiscard]] std::pair, bool> eval_tile( const madness::Future& tile, - const bool consumable_tile) const { + const bool consumable_tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + const ProcessID tile_owner +#endif + ) const { // Insert the tile into this evaluator for subsequent processing if (tile.probe()) { // Skip the task since the tile is ready Future result; result.set(make_tile(tile, consumable_tile)); const_cast(this)->notify(); - return result; + return {result, false}; } else { // Spawn a task to set the tile when the input tile is not ready. Future result = TensorImpl_::world().taskq.add( shared_from_this(), &ArrayEvalImpl_::make_tile, tile, consumable_tile, madness::TaskAttributes::hipri()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ntasks_pending_[tile_owner]++; + result.register_callback( + new AtomicCounterDecreaser(std::shared_ptr>( + ntasks_pending_, ntasks_pending_.get() + tile_owner))); +#endif result.register_callback(const_cast(this)); - return result; + return {result, true}; } } /// Evaluate the tiles of this tensor @@ -303,6 +423,22 @@ class ArrayEvalImpl /// \return The number of tiles that will be set by this process int internal_eval() override { return TensorImpl_::local_nnz(); } +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + std::string status() const override { + std::stringstream ss; + ss << "ArrayEvalImpl: array.id()=" << array_.id(); + ss << " ntiles_pending=["; + for (auto rank = 0; rank != this->world().size(); ++rank) { + ss << " " << ntiles_pending_[rank]; + } + ss << "] ntasks_pending=["; + for (auto rank = 0; rank != this->world().size(); ++rank) { + ss << " " << ntasks_pending_[rank]; + } + ss << "]\n"; + return ss.str(); + } +#endif }; // class ArrayEvalImpl } // namespace detail diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index e343c087b3..62bbdb64ce 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -68,6 +68,16 @@ class BinaryEvalImpl : public DistEvalImpl, right_type right_; ///< Right argument op_type op_; ///< binary element operator +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // artifacts of tracing + mutable ordinal_type left_ntiles_used_; // # of tiles used from left_ + mutable ordinal_type right_ntiles_used_; // # of tiles used from right_ + mutable ordinal_type + left_ntiles_discarded_; // # of tiles discarded from left_ + mutable ordinal_type + right_ntiles_discarded_; // # of tiles discarded from right_ +#endif + public: /// Construct a binary evaluator @@ -88,7 +98,15 @@ class BinaryEvalImpl : public DistEvalImpl, : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), left_(left), right_(right), - op_(op) { + op_(op) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + left_ntiles_used_(0), + right_ntiles_used_(0), + left_ntiles_discarded_(0), + right_ntiles_discarded_(0) +#endif + { TA_ASSERT(left.trange() == right.trange()); } @@ -105,9 +123,9 @@ class BinaryEvalImpl : public DistEvalImpl, TA_ASSERT(!TensorImpl_::is_zero(i)); const auto source_index = DistEvalImpl_::perm_index_to_source(i); - const ProcessID source = - left_.owner(source_index); // Left and right - // should have the same owner + const ProcessID source = left_.owner(source_index); + // Left and right should have the same owner + TA_ASSERT(source == right_.owner(source_index)); const madness::DistributedID key(DistEvalImpl_::id(), i); return TensorImpl_::world().gop.template recv(source, key); @@ -195,6 +213,12 @@ class BinaryEvalImpl : public DistEvalImpl, &BinaryEvalImpl_::template eval_tile, target_index, left_.get(source_index), right_.get(source_index)); + TA_ASSERT(left_.is_local(source_index)); + TA_ASSERT(right_.is_local(source_index)); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_used_++; + right_ntiles_used_++; +#endif ++task_count; } @@ -213,32 +237,64 @@ class BinaryEvalImpl : public DistEvalImpl, &BinaryEvalImpl_::template eval_tile, target_index, ZeroTensor(), right_.get(index)); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + right_ntiles_used_++; +#endif } else if (right_.is_zero(index)) { TensorImpl_::world().taskq.add( self, &BinaryEvalImpl_::template eval_tile, target_index, left_.get(index), ZeroTensor()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_used_++; +#endif } else { + TA_ASSERT(!left_.is_zero(index) && !right_.is_zero(index)); TensorImpl_::world().taskq.add( self, &BinaryEvalImpl_::template eval_tile, target_index, left_.get(index), right_.get(index)); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_used_++; + right_ntiles_used_++; +#endif } ++task_count; } else { // Cleanup unused tiles - if (!left_.is_zero(index)) left_.discard(index); - if (!right_.is_zero(index)) right_.discard(index); + if (!left_.is_zero(index)) { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_discarded_++; +#endif + left_.discard(index); + } + if (!right_.is_zero(index)) { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + right_ntiles_discarded_++; +#endif + right_.discard(index); + } } } } // Wait for child tensors to be evaluated, and process tasks while waiting. +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + TA_ASSERT(left_.local_nnz() == left_ntiles_used_ + left_ntiles_discarded_); + TA_ASSERT(right_.local_nnz() == + right_ntiles_used_ + right_ntiles_discarded_); +#endif left_.wait(); right_.wait(); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // for some evaluators like SUMMA real task counts are not available even + // after wait() TA_ASSERT(left_.task_count() >= left_ntiles_used_ + + // left_ntiles_discarded_); TA_ASSERT(right_.task_count() >= + // right_ntiles_used_ + right_ntiles_discarded_); +#endif return task_count; } diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 8ff0d80091..2da66628fc 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -118,6 +118,7 @@ class Summa typedef std::pair col_datum; ///< Datum element type for a left-hand argument column + // various tracing/debugging artifacts static constexpr const bool trace_tasks = #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE true @@ -125,6 +126,16 @@ class Summa false #endif ; +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + mutable std::atomic + left_ntiles_used_; // # of tiles used from left_ + mutable std::atomic + right_ntiles_used_; // # of tiles used from right_ + mutable std::atomic + left_ntiles_discarded_; // # of tiles discarded from left_ + mutable std::atomic + right_ntiles_discarded_; // # of tiles discarded from right_ +#endif protected: // Import base class functions @@ -705,11 +716,17 @@ class Summa if (do_broadcast) { // Broadcast the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++left_ntiles_used_; +#endif const madness::DistributedID key(DistEvalImpl_::id(), index); auto tile = get_tile(left_, index); TensorImpl_::world().gop.bcast(key, tile, group_root, row_group); } else { // Discard the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++left_ntiles_discarded_; +#endif left_.discard(index); } } @@ -748,12 +765,18 @@ class Summa if (do_broadcast) { // Broadcast the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++right_ntiles_used_; +#endif const madness::DistributedID key(DistEvalImpl_::id(), index + left_.size()); auto tile = get_tile(right_, index); TensorImpl_::world().gop.bcast(key, tile, group_root, col_group); } else { // Discard the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++right_ntiles_discarded_; +#endif right_.discard(index); } } @@ -1550,7 +1573,16 @@ class Summa left_stride_(k), left_stride_local_(proc_grid.proc_rows() * k), right_stride_(1ul), - right_stride_local_(proc_grid.proc_cols()) {} + right_stride_local_(proc_grid.proc_cols()) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + left_ntiles_used_(0), + right_ntiles_used_(0), + left_ntiles_discarded_(0), + right_ntiles_discarded_(0) +#endif + { + } virtual ~Summa() {} @@ -1728,6 +1760,16 @@ class Summa // Wait for child tensors to be evaluated, and process tasks while waiting. left_.wait(); right_.wait(); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // values of left_ntiles_used_ etc. are not available until all broadcasts + // have been completed ... +// TA_ASSERT(left_.local_nnz() == left_ntiles_used_ + +// left_ntiles_discarded_); TA_ASSERT(right_.local_nnz() == +// right_ntiles_used_ + right_ntiles_discarded_); +// TA_ASSERT(left_.task_count() >= left_ntiles_used_ + +// left_ntiles_discarded_); TA_ASSERT(right_.task_count() >= +// right_ntiles_used_ + right_ntiles_discarded_); +#endif #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL printf("eval: finished wait children rank=%i\n", diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index 7585b7e4bf..9e0157cb8b 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -123,6 +123,28 @@ class DistEvalImpl : public TensorImpl, source_to_target_ = PermIndex(source_range, perm); target_to_source_ = PermIndex(trange.tiles_range(), inv_perm); } + +#if 0 + { + // print out expected number of tiles on each rank + std::vector ntiles_per_rank(world.size(), 0); + for (auto& i : trange.tiles_range()) { + if (!TensorImpl_::is_zero(i)) { + ntiles_per_rank[TensorImpl_::owner(i)]++; + } + } + std::stringstream ss; + ss << "DistEvalImpl: id=" << id_; + if (perm) + ss << " perm=" << perm; + ss << " ntiles=["; + for (auto& i : ntiles_per_rank) { + ss << i << " "; + } + ss << "]"; + std::cout << ss.str() << std::endl; + } +#endif } virtual ~DistEvalImpl() {} @@ -142,7 +164,8 @@ class DistEvalImpl : public TensorImpl, /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. - /// \param i The index of the tile + /// \param i The index of the local tile to discard + /// \pre `this->is_local(i)` virtual void discard_tile(ordinal_type i) const = 0; /// Set tensor value @@ -234,13 +257,36 @@ class DistEvalImpl : public TensorImpl, TA_ASSERT(task_count_ >= 0); } + /// \return The number of tasks spawned on this rank (after invoking eval() + /// this should be equal to local_nnz() for simple evaluators like + /// unary/binary, or greater than that for more complex evaluators like SUMMA + ordinal_type task_count() const { + if (task_count_ == -1) + return 0; + else + return task_count_; + } + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + /// reports evaluator status + + /// intended for debugging purposes + /// @return string containing log of the current status of evaluator (empty + /// string, unless overridden in the specialization) + [[nodiscard]] virtual std::string status() const { return {}; } +#endif }; // class DistEvalImpl -/// Tensor expression object +/// Tensor expression evaluator wrapper -/// This object holds a tensor expression. It is used to store various type -/// of tensor expressions that depend on the pimpl used to construct the -/// expression. +/// This object holds a tensor expression evaluator (DistEvalImpl). +/// +/// \note Tensor expression evaluators (DistEval and DistEvalImpl) +/// are similar to DistArray in that they has tensorial structure +/// (TensorImpl), with shape and policy, but their semantics that +/// differs from DistArray (e.g., data is not stored +/// persistently). +/// /// \tparam Tile The output tile type /// \tparam Policy The tensor policy class template @@ -333,7 +379,7 @@ class DistEval { return pimpl_->pmap(); } - /// Query the density of the tensor + /// Query if the tensor is dense /// \return \c true if the tensor is dense, otherwise false bool is_dense() const { return pimpl_->is_dense(); } @@ -348,7 +394,7 @@ class DistEval { /// \return The tiled range of the tensor const trange_type& trange() const { return pimpl_->trange(); } - /// Tile move + /// Tile accessor /// Tile is removed after it is set. /// \param i The tile index @@ -359,8 +405,12 @@ class DistEval { /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. - /// \param i The index of the tile - virtual void discard(ordinal_type i) const { pimpl_->discard_tile(i); } + /// \param i The index of a local tile to discard + /// \pre `this->is_local(i)` + virtual void discard(ordinal_type i) const { + TA_ASSERT(this->is_local(i)); + pimpl_->discard_tile(i); + } /// World object accessor @@ -372,9 +422,35 @@ class DistEval { /// \return The unique id for this object madness::uniqueidT id() const { return pimpl_->id(); } + /// \return Number of nonzero tiles on this rank + /// \sa TensorImpl::local_nnz() + ordinal_type local_nnz() const { return pimpl_->local_nnz(); } + + /// \return The number of tasks spawned on this rank (after invoking eval() + /// this should be same as the value returned by local_nnz(), if everything is + /// well) + ordinal_type task_count() const { return pimpl_->task_count(); } + /// Wait for all local tiles to be evaluated void wait() const { pimpl_->wait(); } +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + /// reports evaluator status + + /// intended for debugging purposes + /// @return string containing log of the current status of evaluator (empty + /// string, unless overridden in the specialization) + std::string status() const { + std::ostringstream oss; + oss << "DistEval status: id=" << id() + << " impl_type_name=" << typeid(*(pimpl_.get())).name() + << " "; + oss << pimpl_->status(); + oss << "\n"; + return oss.str(); + } +#endif + }; // class DistEval } // namespace detail diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index d687fcb4af..66ab742ada 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -74,7 +74,13 @@ class UnaryEvalImpl const Perm& perm, const op_type& op) : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), arg_(arg), - op_(op) {} + op_(op) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + arg_ntiles_used_(0) +#endif + { + } /// Virtual destructor virtual ~UnaryEvalImpl() {} @@ -152,10 +158,12 @@ class UnaryEvalImpl // Evaluate argument arg_.eval(); - // Counter for the number of tasks submitted by this object + // Counter for the number of tasks that will use local tiles of arg_ ordinal_type task_count = 0ul; - // Make sure all local tiles are present. + // now create tasks that will produce result tiles and push them to the + // destination N.B. data is pushed, rather than pulled, to be able to manage + // the lifetime of the argument const typename pmap_interface::const_iterator end = arg_.pmap()->end(); typename pmap_interface::const_iterator it = arg_.pmap()->begin(); for (; it != end; ++it) { @@ -165,8 +173,10 @@ class UnaryEvalImpl if (!arg_.is_zero(index)) { // Get target tile index const auto target_index = DistEvalImpl_::perm_index_to_target(index); + TA_ASSERT(!this->is_zero(target_index)); // Schedule tile evaluation task + TA_ASSERT(arg_.is_local(index)); #ifdef TILEDARRAY_HAS_DEVICE TensorImpl_::world().taskq.add(self, &UnaryEvalImpl_::template eval_tile<>, @@ -175,12 +185,18 @@ class UnaryEvalImpl TensorImpl_::world().taskq.add(self, &UnaryEvalImpl_::eval_tile, target_index, arg_.get(index)); #endif - +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + arg_ntiles_used_++; +#endif ++task_count; } } // Wait for local tiles of argument to be evaluated +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + TA_ASSERT(arg_.local_nnz() == arg_ntiles_used_); + TA_ASSERT(arg_.task_count() >= arg_ntiles_used_); +#endif // arg_.wait(); return task_count; @@ -188,7 +204,14 @@ class UnaryEvalImpl arg_type arg_; ///< Argument op_type op_; ///< The unary tile operation -}; // class UnaryEvalImpl + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // artifacts of tracing/debugging + mutable ordinal_type arg_ntiles_used_; // # of tiles used from arg_ ; N.B. no + // tiles are discarded! +#endif + +}; // class UnaryEvalImpl } // namespace detail } // namespace TiledArray From 78e8ad3d7df467b9a283ff7c7bd2dfa8608e7d77 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:43:47 -0500 Subject: [PATCH 87/88] DistributedStorage::get() can use (2 types of) caching if requested by user if configured with TA_TRACE_GLOBAL_COMM_STATS will collect stats of DistributedStorage comms --- src/TiledArray/array_impl.h | 13 +- src/TiledArray/distributed_storage.h | 224 ++++++++++++++++++++++++--- src/TiledArray/expressions/expr.h | 4 + 3 files changed, 222 insertions(+), 19 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index beb8ba3e09..e5ad9d5db9 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -636,7 +636,18 @@ class ArrayImpl : public TensorImpl { /// DistributedStorage /// @return const reference to the atomic counter of live DelayedSet requests - const madness::AtomicInt& num_live_ds() const { return data_.num_live_ds(); } + const std::atomic& num_live_ds() const { + return data_.num_live_ds(); + } + + /// Reports the number of live DelayedForward requests for this object's + /// DistributedStorage + + /// @return const reference to the atomic counter of live DelayedForward + /// requests + const std::atomic& num_live_df() const { + return data_.num_live_df(); + } }; // class ArrayImpl diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h index 47c52ead2a..60eb715c34 100644 --- a/src/TiledArray/distributed_storage.h +++ b/src/TiledArray/distributed_storage.h @@ -23,6 +23,17 @@ #include namespace TiledArray { + +/// Describes how to get remote data +enum class RemoteDataGetPolicy { + /// no caching = each get will trigger data fetch + nocache, + /// aggregate gets until data arrives, subsequent gets will trigger new gets + aggregate, + /// get once, read forever + cache +}; + namespace detail { /// Distributed storage container. @@ -41,7 +52,7 @@ namespace detail { /// thread. DO NOT construct world objects within tasks where the order of /// execution is nondeterministic. template -class DistributedStorage : public madness::WorldObject > { +class DistributedStorage : public madness::WorldObject> { public: typedef DistributedStorage DistributedStorage_; ///< This object type typedef madness::WorldObject @@ -64,8 +75,22 @@ class DistributedStorage : public madness::WorldObject > { ///< stored by this container std::shared_ptr pmap_; ///< The process map that defines the element distribution - mutable container_type data_; ///< The local data container - madness::AtomicInt num_live_ds_; ///< Number of live DelayedSet objects + mutable container_type data_; ///< The local data container + + // tracing/defensive driving artifacts + mutable std::atomic + num_live_ds_; ///< Number of live DelayedSet objects + mutable std::atomic + num_live_df_; ///< Number of live DelayedForward objects +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + mutable std::vector> + ngets_served_per_rank_; ///< Counts # of gets served to remote ranks + mutable std::vector> + ngets_sent_per_rank_; ///< Counts # of gets sent to remote ranks + mutable std::vector> + ngets_received_per_rank_; ///< Counts # of gets received from remote + ///< ranks +#endif // not allowed DistributedStorage(const DistributedStorage_&); @@ -120,6 +145,124 @@ class DistributedStorage : public madness::WorldObject > { }; // struct DelayedSet friend struct DelayedSet; + /// Tile cache works just like madness::detail::DistCache (and in fact is + /// based on it) in that it implements a local cache for asynchronous data + /// pulls. Unlike madness::detail::DistCache: + /// - this is unidirectional, i.e. there is no need to manually push data into + /// the cache (a task sending data + /// will be posted). + /// - depending on get policy data will either stay in the cache forever or + /// will be discarded upon arrival; + /// subsequent gets will need to fetch the data again (may make this + /// user-controllable in the future) + mutable container_type remote_data_cache_; + + /// Get the cache value accosted with \c key + + /// This will get the value associated with \c key to \c value. If + /// the cache element does not exist, a task requesting the data will be sent + /// to the owner, a future referring to the result will be inserted in the + /// cache so that the subsequent gets will receive the same data. After data + /// arrival the future will be removed from the cache, thus subsequent gets + /// will need to fetch the data again. \param[in] key The target key \return A + /// future that holds/will hold the cache value + future get_cached(const key_type& key, bool keep_in_cache = false) const { + // Retrieve the cached future + typename container_type::const_accessor acc; + if (remote_data_cache_.insert( + acc, key)) { // no future in cache yet, create a task + static_assert(std::is_signed_v); + const ProcessID rank = this->get_world().rank(); + ProcessID rank_w_persistence = keep_in_cache ? rank : -(rank + 1); + WorldObject_::task(owner(key), &DistributedStorage_::get_cached_handler, + key, rank_w_persistence, + madness::TaskAttributes::hipri()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ngets_sent_per_rank_.at(owner(key))++; +#endif + } + return acc->second; + } + + /// used to forward data that were unassigned at the time of request arrival + struct DelayedForward : public madness::CallbackInterface { + public: + DelayedForward(const DistributedStorage_& ds, key_type key, + ProcessID destination_rank, bool keep_in_cache) + : ds(ds), + key(key), + destination_rank(destination_rank), + keep_in_cache(keep_in_cache) {} + + void notify() override { + auto& data_fut = ds.get_local(key); + TA_ASSERT( + data_fut.probe()); // must be ready, otherwise why is this invoked? + if (keep_in_cache) { + ds.task(destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } else { + ds.task(destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } + delete this; + } + + private: + const DistributedStorage_& ds; + key_type key; + ProcessID destination_rank; + bool keep_in_cache; + }; + + void get_cached_handler(const size_type key, + ProcessID destination_rank_w_persistence) const { + const bool keep_in_cache = destination_rank_w_persistence >= 0; + const ProcessID destination_rank = + destination_rank_w_persistence < 0 + ? (-destination_rank_w_persistence - 1) + : destination_rank_w_persistence; +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ngets_served_per_rank_.at(destination_rank)++; +#endif + auto& data_fut = get_local(key); + if (data_fut.probe()) { + if (keep_in_cache) { + WorldObject_::task( + destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } else { + WorldObject_::task( + destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } + } else { // data not ready yet, defer send to a callback (maybe task??) + const_cast(data_fut).register_callback( + new DelayedForward(*this, key, destination_rank, keep_in_cache)); + } + } + + template + void set_cached_handler(const size_type key, const value_type& datum) const { + // assign the future first, then remove from the cache + typename container_type::accessor acc; + [[maybe_unused]] const bool inserted = remote_data_cache_.insert(acc, key); + // future must be in cache + TA_ASSERT(!inserted); + // assign it + acc->second.set(datum); + // remove it from the cache + if constexpr (!KeepInCache) remote_data_cache_.erase(acc); + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ngets_received_per_rank_.at(this->owner(key))++; +#endif + } + public: /// Makes an initialized, empty container with default data distribution (no /// communication) @@ -136,23 +279,47 @@ class DistributedStorage : public madness::WorldObject > { : WorldObject_(world), max_size_(max_size), pmap_(pmap), - data_((max_size / world.size()) + 11) { + data_((max_size / world.size()) + 11) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + ngets_served_per_rank_(world.size()), + ngets_sent_per_rank_(world.size()), + ngets_received_per_rank_(world.size()) +#endif + { // Check that the process map is appropriate for this storage object TA_ASSERT(pmap_); TA_ASSERT(pmap_->size() == max_size); TA_ASSERT(pmap_->rank() == pmap_interface::size_type(world.rank())); TA_ASSERT(pmap_->procs() == pmap_interface::size_type(world.size())); num_live_ds_ = 0; + num_live_df_ = 0; +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + for (auto rank = 0; rank != world.size(); ++rank) { + ngets_served_per_rank_[rank] = 0; + ngets_sent_per_rank_[rank] = 0; + ngets_received_per_rank_[rank] = 0; + } +#endif WorldObject_::process_pending(); } virtual ~DistributedStorage() { if (num_live_ds_ != 0) { - madness::print_error( - "DistributedStorage (object id=", this->id(), - ") destroyed while " - "outstanding tasks exist. Add a fence() to extend the lifetime of " - "this object."); + madness::print_error("DistributedStorage (object id=", this->id(), + ") destroyed while " + "pending tasks that set its data exist. Add a " + "fence() to extend the lifetime of " + "this object."); + abort(); + } + if (num_live_df_ != 0) { + madness::print_error("DistributedStorage (object id=", this->id(), + ") destroyed while " + "pending callbacks that forward its data to other " + "ranks exist. This may indicate a bug in your " + "program or you may need to extend the lifetime of " + "this object."); abort(); } } @@ -207,18 +374,21 @@ class DistributedStorage : public madness::WorldObject > { /// \return A future to element \c i /// \throw TiledArray::Exception If \c i is greater than or equal to \c /// max_size() . - future get(size_type i) const { + future get(size_type i, + RemoteDataGetPolicy policy = RemoteDataGetPolicy::nocache) const { TA_ASSERT(i < max_size_); if (is_local(i)) { return get_local(i); } else { - // Send a request to the owner of i for the element. - future result; - WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i, - result.remote_ref(get_world()), - madness::TaskAttributes::hipri()); - - return result; + if (policy == RemoteDataGetPolicy::nocache) { + // Send a request to the owner of i for the element. + future result; + WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i, + result.remote_ref(get_world()), + madness::TaskAttributes::hipri()); + return result; + } else + return get_cached(i, policy == RemoteDataGetPolicy::cache); } } @@ -343,7 +513,25 @@ class DistributedStorage : public madness::WorldObject > { /// Reports the number of live DelayedSet requests /// @return const reference to the atomic counter of live DelayedSet requests - const madness::AtomicInt& num_live_ds() const { return num_live_ds_; } + const std::atomic& num_live_ds() const { return num_live_ds_; } + + /// Reports the number of live DelayedForward requests + + /// @return const reference to the atomic counter of live DelayedForward + /// requests + const std::atomic& num_live_df() const { return num_live_df_; } + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + const std::vector>& ngets_served_per_rank() const { + return ngets_served_per_rank_; + } + const std::vector>& ngets_sent_per_rank() const { + return ngets_sent_per_rank_; + } + const std::vector>& ngets_received_per_rank() const { + return ngets_received_per_rank_; + } +#endif }; // class DistributedStorage } // namespace detail diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 72ad9a42cd..f77d13dbad 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -420,6 +420,10 @@ class Expr { dist_eval.wait(); // Swap the new array with the result array object. result.swap(tsr.array()); + +#if 0 + std::cout << "array.id()=" << tsr.array().id() << " evaluated using dist_eval.id=" << dist_eval.id() << std::endl; +#endif } /// Evaluate this object and assign it to \c tsr From 989fd8e6549aaa2bb4e6017f991110c31567ba58 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:46:25 -0500 Subject: [PATCH 88/88] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/516 which fixes hangs in applications with large number of tasks --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c3b7b0659f..c48f0c19b6 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag cf3c98053453329f35b775c8b9f561301f6a997e . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag b1f1c39c497b86ab3ef4e560a686de63eb555cc4 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 9499354eba..5255df9780 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) +set(TA_TRACKED_MADNESS_TAG b1f1c39c497b86ab3ef4e560a686de63eb555cc4) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)