From 2988db4aa10150163bf7a7bdfb254e7212fe5896 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 2 May 2022 15:56:48 -0400
Subject: [PATCH 01/30] blaspp seems to always define empty BLA_VENDOR in CACHE
 so only look at its value, not its presence

---
 cmake/modules/FindOrFetchBTAS.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/cmake/modules/FindOrFetchBTAS.cmake b/cmake/modules/FindOrFetchBTAS.cmake
index 775d1964b6..c484905f9d 100644
--- a/cmake/modules/FindOrFetchBTAS.cmake
+++ b/cmake/modules/FindOrFetchBTAS.cmake
@@ -13,9 +13,9 @@ if (NOT TARGET BTAS::BTAS)
   # BTAS will load BLAS++/LAPACK++ ... if those use CMake's FindBLAS/FindLAPACK (as indicated by defined BLA_VENDOR)
   # will need to specify Fortran linkage convention ... manually for now, switching to NWX's linear algebra discovery
   # is necessary to handle all the corner cases for automatic discovery
-  if (DEFINED BLA_VENDOR)
+  if (BLA_VENDOR)
     set(_linalgpp_use_standard_linalg_kits TRUE)
-  endif(DEFINED BLA_VENDOR)
+  endif(BLA_VENDOR)
 
   if (NOT TILEDARRAY_HAS_CUDA)
     # tell BLAS++/LAPACK++ to ignore CUDA

From 0afdbfe5267d277498f94946241e144612c9cb5f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 2 May 2022 16:35:39 -0400
Subject: [PATCH 02/30] update unit tests to replace DistArray::range() with
 DistArray::tiles_range() per 8cf3327ef1489847e65b0fd7d01c8eb3e773ea54

---
 src/TiledArray/expressions/blk_tsr_expr.h |  2 +-
 tests/dist_array.cpp                      | 46 +++++++--------
 tests/dist_eval_binary_eval.cpp           |  2 +-
 tests/dist_eval_contraction_eval.cpp      |  4 +-
 tests/eigen.cpp                           | 40 ++++++-------
 tests/expressions_cuda_um.cpp             | 69 +++++++++++++++--------
 tests/expressions_impl.h                  | 68 ++++++++++++++--------
 tests/tot_array_fixture.h                 |  2 +-
 tests/tot_dist_array_part2.cpp            | 34 +++++------
 tests/tot_expressions.cpp                 | 10 ++--
 10 files changed, 161 insertions(+), 116 deletions(-)

diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h
index 5604d71d63..00b19d453e 100644
--- a/src/TiledArray/expressions/blk_tsr_expr.h
+++ b/src/TiledArray/expressions/blk_tsr_expr.h
@@ -234,7 +234,7 @@ class BlkTsrExprBase : public Expr<Derived> {
   BlkTsrExprBase(reference array, const std::string& annotation,
                  const PairRange& bounds)
       : Expr_(), array_(array), annotation_(annotation) {
-    const auto rank = array.range().rank();
+    const auto rank = array.tiles_range().rank();
     lower_bound_.reserve(rank);
     upper_bound_.reserve(rank);
     int d = 0;
diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp
index 7005ac60d0..d65bf73f86 100644
--- a/tests/dist_array.cpp
+++ b/tests/dist_array.cpp
@@ -36,8 +36,8 @@ ArrayFixture::ArrayFixture()
     : shape_tensor(tr.tiles_range(), 0.0),
       world(*GlobalFixture::world),
       a(world, tr) {
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it)
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it)
     if (a.is_local(*it))
       a.set(*it, world.rank() + 1);  // Fill the tile at *it (the index)
 
@@ -46,8 +46,8 @@ ArrayFixture::ArrayFixture()
   }
 
   b = decltype(b)(world, tr, TiledArray::SparseShape<float>(shape_tensor, tr));
-  for (SpArrayN::range_type::const_iterator it = b.range().begin();
-       it != b.range().end(); ++it)
+  for (SpArrayN::range_type::const_iterator it = b.tiles_range().begin();
+       it != b.tiles_range().end(); ++it)
     if (!b.is_zero(*it) && b.is_local(*it))
       b.set(*it, world.rank() + 1);  // Fill the tile at *it (the index)
 
@@ -305,8 +305,8 @@ BOOST_AUTO_TEST_CASE(owner) {
                                          std::default_delete<ProcessID[]>());
 
   ordinal_type o = 0;
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it, ++o) {
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it, ++o) {
     // Check that local ownership agrees
     const int owner = a.owner(*it);
     BOOST_CHECK_EQUAL(a.owner(o), owner);
@@ -334,8 +334,8 @@ BOOST_AUTO_TEST_CASE(is_local) {
   // Test to make sure everyone agrees who owns which tiles.
 
   ordinal_type o = 0;
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it, ++o) {
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it, ++o) {
     // Check that local ownership agrees
     const bool local_tile = a.owner(o) == world.rank();
     BOOST_CHECK_EQUAL(a.is_local(*it), local_tile);
@@ -352,8 +352,8 @@ BOOST_AUTO_TEST_CASE(is_local) {
 }
 
 BOOST_AUTO_TEST_CASE(find_local) {
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it) {
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it) {
     if (a.is_local(*it)) {
       Future<ArrayN::value_type> tile = a.find(*it);
 
@@ -366,7 +366,7 @@ BOOST_AUTO_TEST_CASE(find_local) {
     }
   }
 
-  for (auto&& tile_idx : a.range()) {
+  for (auto&& tile_idx : a.tiles_range()) {
     if (a.is_local(tile_idx)) {
       const Future<ArrayN::value_type>& const_tile_fut = a.find_local(tile_idx);
       Future<ArrayN::value_type>& nonconst_tile_fut = a.find_local(tile_idx);
@@ -393,8 +393,8 @@ BOOST_AUTO_TEST_CASE(find_local) {
 }
 
 BOOST_AUTO_TEST_CASE(find_remote) {
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it) {
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it) {
     if (!a.is_local(*it)) {
       Future<ArrayN::value_type> tile = a.find(*it);
 
@@ -409,8 +409,8 @@ BOOST_AUTO_TEST_CASE(find_remote) {
 BOOST_AUTO_TEST_CASE(fill_tiles) {
   ArrayN a(world, tr);
 
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it) {
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it) {
     if (a.is_local(*it)) {
       a.set(*it, 0);  // Fill the tile at *it (the index) with 0
 
@@ -430,8 +430,8 @@ BOOST_AUTO_TEST_CASE(assign_tiles) {
   std::vector<int> data;
   ArrayN a(world, tr);
 
-  for (ArrayN::range_type::const_iterator it = a.range().begin();
-       it != a.range().end(); ++it) {
+  for (ArrayN::range_type::const_iterator it = a.tiles_range().begin();
+       it != a.tiles_range().end(); ++it) {
     ArrayN::trange_type::range_type range = a.trange().make_tile_range(*it);
     if (a.is_local(*it)) {
       if (data.size() < range.volume()) data.resize(range.volume(), 1);
@@ -500,8 +500,8 @@ BOOST_AUTO_TEST_CASE(truncate) {
   BOOST_CHECK_NO_THROW(b_trunc0.truncate());
   auto b_trunc1 = b.clone();
   BOOST_CHECK_NO_THROW(
-      b_trunc1.truncate(std::numeric_limits<typename decltype(
-                            b)::shape_type::value_type>::max()));
+      b_trunc1.truncate(std::numeric_limits<
+                        typename decltype(b)::shape_type::value_type>::max()));
   BOOST_CHECK(std::distance(b_trunc1.begin(), b_trunc1.end()) == 0);
 }
 
@@ -623,12 +623,12 @@ BOOST_AUTO_TEST_CASE(parallel_serialization) {
   char archive_file_prefix_name[] = "tmp.XXXXXX";
   mktemp(archive_file_prefix_name);
   madness::archive::ParallelOutputArchive<> oar(world, archive_file_prefix_name,
-                                              nio);
+                                                nio);
   oar& a;
   oar.close();
 
   madness::archive::ParallelInputArchive<> iar(world, archive_file_prefix_name,
-                                             nio);
+                                               nio);
   decltype(a) aread;
   aread.load(world, iar);
 
@@ -647,12 +647,12 @@ BOOST_AUTO_TEST_CASE(parallel_sparse_serialization) {
   char archive_file_prefix_name[] = "tmp.XXXXXX";
   mktemp(archive_file_prefix_name);
   madness::archive::ParallelOutputArchive<> oar(world, archive_file_prefix_name,
-                                              nio);
+                                                nio);
   oar& b;
   oar.close();
 
   madness::archive::ParallelInputArchive<> iar(world, archive_file_prefix_name,
-                                             nio);
+                                               nio);
   decltype(b) bread;
   bread.load(world, iar);
 
diff --git a/tests/dist_eval_binary_eval.cpp b/tests/dist_eval_binary_eval.cpp
index a79ac1ef31..5e8368fd81 100644
--- a/tests/dist_eval_binary_eval.cpp
+++ b/tests/dist_eval_binary_eval.cpp
@@ -215,7 +215,7 @@ BOOST_AUTO_TEST_CASE(perm_eval) {
   for (auto index : *dist_eval.pmap()) {
     // Get the original tiles
     const std::size_t arg_index =
-        left.range().ordinal(inv_perm * dist_eval.range().idx(index));
+        left.tiles_range().ordinal(inv_perm * dist_eval.range().idx(index));
     const TArrayI::value_type left_tile = left.find(arg_index);
     const TArrayI::value_type right_tile = right.find(arg_index);
 
diff --git a/tests/dist_eval_contraction_eval.cpp b/tests/dist_eval_contraction_eval.cpp
index 1c31328359..6e59e2f93b 100644
--- a/tests/dist_eval_contraction_eval.cpp
+++ b/tests/dist_eval_contraction_eval.cpp
@@ -115,9 +115,9 @@ struct ContractionEvalFixture : public SparseShapeFixture {
                                     const int middle) {
     // Compute the number of rows and columns in the matrix, and a new weight
     // that is bisected the row and column dimensions.
-    std::vector<std::size_t> weight(array.range().rank(), 0ul);
+    std::vector<std::size_t> weight(array.tiles_range().rank(), 0ul);
     std::size_t MN[2] = {1ul, 1ul};
-    const int dim = array.range().rank();
+    const int dim = array.tiles_range().rank();
     int i = dim - 1;
     for (; i >= middle; --i) {
       weight[i] = MN[1];
diff --git a/tests/eigen.cpp b/tests/eigen.cpp
index 6196591713..bfa4f1a0db 100644
--- a/tests/eigen.cpp
+++ b/tests/eigen.cpp
@@ -172,8 +172,8 @@ BOOST_AUTO_TEST_CASE(matrix_to_array) {
       (array = eigen_to_array<TArrayI>(*GlobalFixture::world, trange, matrix)));
 
   // Check that the data in array is equal to that in matrix
-  for (Range::const_iterator it = array.range().begin();
-       it != array.range().end(); ++it) {
+  for (Range::const_iterator it = array.tiles_range().begin();
+       it != array.tiles_range().end(); ++it) {
     Future<TArrayI::value_type> tile = array.find(*it);
     for (Range::const_iterator tile_it = tile.get().range().begin();
          tile_it != tile.get().range().end(); ++tile_it) {
@@ -193,8 +193,8 @@ BOOST_AUTO_TEST_CASE(vector_to_array) {
                                                          trange1, vector)));
 
   // Check that the data in array matches the data in vector
-  for (Range::const_iterator it = array1.range().begin();
-       it != array1.range().end(); ++it) {
+  for (Range::const_iterator it = array1.tiles_range().begin();
+       it != array1.tiles_range().end(); ++it) {
     Future<TArrayI::value_type> tile = array1.find(*it);
     for (Range::const_iterator tile_it = tile.get().range().begin();
          tile_it != tile.get().range().end(); ++tile_it) {
@@ -211,8 +211,8 @@ BOOST_AUTO_TEST_CASE(array_to_matrix) {
   if (GlobalFixture::world->size() == 1) {
     // Fill the array with random data
     GlobalFixture::world->srand(27);
-    for (Range::const_iterator it = array.range().begin();
-         it != array.range().end(); ++it) {
+    for (Range::const_iterator it = array.tiles_range().begin();
+         it != array.tiles_range().end(); ++it) {
       TArrayI::value_type tile(array.trange().make_tile_range(*it));
       for (TArrayI::value_type::iterator tile_it = tile.begin();
            tile_it != tile.end(); ++tile_it) {
@@ -235,8 +235,8 @@ BOOST_AUTO_TEST_CASE(array_to_matrix) {
                       array.trange().elements_range().extent(1));
 
     // Check that the data in matrix matches the data in array
-    for (Range::const_iterator it = array.range().begin();
-         it != array.range().end(); ++it) {
+    for (Range::const_iterator it = array.tiles_range().begin();
+         it != array.tiles_range().end(); ++it) {
       Future<TArrayI::value_type> tile = array.find(*it);
       for (Range::const_iterator tile_it = tile.get().range().begin();
            tile_it != tile.get().range().end(); ++tile_it) {
@@ -281,8 +281,8 @@ BOOST_AUTO_TEST_CASE(array_to_matrix) {
                       array.trange().elements_range().extent(1));
 
     // Check that the data in vector matches the data in array
-    for (Range::const_iterator it = array.range().begin();
-         it != array.range().end(); ++it) {
+    for (Range::const_iterator it = array.tiles_range().begin();
+         it != array.tiles_range().end(); ++it) {
       BOOST_CHECK(array.is_local(*it));
 
       Future<TArrayI::value_type> tile = array.find(*it);
@@ -301,8 +301,8 @@ BOOST_AUTO_TEST_CASE(array_to_vector) {
   if (GlobalFixture::world->size() == 1) {
     // Fill the array with random data
     GlobalFixture::world->srand(27);
-    for (Range::const_iterator it = array1.range().begin();
-         it != array1.range().end(); ++it) {
+    for (Range::const_iterator it = array1.tiles_range().begin();
+         it != array1.tiles_range().end(); ++it) {
       TArrayI::value_type tile(array1.trange().make_tile_range(*it));
       for (TArrayI::value_type::iterator tile_it = tile.begin();
            tile_it != tile.end(); ++tile_it) {
@@ -320,8 +320,8 @@ BOOST_AUTO_TEST_CASE(array_to_vector) {
     BOOST_CHECK_EQUAL(vector.cols(), 1);
 
     // Check that the data in vector matches the data in array
-    for (Range::const_iterator it = array1.range().begin();
-         it != array1.range().end(); ++it) {
+    for (Range::const_iterator it = array1.tiles_range().begin();
+         it != array1.tiles_range().end(); ++it) {
       Future<TArrayI::value_type> tile = array1.find(*it);
       for (Range::const_iterator tile_it = tile.get().range().begin();
            tile_it != tile.get().range().end(); ++tile_it) {
@@ -359,8 +359,8 @@ BOOST_AUTO_TEST_CASE(array_to_vector) {
     BOOST_CHECK_EQUAL(vector.cols(), 1);
 
     // Check that the data in vector matches the data in array
-    for (Range::const_iterator it = array1.range().begin();
-         it != array1.range().end(); ++it) {
+    for (Range::const_iterator it = array1.tiles_range().begin();
+         it != array1.tiles_range().end(); ++it) {
       BOOST_CHECK(array1.is_local(*it));
 
       Future<TArrayI::value_type> tile = array1.find(*it);
@@ -430,8 +430,8 @@ BOOST_AUTO_TEST_CASE(tensor_to_array) {
                             *GlobalFixture::world, trangeN, tensor)));
 
   // Check that the data in array is equal to that in matrix
-  for (Range::const_iterator it = array.range().begin();
-       it != array.range().end(); ++it) {
+  for (Range::const_iterator it = array.tiles_range().begin();
+       it != array.tiles_range().end(); ++it) {
     Future<TArrayI::value_type> tile = array.find(*it);
     for (Range::const_iterator tile_it = tile.get().range().begin();
          tile_it != tile.get().range().end(); ++tile_it) {
@@ -497,8 +497,8 @@ BOOST_AUTO_TEST_CASE(array_to_tensor) {
       arrayN.trange().elements_range().extent().end());
 
   // Check that the data in vector matches the data in array
-  for (Range::const_iterator it = arrayN.range().begin();
-       it != arrayN.range().end(); ++it) {
+  for (Range::const_iterator it = arrayN.tiles_range().begin();
+       it != arrayN.tiles_range().end(); ++it) {
     BOOST_CHECK(arrayN.is_local(*it));
 
     Future<TArrayI::value_type> tile = arrayN.find(*it);
diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp
index b03ec0c994..15cdf2146d 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_cuda_um.cpp
@@ -305,7 +305,8 @@ BOOST_AUTO_TEST_CASE(permute) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = b("c,b,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (a.is_local(perm_index)) {
       TArrayUMD::value_type a_tile = a.find(perm_index).get();
       TArrayUMD::value_type perm_b_tile = permute_fn(b.find(i), perm);
@@ -333,7 +334,8 @@ BOOST_AUTO_TEST_CASE(permute) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = b("b,c,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm2 * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm2 * b.tiles_range().idx(i));
     if (a.is_local(perm_index)) {
       TArrayUMD::value_type a_tile = a.find(perm_index).get();
       TArrayUMD::value_type perm_b_tile = permute_fn(b.find(i), perm2);
@@ -350,7 +352,8 @@ BOOST_AUTO_TEST_CASE(scale_permute) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = 2 * b("c,b,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (a.is_local(perm_index)) {
       TArrayUMD::value_type a_tile = a.find(perm_index).get();
       TArrayUMD::value_type perm_b_tile = permute_fn(b.find(i), perm);
@@ -524,7 +527,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = a("c,b,a").block({3, 3, 3}, {5, 5, 5}));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
       auto arg_tile = permute_fn(a.find(block_range.ordinal(perm_index)), perm);
@@ -543,7 +547,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
                              2 * a("c,b,a").block({3, 3, 3}, {5, 5, 5}));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
       auto arg_tile = permute_fn(a.find(block_range.ordinal(perm_index)), perm);
@@ -563,7 +568,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
                                   4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5})));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index)) ||
         !b.is_zero(block_range.ordinal(index))) {
@@ -584,7 +590,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
                                   4 * b("c,b,a").block({3, 3, 3}, {5, 5, 5})));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index)) ||
         !b.is_zero(block_range.ordinal(perm_index))) {
@@ -867,7 +874,8 @@ BOOST_AUTO_TEST_CASE(add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -879,7 +887,8 @@ BOOST_AUTO_TEST_CASE(add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -958,7 +967,8 @@ BOOST_AUTO_TEST_CASE(scale_add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -970,7 +980,8 @@ BOOST_AUTO_TEST_CASE(scale_add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1058,7 +1069,8 @@ BOOST_AUTO_TEST_CASE(subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1070,7 +1082,8 @@ BOOST_AUTO_TEST_CASE(subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1133,7 +1146,8 @@ BOOST_AUTO_TEST_CASE(scale_subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1145,7 +1159,8 @@ BOOST_AUTO_TEST_CASE(scale_subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1233,7 +1248,8 @@ BOOST_AUTO_TEST_CASE(mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1245,7 +1261,8 @@ BOOST_AUTO_TEST_CASE(mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1308,7 +1325,8 @@ BOOST_AUTO_TEST_CASE(scale_mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1320,7 +1338,8 @@ BOOST_AUTO_TEST_CASE(scale_mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -2459,7 +2478,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
   double expected = 0;
   for (std::size_t i = 0ul; i < a.size(); ++i) {
     TArrayUMD::value_type a_tile = a.find(i).get();
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
     for (std::size_t j = 0ul; j < a_tile.size(); ++j)
@@ -2476,7 +2496,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
@@ -2495,7 +2516,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
@@ -2516,7 +2538,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
index 0ffbf4754e..76bb75a06c 100644
--- a/tests/expressions_impl.h
+++ b/tests/expressions_impl.h
@@ -237,7 +237,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = b("c,b,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (a.is_local(perm_index) && !a.is_zero(perm_index)) {
       auto a_tile = a.find(perm_index).get();
       auto perm_b_tile = perm * b.find(i).get();
@@ -258,7 +259,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = 2 * b("c,b,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (a.is_local(perm_index) && !a.is_zero(perm_index)) {
       auto a_tile = a.find(perm_index).get();
       auto perm_b_tile = perm * b.find(i).get();
@@ -501,7 +503,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(permute_block, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = a("c,b,a").block({3, 3, 3}, {5, 5, 5}));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
       auto arg_tile = perm * a.find(block_range.ordinal(perm_index)).get();
@@ -520,7 +523,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(permute_block, F, Fixtures, F) {
                              2 * a("c,b,a").block({3, 3, 3}, {5, 5, 5}));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
       auto arg_tile = perm * a.find(block_range.ordinal(perm_index)).get();
@@ -540,7 +544,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(permute_block, F, Fixtures, F) {
                                   4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5})));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index)) ||
         !b.is_zero(block_range.ordinal(index))) {
@@ -565,7 +570,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(permute_block, F, Fixtures, F) {
                                   4 * b("c,b,a").block({3, 3, 3}, {5, 5, 5})));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index)) ||
         !b.is_zero(block_range.ordinal(perm_index))) {
@@ -681,7 +687,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_permute_block, F, Fixtures,
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
     //    const size_t perm_index = block_range.ordinal(perm *
-    //    c.range().idx(index));
+    //    c.tiles_range().idx(index));
     auto perm_index = perm * block_range.idx(index);
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
@@ -961,7 +967,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(add_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = (2 * a("c,b,a")) + (3 * b("a,b,c")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -979,7 +986,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(add_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = (2 * a("c,b,a")) + (3 * b("c,b,a")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1098,7 +1106,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_add_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) + (3 * b("a,b,c")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1116,7 +1125,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_add_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) + (3 * b("c,b,a")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1258,7 +1268,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(sub_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = (2 * a("c,b,a")) - (3 * b("a,b,c")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1276,7 +1287,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(sub_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = (2 * a("c,b,a")) - (3 * b("c,b,a")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1377,7 +1389,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_sub_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) - (3 * b("a,b,c")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1395,7 +1408,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_sub_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) - (3 * b("c,b,a")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1495,7 +1509,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(mult_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = (2 * a("c,b,a")) * (3 * b("a,b,c")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1513,7 +1528,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(mult_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = (2 * a("c,b,a")) * (3 * b("c,b,a")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1656,7 +1672,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_mult_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) * (3 * b("a,b,c")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -1674,7 +1691,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(scale_mult_permute, F, Fixtures, F) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) * (3 * b("c,b,a")));
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     if (!c.is_zero(i)) {
       auto c_tile = c.find(i).get();
       auto a_tile = a.is_zero(perm_index) ? F::make_zero_tile(c_tile.range())
@@ -2794,7 +2812,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(dot_permute, F, Fixtures, F) {
   // Compute the expected value for the dot function.
   typename F::element_type expected = 0;
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
@@ -2814,7 +2833,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(dot_permute, F, Fixtures, F) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) || !b.is_zero(perm_index)) {
       auto a_tile = a.is_zero(i) ? F::make_zero_tile(a.trange().tile(i))
                                  : a.find(i).get();
@@ -2836,7 +2856,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(dot_permute, F, Fixtures, F) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
@@ -2857,7 +2878,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(dot_permute, F, Fixtures, F) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) || !b.is_zero(perm_index)) {
       auto a_tile = a.is_zero(i) ? F::make_zero_tile(a.trange().tile(i))
                                  : a.find(i).get();
diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h
index 45a0ae0f8e..9d46fadcc7 100644
--- a/tests/tot_array_fixture.h
+++ b/tests/tot_array_fixture.h
@@ -264,7 +264,7 @@ struct ToTArrayFixture {
 
       // Same components? Here we make all ranks check all tiles
       bool are_same = true;
-      for (auto idx : lhs.range()) {
+      for (auto idx : lhs.tiles_range()) {
         const auto& lhs_tot = lhs.find(idx).get();
         const auto& rhs_tot = rhs.find(idx).get();
         if (lhs_tot != rhs_tot) {
diff --git a/tests/tot_dist_array_part2.cpp b/tests/tot_dist_array_part2.cpp
index 9473a5ef6a..b916812884 100644
--- a/tests/tot_dist_array_part2.cpp
+++ b/tests/tot_dist_array_part2.cpp
@@ -255,17 +255,17 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(trange, TestParam, test_params) {
   }
 }
 
-BOOST_AUTO_TEST_CASE_TEMPLATE(range, TestParam, test_params) {
+BOOST_AUTO_TEST_CASE_TEMPLATE(tiles_range, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.range(), TiledArray::Exception);
+      BOOST_CHECK_THROW(t.tiles_range(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
     auto& tr = std::get<0>(tr_t);
     auto& corr = std::get<2>(tr_t);
-    bool are_same = corr.range() == tr.tiles_range();
+    bool are_same = corr.tiles_range() == tr.tiles_range();
     BOOST_TEST(are_same);
   }
 }
@@ -344,7 +344,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(call_operator, TestParam, test_params) {
   for (auto tr_t : run_all<TestParam>()) {
     auto inner_rank = std::get<1>(tr_t);
     auto& t = std::get<2>(tr_t);
-    auto outer_rank = t.range().rank();
+    auto outer_rank = t.tiles_range().rank();
     std::string outer_idx = (outer_rank == 1 ? "i" : "i,j");
     std::string inner_idx = (inner_rank == 1 ? "k" : "k,l");
 
@@ -367,7 +367,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_call_operator, TestParam, test_params) {
   for (auto tr_t : run_all<TestParam>()) {
     auto inner_rank = std::get<1>(tr_t);
     const auto& t = std::get<2>(tr_t);
-    auto outer_rank = t.range().rank();
+    auto outer_rank = t.tiles_range().rank();
     std::string outer_idx = (outer_rank == 1 ? "i" : "i,j");
     std::string inner_idx = (inner_rank == 1 ? "k" : "k,l");
 
@@ -433,8 +433,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner, TestParam, test_params) {
       BOOST_CHECK_THROW(corr.owner(bad_idx), TiledArray::Exception);
     }
 
-    for (auto idx : corr.range()) {
-      const auto ordinal = corr.range().ordinal(idx);
+    for (auto idx : corr.tiles_range()) {
+      const auto ordinal = corr.tiles_range().ordinal(idx);
       BOOST_TEST(corr.owner(idx) == corr.pmap()->owner(ordinal));
       BOOST_TEST(corr.owner(ordinal) == corr.pmap()->owner(ordinal));
     }
@@ -468,8 +468,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner_init_list, TestParam, test_params) {
       BOOST_CHECK_THROW(corr.owner(il2), except_t);
     }
 
-    for (auto idx : corr.range()) {
-      const auto ordinal = corr.range().ordinal(idx);
+    for (auto idx : corr.tiles_range()) {
+      const auto ordinal = corr.tiles_range().ordinal(idx);
       const auto owner = corr.pmap()->owner(ordinal);
       if (rank == 1) {
         BOOST_TEST(corr.owner({idx[0]}) == owner);
@@ -502,8 +502,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_local, TestParam, test_params) {
       BOOST_CHECK_THROW(corr.is_local(bad_idx), TiledArray::Exception);
     }
 
-    for (auto idx : corr.range()) {
-      const auto ordinal = corr.range().ordinal(idx);
+    for (auto idx : corr.tiles_range()) {
+      const auto ordinal = corr.tiles_range().ordinal(idx);
       BOOST_TEST(corr.is_local(idx) == corr.pmap()->is_local(ordinal));
       BOOST_TEST(corr.is_local(ordinal) == corr.pmap()->is_local(ordinal));
     }
@@ -537,8 +537,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_local_init_list, TestParam, test_params) {
       BOOST_CHECK_THROW(corr.is_local(il2), except_t);
     }
 
-    for (auto idx : corr.range()) {
-      const auto ordinal = corr.range().ordinal(idx);
+    for (auto idx : corr.tiles_range()) {
+      const auto ordinal = corr.tiles_range().ordinal(idx);
       const auto is_local = corr.pmap()->is_local(ordinal);
       if (rank == 1) {
         BOOST_TEST(corr.is_local({idx[0]}) == is_local);
@@ -571,8 +571,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_zero, TestParam, test_params) {
       BOOST_CHECK_THROW(corr.is_zero(bad_idx), TiledArray::Exception);
     }
 
-    for (auto idx : corr.range()) {
-      const auto ordinal = corr.range().ordinal(idx);
+    for (auto idx : corr.tiles_range()) {
+      const auto ordinal = corr.tiles_range().ordinal(idx);
       BOOST_TEST(corr.is_zero(idx) == corr.shape().is_zero(ordinal));
       BOOST_TEST(corr.owner(ordinal) == corr.pmap()->owner(ordinal));
     }
@@ -606,8 +606,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_zero_init_list, TestParam, test_params) {
       BOOST_CHECK_THROW(corr.is_zero(il2), except_t);
     }
 
-    for (auto idx : corr.range()) {
-      const auto ordinal = corr.range().ordinal(idx);
+    for (auto idx : corr.tiles_range()) {
+      const auto ordinal = corr.tiles_range().ordinal(idx);
       const auto is_zero = corr.shape().is_zero(ordinal);
       if (rank == 1) {
         BOOST_TEST(corr.is_zero({idx[0]}) == is_zero);
diff --git a/tests/tot_expressions.cpp b/tests/tot_expressions.cpp
index 1c7feb11b7..c834810065 100644
--- a/tests/tot_expressions.cpp
+++ b/tests/tot_expressions.cpp
@@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(no_perm, TestParam, test_params) {
     auto& in_rank = std::get<1>(tr_t);
     auto& t = std::get<2>(tr_t);
 
-    std::string out_idx = t.range().rank() == 1 ? "i" : "i, j";
+    std::string out_idx = t.tiles_range().rank() == 1 ? "i" : "i, j";
     std::string in_idx = in_rank == 1 ? "k" : "k, l";
     std::string idx = out_idx + ";" + in_idx;
 
@@ -42,7 +42,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(permute_outer, TestParam, test_params) {
     auto& in_rank = std::get<1>(tr_t);
     auto& t = std::get<2>(tr_t);
 
-    if (t.range().rank() == 1) continue;
+    if (t.tiles_range().rank() == 1) continue;
 
     std::string rhs_out_idx = "i, j";
     std::string lhs_out_idx = "j, i";
@@ -52,7 +52,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(permute_outer, TestParam, test_params) {
     tensor_type<TestParam> result;
     result(lhs_idx) = t(rhs_idx);
 
-    for (auto tile_idx : t.range()) {
+    for (auto tile_idx : t.tiles_range()) {
       auto rtile = t.find(tile_idx).get();
       auto ltile = result.find({tile_idx[1], tile_idx[0]}).get();
       for (auto outer_idx : ltile.range()) {
@@ -79,13 +79,13 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(permute_inner, TestParam, test_params) {
 
     std::string rhs_in_idx = "i, j";
     std::string lhs_in_idx = "j, i";
-    std::string out_idx = t.range().rank() == 1 ? "k" : "k, l";
+    std::string out_idx = t.tiles_range().rank() == 1 ? "k" : "k, l";
     std::string rhs_idx = out_idx + ";" + rhs_in_idx;
     std::string lhs_idx = out_idx + ";" + lhs_in_idx;
     tensor_type<TestParam> result;
     result(lhs_idx) = t(rhs_idx);
 
-    for (auto tile_idx : t.range()) {
+    for (auto tile_idx : t.tiles_range()) {
       auto rtile = t.find(tile_idx).get();
       auto ltile = result.find(tile_idx).get();
       bool same_outer_range = ltile.range() == rtile.range();

From 760bfa3a4a17d31ead985a3106931b3764fc9cbe Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 May 2022 08:33:15 -0400
Subject: [PATCH 03/30] ta_test: UM expr tests need looser FP64 comparison
 tolerance

---
 tests/expressions_cuda_um.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp
index 15cdf2146d..0a9e057f6b 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_cuda_um.cpp
@@ -123,7 +123,7 @@ struct UMExpressionsFixture : public TiledRangeFixture {
   TArrayUMD u;
   TArrayUMD v;
   TArrayUMD w;
-  double tolerance = 1.0e-14;
+  static constexpr double tolerance = 5.0e-14;
 };  // UMExpressionsFixture
 
 // Instantiate static variables for fixture

From 4be43ff22b7a82604243755e744564237316ccd8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 May 2022 08:34:18 -0400
Subject: [PATCH 04/30] introduced check_serial{,-tiledarray} targets

---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a972c76b4..88020f8fab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -354,14 +354,18 @@ add_subdirectory(doc)
 ##########################
 include(CTest)
 if (BUILD_TESTING)
-  set(_ctest_args -V -R "tiledarray/unit")
+  set(_ctest_args -V -R "tiledarray/unit/run-np.*")
+  set(_ctest_args_serial -V -R "tiledarray/unit/run-np1")
   if (DEFINED TA_UT_CTEST_TIMEOUT)
     list(APPEND _ctest_args --timeout ${TA_UT_CTEST_TIMEOUT})
+    list(APPEND _ctest_args_serial --timeout ${TA_UT_CTEST_TIMEOUT})
   endif(DEFINED TA_UT_CTEST_TIMEOUT)
   add_custom_target_subproject(tiledarray check USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args})
+  add_custom_target_subproject(tiledarray check_serial USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args_serial})
   add_subdirectory(tests)
 else()
   add_custom_target_subproject(tiledarray check USES_TERMINAL COMMAND echo "WARNING: unit testing disabled. To enable, give -DBUILD_TESTING=ON to cmake")
+  add_custom_target_subproject(tiledarray check_serial USES_TERMINAL COMMAND echo "WARNING: unit testing disabled. To enable, give -DBUILD_TESTING=ON to cmake")
 endif()
 
 ##########################

From 862c0141723dcf399b1053ab4205e7ae18ed372f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 May 2022 08:35:08 -0400
Subject: [PATCH 05/30] gitlab ci: run serial unit tests with CUDA + qualify
 all targets with -tiledarray where possible

---
 .gitlab-ci.yml | 4 ++--
 CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 11f03acf70..ce49b4dc01 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -7,7 +7,7 @@ default:
 
 variables:
   MAD_NUM_THREADS : 2
-  TA_TARGETS : "tiledarray examples ta_test check-tiledarray"
+  TA_TARGETS : "tiledarray examples-tiledarray ta_test check-tiledarray"
   # Debug builds with ScaLAPACK=ON need increased TA_UT_CTEST_TIMEOUT
   TA_CONFIG : >
     CMAKE_BUILD_TYPE=${BUILD_TYPE}
@@ -70,4 +70,4 @@ ubuntu:
         CXX: [ g++ ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]
-        TA_TARGETS : [ "tiledarray examples" ]
+        TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88020f8fab..9a4b96c3e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,7 +355,7 @@ add_subdirectory(doc)
 include(CTest)
 if (BUILD_TESTING)
   set(_ctest_args -V -R "tiledarray/unit/run-np.*")
-  set(_ctest_args_serial -V -R "tiledarray/unit/run-np1")
+  set(_ctest_args_serial -V -R "tiledarray/unit/run-np-1")
   if (DEFINED TA_UT_CTEST_TIMEOUT)
     list(APPEND _ctest_args --timeout ${TA_UT_CTEST_TIMEOUT})
     list(APPEND _ctest_args_serial --timeout ${TA_UT_CTEST_TIMEOUT})

From e263f997764a72e82cac957540782141e7d4f6e9 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 May 2022 10:15:21 -0400
Subject: [PATCH 06/30] ccd example avoids using deprecated DistArray::range()

---
 examples/cc/ccd.cpp  | 15 ++++++++-------
 examples/cc/ccsd.cpp |  9 +++++----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/cc/ccd.cpp b/examples/cc/ccd.cpp
index 18106f34c1..2560048d26 100644
--- a/examples/cc/ccd.cpp
+++ b/examples/cc/ccd.cpp
@@ -96,27 +96,28 @@ int main(int argc, char** argv) {
 
     TiledArray::TSpArrayD t_aa_vvoo(world, v_aa_vvoo.trange(),
                                     v_aa_vvoo.shape());
-    for (auto it = t_aa_vvoo.range().begin(); it != t_aa_vvoo.range().end();
-         ++it)
+    for (auto it = t_aa_vvoo.tiles_range().begin();
+         it != t_aa_vvoo.tiles_range().end(); ++it)
       if (t_aa_vvoo.is_local(*it) && (!t_aa_vvoo.is_zero(*it)))
         t_aa_vvoo.set(*it, 0.0);
 
     TiledArray::TSpArrayD t_ab_vvoo(world, v_ab_vvoo.trange(),
                                     v_ab_vvoo.shape());
-    for (auto it = t_ab_vvoo.range().begin(); it != t_ab_vvoo.range().end();
-         ++it)
+    for (auto it = t_ab_vvoo.tiles_range().begin();
+         it != t_ab_vvoo.tiles_range().end(); ++it)
       if (t_ab_vvoo.is_local(*it) && (!t_ab_vvoo.is_zero(*it)))
         t_ab_vvoo.set(*it, 0.0);
 
     TiledArray::TSpArrayD t_bb_vvoo(world, v_bb_vvoo.trange(),
                                     v_bb_vvoo.shape());
-    for (auto it = t_bb_vvoo.range().begin(); it != t_bb_vvoo.range().end();
-         ++it)
+    for (auto it = t_bb_vvoo.tiles_range().begin();
+         it != t_bb_vvoo.tiles_range().end(); ++it)
       if (t_bb_vvoo.is_local(*it) && (!t_bb_vvoo.is_zero(*it)))
         t_bb_vvoo.set(*it, 0.0);
 
     TiledArray::TSpArrayD D_vvoo(world, v_ab_vvoo.trange(), v_ab_vvoo.shape());
-    for (auto it = D_vvoo.range().begin(); it != D_vvoo.range().end(); ++it)
+    for (auto it = D_vvoo.tiles_range().begin();
+         it != D_vvoo.tiles_range().end(); ++it)
       if (D_vvoo.is_local(*it) && (!D_vvoo.is_zero(*it)))
         D_vvoo.set(*it, world.taskq.add(data, &InputData::make_D_vvoo_tile,
                                         D_vvoo.trange().make_tile_range(*it)));
diff --git a/examples/cc/ccsd.cpp b/examples/cc/ccsd.cpp
index 47a29686fa..f06b53edf1 100644
--- a/examples/cc/ccsd.cpp
+++ b/examples/cc/ccsd.cpp
@@ -128,15 +128,16 @@ int main(int argc, char** argv) {
     //
     //
     //    TArray2s D_vo(world, f_a_vo.trange(), f_a_vo.shape());
-    //    for(TArray2s::range_type::const_iterator it = D_vo.range().begin(); it
-    //    != D_vo.range().end(); ++it)
+    //    for(TArray2s::range_type::const_iterator it =
+    //    D_vo.tiles_range().begin(); it
+    //    != D_vo.tiles_range().end(); ++it)
     //      if(D_vo.is_local(*it) && (! D_vo.is_zero(*it)))
     //        D_vo.set(*it, world.taskq.add(data, & InputData::make_D_vo_tile,
     //        D_vo.trange().make_tile_range(*it)));
     //
     //    TArray4s D_vvoo(world, v_ab_vvoo.trange(), v_ab_vvoo.shape());
-    //    for(TArray4s::range_type::const_iterator it = D_vvoo.range().begin();
-    //    it != D_vvoo.range().end(); ++it)
+    //    for(TArray4s::range_type::const_iterator it =
+    //    D_vvoo.tiles_range().begin(); it != D_vvoo.tiles_range().end(); ++it)
     //      if(D_vvoo.is_local(*it) && (! D_vvoo.is_zero(*it)))
     //        D_vvoo.set(*it, world.taskq.add(data, &
     //        InputData::make_D_vvoo_tile,

From e0dfb455850000ff1c0c478a39a755ca2f321293 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 6 May 2022 13:35:55 -0400
Subject: [PATCH 07/30] print nvcc version info and output of nvidia-smi

---
 ci/.build-project | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/.build-project b/ci/.build-project
index 79a08d541b..a9c9f7582a 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -81,6 +81,8 @@ fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "make -C /home/ValeevGroup install/cuda"
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
+  cmd "${CUDACXX} -V"
+  cmd "find / -name \"*nvidia-smi\""
 fi
 section_end preparing_system_section
 

From c172a9895c450b9b888591130f636f149a6d9f21 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 8 May 2022 22:47:46 -0400
Subject: [PATCH 08/30] vector<T>::data() may not return null ptr if empty

---
 tests/tensor_um.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/tensor_um.cpp b/tests/tensor_um.cpp
index 310e04234f..33efbfd7d4 100644
--- a/tests/tensor_um.cpp
+++ b/tests/tensor_um.cpp
@@ -87,8 +87,7 @@ struct TensorUMFixture {
 
 const TensorUMFixture::range_type TensorUMFixture::r = make_range(81);
 
-BOOST_FIXTURE_TEST_SUITE(tensor_um_suite, TensorUMFixture,
-                         TA_UT_LABEL_SERIAL)
+BOOST_FIXTURE_TEST_SUITE(tensor_um_suite, TensorUMFixture, TA_UT_LABEL_SERIAL)
 
 BOOST_AUTO_TEST_CASE(default_constructor) {
   // check constructor
@@ -98,7 +97,6 @@ BOOST_AUTO_TEST_CASE(default_constructor) {
   BOOST_CHECK(x.empty());
 
   // Check that range data is correct
-  BOOST_CHECK_EQUAL(x.data(), static_cast<int*>(NULL));
   BOOST_CHECK_EQUAL(x.size(), 0ul);
   BOOST_CHECK_EQUAL(x.range().volume(), 0ul);
 

From 9c355bd1bba2423e450b48e800c96e8186b2dbf4 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 9 May 2022 10:42:14 -0400
Subject: [PATCH 09/30] bump up UT test timeout to accomodate CUDA builds

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ce49b4dc01..6ab502b527 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,7 +12,7 @@ variables:
   TA_CONFIG : >
     CMAKE_BUILD_TYPE=${BUILD_TYPE}
     TA_ASSERT_POLICY=TA_ASSERT_THROW
-    TA_UT_CTEST_TIMEOUT=2000
+    TA_UT_CTEST_TIMEOUT=3000
     ${TA_PYTHON}
     ${ENABLE_CUDA}
     ${BLA_VENDOR}

From 3c14fa48ec1f792e0dbc2c84c2be51c6ddd14a90 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 13 May 2022 14:42:23 -0400
Subject: [PATCH 10/30] added UT um_expressions_suite/scal_add_block

---
 tests/expressions_cuda_um.cpp | 37 +++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp
index 0a9e057f6b..e5e810e29d 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_cuda_um.cpp
@@ -520,6 +520,39 @@ BOOST_AUTO_TEST_CASE(scal_block) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(scal_add_block) {
+  Permutation perm({2, 1, 0});
+  BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5});
+
+  BOOST_REQUIRE_NO_THROW(c("a,b,c") =
+                             2 * (3 * a("a,b,c").block({3, 3, 3}, {5, 5, 5}) +
+                                  4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5})));
+
+  std::cout << "expr tree for c(\"a,b,c\") =\n"
+               "                             2 * (3 * a(\"a,b,c\").block({3, "
+               "3, 3}, {5, 5, 5}) +\n"
+               "                                  4 * b(\"a,b,c\").block({3, "
+               "3, 3}, {5, 5, 5})):\n"
+            << c("a,b,c")
+            << 2 * (3 * a("a,b,c").block({3, 3, 3}, {5, 5, 5}) +
+                    4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5}));
+
+  for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
+    if (!a.is_zero(block_range.ordinal(index)) &&
+        !b.is_zero(block_range.ordinal(index))) {
+      auto a_tile = a.find(block_range.ordinal(index)).get();
+      auto b_tile = b.find(block_range.ordinal(index)).get();
+      auto result_tile = c.find(index).get();
+
+      for (std::size_t j = 0ul; j < result_tile.range().volume(); ++j) {
+        BOOST_CHECK_EQUAL(result_tile[j], 2 * (3 * a_tile[j] + 4 * b_tile[j]));
+      }
+    } else {
+      BOOST_CHECK(c.is_zero(index));
+    }
+  }
+}
+
 BOOST_AUTO_TEST_CASE(permute_block) {
   Permutation perm({2, 1, 0});
   BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5});
@@ -965,6 +998,10 @@ BOOST_AUTO_TEST_CASE(scale_add_permute) {
 
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) + (3 * b("a,b,c")));
 
+  std::cout << "expr tree for c(\"a,b,c\") = 5 * (2 * a(\"c,b,a\")) + (3 * "
+               "b(\"a,b,c\")))"
+            << c("a,b,c") << (5 * (2 * a("c,b,a")) + (3 * b("a,b,c")));
+
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
     const size_t perm_index =

From 18a68fb21bde20ab6095b7463f5f407474389d83 Mon Sep 17 00:00:00 2001
From: Victor Anisimov <vanisimov@anl.gov>
Date: Tue, 5 Jul 2022 10:07:07 -0500
Subject: [PATCH 11/30] Integrate CUDA-HIP-SYCL version of LibreTT

---
 INSTALL.md                                    |   4 +-
 bin/admin/dependency-versions-update-hook.py  |  10 +-
 examples/cuda/CMakeLists.txt                  |   2 +-
 .../cuda/{cuda_cutt.cpp => cuda_librett.cpp}  |   2 +-
 external/cuda.cmake                           |   4 +-
 external/{cutt.cmake => librett.cmake}        |  97 ++++++------
 external/versions.cmake                       |   4 +-
 src/CMakeLists.txt                            |   4 +-
 src/TiledArray/cuda/btas_um_tensor.h          |   4 +-
 .../external/{cutt.h => ta-librett.h}         |  31 ++--
 src/TiledArray/tiledarray.cpp                 |   8 +-
 tests/CMakeLists.txt                          |   2 +-
 tests/{cutt.cpp => librett.cpp}               | 141 +++++++++---------
 13 files changed, 161 insertions(+), 152 deletions(-)
 rename examples/cuda/{cuda_cutt.cpp => cuda_librett.cpp} (98%)
 rename external/{cutt.cmake => librett.cmake} (53%)
 rename src/TiledArray/external/{cutt.h => ta-librett.h} (80%)
 rename tests/{cutt.cpp => librett.cpp} (81%)

diff --git a/INSTALL.md b/INSTALL.md
index 3606a2bd25..c06535172e 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -64,7 +64,7 @@ Compiling BTAS requires the following prerequisites:
 
 Optional prerequisites:
 - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing:
-  - [cuTT](github.com/ValeevGroup/cutt) -- CUDA transpose library; note that our fork of the [original cuTT repo](github.com/ap-hynninen/cutt) is required to provide thread-safety (tag 0e8685bf82910bc7435835f846e88f1b39f47f09).
+  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) with our additional thread-safety improvements (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da).
   - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82).
 - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
 - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
@@ -329,7 +329,7 @@ Support for execution on CUDA-enabled hardware is controlled by the following va
 * `ENABLE_CUDA`  -- Set to `ON` to turn on CUDA support. [Default=OFF].
 * `CMAKE_CUDA_HOST_COMPILER`  -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.).
 * `ENABLE_CUDA_ERROR_CHECK` -- Set to `ON` to turn on assertions for successful completion of calls to CUDA runtime and libraries. [Default=OFF].
-* `CUTT_INSTALL_DIR` -- the installation prefix of the pre-installed cuTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install cuTT.
+* `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT.
 * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire.
 
 For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info.
diff --git a/bin/admin/dependency-versions-update-hook.py b/bin/admin/dependency-versions-update-hook.py
index 19b7123703..686b98b49a 100755
--- a/bin/admin/dependency-versions-update-hook.py
+++ b/bin/admin/dependency-versions-update-hook.py
@@ -106,11 +106,11 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
                 btas_old_tag = tokens[2]
             else:
                 btas_new_tag = tokens[2]
-        elif tokens[1].find('CUTT') != -1:
+        elif tokens[1].find('LIBRETT') != -1:
             if tokens[1].find('PREVIOUS') != -1:
-                cutt_old_tag = tokens[2]
+                librett_old_tag = tokens[2]
             else:
-                cutt_new_tag = tokens[2]
+                librett_new_tag = tokens[2]
         elif tokens[1].find('UMPIRE') != -1:
             if tokens[1].find('PREVIOUS') != -1:
                 umpire_old_tag = tokens[2]
@@ -146,8 +146,8 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
 # BTAS tag in INSTALL.md
 any_files_changed |= replace_dep_id(topsrc, 'md', 'BTAS', btas_old_tag, btas_new_tag, 'ValeevGroup/BTAS), tag ', '')
 
-# cuTT tag in INSTALL.md
-any_files_changed |= replace_dep_id(topsrc, 'md', 'cuTT', cutt_old_tag, cutt_new_tag, '', '')
+# LibreTT tag in INSTALL.md
+any_files_changed |= replace_dep_id(topsrc, 'md', 'LibreTT', librett_old_tag, librett_new_tag, '', '')
 
 # Umpire tag in INSTALL.md
 any_files_changed |= replace_dep_id(topsrc, 'md', 'Umpire', umpire_old_tag, umpire_new_tag, '', '')
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
index 2f6affe700..5d7f56c86e 100644
--- a/examples/cuda/CMakeLists.txt
+++ b/examples/cuda/CMakeLists.txt
@@ -25,7 +25,7 @@
 
 if(CUDA_FOUND)
 
-  foreach(_exec cuda_cutt cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda)
+  foreach(_exec cuda_librett cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda)
 
     # Add executable
     add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
diff --git a/examples/cuda/cuda_cutt.cpp b/examples/cuda/cuda_librett.cpp
similarity index 98%
rename from examples/cuda/cuda_cutt.cpp
rename to examples/cuda/cuda_librett.cpp
index edaefc2597..a916bfc729 100644
--- a/examples/cuda/cuda_cutt.cpp
+++ b/examples/cuda/cuda_librett.cpp
@@ -29,7 +29,7 @@
 #include <iostream>
 
 /**
- *  Test cuTT
+ *  Test LibreTT
  */
 
 const std::size_t N = 100;
diff --git a/external/cuda.cmake b/external/cuda.cmake
index 1e5ebd8d60..3b2eb6ce37 100644
--- a/external/cuda.cmake
+++ b/external/cuda.cmake
@@ -42,6 +42,6 @@ message(STATUS "CMAKE Implicit Link Directories: ${CMAKE_CUDA_IMPLICIT_LINK_DIRE
 include(external/umpire.cmake)
 
 ##
-## cuTT
+## LibreTT
 ##
-include(external/cutt.cmake)
+include(external/librett.cmake)
diff --git a/external/cutt.cmake b/external/librett.cmake
similarity index 53%
rename from external/cutt.cmake
rename to external/librett.cmake
index dbf4e94f91..a238f3af92 100644
--- a/external/cutt.cmake
+++ b/external/librett.cmake
@@ -1,48 +1,48 @@
 ##
-## find cuTT
+## find LibreTT
 ##
 
-find_path(_CUTT_INSTALL_DIR NAMES include/cutt.h lib/libcutt.a HINTS ${CUTT_INSTALL_DIR})
+find_path(_LIBRETT_INSTALL_DIR NAMES include/librett.h lib/librett.a HINTS ${LIBRETT_INSTALL_DIR})
 
-if( _CUTT_INSTALL_DIR )
+if( _LIBRETT_INSTALL_DIR )
 
-    message(STATUS "cuTT found at ${_CUTT_INSTALL_DIR}")
+    message(STATUS "LibreTT found at ${_LIBRETT_INSTALL_DIR}")
 
 elseif(TA_EXPERT)
 
-    message("** cuTT was not found")
-    message(STATUS "** Downloading and building cuTT is explicitly disabled in EXPERT mode")
+    message("** LibreTT was not found")
+    message(STATUS "** Downloading and building LibreTT is explicitly disabled in EXPERT mode")
 
 else()
 
-    # TODO need to fix the auto installation of cuTT
+    # TODO need to fix the auto installation of LibreTT
 
     include(ExternalProject)
 
     # to pass CMAKE_C_* vars to external project
     enable_language(C)
 
-    # set source and build path for cuTT in the TiledArray project
-    set(EXTERNAL_SOURCE_DIR   ${FETCHCONTENT_BASE_DIR}/cutt-src)
-    # cutt only supports in source build
-    set(EXTERNAL_BUILD_DIR  ${FETCHCONTENT_BASE_DIR}/cutt-build)
+    # set source and build path for LibreTT in the TiledArray project
+    set(EXTERNAL_SOURCE_DIR   ${FETCHCONTENT_BASE_DIR}/librett-src)
+    # librett only supports in source build
+    set(EXTERNAL_BUILD_DIR  ${FETCHCONTENT_BASE_DIR}/librett-build)
     set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX})
 
-    if (NOT CUTT_URL)
-        set(CUTT_URL https://github.com/ValeevGroup/cutt.git)
-    endif (NOT CUTT_URL)
-    if (NOT CUTT_TAG)
-        set(CUTT_TAG ${TA_TRACKED_CUTT_TAG})
-    endif (NOT CUTT_TAG)
+    if (NOT LIBRETT_URL)
+        set(LIBRETT_URL https://github.com/victor-anisimov/librett.git)
+    endif (NOT LIBRETT_URL)
+    if (NOT LIBRETT_TAG)
+        set(LIBRETT_TAG ${TA_TRACKED_LIBRETT_TAG})
+    endif (NOT LIBRETT_TAG)
 
-    message("** Will clone cuTT from ${CUTT_URL}")
+    message("** Will clone LibreTT from ${LIBRETT_URL}")
 
     # need to change the separator of list to avoid issues with ExternalProject parsing
 #    set(CUDA_FLAGS "${CUDA_NVCC_FLAGS}")
 #    string(REPLACE ";" "::" CUDA_FLAGS "${CUDA_NVCC_FLAGS}")
     #message(STATUS "CUDA_FLAGS: " "${CUDA_FLAGS}")
 
-    set(CUTT_CMAKE_ARGS
+    set(LIBRETT_CMAKE_ARGS
         -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR}
         -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
         -DCMAKE_POSITION_INDEPENDENT_CODE=${CMAKE_POSITION_INDEPENDENT_CODE}
@@ -66,87 +66,88 @@ else()
         -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD}
         -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS}
         -DENABLE_UMPIRE=OFF
-        -DCUTT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool
+        -DLIBRETT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool
         -DCMAKE_PREFIX_PATH=${_UMPIRE_INSTALL_DIR}
         -DENABLE_NO_ALIGNED_ALLOC=ON
         -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER}
         -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT}
+	-DENABLE_CUDA=ON
         )
     if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-        list(APPEND CUTT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})
+        list(APPEND LIBRETT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})
     endif(DEFINED CMAKE_CUDA_ARCHITECTURES)
     if (CMAKE_TOOLCHAIN_FILE)
-        set(CUTT_CMAKE_ARGS "${CUTT_CMAKE_ARGS}"
+        set(LIBRETT_CMAKE_ARGS "${LIBRETT_CMAKE_ARGS}"
             "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
     endif(CMAKE_TOOLCHAIN_FILE)
 
     if (BUILD_SHARED_LIBS)
-        set(CUTT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+        set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
     else(BUILD_SHARED_LIBS)
-        set(CUTT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+        set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
     endif(BUILD_SHARED_LIBS)
 
     # N.B. Ninja needs spelling out the byproducts of custom targets, see https://cmake.org/cmake/help/v3.3/policy/CMP0058.html
-    set(CUTT_BUILD_BYPRODUCTS "${EXTERNAL_BUILD_DIR}/src/libcutt${CUTT_DEFAULT_LIBRARY_SUFFIX}")
-    message(STATUS "custom target cutt is expected to build these byproducts: ${CUTT_BUILD_BYPRODUCTS}")
+    set(LIBRETT_BUILD_BYPRODUCTS "${EXTERNAL_BUILD_DIR}/src/librett${LIBRETT_DEFAULT_LIBRARY_SUFFIX}")
+    message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}")
 
-    ExternalProject_Add(cutt
+    ExternalProject_Add(librett
             PREFIX ${CMAKE_INSTALL_PREFIX}
-            STAMP_DIR ${FETCHCONTENT_BASE_DIR}/cutt-ep-artifacts
-            TMP_DIR ${FETCHCONTENT_BASE_DIR}/cutt-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
+            STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts
+            TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
             #--Download step--------------
             DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR}
-            GIT_REPOSITORY ${CUTT_URL}
-            GIT_TAG ${CUTT_TAG}
+            GIT_REPOSITORY ${LIBRETT_URL}
+            GIT_TAG ${LIBRETT_TAG}
             #--Configure step-------------
             SOURCE_DIR ${EXTERNAL_SOURCE_DIR}
             LIST_SEPARATOR ::
             UPDATE_DISCONNECTED 1
             CMAKE_ARGS
-            ${CUTT_CMAKE_ARGS}
+            ${LIBRETT_CMAKE_ARGS}
             	${EXTERNAL_SOURCE_DIR}
             #--Build step-----------------
             BINARY_DIR ${EXTERNAL_BUILD_DIR}
-            BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cutt -v
-            BUILD_BYPRODUCTS ${CUTT_BUILD_BYPRODUCTS}
+            BUILD_COMMAND ${CMAKE_COMMAND} --build . --target librett -v
+            BUILD_BYPRODUCTS ${LIBRETT_BUILD_BYPRODUCTS}
             #--Install step---------------
-            INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "cuTT will be installed during TiledArray's installation."
+            INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "LibreTT will be installed during TiledArray's installation."
             #--Custom targets-------------
             STEP_TARGETS build
             )
 
-    # TiledArray_CUTT target depends on existence of this directory to be usable from the build tree at configure time
+    # TiledArray_LIBRETT target depends on existence of this directory to be usable from the build tree at configure time
     execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src")
 
-    # do install of cuTT as part of building TiledArray's install target
+    # do install of LibreTT as part of building TiledArray's install target
     install(CODE
             "execute_process(
                COMMAND \"${CMAKE_COMMAND}\" \"--build\" \".\" \"--target\" \"install\"
                WORKING_DIRECTORY \"${EXTERNAL_BUILD_DIR}\"
                RESULT_VARIABLE error_code)
                if(error_code)
-                 message(FATAL_ERROR \"Failed to install cuTT\")
+                 message(FATAL_ERROR \"Failed to install LibreTT\")
                endif()
             ")
 
-    # Add cuTT dependency to External
-    add_dependencies(External-tiledarray cutt-build)
+    # Add LibreTT dependency to External
+    add_dependencies(External-tiledarray librett-build)
 
-    set(_CUTT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR})
+    set(_LIBRETT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR})
 
-endif(_CUTT_INSTALL_DIR)
+endif(_LIBRETT_INSTALL_DIR)
 
-add_library(TiledArray_CUTT INTERFACE)
+add_library(TiledArray_LIBRETT INTERFACE)
 
-set_target_properties(TiledArray_CUTT
+set_target_properties(TiledArray_LIBRETT
         PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES
-        "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<INSTALL_INTERFACE:${_CUTT_INSTALL_DIR}/include>"
+        "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<INSTALL_INTERFACE:${_LIBRETT_INSTALL_DIR}/include>"
         INTERFACE_LINK_LIBRARIES
-        "$<BUILD_INTERFACE:${CUTT_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_CUTT_INSTALL_DIR}/lib/libcutt.${CUTT_DEFAULT_LIBRARY_SUFFIX}>"
+        "$<BUILD_INTERFACE:${LIBRETT_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_LIBRETT_INSTALL_DIR}/lib/librett.${LIBRETT_DEFAULT_LIBRARY_SUFFIX}>"
         )
 
-install(TARGETS TiledArray_CUTT EXPORT tiledarray COMPONENT tiledarray)
+install(TARGETS TiledArray_LIBRETT EXPORT tiledarray COMPONENT tiledarray)
 
 
-#TODO test cuTT
+#TODO test LibreTT
diff --git a/external/versions.cmake b/external/versions.cmake
index 4ac855e249..c1120147d9 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -27,8 +27,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 set(TA_TRACKED_BTAS_TAG 242871710dabd5ef337e5253000d3e38c1d977ba)
 set(TA_TRACKED_BTAS_PREVIOUS_TAG db884b020b5c13c312c07df9d5c03cea2d65afb2)
 
-set(TA_TRACKED_CUTT_TAG 0e8685bf82910bc7435835f846e88f1b39f47f09)
-set(TA_TRACKED_CUTT_PREVIOUS_TAG 592198b93c93b7ca79e7900b9a9f2e79f9dafec3)
+set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da)
+set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796)
 
 set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82)
 set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f5ed90793b..d6f055df8f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -197,7 +197,7 @@ if(CUDA_FOUND)
 
   list(APPEND TILEDARRAY_HEADER_FILES
      TiledArray/external/cuda.h
-     TiledArray/external/cutt.h
+     TiledArray/external/ta-librett.h
      TiledArray/cuda/cublas.h
      TiledArray/cuda/btas_cublas.h
      TiledArray/cuda/btas_um_tensor.h
@@ -245,7 +245,7 @@ if(CUDA_FOUND)
           LANGUAGE CUDA)
 
   # the list of libraries on which TiledArray depends on
-  list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_CUTT)
+  list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT)
 
 endif(CUDA_FOUND)
 
diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/cuda/btas_um_tensor.h
index d6012f00f1..2ec1fb9a6d 100644
--- a/src/TiledArray/cuda/btas_um_tensor.h
+++ b/src/TiledArray/cuda/btas_um_tensor.h
@@ -32,7 +32,7 @@
 
 #include <TiledArray/cuda/btas_cublas.h>
 #include <TiledArray/cuda/um_storage.h>
-#include <TiledArray/external/cutt.h>
+#include <TiledArray/external/ta-librett.h>
 #include <TiledArray/tile.h>
 
 namespace TiledArray {
@@ -187,7 +187,7 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
                                       std::move(storage));
 
   // invoke the permute function
-  cutt_permute(const_cast<T *>(device_data(arg.storage())),
+  librett_permute(const_cast<T *>(device_data(arg.storage())),
                device_data(result.storage()), arg.range(), perm, stream);
 
   synchronize_stream(&stream);
diff --git a/src/TiledArray/external/cutt.h b/src/TiledArray/external/ta-librett.h
similarity index 80%
rename from src/TiledArray/external/cutt.h
rename to src/TiledArray/external/ta-librett.h
index a2a31ec20d..bc0da4de8a 100644
--- a/src/TiledArray/external/cutt.h
+++ b/src/TiledArray/external/ta-librett.h
@@ -21,8 +21,8 @@
  *
  */
 
-#ifndef TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED
-#define TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED
+#ifndef TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
+#define TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
 
 #include <TiledArray/config.h>
 
@@ -31,7 +31,7 @@
 #include <algorithm>
 #include <vector>
 
-#include <cutt.h>
+#include <librett.h>
 
 #include <TiledArray/permutation.h>
 #include <TiledArray/range.h>
@@ -77,38 +77,39 @@ inline void permutation_to_col_major(std::vector<int>& perm) {
  * @param stream  the CUDA stream this permutation will be submitted to
  */
 template <typename T>
-void cutt_permute(T* inData, T* outData, const TiledArray::Range& range,
+void librett_permute(T* inData, T* outData, const TiledArray::Range& range,
                   const TiledArray::Permutation& perm, cudaStream_t stream) {
   auto extent = range.extent();
   std::vector<int> extent_int(extent.begin(), extent.end());
 
-  // cuTT uses FROM notation
+  // LibreTT uses FROM notation
   auto perm_inv = perm.inv();
   std::vector<int> perm_int(perm_inv.begin(), perm_inv.end());
 
-  // cuTT uses ColMajor
+  // LibreTT uses ColMajor
   TiledArray::extent_to_col_major(extent_int);
   TiledArray::permutation_to_col_major(perm_int);
 
-  cuttResult_t status;
+  //librettResult_t status;
+  librettResult status;
 
-  cuttHandle plan;
-  status = cuttPlan(&plan, range.rank(), extent_int.data(), perm_int.data(),
+  librettHandle plan;
+  status = librettPlan(&plan, range.rank(), extent_int.data(), perm_int.data(),
                     sizeof(T), stream);
 
-  TA_ASSERT(status == CUTT_SUCCESS);
+  TA_ASSERT(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, inData, outData);
+  status = librettExecute(plan, inData, outData);
 
-  TA_ASSERT(status == CUTT_SUCCESS);
+  TA_ASSERT(status == LIBRETT_SUCCESS);
 
-  status = cuttDestroy(plan);
+  status = librettDestroy(plan);
 
-  TA_ASSERT(status == CUTT_SUCCESS);
+  TA_ASSERT(status == LIBRETT_SUCCESS);
 }
 
 }  // namespace TiledArray
 
 #endif  //  TILEDARRAY_HAS_CUDA
 
-#endif  // TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED
+#endif  // TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
index 29b60a61d6..226d2365ac 100644
--- a/src/TiledArray/tiledarray.cpp
+++ b/src/TiledArray/tiledarray.cpp
@@ -7,7 +7,7 @@
 #ifdef TILEDARRAY_HAS_CUDA
 #include <TiledArray/cuda/cublas.h>
 #include <TiledArray/external/cuda.h>
-#include <cutt.h>
+#include <librett.h>
 #endif
 
 namespace TiledArray {
@@ -20,14 +20,14 @@ inline void cuda_initialize() {
   cudaEnv::instance();
   //
   cuBLASHandlePool::handle();
-  // initialize cuTT
-  cuttInitialize();
+  // initialize LibreTT
+  librettInitialize();
 }
 
 /// finalize cuda environment
 inline void cuda_finalize() {
   CudaSafeCall(cudaDeviceSynchronize());
-  cuttFinalize();
+  librettFinalize();
   cublasDestroy(cuBLASHandlePool::handle());
   delete &cuBLASHandlePool::handle();
   cudaEnv::instance().reset(nullptr);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0fccf921b5..1ac73df189 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -116,7 +116,7 @@ set(ta_test_src_files  ta_test.cpp
 )
 
 if(CUDA_FOUND)
-    list(APPEND ta_test_src_files cutt.cpp expressions_cuda_um.cpp tensor_um.cpp)
+    list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp)
 endif()
 
 # if tiledarray library was compiled without exceptions, use TA header-only (see below)
diff --git a/tests/cutt.cpp b/tests/librett.cpp
similarity index 81%
rename from tests/cutt.cpp
rename to tests/librett.cpp
index 8a6b1af539..91c5b5b8ad 100644
--- a/tests/cutt.cpp
+++ b/tests/librett.cpp
@@ -27,8 +27,8 @@
 #include <TiledArray/cuda/btas_um_tensor.h>
 #include "unit_test_config.h"
 
-struct cuTTFixture {
-  //  cuTTFixture()
+struct LibreTTFixture {
+  //  LibreTTFixture()
   //      : A(100),
   //        B(50),
   //        C(20),
@@ -36,16 +36,16 @@ struct cuTTFixture {
   //        extent({100, 100}),
   //        extent_nonsym({100, 50}),
   //        perm({1, 0}) {}
-  cuTTFixture() : A(10), B(5), C(2) {}
+  LibreTTFixture() : A(10), B(5), C(2) {}
 
   int A;
   int B;
   int C;
 };
 
-BOOST_FIXTURE_TEST_SUITE(cutt_suite, cuTTFixture, TA_UT_LABEL_SERIAL);
+BOOST_FIXTURE_TEST_SUITE(librett_suite, LibreTTFixture, TA_UT_LABEL_SERIAL);
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
   int* a_host = (int*)std::malloc(A * A * sizeof(int));
   int* b_host = (int*)std::malloc(A * A * sizeof(int));
   int iter = 0;
@@ -68,17 +68,18 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
-  cuttDestroy(plan);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
+  librettDestroy(plan);
 
   cudaMemcpy(b_host, b_device, A * A * sizeof(int), cudaMemcpyDeviceToHost);
 
@@ -97,7 +98,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
   int* a_host = (int*)std::malloc(A * B * sizeof(int));
   int* b_host = (int*)std::malloc(A * B * sizeof(int));
   int iter = 0;
@@ -115,8 +116,9 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
 
   cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice);
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({B, A});
   TiledArray::extent_to_col_major(extent);
@@ -124,14 +126,14 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
-  cuttDestroy(plan);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
+  librettDestroy(plan);
 
   cudaMemcpy(b_host, b_device, A * B * sizeof(int), cudaMemcpyDeviceToHost);
 
@@ -150,7 +152,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
   int* a_host = (int*)std::malloc(A * B * C * sizeof(int));
   int* b_host = (int*)std::malloc(A * B * C * sizeof(int));
   int iter = 0;
@@ -172,28 +174,29 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) {
 
   // b(j,i,k) = a(i,j,k)
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent3{int(A), int(B), int(C)};
 
   std::vector<int> perm3{1, 0, 2};
   //  std::vector<int> perm3{0, 2, 1};
 
-  status = cuttPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int),
+  status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int),
                            0, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost);
 
-  status = cuttDestroy(plan);
+  status = librettDestroy(plan);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   iter = 0;
   for (std::size_t k = 0; k < C; k++) {
@@ -212,7 +215,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
   int* a_host = (int*)std::malloc(A * B * C * sizeof(int));
   int* b_host = (int*)std::malloc(A * B * C * sizeof(int));
   int iter = 0;
@@ -234,8 +237,9 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
 
   // b(j,i,k) = a(i,j,k)
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({A, B, C});
   TiledArray::extent_to_col_major(extent);
@@ -243,20 +247,20 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
   std::vector<int> perm({1, 0, 2});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0,
+  status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0,
                            a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost);
 
-  status = cuttDestroy(plan);
+  status = librettDestroy(plan);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -275,7 +279,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
+BOOST_AUTO_TEST_CASE(librett_unified_mem) {
   int* a_um;
   cudaMallocManaged(&a_um, A * A * sizeof(int));
 
@@ -290,8 +294,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
     }
   }
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({A, A});
   TiledArray::extent_to_col_major(extent);
@@ -299,15 +304,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_um, b_um);
+  status = librettExecute(plan, a_um, b_um);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cuttDestroy(plan);
+  librettDestroy(plan);
 
   cudaDeviceSynchronize();
 
@@ -323,7 +328,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
   cudaFree(b_um);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
+BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
   int* a_um;
   cudaMallocManaged(&a_um, A * B * sizeof(int));
 
@@ -338,8 +343,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
     }
   }
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({B, A});
   TiledArray::extent_to_col_major(extent);
@@ -347,15 +353,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_um, b_um);
+  status = librettExecute(plan, a_um, b_um);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cuttDestroy(plan);
+  librettDestroy(plan);
   cudaDeviceSynchronize();
 
   iter = 0;
@@ -369,7 +375,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
   cudaFree(b_um);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
+BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
   int* a_um;
   cudaMallocManaged(&a_um, A * B * C * sizeof(int));
 
@@ -386,8 +392,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
     }
   }
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   // b(k,i,j) = a(i,j,k)
 
@@ -397,15 +404,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
   std::vector<int> perm({2, 0, 1});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_um, b_um);
+  status = librettExecute(plan, a_um, b_um);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cuttDestroy(plan);
+  librettDestroy(plan);
   cudaDeviceSynchronize();
 
   iter = 0;
@@ -421,7 +428,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
   cudaFree(b_um);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor) {
   TiledArray::Range range{A, A};
 
   using Tile = TiledArray::btasUMTensorVarray<int, TiledArray::Range>;
@@ -453,7 +460,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_nonsym) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_nonsym) {
   TiledArray::Range range{B, A};
 
   using Tile = TiledArray::btasUMTensorVarray<int, TiledArray::Range>;
@@ -485,7 +492,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_nonsym) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_three) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) {
   TiledArray::Range range{A, B, C};
 
   using Tile = TiledArray::btasUMTensorVarray<int, TiledArray::Range>;
@@ -540,7 +547,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_three) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_four) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) {
   std::size_t a = 2;
   std::size_t b = 3;
   std::size_t c = 6;
@@ -609,7 +616,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_four) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_six) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) {
   std::size_t a = 2;
   std::size_t b = 3;
   std::size_t c = 6;

From ae1f4038475c6d6af755469b7f4d848e8e9cbb9e Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 13 Jul 2022 14:29:14 -0400
Subject: [PATCH 12/30] ta-librett.h -> librett.h

---
 src/CMakeLists.txt                            |   2 +-
 src/TiledArray/cuda/btas_um_tensor.h          | 119 +++++++++++-------
 .../external/{ta-librett.h => librett.h}      |   6 +-
 3 files changed, 79 insertions(+), 48 deletions(-)
 rename src/TiledArray/external/{ta-librett.h => librett.h} (95%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d6f055df8f..04281d4926 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -197,7 +197,7 @@ if(CUDA_FOUND)
 
   list(APPEND TILEDARRAY_HEADER_FILES
      TiledArray/external/cuda.h
-     TiledArray/external/ta-librett.h
+     TiledArray/external/librett.h
      TiledArray/cuda/cublas.h
      TiledArray/cuda/btas_cublas.h
      TiledArray/cuda/btas_um_tensor.h
diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/cuda/btas_um_tensor.h
index 2ec1fb9a6d..7bddc4a178 100644
--- a/src/TiledArray/cuda/btas_um_tensor.h
+++ b/src/TiledArray/cuda/btas_um_tensor.h
@@ -32,7 +32,7 @@
 
 #include <TiledArray/cuda/btas_cublas.h>
 #include <TiledArray/cuda/um_storage.h>
-#include <TiledArray/external/ta-librett.h>
+#include <TiledArray/external/librett.h>
 #include <TiledArray/tile.h>
 
 namespace TiledArray {
@@ -95,7 +95,8 @@ namespace TiledArray {
 /// gemm
 ///
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> gemm(
     const btasUMTensorVarray<T, Range> &left,
     const btasUMTensorVarray<T, Range> &right, Scalar factor,
@@ -103,7 +104,8 @@ btasUMTensorVarray<T, Range> gemm(
   return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper);
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 void gemm(btasUMTensorVarray<T, Range> &result,
           const btasUMTensorVarray<T, Range> &left,
           const btasUMTensorVarray<T, Range> &right, Scalar factor,
@@ -159,8 +161,8 @@ btasUMTensorVarray<T, Range> shift(const btasUMTensorVarray<T, Range> &arg,
 /// shift to
 ///
 template <typename T, typename Range, typename Index>
-btasUMTensorVarray<T, Range>& shift_to(btasUMTensorVarray<T, Range> &arg,
-                                      const Index &range_shift) {
+btasUMTensorVarray<T, Range> &shift_to(btasUMTensorVarray<T, Range> &arg,
+                                       const Index &range_shift) {
   const_cast<Range &>(arg.range()).inplace_shift(range_shift);
   return arg;
 }
@@ -188,7 +190,7 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
 
   // invoke the permute function
   librett_permute(const_cast<T *>(device_data(arg.storage())),
-               device_data(result.storage()), arg.range(), perm, stream);
+                  device_data(result.storage()), arg.range(), perm, stream);
 
   synchronize_stream(&stream);
 
@@ -199,24 +201,29 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
 /// scale
 ///
 
-template <typename T, typename Range, typename Scalar, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Range, typename Scalar,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
                                    const Scalar factor) {
   detail::to_cuda(arg);
   return btas_tensor_scale_cuda_impl(arg, factor);
 }
 
-template <typename T, typename Range, typename Scalar, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& scale_to(btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
+template <typename T, typename Range, typename Scalar,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &scale_to(btasUMTensorVarray<T, Range> &arg,
+                                       const Scalar factor) {
   detail::to_cuda(arg);
   btas_tensor_scale_to_cuda_impl(arg, factor);
   return arg;
 }
 
-template <typename T, typename Range, typename Scalar, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Scalar, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
-                                   const Scalar factor,
-                                   const Perm &perm) {
+                                   const Scalar factor, const Perm &perm) {
   auto result = scale(arg, factor);
 
   // wait to finish before switch stream
@@ -236,7 +243,9 @@ btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg) {
   return btas_tensor_scale_cuda_impl(arg, T(-1.0));
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
                                  const Perm &perm) {
   auto result = neg(arg);
@@ -249,7 +258,7 @@ btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
 }
 
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& neg_to(btasUMTensorVarray<T, Range> &arg) {
+btasUMTensorVarray<T, Range> &neg_to(btasUMTensorVarray<T, Range> &arg) {
   detail::to_cuda(arg);
   btas_tensor_scale_to_cuda_impl(arg, T(-1.0));
   return arg;
@@ -267,7 +276,8 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0));
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor) {
@@ -276,7 +286,9 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   return result;
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Perm &perm) {
@@ -289,11 +301,13 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   return permute(result, perm);
 }
 
-template <typename T, typename Scalar, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Scalar, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
-                                  const Scalar factor,
-                                  const Perm &perm) {
+                                  const Scalar factor, const Perm &perm) {
   auto result = subt(arg1, arg2, factor);
 
   // wait to finish before switch stream
@@ -308,17 +322,20 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
 ///
 
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& subt_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg1) {
+btasUMTensorVarray<T, Range> &subt_to(
+    btasUMTensorVarray<T, Range> &result,
+    const btasUMTensorVarray<T, Range> &arg1) {
   detail::to_cuda(result);
   detail::to_cuda(arg1);
   btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0));
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& subt_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg1, const Scalar factor) {
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &subt_to(btasUMTensorVarray<T, Range> &result,
+                                      const btasUMTensorVarray<T, Range> &arg1,
+                                      const Scalar factor) {
   subt_to(result, arg1);
   btas_tensor_scale_to_cuda_impl(result, factor);
   return result;
@@ -336,7 +353,8 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0));
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Scalar factor) {
@@ -345,11 +363,13 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Scalar, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
-                                 const Scalar factor,
-                                 const Perm &perm) {
+                                 const Scalar factor, const Perm &perm) {
   auto result = add(arg1, arg2, factor);
 
   // wait to finish before switch stream
@@ -359,7 +379,9 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   return permute(result, perm);
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Perm &perm) {
@@ -377,17 +399,19 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
 ///
 
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& add_to(btasUMTensorVarray<T, Range> &result,
-            const btasUMTensorVarray<T, Range> &arg) {
+btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
+                                     const btasUMTensorVarray<T, Range> &arg) {
   detail::to_cuda(result);
   detail::to_cuda(arg);
   btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& add_to(btasUMTensorVarray<T, Range> &result,
-            const btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
+                                     const btasUMTensorVarray<T, Range> &arg,
+                                     const Scalar factor) {
   add_to(result, arg);
   btas_tensor_scale_to_cuda_impl(result, factor);
   return result;
@@ -416,7 +440,8 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   return btas_tensor_mult_cuda_impl(arg1, arg2);
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor) {
@@ -425,7 +450,9 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   return result;
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Perm &perm) {
@@ -438,11 +465,13 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   return permute(result, perm);
 }
 
-template <typename T, typename Range, typename Scalar, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Scalar, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
-                                  const Scalar factor,
-                                  const Perm &perm) {
+                                  const Scalar factor, const Perm &perm) {
   auto result = mult(arg1, arg2, factor);
 
   // wait to finish before switch stream
@@ -456,17 +485,19 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
 /// mult to
 ///
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& mult_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg) {
+btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
+                                      const btasUMTensorVarray<T, Range> &arg) {
   detail::to_cuda(result);
   detail::to_cuda(arg);
   btas_tensor_mult_to_cuda_impl(result, arg);
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& mult_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
+                                      const btasUMTensorVarray<T, Range> &arg,
+                                      const Scalar factor) {
   mult_to(result, arg);
   btas_tensor_scale_to_cuda_impl(result, factor);
   return result;
diff --git a/src/TiledArray/external/ta-librett.h b/src/TiledArray/external/librett.h
similarity index 95%
rename from src/TiledArray/external/ta-librett.h
rename to src/TiledArray/external/librett.h
index bc0da4de8a..46d116c45b 100644
--- a/src/TiledArray/external/ta-librett.h
+++ b/src/TiledArray/external/librett.h
@@ -78,7 +78,7 @@ inline void permutation_to_col_major(std::vector<int>& perm) {
  */
 template <typename T>
 void librett_permute(T* inData, T* outData, const TiledArray::Range& range,
-                  const TiledArray::Permutation& perm, cudaStream_t stream) {
+                     const TiledArray::Permutation& perm, cudaStream_t stream) {
   auto extent = range.extent();
   std::vector<int> extent_int(extent.begin(), extent.end());
 
@@ -90,12 +90,12 @@ void librett_permute(T* inData, T* outData, const TiledArray::Range& range,
   TiledArray::extent_to_col_major(extent_int);
   TiledArray::permutation_to_col_major(perm_int);
 
-  //librettResult_t status;
+  // librettResult_t status;
   librettResult status;
 
   librettHandle plan;
   status = librettPlan(&plan, range.rank(), extent_int.data(), perm_int.data(),
-                    sizeof(T), stream);
+                       sizeof(T), stream);
 
   TA_ASSERT(status == LIBRETT_SUCCESS);
 

From 25d456ede822dbd8c5a0f90775b79f7b06960ced Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 13 Jul 2022 14:31:45 -0400
Subject: [PATCH 13/30] minor verbiage fix

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index c06535172e..683a684f0b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -64,7 +64,7 @@ Compiling BTAS requires the following prerequisites:
 
 Optional prerequisites:
 - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing:
-  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) with our additional thread-safety improvements (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da).
+  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da).
   - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82).
 - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
 - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:

From c94a08551749ed7db29557d9043053224f823ac7 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 13 Jul 2022 15:23:16 -0400
Subject: [PATCH 14/30] removed all Travis artifacts

---
 .travis.yml                                   | 111 -------------
 INSTALL.md                                    |   2 -
 README.md                                     |   1 -
 bin/build-boost-linux.sh                      |  41 -----
 bin/build-eigen3-linux.sh                     |  42 -----
 bin/build-linux.sh                            | 147 ------------------
 bin/build-madness-linux.sh                    |  85 ----------
 bin/build-mpich-linux.sh                      |  42 -----
 bin/build-scalapack-mpich-linux.sh            |  48 ------
 bin/deploy-linux.sh                           |  62 --------
 bin/docker-cuda.md                            |   2 +-
 bin/docker-travis-build.sh                    |  93 -----------
 bin/docker-travis.md                          |  33 ----
 bin/docker.md                                 |   2 +-
 .../contrib/Travis-CI-Administration-Notes.md |  14 +-
 15 files changed, 5 insertions(+), 720 deletions(-)
 delete mode 100644 .travis.yml
 delete mode 100755 bin/build-boost-linux.sh
 delete mode 100755 bin/build-eigen3-linux.sh
 delete mode 100755 bin/build-linux.sh
 delete mode 100755 bin/build-madness-linux.sh
 delete mode 100755 bin/build-mpich-linux.sh
 delete mode 100755 bin/build-scalapack-mpich-linux.sh
 delete mode 100755 bin/deploy-linux.sh
 delete mode 100755 bin/docker-travis-build.sh
 delete mode 100644 bin/docker-travis.md

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 0bf6535c4a..0000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# See http://about.travis-ci.org/docs/user/build-configuration/
-# To validate this file: http://lint.travis-ci.org/
-
-language: cpp
-dist: focal
-cache: ccache
-cache:
-  directories:
-  - /home/travis/_install
-os: linux
-
-addons:
-  apt:
-    packages: &base_packages
-    - libblas-dev
-    - liblapack-dev
-    - liblapacke-dev
-    - libtbb-dev
-    - lcov
-    - python3
-    - python3-pip
-    - python3-pytest
-    - python3-numpy
-
-env:
-  global:
-    - BUILD_PREFIX=/home/travis/_build
-    - INSTALL_PREFIX=/home/travis/_install
-
-matrix:
-  fast_finish: true
-  include:
-    - compiler: gcc
-      env: GCC_VERSION=7 BUILD_TYPE=Debug MADNESS_OVER_PARSEC=1
-      addons:
-        apt:
-          packages:
-           - *base_packages
-           - g++-7
-           - gfortran-7
-    - compiler: gcc
-      env: GCC_VERSION=7 BUILD_TYPE=Debug
-      addons:
-        apt:
-          packages:
-           - *base_packages
-           - g++-7
-           - gfortran-7
-    - compiler: gcc
-      env: GCC_VERSION=7 BUILD_TYPE=Release
-      addons:
-        apt:
-          packages:
-           - *base_packages
-           - g++-7
-           - gfortran-7
-    - compiler: gcc
-      env: GCC_VERSION=8 BUILD_TYPE=Debug COMPUTE_COVERAGE=1 MADNESS_OVER_PARSEC=1
-      addons:
-        apt:
-          packages:
-            - *base_packages
-            - g++-8
-            - gfortran-8
-    - compiler: gcc
-      env: GCC_VERSION=8 BUILD_TYPE=Release
-      addons:
-       apt:
-          packages:
-            - *base_packages
-            - g++-8
-            - gfortran-8
-    - compiler: gcc
-      env: GCC_VERSION=9 BUILD_TYPE=Debug MADNESS_OVER_PARSEC=1
-      addons:
-        apt:
-          sources:
-            - sourceline: 'ppa:ubuntu-toolchain-r/test'
-          packages:
-            - *base_packages
-            - g++-9
-            - gfortran-9
-
-before_install:
-  - env
-  - mkdir -p ${BUILD_PREFIX} && mkdir -p ${INSTALL_PREFIX}
-# use timeout to stop long-running (i.e. cache-rebuilding) jobs right before they get killed by Travis-CI
-# in case of timeout report success to Travis to force cache upload
-script:
-  - travis_wait 50 timeout 2850 ${TRAVIS_BUILD_DIR}/bin/build-$TRAVIS_OS_NAME.sh;  RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 124 ]; then true; else false; fi;
-after_failure:
-  - cat ${BUILD_PREFIX}/TA/external/madness-build/CMakeFiles/CMakeError.log
-  - cat ${BUILD_PREFIX}/TA/external/madness-build/CMakeFiles/CMakeOutput.log
-  - cat ${BUILD_PREFIX}/TA/CMakeFiles/CMakeError.log
-  - cat ${BUILD_PREFIX}/TA/CMakeFiles/CMakeOutput.log
-# codecov
-after_success:
-  # create report
-  - cd ${TRAVIS_BUILD_DIR}
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then lcov --gcov-tool gcov-${GCC_VERSION} --directory ${BUILD_PREFIX}/TA --capture --output-file coverage.info; fi; # capture coverage info
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then lcov --remove coverage.info '/usr/*' '*/madness/*' '*/btas/*' '*/tests/*' --output-file coverage.info; fi; # filter out non-project files
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then lcov --list coverage.info; fi; #debug info
-  - echo ${TRAVIS_CMD}
-  # upload report to CodeCov
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then bash <(curl -s https://codecov.io/bash) -t token; fi;
-  # deploy artifacts: currently only dox
-  - if [ "$DEPLOY" = "1" ]; then bash ${TRAVIS_BUILD_DIR}/bin/deploy-$TRAVIS_OS_NAME.sh; fi;
-
-notifications:
-  slack:
-    secure: aSmy6FmiEf+0gcbVpJs0GIrmpI1dF7/WFOXgUkM2wLxw5DBQxE4LW/yt01mvFqAMJLe0LzGujx/V/z98i0kA1S8DEMTqJ+IG2bbdmgb5CAw5LTP5Air1P2SeAyKW/eAAsnGsERaEnHj8nnZEa2dhbAFOPD5QDM7nwWG/xUkIGMU=
diff --git a/INSTALL.md b/INSTALL.md
index 683a684f0b..579f92f28d 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -30,8 +30,6 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - [Apple Clang](https://en.wikipedia.org/wiki/Xcode), version 9.3 or higher
   - [Intel C++ compiler](https://software.intel.com/en-us/c-compilers), version 19 or higher
 
-  See the current [Travis CI matrix](.travis.yml) for the most up-to-date list of compilers that are known to work.
-
 - [CMake](https://cmake.org/), version 3.15 or higher; if CUDA support is needed, CMake 3.18 or higher is required.
 - [Git](https://git-scm.com/) 1.8 or later (required to obtain TiledArray and MADNESS source code from GitHub)
 - [Eigen](http://eigen.tuxfamily.org/), version 3.3.5 or higher; if CUDA is enabled then 3.3.7 is required (will be downloaded automatically, if missing)
diff --git a/README.md b/README.md
index 853629526a..8742d1e774 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-[![Travis Build Status](https://travis-ci.com/ValeevGroup/tiledarray.svg?branch=master)](https://travis-ci.com/ValeevGroup/tiledarray)
 [![Gitlab Pipeline Status](https://gitlab.com/ValeevGroup/tiledarray/badges/master/pipeline.svg)](https://gitlab.com/ValeevGroup/tiledarray/-/pipelines?page=1&scope=all&ref=master)
 [![codecov](https://codecov.io/gh/ValeevGroup/tiledarray/branch/master/graph/badge.svg)](https://codecov.io/gh/ValeevGroup/tiledarray)
 
diff --git a/bin/build-boost-linux.sh b/bin/build-boost-linux.sh
deleted file mode 100755
index 7c4fca8bbf..0000000000
--- a/bin/build-boost-linux.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#! /bin/sh
-
-export BOOST_VERSION=1_74_0
-
-# Exit on error
-set -ev
-
-if [ "$CXX" = "g++" ]; then
-    export CXX=/usr/bin/g++-$GCC_VERSION
-    export CXXFLAGS="-mno-avx"
-    export BOOST_TOOLSET=gcc
-else
-    export CXX=/usr/bin/clang++-$CLANG_VERSION
-    export CXXFLAGS="-mno-avx -stdlib=libc++"
-    export BOOST_TOOLSET=clang
-fi
-
-if [ "X$BUILD_TYPE" = "XDebug" ]; then
-    export BOOST_VARIANT="debug"
-else
-    export BOOST_VARIANT="release"
-fi
-
-# download+unpack (but not build!) Boost unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/boost
-if [ ! -d "${INSTALL_DIR}" ]; then
-    rm -fr boost_${BOOST_VERSION}.tar.bz2
-    wget https://boostorg.jfrog.io/artifactory/main/release/1.74.0/source/boost_${BOOST_VERSION}.tar.bz2
-    tar -xjf boost_${BOOST_VERSION}.tar.bz2
-    cd boost_${BOOST_VERSION}
-    cat > user-config.jam << END
-using ${BOOST_TOOLSET} : : ${CXX} :
-      <cxxflags>"${CXXFLAGS}"
-      <linkflags>"${CXXFLAGS}" ;
-END
-    ./bootstrap.sh --prefix=${INSTALL_DIR} --with-libraries=serialization
-    ./b2 -d0 --user-config=`pwd`/user-config.jam toolset=${BOOST_TOOLSET} link=static variant=${BOOST_VARIANT}
-    ./b2 -d0 install
-else
-    echo "Boost already installed ..."
-fi
diff --git a/bin/build-eigen3-linux.sh b/bin/build-eigen3-linux.sh
deleted file mode 100755
index 5f2133111b..0000000000
--- a/bin/build-eigen3-linux.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# Install packages
-
-# Environment variables
-if [ "$CXX" = "g++" ]; then
-  export CC=/usr/bin/gcc-$GCC_VERSION
-  export CXX=/usr/bin/g++-$GCC_VERSION
-  export EXTRACXXFLAGS="-mno-avx"
-else
-  export CC=/usr/bin/clang-$CLANG_VERSION
-  export CXX=/usr/bin/clang++-$CLANG_VERSION
-  export EXTRACXXFLAGS="-mno-avx  -stdlib=libc++"
-fi
-
-# Print compiler information
-$CC --version
-$CXX --version
-
-# log the CMake version (need 3+)
-cmake --version
-
-# Install Eigen3 unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/eigen3
-if [ ! -d "${INSTALL_DIR}" ]; then
-    cd ${BUILD_PREFIX}
-    wget -q https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2
-    tar -xjf eigen-3.3.7.tar.bz2
-    cd eigen-*
-    mkdir build
-    cd build
-    cmake .. -DCMAKE_CXX_COMPILER=$CXX \
-      -DCMAKE_C_COMPILER=$CC \
-      -DCMAKE_CXX_FLAGS="${EXTRACXXFLAGS}" \
-      -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}
-    make install
-else
-    echo "Eigen3 already installed ..."
-fi
diff --git a/bin/build-linux.sh b/bin/build-linux.sh
deleted file mode 100755
index a6c55ed951..0000000000
--- a/bin/build-linux.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#! /bin/sh
-
-# get the most recent cmake available
-if [ ! -d "${INSTALL_PREFIX}/cmake" ]; then
-  CMAKE_VERSION=3.17.0
-  CMAKE_URL="https://cmake.org/files/v${CMAKE_VERSION%.[0-9]}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
-  mkdir ${INSTALL_PREFIX}/cmake && wget --no-check-certificate -O - ${CMAKE_URL} | tar --strip-components=1 -xz -C ${INSTALL_PREFIX}/cmake
-fi
-export PATH=${INSTALL_PREFIX}/cmake/bin:${PATH}
-cmake --version
-
-export PYTHON_EXECUTABLE=$(which python3)
-export TA_PYTHON=ON
-
-${TRAVIS_BUILD_DIR}/bin/build-mpich-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-scalapack-mpich-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-madness-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-boost-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-eigen3-linux.sh
-
-# Exit on error
-set -ev
-
-# download latest Doxygen
-if [ "$DEPLOY" = "1" ]; then
-  DOXYGEN_VERSION=1.8.20
-  if [ ! -d ${INSTALL_PREFIX}/doxygen-${DOXYGEN_VERSION} ]; then
-    cd ${BUILD_PREFIX} && wget https://downloads.sourceforge.net/project/doxygen/rel-${DOXYGEN_VERSION}/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
-    cd ${INSTALL_PREFIX} && tar xzf ${BUILD_PREFIX}/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
-  fi
-  export PATH=${INSTALL_PREFIX}/doxygen-${DOXYGEN_VERSION}/bin:$PATH
-  which doxygen
-  doxygen --version
-fi
-
-# Environment variables
-if [ "$CXX" = "g++" ]; then
-    export CC=/usr/bin/gcc-$GCC_VERSION
-    export CXX=/usr/bin/g++-$GCC_VERSION
-    export EXTRACXXFLAGS="-mno-avx"
-    # if linking statically will need fortran libs to detect liblapacke.a in BTAS
-    export F77=gfortran-$GCC_VERSION
-else
-    export CC=/usr/bin/clang-$CLANG_VERSION
-    export CXX=/usr/bin/clang++-$CLANG_VERSION
-    export EXTRACXXFLAGS="-mno-avx -stdlib=libc++"
-    # if linking statically will need fortran libs to detect liblapacke.a in BTAS
-    export F77=gfortran-$GCC_VERSION
-fi
-
-export MPI_HOME=${INSTALL_PREFIX}/mpich
-export MPICC=$MPI_HOME/bin/mpicc
-export MPICXX=$MPI_HOME/bin/mpicxx
-export LD_LIBRARY_PATH=/usr/lib/lapack:/usr/lib/libblas:${INSTALL_PREFIX}/scalapack/lib:$LD_LIBRARY_PATH
-
-# list the prebuilt prereqs
-ls -l ${INSTALL_PREFIX}
-
-# where to install TA (need for testing installed code)
-export INSTALL_DIR=${INSTALL_PREFIX}/TA
-
-# make build dir
-cd ${BUILD_PREFIX}
-mkdir -p TA
-cd TA
-
-# if have old installed copy of TA, make sure that BTAS tag matches the required tag, if not, remove INSTALL_DIR (will cause rebuild of TA)
-if [ -f "${INSTALL_DIR}/include/btas/version.h" ]; then
-  export INSTALLED_BTAS_TAG=`grep 'define BTAS_REVISION' ${INSTALL_DIR}/include/btas/version.h | awk '{print $3}' | sed s/\"//g`
-  echo "installed BTAS revision = ${INSTALLED_BTAS_TAG}"
-  # extract the tracked tag of BTAS
-  export BTAS_TAG=`grep 'set(TA_TRACKED_BTAS_TAG ' ${TRAVIS_BUILD_DIR}/external/versions.cmake | awk '{print $2}' | sed s/\)//g`
-  echo "required BTAS revision = ${BTAS_TAG}"
-  if [ "${BTAS_TAG}" != "${INSTALLED_BTAS_TAG}" ]; then
-    rm -rf "${INSTALL_DIR}"
-  fi
-fi
-
-# MADNESS are build separately if $BUILD_TYPE=Debug, otherwise built as part of TA
-if [ "$BUILD_TYPE" = "Debug" ]; then
-
-  if [ "$COMPUTE_COVERAGE" = "1" ]; then
-    export CODECOVCXXFLAGS="-O0 --coverage"
-  fi
-
-  cmake ${TRAVIS_BUILD_DIR} \
-    -DCMAKE_TOOLCHAIN_FILE=cmake/vg/toolchains/travis.cmake \
-    -DCMAKE_CXX_COMPILER=$CXX \
-    -DCMAKE_C_COMPILER=$CC \
-    -DCMAKE_Fortran_COMPILER=$F77 \
-    -DMPI_CXX_COMPILER=$MPICXX \
-    -DMPI_C_COMPILER=$MPICC \
-    -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-    -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-    -DCMAKE_CXX_FLAGS="-ftemplate-depth=1024 -Wno-unused-command-line-argument ${EXTRACXXFLAGS} ${CODECOVCXXFLAGS}" \
-    -DCMAKE_PREFIX_PATH="${INSTALL_PREFIX}/madness;${INSTALL_PREFIX}/eigen3;${INSTALL_PREFIX}/boost" \
-    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-    -DTA_PYTHON="${TA_PYTHON}" \
-    -DENABLE_SCALAPACK=ON
-
-else
-
-  # if have old installed copy of TA, make sure that MADNESS tag matches the required tag, if not, remove INSTALL_DIR (will cause rebuild of MADNESS)
-  if [ -f "${INSTALL_DIR}/include/madness/config.h" ]; then
-    export INSTALLED_MADNESS_TAG=`grep 'define MADNESS_REVISION' ${INSTALL_DIR}/include/madness/config.h | awk '{print $3}' | sed s/\"//g`
-    echo "installed MADNESS revision = ${INSTALLED_MADNESS_TAG}"
-    # extract the tracked tag of MADNESS
-    export MADNESS_TAG=`grep 'set(TA_TRACKED_MADNESS_TAG ' ${TRAVIS_BUILD_DIR}/external/versions.cmake | awk '{print $2}' | sed s/\)//g`
-    echo "required MADNESS revision = ${MADNESS_TAG}"
-    if [ "${MADNESS_TAG}" != "${INSTALLED_MADNESS_TAG}" ]; then
-      rm -rf "${INSTALL_DIR}"
-    fi
-  fi
-
-  cmake ${TRAVIS_BUILD_DIR} \
-    -DCMAKE_TOOLCHAIN_FILE=cmake/vg/toolchains/travis.cmake \
-    -DCMAKE_CXX_COMPILER=$CXX \
-    -DCMAKE_C_COMPILER=$CC \
-    -DCMAKE_Fortran_COMPILER=$F77 \
-    -DMPI_CXX_COMPILER=$MPICXX \
-    -DMPI_C_COMPILER=$MPICC \
-    -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-    -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-    -DCMAKE_CXX_FLAGS="-ftemplate-depth=1024 -Wno-unused-command-line-argument ${EXTRACXXFLAGS}" \
-    -DCMAKE_PREFIX_PATH="${INSTALL_PREFIX}/eigen3;${INSTALL_PREFIX}/boost" \
-    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-    -DTA_PYTHON="${TA_PYTHON}" \
-    -DTA_ASSERT_POLICY=TA_ASSERT_THROW \
-    -DENABLE_SCALAPACK=ON
-
-fi
-
-# Build all libraries, examples, and applications
-make -j2 all VERBOSE=1
-make install
-# remove install dir to avoid broken artifacts like BTAS polluting the next build via cached copy
-rm -rf $INSTALL_DIR
-
-# Validate
-make -j1 ta_test VERBOSE=1
-export MAD_NUM_THREADS=2
-# to find dep shared libs (do we need this since El is gone?)
-export LD_LIBRARY_PATH=${INSTALL_PREFIX}/TA/lib:${INSTALL_PREFIX}/madness/lib:${LD_LIBRARY_PATH}
-make check-tiledarray
-
-# Build examples
-make -j2 examples VERBOSE=1
diff --git a/bin/build-madness-linux.sh b/bin/build-madness-linux.sh
deleted file mode 100755
index d255bff92d..0000000000
--- a/bin/build-madness-linux.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# Will build MADNESS stand-alone for Debug builds only
-if [ "$BUILD_TYPE" = "Debug" ]; then
-
-  # Environment variables
-  if [ "$CXX" = "g++" ]; then
-    export CC=/usr/bin/gcc-$GCC_VERSION
-    export CXX=/usr/bin/g++-$GCC_VERSION
-    export EXTRACXXFLAGS="-mno-avx"
-    export F77=gfortran-$GCC_VERSION
-  else
-    export CC=/usr/bin/clang-$CLANG_VERSION
-    export CXX=/usr/bin/clang++-$CLANG_VERSION
-    export EXTRACXXFLAGS="-mno-avx -stdlib=libc++"
-    export F77=gfortran-$GCC_VERSION
-  fi
-
-  export MPI_HOME=${INSTALL_PREFIX}/mpich
-  export MPICC=$MPI_HOME/bin/mpicc
-  export MPICXX=$MPI_HOME/bin/mpicxx
-  export LD_LIBRARY_PATH=/usr/lib/lapack:/usr/lib/libblas:$LD_LIBRARY_PATH
-
-  # list the prebuilt prereqs
-  ls -l ${INSTALL_PREFIX}
-
-  # where to install MADNESS (need for testing installed code)
-  export INSTALL_DIR=${INSTALL_PREFIX}/madness
-
-  # extract the tracked tag of MADNESS
-  export MADNESS_TAG=`grep 'set(TA_TRACKED_MADNESS_TAG ' ${TRAVIS_BUILD_DIR}/external/versions.cmake | awk '{print $2}' | sed s/\)//g`
-  echo "required MADNESS revision = ${MADNESS_TAG}"
-
-  # make sure installed MADNESS tag matches the required tag, if not, remove INSTALL_DIR (will cause reinstall)
-  if [ -f "${INSTALL_DIR}/include/madness/config.h" ]; then
-    export INSTALLED_MADNESS_TAG=`grep 'define MADNESS_REVISION' ${INSTALL_DIR}/include/madness/config.h | awk '{print $3}' | sed s/\"//g`
-    echo "installed MADNESS revision = ${INSTALLED_MADNESS_TAG}"
-    if [ "${MADNESS_TAG}" != "${INSTALLED_MADNESS_TAG}" ]; then
-      rm -rf "${INSTALL_DIR}"
-    fi
-  fi
-
-  if [ ! -d "${INSTALL_DIR}" ]; then
-
-    # make build dir
-    cd ${BUILD_PREFIX}
-    mkdir -p madness
-    cd madness
-
-    if [ -n "${MADNESS_OVER_PARSEC}" ]; then
-	MADNESS_BACKEND_OPTION="-DMADNESS_TASK_BACKEND=PaRSEC"
-    fi
-
-    # check out the tracked tag of MADNESS
-    git clone https://github.com/TESSEorg/madness.git madness_src && cd madness_src && git checkout ${MADNESS_TAG} && cd ..
-
-    cmake madness_src \
-      -DCMAKE_TOOLCHAIN_FILE="${TRAVIS_BUILD_DIR}/cmake/toolchains/travis.cmake" \
-      -DCMAKE_CXX_COMPILER=$CXX \
-      -DCMAKE_C_COMPILER=$CC \
-      -DMPI_CXX_COMPILER=$MPICXX \
-      -DMPI_C_COMPILER=$MPICC \
-      -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-      -DCMAKE_CXX_FLAGS="-ftemplate-depth=1024 -Wno-unused-command-line-argument ${EXTRACXXFLAGS}" \
-      -DMADNESS_BUILD_MADWORLD_ONLY=ON \
-      -DENABLE_MPI=ON \
-      -DMPI_THREAD=multiple \
-      -DENABLE_TBB=OFF \
-      -DTBB_ROOT_DIR=/usr \
-      -DFORTRAN_INTEGER_SIZE=4 \
-      -DENABLE_LIBXC=OFF \
-      -DENABLE_GPERFTOOLS=OFF \
-      -DASSERTION_TYPE=throw \
-      -DDISABLE_WORLD_GET_DEFAULT=ON \
-      ${MADNESS_BACKEND_OPTION}
-
-    # Build+install MADworld interface
-    make -j2 install VERBOSE=1
-  fi
-
-fi
diff --git a/bin/build-mpich-linux.sh b/bin/build-mpich-linux.sh
deleted file mode 100755
index 7e38ef3167..0000000000
--- a/bin/build-mpich-linux.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# Install packages
-
-# always use gcc to compile MPICH, there are unexplained issues with clang (e.g. MPI_Barrier aborts)
-export CC=/usr/bin/gcc-$GCC_VERSION
-export CXX=/usr/bin/g++-$GCC_VERSION
-export FC=/usr/bin/gfortran-$GCC_VERSION
-
-# Print compiler information
-$CC --version
-$CXX --version
-$FC --version
-
-# log the CMake version (need 3+)
-cmake --version
-
-# Install MPICH unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/mpich
-if [ ! -d "${INSTALL_DIR}" ]; then
-    cd ${BUILD_PREFIX}
-    export MPICH_VERSION=3.3
-    wget --no-check-certificate -q http://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz
-    tar -xzf mpich-${MPICH_VERSION}.tar.gz
-    cd mpich-${MPICH_VERSION}
-    ./configure FC=$FC CC=$CC CXX=$CXX --prefix=${INSTALL_DIR}
-    make -j2
-    make install
-    ${INSTALL_DIR}/bin/mpichversion
-    ${INSTALL_DIR}/bin/mpicc -show
-    ${INSTALL_DIR}/bin/mpicxx -show
-    ${INSTALL_DIR}/bin/mpifort -show
-else
-    echo "MPICH installed..."
-    find ${INSTALL_DIR} -name mpiexec
-    find ${INSTALL_DIR} -name mpicc
-    find ${INSTALL_DIR} -name mpicxx
-    find ${INSTALL_DIR} -name mpifort
-fi
diff --git a/bin/build-scalapack-mpich-linux.sh b/bin/build-scalapack-mpich-linux.sh
deleted file mode 100755
index 213d7bc5a7..0000000000
--- a/bin/build-scalapack-mpich-linux.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# always use gcc, just like mpich ... ?
-export CC=/usr/bin/gcc-$GCC_VERSION
-export CXX=/usr/bin/g++-$GCC_VERSION
-export FC=/usr/bin/gfortran-$GCC_VERSION
-
-# Print compiler information
-$CC --version
-$CXX --version
-$FC --version
-
-# log the CMake version (need 3+)
-cmake --version
-
-# Install MPICH unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/scalapack
-if [ ! -d "${INSTALL_DIR}" ]; then
-
-    # Make sure MPI is built
-    ${INSTALL_PREFIX}/mpich/bin/mpichversion
-    ${INSTALL_PREFIX}/mpich/bin/mpicc -show
-    ${INSTALL_PREFIX}/mpich/bin/mpicxx -show
-    ${INSTALL_PREFIX}/mpich/bin/mpif90 -show
-
-    cd ${BUILD_PREFIX}
-    git clone https://github.com/Reference-ScaLAPACK/scalapack.git
-    cd scalapack
-    git checkout 0efeeb6d2ec9faf0f2fd6108de5eda60773cdcf9 # checked revision
-    cmake -H. -Bbuild_scalapack \
-      -DCMAKE_C_COMPILER=$CC \
-      -DCMAKE_Fortran_COMPILER=$FC \
-      -DMPI_C_COMPILER=${INSTALL_PREFIX}/mpich/bin/mpicc \
-      -DMPI_Fortran_COMPILER=${INSTALL_PREFIX}/mpich/bin/mpif90 \
-      -DCMAKE_TOOLCHAIN_FILE="${TRAVIS_BUILD_DIR}/cmake/toolchains/travis.cmake" \
-      -DCMAKE_PREFIX_PATH=${INSTALL_DIR} \
-      -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}
-
-    cmake --build build_scalapack -j2
-    cmake --build build_scalapack --target install
-    find ${INSTALL_DIR} -name libscalapack.so
-else
-    echo "ScaLAPACK installed..."
-    find ${INSTALL_DIR} -name libscalapack.so
-fi
diff --git a/bin/deploy-linux.sh b/bin/deploy-linux.sh
deleted file mode 100755
index 279a8f69e8..0000000000
--- a/bin/deploy-linux.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-git config --global user.email "travis@travis-ci.org"
-git config --global user.name "Travis CI"
-
-# only non-cron job deploys
-RUN=1
-if [ "$TRAVIS_EVENT_TYPE" = "cron" ] || [ "$TRAVIS_BRANCH" != "master" ]; then
-  RUN=0
-fi
-if [ "$RUN" = "0" ]; then
-  echo "Deployment skipped"
-  exit 0
-fi
-
-# deploy from the build area
-cd ${BUILD_PREFIX}/TA
-
-### deploy docs
-# see https://gist.github.com/willprice/e07efd73fb7f13f917ea
-
-# build docs
-export VERBOSE=1
-cmake --build . --target html
-if [ ! -f "${BUILD_PREFIX}/TA/doc/dox/html/index.html" ]; then
-  echo "Target html built successfully but did not produce index.html"
-  exit 1
-fi
-
-# check out current docs + template
-git clone --depth=1 https://github.com/ValeevGroup/tiledarray.git --branch gh-pages --single-branch tiledarray-docs-current
-git clone --depth=1 https://github.com/ValeevGroup/tiledarray.git --branch gh-pages-template --single-branch tiledarray-docs-template
-mkdir tiledarray-docs
-cp -rp tiledarray-docs-current/* tiledarray-docs
-rm -rf tiledarray-docs-current
-cp -rp tiledarray-docs-template/* tiledarray-docs
-rm -rf tiledarray-docs-template
-cd tiledarray-docs
-# copy TA's README.md into index.md
-cp ${TRAVIS_BUILD_DIR}/README.md index.md
-# update dox
-if [ -d dox-master ]; then
-  rm -rf dox-master
-fi
-mv ${BUILD_PREFIX}/TA/doc/dox/html dox-master
-# Jekyll does not allow files with "special" names, e.g. whose names start with underscore
-# must "include" such files explicitly
-# re: how file names must be formatted: see https://github.com/jekyll/jekyll/issues/1352
-echo "include:" >> _config.yml
-find dox-master -name "_*" | sed "s/dox-master\//  \- /g" >> _config.yml
-# make empty repo to ensure gh-pages contains no history
-git init
-git add *
-git commit -a -q -m "rebuilt TA master docs via Travis build: $TRAVIS_BUILD_NUMBER"
-git checkout -b gh-pages
-git remote add origin https://${GH_TILEDARRAY_TOKEN}@github.com/ValeevGroup/tiledarray.git > /dev/null 2>&1
-git push origin +gh-pages --force
-cd ..
-rm -rf tiledarray-docs
diff --git a/bin/docker-cuda.md b/bin/docker-cuda.md
index a525369070..0f39c0ac20 100644
--- a/bin/docker-cuda.md
+++ b/bin/docker-cuda.md
@@ -1,5 +1,5 @@
 # Intro
-These notes describe how to build TiledArray with CUDA support enabled within the latest nvidia/cuda Docker image (https://hub.docker.com/r/nvidia/cuda/). This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article). If you want to use Docker to run/debug Travis-CI jobs, see [docker-travis.md](docker-travis.md)
+These notes describe how to build TiledArray with CUDA support enabled within the latest nvidia/cuda Docker image (https://hub.docker.com/r/nvidia/cuda/). This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article).
 
 # Using
 These notes assume that Docker 19.03 and NVIDIA Container Toolkit (https://github.com/NVIDIA/nvidia-docker) are installed on your machine and that you start at the top of the TiledArray source tree.
diff --git a/bin/docker-travis-build.sh b/bin/docker-travis-build.sh
deleted file mode 100755
index 4209bad9ef..0000000000
--- a/bin/docker-travis-build.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-
-# this script builds a 'Bionic' env docker image used by Travis-CI for TiledArray project
-#
-# to run bash in the image: docker run -it tiledarray-travis-debug bash -l
-# see docker-travis.md for further instructions
-# N.B. relevant locations:
-#   - source dir: /home/travis/build/ValeevGroup/tiledarray (TRAVIS_BUILD_DIR env in Travis jobs)
-#   - build dir: /home/travis/_build
-#   - install dir: /home/travis/_install
-
-# this is where in the container file system Travis-CI "starts"
-export TRAVIS_BUILD_TOPDIR=/home/travis/build
-export DIRNAME=`dirname $0`
-export ABSDIRNAME=`pwd $DIRNAME`
-
-##############################################################
-# make a script to download all prereqs and clone TiledArray repo
-setup=setup.sh
-cat > $setup << END
-#!/bin/sh
-curl -sSL "http://apt.llvm.org/llvm-snapshot.gpg.key" | apt-key add -
-echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" | tee -a /etc/apt/sources.list > /dev/null
-apt-add-repository -y "ppa:ubuntu-toolchain-r/test"
-apt-get -yq update >> ~/apt-get-update.log
-apt-get -yq --no-install-suggests --no-install-recommends --force-yes install g++-7 g++-8 g++-9 gfortran-7 gfortran-8 gfortran-9 libblas-dev liblapack-dev liblapacke-dev libtbb-dev clang-8 clang-9 cmake cmake-data libclang1-9 graphviz fonts-liberation \
-python3 python3-pip python3-pytest python3-numpy
-mkdir -p ${TRAVIS_BUILD_TOPDIR}
-cd ${TRAVIS_BUILD_TOPDIR}
-git clone https://github.com/ValeevGroup/tiledarray.git ${TRAVIS_BUILD_TOPDIR}/ValeevGroup/tiledarray
-END
-chmod +x $setup
-
-##############################################################
-# make a script to build all extra prereqs once in the container
-build=build.sh
-cat > $build << END
-#!/bin/sh
-cd /home/travis/_build
-export BUILD_PREFIX=/home/travis/_build
-export INSTALL_PREFIX=/home/travis/_install
-export TRAVIS_BUILD_DIR=${TRAVIS_BUILD_TOPDIR}/ValeevGroup/tiledarray
-export TRAVIS_EVENT_TYPE=cron
-export TRAVIS_OS_NAME=linux
-\${TRAVIS_BUILD_DIR}/bin/build-\$TRAVIS_OS_NAME.sh
-END
-chmod +x $build
-
-##############################################################
-# make Dockerfile
-cat > Dockerfile << END
-# Travis default 'Focal' image
-FROM travisci/ci-ubuntu-2004:packer-1609444725-e5de6974
-
-# Use baseimage-docker's init system.
-CMD ["/sbin/my_init"]
-
-# create source, build, and install dirs
-RUN mkdir -p /home/travis/_build
-RUN mkdir -p /home/travis/_install
-
-# install prereqs
-ADD $setup /home/travis/_build/$setup
-RUN /home/travis/_build/$setup
-
-# Clean up APT when done.
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# copy travis scripts
-ADD $build /home/travis/_build/$build
-
-# for further info ...
-RUN echo "\e[92mDone! For info on how to use the image refer to $ABSDIRNAME/docker-travis.md\e[0m"
-
-END
-
-function clean_up {
-  rm -f $setup $build Dockerfile
-  exit
-}
-
-trap clean_up SIGHUP SIGINT SIGTERM
-
-##############################################################
-# build a dev image
-docker build -t tiledarray-travis-debug .
-
-##############################################################
-# extra admin tasks, uncomment as needed
-
-##############################################################
-# done
-clean_up
diff --git a/bin/docker-travis.md b/bin/docker-travis.md
deleted file mode 100644
index 65e43632df..0000000000
--- a/bin/docker-travis.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Intro
-These notes describe how to build TiledArray within the latest Travis-CI Docker image. This is useful for debugging Travis-CI jobs on your local machine.
-# Using
-These notes assume that Docker is installed on your machine and that you start at the top of the TiledArray source tree.
-
-## Create/build Docker Travis image
-1. Create a Travis-CI docker image: `cd bin; ./docker-travis-build.sh`
-2. Run a container using the newly created image: `docker run -it tiledarray-travis-debug bash -l`
-3. `cd /home/travis/_build`
-4. Configure the job to use the appropriate compiler, compiler version, and debug/release build type:
-  * `export BUILD_TYPE=B`, where `B` is `Debug` or `Release`.
-  * If want to use GNU C++ compiler (gcc):
-    * `export GCC_VERSION=VVV` where `VVV` should be the GCC version to be used. The currently valid values are `7`, `8` and `9`.
-    * `export CXX=g++`
-  * If want to use Clang C++ compiler (clang++):
-    * `export GCC_VERSION=8`
-    * `export CLANG_VERSION=VVV` where `VVV` should be the Clang version to be used. The currently valid values is `11`.
-    * `export CXX=clang++`
-    * `apt-get update && apt-get install libc++-${CLANG_VERSION}-dev libc++abi-${CLANG_VERSION}-dev`
-5. Build prerequisites (MPICH, MADNESS, ScaLAPACK), TiledArray, and run tests: `./build.sh`
-
-## Notes
-* According to [Travis-CI docs](https://docs.travis-ci.com/user/reference/overview/) you want to configure your Docker to run containers with 2 cores and 7.5 GB of RAM to best match the production environment.
-* If you plan to use this container multiple times it might make sense to take a snapshot at this point to avoid having to recompile the prerequisites each and every time. Store it as a separate image, e.g. `docker commit container_id tiledarray-travis-debug:clang-debug`, where `container_id` can be found in the output of `docker ps`. Next time to start debugging you will need to pull updates to the TiledArray source (do `cd /home/travis/build/ValeevGroup/tiledarray && git pull`), then execute step 2 with the new image name, execute step 3, and go directly to step 6.
-* To install `gdb` execute `apt-get update && apt-get install gdb`. Also, it appears that to be able to attach `gdb` or any other debugger to a running process you must run the Docker container in privileged mode as `docker run --privileged -it tiledarray-travis-debug:clang-debug bash -l`.
-* To debug parallel jobs you want to launch jobs in a gdb in an xterm. To run xterm you need to ssh into the container. To start an ssh server in the container do this:
-  * Connect sshd's port of the container (22) to an unprivileged port (say, 2222) of the host: `docker run -p 127.0.0.1:2222:22 --privileged -it tiledarray-travis-debug:clang-debug bash -l`
-  * Generate host keys: `ssh-keygen -A`
-  * Create a root password: `passwd` and follow prompts. No need to be fancy: security is not a concern here, but `passwd` will not accept an empty password. N.B. This is easier than setting up a pubkey login, so don't bother with that.
-  * Edit `/etc/ssh/sshd_config` and allow root to log in by ensuring that `PermitRootLogin` and `PasswordAuthentication` are set to `yes`.
-  * Start ssh server: `/etc/init.d/ssh start`
-  * (optional) To launch gdb in xterm windows: `apt-get update && apt-get install xterm`
-  * You should be able to log in from an xterm on the host side: `ssh -Y -p 2222 root@localhost`
diff --git a/bin/docker.md b/bin/docker.md
index fb558db6db..1826c95ef2 100644
--- a/bin/docker.md
+++ b/bin/docker.md
@@ -1,5 +1,5 @@
 # Intro
-These notes describe how to build TiledArray within the latest phusion (https://github.com/phusion/baseimage-docker) Docker image. This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article). If you want to use Docker to run/debug Travis-CI jobs, see [docker-travis.md](docker-travis.md)
+These notes describe how to build TiledArray within the latest phusion (https://github.com/phusion/baseimage-docker) Docker image. This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article).
 
 # Using
 These notes assume that Docker is installed on your machine and that you start at the top of the TiledArray source tree.
diff --git a/doc/dox/contrib/Travis-CI-Administration-Notes.md b/doc/dox/contrib/Travis-CI-Administration-Notes.md
index 0b626507cd..0284ebf0b9 100644
--- a/doc/dox/contrib/Travis-CI-Administration-Notes.md
+++ b/doc/dox/contrib/Travis-CI-Administration-Notes.md
@@ -1,13 +1,5 @@
-# Managing Travis Builds {#Travis-CI-Administration-Notes}
+# Managing CI Builds {#CI-Administration-Notes}
 
 ## Basic Facts
-* Travis CI configuration is in file `.travis.yml`, and build scripts are in `bin/build-*linux.sh`. Only Linux builds are currently supported.
-* `BUILD_TYPE=Debug` jobs build and install MADNESS separately, before building TiledArray' `BUILD_TYPE=Release` jobs build MADNESS as a step of the TiledArray build.
-* MPICH and (`BUILD_TYPE=Debug` only) MADNESS installation directories are _cached_. **Build scripts only verify the presence of installed directories, and do not update them if their configuration (e.g. static vs. shared, or code version) has changed. _Thus it is admin's responsibility to manually wipe out the cache on a per-branch basis_.** It is the easiest to do via the Travis-CI web interface (click on 'More Options' menu at the top right, select 'Caches', etc.).
-* Rebuilding cache of prerequisites may take more time than the job limit (50 mins at the moment), so rebuilding cache can take several attempts. Since Travis-CI does not support forced cache updates (see e.g. https://github.com/travis-ci/travis-ci/issues/6410) if the job looks like it's going to time out we report success to Travis just so that it will store cache. __Thus jobs that timed out will be falsely reported as successful (rather than errored)!__ When rebuilding cache it may be necessary to manually restart some build jobs to make sure that cache rebuild is complete (or, just to be sure, restart the whole __build__ one time just to be sure all caches have been rebuilt). Again: this is only relevant when rebuilding caches (i.e. <5% of the time), otherwise there should be no need to restart jobs manually.
-
-# Debugging Travis-CI jobs
-
-## Local debugging
-
-Follow the instructions contained in [docker-travis.md](https://github.com/ValeevGroup/tiledarray/blob/master/bin/docker-travis.md) .
+* TiledArray only uses GitLab CI at this point
+* CI configuration is in file `.gitlab-ci.yml`, and build metadata is in `ci/`. Only Linux builds are currently supported.

From 754a88069195236cd6b425a144dc918fd2b3d1c8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 13 Jul 2022 15:25:22 -0400
Subject: [PATCH 15/30] bump BTAS tag to pull in
 https://github.com/ValeevGroup/BTAS/pull/136

---
 INSTALL.md              | 2 +-
 external/versions.cmake | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 579f92f28d..f999a98747 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -38,7 +38,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Container: header-only
   - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing*
   - Boost.Range: header-only, *only used for unit testing*
-- [BTAS](http://github.com/ValeevGroup/BTAS), tag 242871710dabd5ef337e5253000d3e38c1d977ba . If usable BTAS installation is not found, TiledArray will download and compile
+- [BTAS](http://github.com/ValeevGroup/BTAS), tag da2cb0ea3f10b0a88b1532e708c7358ca92bde6a . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
 - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 997e8b458c4234fb6c8c2781a5df59cb14b7e700 .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
diff --git a/external/versions.cmake b/external/versions.cmake
index c1120147d9..8419cba40f 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG fae8081179b9d074968b08e064a32e3ca07ab0f1)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
-set(TA_TRACKED_BTAS_TAG 242871710dabd5ef337e5253000d3e38c1d977ba)
-set(TA_TRACKED_BTAS_PREVIOUS_TAG db884b020b5c13c312c07df9d5c03cea2d65afb2)
+set(TA_TRACKED_BTAS_TAG da2cb0ea3f10b0a88b1532e708c7358ca92bde6a)
+set(TA_TRACKED_BTAS_PREVIOUS_TAG 242871710dabd5ef337e5253000d3e38c1d977ba)
 
 set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da)
 set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796)

From 8d9b49a82be9a11ef497b6d27babd467d1ba4b0b Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 30 Sep 2022 16:05:30 -0400
Subject: [PATCH 16/30] introduced TA_SCOPED_INITIALIZE and details (scoped
 finalizer, etc.)

---
 examples/cuda/ta_cc_abcd_cuda.cpp |  5 +----
 examples/cuda/ta_dense_cuda.cpp   |  4 +---
 examples/cuda/ta_reduce_cuda.cpp  |  4 +---
 examples/cuda/ta_vector_cuda.cpp  |  4 +---
 src/TiledArray/initialize.h       | 24 ++++++++++++++++++++++++
 src/TiledArray/tiledarray.cpp     | 12 ++++++++++++
 6 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/cuda/ta_cc_abcd_cuda.cpp
index c67895f7dc..6a2ef26e5f 100644
--- a/examples/cuda/ta_cc_abcd_cuda.cpp
+++ b/examples/cuda/ta_cc_abcd_cuda.cpp
@@ -60,7 +60,7 @@ int main(int argc, char** argv) {
 
   try {
     // Initialize runtime
-    TA::World& world = TA::initialize(argc, argv);
+    TA::World& world = TA_SCOPED_INITIALIZE(argc, argv);
 
     // Get command line arguments
     if (argc < 5) {
@@ -136,9 +136,6 @@ int main(int argc, char** argv) {
     } else {
       cc_abcd<float>(world, trange_occ, trange_uocc, repeat);
     }
-
-    TA::finalize();
-
   } catch (TA::Exception& e) {
     std::cerr << "!! TiledArray exception: " << e.what() << "\n";
     rc = 1;
diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp
index 51ebc67b11..14f692329b 100644
--- a/examples/cuda/ta_dense_cuda.cpp
+++ b/examples/cuda/ta_dense_cuda.cpp
@@ -300,7 +300,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
-  TiledArray::World &world = TiledArray::initialize(argc, argv);
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
 
   // Get command line arguments
   if (argc < 6) {
@@ -453,8 +453,6 @@ int try_main(int argc, char **argv) {
     throw std::runtime_error("Invalid storage type!\n");
   }
 
-  TiledArray::finalize();
-
   return 0;
 }
 
diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/cuda/ta_reduce_cuda.cpp
index 417fa2d72f..c275863519 100644
--- a/examples/cuda/ta_reduce_cuda.cpp
+++ b/examples/cuda/ta_reduce_cuda.cpp
@@ -239,7 +239,7 @@ using cudaTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
-  TiledArray::World &world = TiledArray::initialize(argc, argv);
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
 
   // Get command line arguments
   if (argc < 4) {
@@ -365,8 +365,6 @@ int try_main(int argc, char **argv) {
     do_main_body<TiledArray::Tensor<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
   }
 
-  TiledArray::finalize();
-
   return 0;
 }
 
diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/cuda/ta_vector_cuda.cpp
index f3c6265eb1..f5d2772ced 100644
--- a/examples/cuda/ta_vector_cuda.cpp
+++ b/examples/cuda/ta_vector_cuda.cpp
@@ -258,7 +258,7 @@ using cudaTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
-  TiledArray::World &world = TiledArray::initialize(argc, argv);
+  auto &world = TA_SCOPED_INITIALIZE(argc, argv);
 
   // Get command line arguments
   if (argc < 4) {
@@ -384,8 +384,6 @@ int try_main(int argc, char **argv) {
     do_main_body<TiledArray::Tensor<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
   }
 
-  TiledArray::finalize();
-
   return 0;
 }
 
diff --git a/src/TiledArray/initialize.h b/src/TiledArray/initialize.h
index c86fa1d151..324f772ccf 100644
--- a/src/TiledArray/initialize.h
+++ b/src/TiledArray/initialize.h
@@ -60,10 +60,34 @@ inline World& initialize(int& argc, char**& argv, const MPI_Comm& comm,
 
 /// @}
 
+#ifndef TA_SCOPED_INITIALIZE
+/// calling this will initialize TA and then finalize it when leaving this scope
+#define TA_SCOPED_INITIALIZE(args...) \
+  TiledArray::initialize(args);       \
+  auto finalizer = TiledArray::scoped_finalizer();
+#endif
+
 /// Finalizes TiledArray (and MADWorld runtime, if it had not been initialized
 /// when TiledArray::initialize was called).
 void finalize();
 
+namespace detail {
+struct Finalizer {
+  ~Finalizer() noexcept;
+};
+}  // namespace detail
+
+/// creates an object whose destruction upon leaving this scope will cause
+/// TiledArray::finalize to be called
+detail::Finalizer scoped_finalizer();
+
+#ifndef TA_FINALIZE_AFTER_LEAVING_THIS_SCOPE
+/// calling this will cause TiledArray::finalize() to be called (if needed)
+/// upon leaving this scope
+#define TA_FINALIZE_AFTER_LEAVING_THIS_SCOPE() \
+  auto finalizer = TiledArray::scoped_finalizer();
+#endif
+
 void taskq_wait_busy();
 void taskq_wait_yield();
 void taskq_wait_usleep(int);
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
index 3840fe750a..b4700ddec9 100644
--- a/src/TiledArray/tiledarray.cpp
+++ b/src/TiledArray/tiledarray.cpp
@@ -39,6 +39,8 @@ inline void cuda_finalize() {
   librettFinalize();
   cublasDestroy(cuBLASHandlePool::handle());
   delete &cuBLASHandlePool::handle();
+  // although TA::cudaEnv is a singleton, must explicitly delete it so
+  // that CUDA runtime is not finalized before the cudaEnv dtor is called
   cudaEnv::instance().reset(nullptr);
 }
 #endif
@@ -173,6 +175,16 @@ void TiledArray::finalize() {
   finalized_accessor() = true;
 }
 
+TiledArray::detail::Finalizer::~Finalizer() noexcept {
+  static std::mutex mtx;
+  std::scoped_lock lock(mtx);
+  if (TiledArray::initialized()) {
+    TiledArray::finalize();
+  }
+}
+
+TiledArray::detail::Finalizer TiledArray::scoped_finalizer() { return {}; }
+
 void TiledArray::ta_abort() { SafeMPI::COMM_WORLD.Abort(); }
 
 void TiledArray::ta_abort(const std::string& m) {

From 2196a11cfe455a3a5478fb7949f3a58024d8ff37 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 30 Sep 2022 16:06:28 -0400
Subject: [PATCH 17/30] fixed ta_vector_cuda for asymmetric matrices

---
 examples/cuda/ta_vector_cuda.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/cuda/ta_vector_cuda.cpp
index f5d2772ced..1593a68e8b 100644
--- a/examples/cuda/ta_vector_cuda.cpp
+++ b/examples/cuda/ta_vector_cuda.cpp
@@ -62,8 +62,9 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
   blocking.push_back(
       TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
 
-  TiledArray::TiledRange  // TRange
-      trange(blocking.begin(), blocking.end());
+  TiledArray::TiledRange trange(blocking.begin(), blocking.end());
+  TiledArray::TiledRange trange_tr(blocking.rbegin(),
+                                   blocking.rend());  // transposed trange
 
   using value_type = typename Tile::value_type;
   using TArray = TA::DistArray<Tile, TA::DensePolicy>;
@@ -150,7 +151,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);
@@ -222,7 +223,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);

From c536b1b5a13feb46fb0c5c2dc07c18ff37705e15 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 30 Sep 2022 16:14:51 -0400
Subject: [PATCH 18/30] use single CUDA stream for unit tests for now

---
 tests/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9ac9250463..88ea115334 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -154,9 +154,11 @@ if(ENABLE_MPI)
         $<TARGET_FILE:${executable}> --log_level=unit_scope ${${executable}_np_${p}_args}
         ${MPIEXEC_POSTFLAGS}
       )
+    # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
     set_tests_properties(tiledarray/unit/run-np-${p}
             PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC
-            ENVIRONMENT MAD_NUM_THREADS=2)
+            ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1"
+            )
 
     if (p GREATER 1)
       set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES ENVIRONMENT TA_UT_DISTRIBUTED=1)
@@ -165,7 +167,9 @@ if(ENABLE_MPI)
 else()
   add_test(NAME tiledarray/unit/run-np-1
            COMMAND ${executable})
+  # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
   set_tests_properties(tiledarray/unit/run-np-1
           PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC
-          ENVIRONMENT MAD_NUM_THREADS=2)
+          ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1"
+          )
 endif()

From 15f54ae42cb54e3350f48d23ac46284b175bac9f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sat, 1 Oct 2022 10:45:59 -0400
Subject: [PATCH 19/30] [ci] attempt to resolve
 https://gitlab.com/ValeevGroup/tiledarray/-/jobs/3112494437#L1284

---
 ci/.build-project | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ci/.build-project b/ci/.build-project
index a9c9f7582a..44b208242d 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -80,9 +80,14 @@ if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
 fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "make -C /home/ValeevGroup install/cuda"
+  cmd "rm -fr /usr/local/bin/nvcc"
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
   cmd "${CUDACXX} -V"
+  # this will be moved to image builder
+  cmd "sudo apt-get -yq update"
+  cmd "sudo apt-get -yq install nvidia-utils-510"
   cmd "find / -name \"*nvidia-smi\""
+  cmd "nvidia-smi"
 fi
 section_end preparing_system_section
 

From 8f2a363da06b51c10c516968fb76de88e29a29b0 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 7 Oct 2022 15:43:32 -0400
Subject: [PATCH 20/30] fixed ta_reduce_cuda for asymmetric matrices, resolves
 #366

---
 examples/cuda/ta_reduce_cuda.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/cuda/ta_reduce_cuda.cpp
index c275863519..e453069892 100644
--- a/examples/cuda/ta_reduce_cuda.cpp
+++ b/examples/cuda/ta_reduce_cuda.cpp
@@ -62,6 +62,8 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 
   TiledArray::TiledRange  // TRange
       trange(blocking.begin(), blocking.end());
+  TiledArray::TiledRange trange_tr(blocking.rbegin(),
+                                   blocking.rend());  // transposed trange
 
   using value_type = typename Tile::value_type;
   using TArray = TA::DistArray<Tile, TA::DensePolicy>;
@@ -116,7 +118,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);
@@ -198,7 +200,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);

From 32fa553424f09ca9729fdd5d67cf943ca88519d5 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sat, 8 Oct 2022 19:56:38 -0400
Subject: [PATCH 21/30] try using valeevgroup/ubuntu:cuda image

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6ab502b527..7df07712c6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -66,7 +66,7 @@ ubuntu:
         CXX: [ g++, clang++-9 ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
-      - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
+      - IMAGE : [ "ubuntu:cuda" ]
         CXX: [ g++ ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]

From 0c0722930ed47aeff718a3b0295a6d3229a910c8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sun, 9 Oct 2022 11:08:10 -0400
Subject: [PATCH 22/30] no need to install cuda when using
 valeevgroup/ubuntu:cuda image

---
 ci/.build-project | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ci/.build-project b/ci/.build-project
index 44b208242d..1b4e9dc749 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -79,15 +79,8 @@ if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
   cmd "echo MKLROOT=\$MKLROOT"
 fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
-  cmd "make -C /home/ValeevGroup install/cuda"
-  cmd "rm -fr /usr/local/bin/nvcc"
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
   cmd "${CUDACXX} -V"
-  # this will be moved to image builder
-  cmd "sudo apt-get -yq update"
-  cmd "sudo apt-get -yq install nvidia-utils-510"
-  cmd "find / -name \"*nvidia-smi\""
-  cmd "nvidia-smi"
 fi
 section_end preparing_system_section
 

From 1d73c641a6d9879de4dc42303f1d45aa5d8a669e Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 10 Oct 2022 12:33:16 -0400
Subject: [PATCH 23/30] [ci] invoke nvidia-smi to dump driver + CUDA info to
 the log

---
 .gitlab-ci.yml    | 3 ++-
 ci/.build-project | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7df07712c6..e42ca6fa79 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,7 +6,8 @@ default:
   interruptible: true
 
 variables:
-  MAD_NUM_THREADS : 2
+  MAD_NUM_THREADS : "2"
+  DOCKER_GPUS : "all"
   TA_TARGETS : "tiledarray examples-tiledarray ta_test check-tiledarray"
   # Debug builds with ScaLAPACK=ON need increased TA_UT_CTEST_TIMEOUT
   TA_CONFIG : >
diff --git a/ci/.build-project b/ci/.build-project
index 1b4e9dc749..6159653d2e 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -71,8 +71,6 @@ cmd "cmake -P ci/host_system_info.cmake"
 section_end host_system_info
 
 section_start "preparing_system_section[collapsed=true]" "Preparing system"
-cmd "source ci/openmpi.env"
-cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile"
 if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
   cmd "make -C /home/ValeevGroup install/intel-mkl"
   cmd "source /opt/intel/mkl/bin/mklvars.sh intel64"
@@ -81,7 +79,13 @@ fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
   cmd "${CUDACXX} -V"
+  cmd "lspci"
+  cmd "env"
+  cmd "ls -l /usr/bin"
+  cmd "/usr/bin/nvidia-smi"
 fi
+cmd "source ci/openmpi.env"
+cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile"
 section_end preparing_system_section
 
 section_start configure_section "Configure"

From 08df8697793fd1b314546d4ccd0a450784ac89a8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 11 Oct 2022 17:24:08 -0400
Subject: [PATCH 24/30] Revert "[ci] invoke nvidia-smi to dump driver + CUDA
 info to the log"

This reverts commit 1fc764c712c23405b828a45e94a19907b08da5ef.
---
 .gitlab-ci.yml    | 3 +--
 ci/.build-project | 8 ++------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e42ca6fa79..7df07712c6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,8 +6,7 @@ default:
   interruptible: true
 
 variables:
-  MAD_NUM_THREADS : "2"
-  DOCKER_GPUS : "all"
+  MAD_NUM_THREADS : 2
   TA_TARGETS : "tiledarray examples-tiledarray ta_test check-tiledarray"
   # Debug builds with ScaLAPACK=ON need increased TA_UT_CTEST_TIMEOUT
   TA_CONFIG : >
diff --git a/ci/.build-project b/ci/.build-project
index 6159653d2e..1b4e9dc749 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -71,6 +71,8 @@ cmd "cmake -P ci/host_system_info.cmake"
 section_end host_system_info
 
 section_start "preparing_system_section[collapsed=true]" "Preparing system"
+cmd "source ci/openmpi.env"
+cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile"
 if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
   cmd "make -C /home/ValeevGroup install/intel-mkl"
   cmd "source /opt/intel/mkl/bin/mklvars.sh intel64"
@@ -79,13 +81,7 @@ fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
   cmd "${CUDACXX} -V"
-  cmd "lspci"
-  cmd "env"
-  cmd "ls -l /usr/bin"
-  cmd "/usr/bin/nvidia-smi"
 fi
-cmd "source ci/openmpi.env"
-cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile"
 section_end preparing_system_section
 
 section_start configure_section "Configure"

From 4e0ce094d4709f060250df9147e21a50b185603b Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 11 Oct 2022 17:24:08 -0400
Subject: [PATCH 25/30] Revert "no need to install cuda when using
 valeevgroup/ubuntu:cuda image"

This reverts commit 15e1d5bc3e694547bae6f8a3c671ce9cb89e9380.
---
 ci/.build-project | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ci/.build-project b/ci/.build-project
index 1b4e9dc749..44b208242d 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -79,8 +79,15 @@ if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
   cmd "echo MKLROOT=\$MKLROOT"
 fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
+  cmd "make -C /home/ValeevGroup install/cuda"
+  cmd "rm -fr /usr/local/bin/nvcc"
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
   cmd "${CUDACXX} -V"
+  # this will be moved to image builder
+  cmd "sudo apt-get -yq update"
+  cmd "sudo apt-get -yq install nvidia-utils-510"
+  cmd "find / -name \"*nvidia-smi\""
+  cmd "nvidia-smi"
 fi
 section_end preparing_system_section
 

From dd1273255356f354f78ffa2026c5ee82998da849 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 11 Oct 2022 17:24:08 -0400
Subject: [PATCH 26/30] Revert "try using valeevgroup/ubuntu:cuda image"

This reverts commit 2223c8ed294d59a459ec20bc1313522b8c24862b.
---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7df07712c6..6ab502b527 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -66,7 +66,7 @@ ubuntu:
         CXX: [ g++, clang++-9 ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
-      - IMAGE : [ "ubuntu:cuda" ]
+      - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
         CXX: [ g++ ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]

From 7a7bc76f40f0683660774a1795e20916960bb61f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 11 Oct 2022 17:45:37 -0400
Subject: [PATCH 27/30] [ci] define RUNNER_TAGS so that only CUDA jobs end up
 on our local runner

---
 .gitlab-ci.yml    | 7 ++++++-
 ci/.build-project | 4 ----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6ab502b527..4708470e0f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -28,7 +28,9 @@ before_script:
 
 ubuntu:
   stage: build
-  tags: [ docker ]
+  tags:
+    - docker
+    - ${RUNNER_TAGS}
   timeout: 3h
   image: valeevgroup/${IMAGE}
   variables:
@@ -62,12 +64,15 @@ ubuntu:
         BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ]
         # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
         TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL
+        RUNNER_TAGS: [ linux ]
       - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
         CXX: [ g++, clang++-9 ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
+        RUNNER_TAGS: [ linux ]
       - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
         CXX: [ g++ ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]
         TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ]
+        RUNNER_TAGS: [ cuda ]
diff --git a/ci/.build-project b/ci/.build-project
index 44b208242d..1e1596a6f9 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -83,10 +83,6 @@ if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "rm -fr /usr/local/bin/nvcc"
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
   cmd "${CUDACXX} -V"
-  # this will be moved to image builder
-  cmd "sudo apt-get -yq update"
-  cmd "sudo apt-get -yq install nvidia-utils-510"
-  cmd "find / -name \"*nvidia-smi\""
   cmd "nvidia-smi"
 fi
 section_end preparing_system_section

From caba336489bea75ac2998462f814f11dc0a5f4ea Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 13 Oct 2022 13:33:23 -0400
Subject: [PATCH 28/30] cleanup

---
 tests/expressions_cuda_um.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp
index e5e810e29d..a17b749789 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_cuda_um.cpp
@@ -528,15 +528,6 @@ BOOST_AUTO_TEST_CASE(scal_add_block) {
                              2 * (3 * a("a,b,c").block({3, 3, 3}, {5, 5, 5}) +
                                   4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5})));
 
-  std::cout << "expr tree for c(\"a,b,c\") =\n"
-               "                             2 * (3 * a(\"a,b,c\").block({3, "
-               "3, 3}, {5, 5, 5}) +\n"
-               "                                  4 * b(\"a,b,c\").block({3, "
-               "3, 3}, {5, 5, 5})):\n"
-            << c("a,b,c")
-            << 2 * (3 * a("a,b,c").block({3, 3, 3}, {5, 5, 5}) +
-                    4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5}));
-
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
     if (!a.is_zero(block_range.ordinal(index)) &&
         !b.is_zero(block_range.ordinal(index))) {
@@ -998,10 +989,6 @@ BOOST_AUTO_TEST_CASE(scale_add_permute) {
 
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = 5 * (2 * a("c,b,a")) + (3 * b("a,b,c")));
 
-  std::cout << "expr tree for c(\"a,b,c\") = 5 * (2 * a(\"c,b,a\")) + (3 * "
-               "b(\"a,b,c\")))"
-            << c("a,b,c") << (5 * (2 * a("c,b,a")) + (3 * b("a,b,c")));
-
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
     const size_t perm_index =

From 77c95ff3b2000cc11e575acb1a0f25d3b5f9ecec Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 13 Oct 2022 13:34:36 -0400
Subject: [PATCH 29/30] hush warnings re use of Eigen's Tensor header when CUDA
 enabled

---
 src/TiledArray/external/eigen.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/TiledArray/external/eigen.h b/src/TiledArray/external/eigen.h
index 6ee0eaea3f..cd2c50b522 100644
--- a/src/TiledArray/external/eigen.h
+++ b/src/TiledArray/external/eigen.h
@@ -46,7 +46,14 @@ TILEDARRAY_PRAGMA_GCC(system_header)
 #endif
 
 #include <Eigen/Core>
+
+// disable warnings re: ignored attributes on template argument
+// Eigen::PacketType<int, Eigen::DefaultDevice>::type
+// {aka __vector(2) long long int}
+TILEDARRAY_PRAGMA_GCC(diagnostic push)
+TILEDARRAY_PRAGMA_GCC(diagnostic ignored "-Wignored-attributes")
 #include <unsupported/Eigen/CXX11/Tensor>
+TILEDARRAY_PRAGMA_GCC(diagnostic pop)
 
 #if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_LAPACKE_STRICT)
 #if !EIGEN_VERSION_AT_LEAST(3, 3, 7)

From 6bd84aa461d53d7ad266a1d3ed9b786549de0165 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 13 Oct 2022 16:37:02 -0400
Subject: [PATCH 30/30] [ci] try using Ninja and multiple cores

---
 .gitlab-ci.yml    | 5 ++---
 ci/.build-project | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4708470e0f..93850215f1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,9 +20,8 @@ variables:
     ${ENABLE_SCALAPACK}
 
 before_script:
-  # NB: below tag parsing is not robust
-  - echo "CI_RUNNER_TAGS=$CI_RUNNER_TAGS"
-  - CMAKE_BUILD_PARALLEL_LEVEL=$(echo $CI_RUNNER_TAGS | sed -n 's/CMAKE_BUILD_PARALLEL_LEVEL=\([0-9]\+\).*/\1/p')
+  # NB: if CMAKE_BUILD_PARALLEL_LEVEL is not set (i.e. using shared runner), use 1 to ensure we have enough memory
+  # TODO optimize ta_test build memory consumption
   - export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:=1}
   - echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL"
 
diff --git a/ci/.build-project b/ci/.build-project
index 1e1596a6f9..aeb7c73787 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -89,7 +89,7 @@ section_end preparing_system_section
 
 section_start configure_section "Configure"
 cmd mkdir -p ${build_dir}
-time_cmd configure "cmake -B${build_dir} $vars"
+time_cmd configure "cmake -GNinja -B${build_dir} $vars"
 section_end configure_section
 
 for target in ${targets}; do