From ca27eb0782861fe4e21678461e294594df269179 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 27 Mar 2023 19:45:14 -0400
Subject: [PATCH 1/8] extend device API for complex types, as illustrated by
 ta_dense_cuda

---
 examples/cuda/ta_dense_cuda.cpp               | 110 ++++++++++++------
 examples/dgemm/ta_dense_asymm.cpp             |  34 +++---
 src/TiledArray/cuda/btas_um_tensor.cpp        |  16 +++
 src/TiledArray/cuda/btas_um_tensor.h          |  16 +++
 src/TiledArray/cuda/cpu_cuda_vector.cu        |  21 +++-
 src/TiledArray/cuda/cpu_cuda_vector.h         |   2 +
 src/TiledArray/cuda/cublas.h                  |  61 ++++++++++
 src/TiledArray/cuda/kernel/mult_kernel.cu     |  19 +++
 src/TiledArray/cuda/kernel/mult_kernel.h      |  20 ++++
 src/TiledArray/cuda/kernel/reduce_kernel.cu   |  41 ++++++-
 src/TiledArray/cuda/kernel/reduce_kernel.h    |  34 ++++++
 .../cuda/kernel/reduce_kernel_impl.h          |  31 +++--
 12 files changed, 337 insertions(+), 68 deletions(-)
diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp
index 14f692329b..4a035f176b 100644
--- a/examples/cuda/ta_dense_cuda.cpp
+++ b/examples/cuda/ta_dense_cuda.cpp
@@ -137,23 +137,31 @@ template <typename Storage>
 void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
                   const long Nn, const long Bn, const long Nk, const long Bk,
                   const long nrepeat) {
-  using Real = typename Storage::value_type;
+  using T = TiledArray::detail::numeric_t<Storage>;
+  using RT = TiledArray::detail::scalar_t<Storage>;
+  constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;
 
   const std::size_t Tm = Nm / Bm;
   const std::size_t Tn = Nn / Bn;
   const std::size_t Tk = Nk / Bk;
 
+  const std::int64_t nflops =
+      (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
+                           // 1 add takes 2/1 flops for complex/real
+      * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
+      static_cast<std::int64_t>(Nk);
+
   if (world.rank() == 0)
     std::cout << "TiledArray: dense matrix multiply test...\n"
               << "Number of nodes     = " << world.size()
               << "\nSize of A         = " << Nm << "x" << Nk << " ("
-              << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
+              << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
               << "\nSize of A block   = " << Bm << "x" << Bk
               << "\nSize of B         = " << Nk << "x" << Nn << " ("
-              << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)"
+              << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
               << "\nSize of B block   = " << Bk << "x" << Bn
               << "\nSize of C         = " << Nm << "x" << Nn << " ("
-              << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)"
+              << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
               << "\nSize of C block   = " << Bm << "x" << Bn
               << "\n# of blocks of C  = " << Tm * Tn
               << "\nAverage # of blocks of C/node = "
@@ -205,14 +213,13 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
   TiledArray::TiledRange  // TRange for b
       trange_b(blocking_B.begin(), blocking_B.end());
 
-  using value_type = typename Storage::value_type;
-  using CUDATile = btas::Tensor<Real, TA::Range, Storage>;
+  using CUDATile = btas::Tensor<T, TA::Range, Storage>;
   using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
-  using TAMatrix = TA::DistArray<TA::Tensor<value_type>>;
+  using TAMatrix = TA::DistArray<TA::Tensor<T>>;
 
   CUDAMatrix c(world, trange_c);
-  value_type val_a = 0.03;
-  value_type val_b = 0.02;
+  auto val_a = 0.03;
+  auto val_b = 0.02;
 
   {
     // Construct and initialize arrays
@@ -235,19 +242,26 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     // start profiler
     cudaProfilerStart();
 
-    // Start clock
-    const double wall_time_start = madness::wall_time();
+    double total_time = 0.0;
+    double total_gflop_rate = 0.0;
 
     // Do matrix multiplication
     for (int i = 0; i < nrepeat; ++i) {
       double iter_time_start = madness::wall_time();
       //      c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n");
       c("m,n") = a("m,k") * b("k,n");
+      c.world().gop.fence();  // fence since GEMM can return early
       double iter_time_stop = madness::wall_time();
+      const double iter_time = iter_time_stop - iter_time_start;
+      total_time += iter_time;
+      const double gflop_rate = double(nflops) / (iter_time * 1.e9);
+      total_gflop_rate += gflop_rate;
       if (world.rank() == 0)
-        std::cout << "Iteration " << i + 1
-                  << " wall time: " << (iter_time_stop - iter_time_start)
+        std::cout << "Iteration " << i + 1 << " wall time: " << iter_time
                   << "\n";
+      if (world.rank() == 0)
+        std::cout << "Iteration " << i + 1 << "   time=" << time
+                  << "   GFLOPS=" << gflop_rate << "\n";
     }
     // Stop clock
     const double wall_time_stop = madness::wall_time();
@@ -256,32 +270,43 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     cudaProfilerStop();
 
     if (world.rank() == 0)
-      std::cout << "Average wall time   = "
-                << (wall_time_stop - wall_time_start) / double(nrepeat)
+      std::cout << "Average wall time   = " << total_time / double(nrepeat)
                 << " sec\nAverage GFLOPS      = "
-                << double(nrepeat) * 2.0 * double(Nn * Nm * Nm) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
+                << total_gflop_rate / double(nrepeat) << "\n";
   }
 
-  double threshold =
-      std::numeric_limits<typename Storage::value_type>::epsilon();
+  double threshold = std::numeric_limits<RT>::epsilon();
   auto dot_length = Nk;
   //  auto result = dot_length * val_a * val_b + val_a - val_b;
-  auto result = dot_length * val_a * val_b;
+  T result;
+  if constexpr (complex_T) {
+    result = T(dot_length * val_a * val_b, 0.);
+  } else
+    result = dot_length * val_a * val_b;
 
   auto verify = [&world, &threshold, &result,
                  &dot_length](TA::Tile<CUDATile> &tile) {
     auto n_elements = tile.size();
     for (std::size_t i = 0; i < n_elements; i++) {
-      double abs_err = fabs(tile[i] - result);
+      double abs_err = std::abs(tile[i] - result);
       //      double abs_val = fabs(tile[i]);
-      double rel_err = abs_err / result / dot_length;
+      double rel_err = abs_err / std::abs(result) / dot_length;
       if (rel_err > threshold) {
+        auto to_string = [](const auto &v) {
+          constexpr bool complex_T =
+              TiledArray::detail::is_complex_v<std::decay_t<decltype(v)>>;
+          if constexpr (complex_T) {
+            std::string result;
+            result = "{" + std::to_string(v.real()) + "," +
+                     std::to_string(v.imag()) + "}";
+            return result;
+          } else
+            return std::to_string(v);
+        };
         std::cout << "Node: " << world.rank() << " Tile: " << tile.range()
                   << " id: " << i
-                  << std::string(" gpu: " + std::to_string(tile[i]) +
-                                 " cpu: " + std::to_string(result) + "\n");
+                  << std::string(" gpu: " + to_string(tile[i]) +
+                                 " cpu: " + to_string(result) + "\n");
         break;
       }
     }
@@ -308,7 +333,7 @@ int try_main(int argc, char **argv) {
                  "blocked by Bm, Bn, and Bk, respectively"
               << std::endl
               << "Usage: " << argv[0]
-              << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [real = double] "
+              << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] "
                  "[storage type = cuda_um_btas_varray]\n";
     return 0;
   }
@@ -337,13 +362,13 @@ int try_main(int argc, char **argv) {
     return 1;
   }
 
-  const auto real_type_str =
-      (argc >= 9) ? std::string(argv[8]) : std::string("double");
-
-  if (real_type_str != "float" && real_type_str != "double") {
-    std::cerr << "Error: invalid real type: " << real_type_str
-              << "\n Valid option includes: float or "
-                 "double. \n";
+  const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
+  if (scalar_type_str != "double" && scalar_type_str != "float" &&
+      scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+    std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+    std::cerr << "       valid real types are \"double\", \"float\", "
+                 "\"zdouble\", and \"zfloat\".\n";
+    return 1;
   }
 
   const auto storage_type =
@@ -357,7 +382,7 @@ int try_main(int argc, char **argv) {
                  "cuda_um_btas_varray or cuda_um_thrust_vector "
                  "or cpu_cuda_vector. \n";
   }
-  std::cout << "Storage type: " << storage_type << "<" << real_type_str << ">"
+  std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">"
             << std::endl;
   //  auto to_bool = [](const std::string &str) {
   //    return (str == "true" || str == "True" || str == "TRUE" || str == "1" ||
@@ -424,7 +449,7 @@ int try_main(int argc, char **argv) {
   }  // print device properties
 
   //  if (storage_type == "cpu_cuda_vector") {
-  //    if (real_type_str == "double")
+  //    if (scalar_type_str == "double")
   //      do_main_body<TiledArray::cpu_cuda_vector<double>>(world, Nm, Bm, Nn,
   //      Bn,
   //                                                        Nk, Bk, nrepeat);
@@ -434,15 +459,24 @@ int try_main(int argc, char **argv) {
   //                                                       Nk, Bk, nrepeat);
   //  } else if (storage_type == "cuda_um_btas_varray") {
   if (storage_type == "cuda_um_btas_varray") {
-    if (real_type_str == "double")
+    if (scalar_type_str == "double")
       do_main_body<TiledArray::cuda_um_btas_varray<double>>(
           world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
-    else
+    else if (scalar_type_str == "float")
       do_main_body<TiledArray::cuda_um_btas_varray<float>>(world, Nm, Bm, Nn,
                                                            Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "zdouble")
+      do_main_body<TiledArray::cuda_um_btas_varray<std::complex<double>>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "zfloat")
+      do_main_body<TiledArray::cuda_um_btas_varray<std::complex<float>>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else {
+      abort();  // unreachable
+    }
   }
   // else if (storage_type == "cuda_um_thrust_vector") {
-  //    if (real_type_str == "double")
+  //    if (scalar_type_str == "double")
   //      do_main_body<TiledArray::cuda_um_thrust_vector<double>>(
   //          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
   //    else
diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp
index ac72a39209..40183603bb 100644
--- a/examples/dgemm/ta_dense_asymm.cpp
+++ b/examples/dgemm/ta_dense_asymm.cpp
@@ -75,23 +75,6 @@ int main(int argc, char** argv) {
   const std::size_t Tn = Nn / Bn;
   const std::size_t Tk = Nk / Bk;
 
-  if (world.rank() == 0)
-    std::cout << "TiledArray: dense matrix multiply test...\n"
-              << "Number of nodes     = " << world.size()
-              << "\nScalar type       = " << scalar_type_str
-              << "\nSize of A         = " << Nm << "x" << Nk << " ("
-              << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of A block   = " << Bm << "x" << Bk
-              << "\nSize of B         = " << Nk << "x" << Nn << " ("
-              << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of B block   = " << Bk << "x" << Bn
-              << "\nSize of C         = " << Nm << "x" << Nn << " ("
-              << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of C block   = " << Bm << "x" << Bn
-              << "\n# of blocks of C  = " << Tm * Tn
-              << "\nAverage # of blocks of C/node = "
-              << double(Tm * Tn) / double(world.size()) << "\n";
-
   // Construct TiledRange
   std::vector<unsigned int> blocking_m;
   blocking_m.reserve(Tm + 1);
@@ -148,6 +131,23 @@ int main(int argc, char** argv) {
         * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
         static_cast<std::int64_t>(Nk);
 
+    if (world.rank() == 0)
+      std::cout << "TiledArray: dense matrix multiply test...\n"
+                << "Number of nodes     = " << world.size()
+                << "\nScalar type       = " << scalar_type_str
+                << "\nSize of A         = " << Nm << "x" << Nk << " ("
+                << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
+                << "\nSize of A block   = " << Bm << "x" << Bk
+                << "\nSize of B         = " << Nk << "x" << Nn << " ("
+                << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
+                << "\nSize of B block   = " << Bk << "x" << Bn
+                << "\nSize of C         = " << Nm << "x" << Nn << " ("
+                << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
+                << "\nSize of C block   = " << Bm << "x" << Bn
+                << "\n# of blocks of C  = " << Tm * Tn
+                << "\nAverage # of blocks of C/node = "
+                << double(Tm * Tn) / double(world.size()) << "\n";
+
     auto memtrace = [do_memtrace, &world](const std::string& str) -> void {
       if (do_memtrace) {
         world.gop.fence();
diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp
index 58c3981f18..9423e7563d 100644
--- a/src/TiledArray/cuda/btas_um_tensor.cpp
+++ b/src/TiledArray/cuda/btas_um_tensor.cpp
@@ -11,6 +11,10 @@
 
 template class btas::varray<double, TiledArray::cuda_um_allocator<double>>;
 template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
+template class btas::varray<
+    std::complex<double>, TiledArray::cuda_um_allocator<std::complex<double>>>;
+template class btas::varray<std::complex<float>,
+                            TiledArray::cuda_um_allocator<std::complex<float>>>;
 template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
 template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
 
@@ -18,6 +22,12 @@ template class btas::Tensor<double, TiledArray::Range,
                             TiledArray::cuda_um_btas_varray<double>>;
 template class btas::Tensor<float, TiledArray::Range,
                             TiledArray::cuda_um_btas_varray<float>>;
+template class btas::Tensor<
+    std::complex<double>, TiledArray::Range,
+    TiledArray::cuda_um_btas_varray<std::complex<double>>>;
+template class btas::Tensor<
+    std::complex<float>, TiledArray::Range,
+    TiledArray::cuda_um_btas_varray<std::complex<float>>>;
 template class btas::Tensor<int, TiledArray::Range,
                             TiledArray::cuda_um_btas_varray<int>>;
 template class btas::Tensor<long, TiledArray::Range,
@@ -27,6 +37,12 @@ template class TiledArray::Tile<btas::Tensor<
     double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
 template class TiledArray::Tile<btas::Tensor<
     float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
+template class TiledArray::Tile<
+    btas::Tensor<std::complex<double>, TiledArray::Range,
+                 TiledArray::cuda_um_btas_varray<std::complex<double>>>>;
+template class TiledArray::Tile<
+    btas::Tensor<std::complex<float>, TiledArray::Range,
+                 TiledArray::cuda_um_btas_varray<std::complex<float>>>>;
 template class TiledArray::Tile<
     btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
 template class TiledArray::Tile<btas::Tensor<
diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/cuda/btas_um_tensor.h
index 7bddc4a178..6342c54771 100644
--- a/src/TiledArray/cuda/btas_um_tensor.h
+++ b/src/TiledArray/cuda/btas_um_tensor.h
@@ -780,6 +780,10 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<UMTensor, Policy> &array) {
 extern template class btas::varray<double,
                                    TiledArray::cuda_um_allocator<double>>;
 extern template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
+extern template class btas::varray<
+    std::complex<double>, TiledArray::cuda_um_allocator<std::complex<double>>>;
+extern template class btas::varray<
+    std::complex<float>, TiledArray::cuda_um_allocator<std::complex<float>>>;
 extern template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
 extern template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
 
@@ -787,6 +791,12 @@ extern template class btas::Tensor<double, TiledArray::Range,
                                    TiledArray::cuda_um_btas_varray<double>>;
 extern template class btas::Tensor<float, TiledArray::Range,
                                    TiledArray::cuda_um_btas_varray<float>>;
+extern template class btas::Tensor<
+    std::complex<double>, TiledArray::Range,
+    TiledArray::cuda_um_btas_varray<std::complex<double>>>;
+extern template class btas::Tensor<
+    std::complex<float>, TiledArray::Range,
+    TiledArray::cuda_um_btas_varray<std::complex<float>>>;
 extern template class btas::Tensor<int, TiledArray::Range,
                                    TiledArray::cuda_um_btas_varray<int>>;
 extern template class btas::Tensor<long, TiledArray::Range,
@@ -796,6 +806,12 @@ extern template class TiledArray::Tile<btas::Tensor<
     double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
 extern template class TiledArray::Tile<btas::Tensor<
     float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
+extern template class TiledArray::Tile<
+    btas::Tensor<std::complex<double>, TiledArray::Range,
+                 TiledArray::cuda_um_btas_varray<std::complex<double>>>>;
+extern template class TiledArray::Tile<
+    btas::Tensor<std::complex<float>, TiledArray::Range,
+                 TiledArray::cuda_um_btas_varray<std::complex<float>>>>;
 extern template class TiledArray::Tile<
     btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
 extern template class TiledArray::Tile<btas::Tensor<
diff --git a/src/TiledArray/cuda/cpu_cuda_vector.cu b/src/TiledArray/cuda/cpu_cuda_vector.cu
index 34dd405807..6c58fdd123 100644
--- a/src/TiledArray/cuda/cpu_cuda_vector.cu
+++ b/src/TiledArray/cuda/cpu_cuda_vector.cu
@@ -15,11 +15,25 @@ void resize<float,thrust::device_allocator<float>>(
     size_t size) {
     dev_vec.resize(size);
 }
+template<>
+void resize<std::complex<double>,thrust::device_allocator<std::complex<double>>>(
+    thrust::device_vector<std::complex<double>, thrust::device_allocator<std::complex<double>>>& dev_vec,
+    size_t size) {
+    dev_vec.resize(size);
+}
+template<>
+void resize<std::complex<float>,thrust::device_allocator<std::complex<float>>>(
+    thrust::device_vector<std::complex<float>, thrust::device_allocator<std::complex<float>>>& dev_vec,
+    size_t size) {
+    dev_vec.resize(size);
+}
 }
 
 namespace TiledArray {
 template class cpu_cuda_vector<double>;
 template class cpu_cuda_vector<float>;
+template class cpu_cuda_vector<std::complex<double>>;
+template class cpu_cuda_vector<std::complex<float>>;
 }
 
 // Thrust included in CUDA 9+ seems to generate uninstantiated CUB calls
@@ -35,6 +49,12 @@ auto force_missing_copy_instantiations_double() {
 auto force_missing_copy_instantiations_float() {
   return force_missing_copy_instantiations<float>();
 }
+auto force_missing_copy_instantiations_zdouble() {
+  return force_missing_copy_instantiations<std::complex<double>>();
+}
+auto force_missing_copy_instantiations_zfloat() {
+  return force_missing_copy_instantiations<std::complex<float>>();
+}
 auto force_missing_copy_instantiations_unsigned_long() {
   return force_missing_copy_instantiations<unsigned long>();
 }
@@ -65,4 +85,3 @@ auto force_missing_copy_n_instantiations_long_long(){
 }
 
 #endif  // __CUDACC_VER_MAJOR__  >= 9
-
diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/cuda/cpu_cuda_vector.h
index 7370eeaa2e..5a6e52beb5 100644
--- a/src/TiledArray/cuda/cpu_cuda_vector.h
+++ b/src/TiledArray/cuda/cpu_cuda_vector.h
@@ -158,6 +158,8 @@ class cpu_cuda_vector {
 
 extern template class cpu_cuda_vector<double>;
 extern template class cpu_cuda_vector<float>;
+extern template class cpu_cuda_vector<std::complex<double>>;
+extern template class cpu_cuda_vector<std::complex<float>>;
 
 template <MemorySpace Space, typename T, typename HostAlloc,
           typename DeviceAlloc>
diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/cuda/cublas.h
index a5d3da7afc..8d4085eabb 100644
--- a/src/TiledArray/cuda/cublas.h
+++ b/src/TiledArray/cuda/cublas.h
@@ -54,6 +54,25 @@ inline void __cublasSafeCall(cublasStatus_t err, const char *file,
 
 namespace TiledArray {
 
+namespace detail {
+
+template <typename T>
+auto cublasPointer(T *std_complex_ptr) {
+  using Scalar = TiledArray::detail::scalar_t<T>;
+  static_assert(std::is_same_v<Scalar, double> ||
+                std::is_same_v<Scalar, float>);
+  constexpr bool DP = std::is_same_v<Scalar, double>;
+  using cuT = std::conditional_t<std::is_same_v<Scalar, double>,
+                                 cuDoubleComplex, cuComplex>;
+  if constexpr (std::is_const_v<
+                    std::remove_pointer_t<decltype(std_complex_ptr)>>) {
+    return reinterpret_cast<const cuT *>(std_complex_ptr);
+  } else
+    return reinterpret_cast<cuT *>(std_complex_ptr);
+};
+
+}  // namespace detail
+
 /*
  * cuBLAS interface functions
  */
@@ -117,6 +136,29 @@ inline cublasStatus_t cublasGemm<double>(
   return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
                      beta, C, ldc);
 }
+template <>
+inline cublasStatus_t cublasGemm<std::complex<float>>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const std::complex<float> *alpha,
+    const std::complex<float> *A, int lda, const std::complex<float> *B,
+    int ldb, const std::complex<float> *beta, std::complex<float> *C, int ldc) {
+  using detail::cublasPointer;
+  return cublasCgemm(handle, transa, transb, m, n, k, cublasPointer(alpha),
+                     cublasPointer(A), lda, cublasPointer(B), ldb,
+                     cublasPointer(beta), cublasPointer(C), ldc);
+}
+template <>
+inline cublasStatus_t cublasGemm<std::complex<double>>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const std::complex<double> *alpha,
+    const std::complex<double> *A, int lda, const std::complex<double> *B,
+    int ldb, const std::complex<double> *beta, std::complex<double> *C,
+    int ldc) {
+  using detail::cublasPointer;
+  return cublasZgemm(handle, transa, transb, m, n, k, cublasPointer(alpha),
+                     cublasPointer(A), lda, cublasPointer(B), ldb,
+                     cublasPointer(beta), cublasPointer(C), ldc);
+}
 
 /// AXPY interface functions
 
@@ -139,6 +181,25 @@ inline cublasStatus_t cublasAxpy<double, double>(cublasHandle_t handle, int n,
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
+template <>
+inline cublasStatus_t cublasAxpy<std::complex<float>, std::complex<float>>(
+    cublasHandle_t handle, int n, const std::complex<float> *alpha,
+    const std::complex<float> *x, int incx, std::complex<float> *y, int incy) {
+  using detail::cublasPointer;
+  return cublasCaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx,
+                     cublasPointer(y), incy);
+}
+
+template <>
+inline cublasStatus_t cublasAxpy<std::complex<double>, std::complex<double>>(
+    cublasHandle_t handle, int n, const std::complex<double> *alpha,
+    const std::complex<double> *x, int incx, std::complex<double> *y,
+    int incy) {
+  using detail::cublasPointer;
+  return cublasZaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx,
+                     cublasPointer(y), incy);
+}
+
 template <>
 inline cublasStatus_t cublasAxpy<float, int>(cublasHandle_t handle, int n,
                                              const int *alpha, const float *x,
diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/cuda/kernel/mult_kernel.cu
index 8bbcae4927..aa3cadbc72 100644
--- a/src/TiledArray/cuda/kernel/mult_kernel.cu
+++ b/src/TiledArray/cuda/kernel/mult_kernel.cu
@@ -45,6 +45,16 @@ void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n,
   mult_to_cuda_kernel_impl(result, arg, n, stream, device_id);
 }
 
+void mult_to_cuda_kernel(std::complex<float> *result, const std::complex<float> *arg, std::size_t n,
+                         cudaStream_t stream, int device_id) {
+  mult_to_cuda_kernel_impl(result, arg, n, stream, device_id);
+}
+
+void mult_to_cuda_kernel(std::complex<double> *result, const std::complex<double> *arg, std::size_t n,
+                         cudaStream_t stream, int device_id) {
+  mult_to_cuda_kernel_impl(result, arg, n, stream, device_id);
+}
+
 /// result[i] = arg1[i] * arg2[i]
 void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n,
                       cudaStream_t stream, int device_id){
@@ -61,6 +71,15 @@ void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, st
   mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id);
 }
 
+void mult_cuda_kernel(std::complex<float> *result, const std::complex<float> *arg1, const std::complex<float> *arg2, std::size_t n,
+                      cudaStream_t stream, int device_id){
+  mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id);
+}
+
+void mult_cuda_kernel(std::complex<double> *result, const std::complex<double> *arg1, const std::complex<double> *arg2, std::size_t n,
+                      cudaStream_t stream, int device_id){
+  mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id);
+}
 
 }  // namespace TiledArray
 
diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/cuda/kernel/mult_kernel.h
index 7c333e879a..0c5c3f7822 100644
--- a/src/TiledArray/cuda/kernel/mult_kernel.h
+++ b/src/TiledArray/cuda/kernel/mult_kernel.h
@@ -28,6 +28,8 @@
 
 #ifdef TILEDARRAY_HAS_CUDA
 
+#include <complex>
+
 namespace TiledArray {
 
 /// result[i] = result[i] * arg[i]
@@ -40,6 +42,14 @@ void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n,
 void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n,
                          cudaStream_t stream, int device_id);
 
+void mult_to_cuda_kernel(std::complex<float> *result,
+                         const std::complex<float> *arg, std::size_t n,
+                         cudaStream_t stream, int device_id);
+
+void mult_to_cuda_kernel(std::complex<double> *result,
+                         const std::complex<double> *arg, std::size_t n,
+                         cudaStream_t stream, int device_id);
+
 /// result[i] = arg1[i] * arg2[i]
 void mult_cuda_kernel(int *result, const int *arg1, const int *arg2,
                       std::size_t n, cudaStream_t stream, int device_id);
@@ -50,6 +60,16 @@ void mult_cuda_kernel(float *result, const float *arg1, const float *arg2,
 void mult_cuda_kernel(double *result, const double *arg1, const double *arg2,
                       std::size_t n, cudaStream_t stream, int device_id);
 
+void mult_cuda_kernel(std::complex<float> *result,
+                      const std::complex<float> *arg1,
+                      const std::complex<float> *arg2, std::size_t n,
+                      cudaStream_t stream, int device_id);
+
+void mult_cuda_kernel(std::complex<double> *result,
+                      const std::complex<double> *arg1,
+                      const std::complex<double> *arg2, std::size_t n,
+                      cudaStream_t stream, int device_id);
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/cuda/kernel/reduce_kernel.cu
index 1e1550260f..d24669b920 100644
--- a/src/TiledArray/cuda/kernel/reduce_kernel.cu
+++ b/src/TiledArray/cuda/kernel/reduce_kernel.cu
@@ -33,7 +33,6 @@ namespace TiledArray {
 int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
                         int device_id){
   return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-
 }
 
 float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
@@ -47,6 +46,16 @@ double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream
   return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
 }
 
+std::complex<float> product_cuda_kernel(const std::complex<float> *arg, std::size_t n, cudaStream_t stream,
+                          int device_id){
+  return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
+std::complex<double> product_cuda_kernel(const std::complex<double> *arg, std::size_t n, cudaStream_t stream,
+                           int device_id){
+
+  return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
 
 // foreach(i) result += arg[i]
 int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
@@ -64,6 +73,16 @@ double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
   return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id);
 }
 
+std::complex<float> sum_cuda_kernel(const std::complex<float> *arg, std::size_t n, cudaStream_t stream,
+                      int device_id){
+  return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
+std::complex<double> sum_cuda_kernel(const std::complex<double> *arg, std::size_t n, cudaStream_t stream,
+                       int device_id){
+  return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
 // foreach(i) result = max(result, arg[i])
 int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
                     int device_id){
@@ -112,6 +131,16 @@ double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
   return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id);
 }
 
+std::complex<float> absmax_cuda_kernel(const std::complex<float> *arg, std::size_t n, cudaStream_t stream,
+                         int device_id){
+  return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
+std::complex<double> absmax_cuda_kernel(const std::complex<double> *arg, std::size_t n, cudaStream_t stream,
+                          int device_id){
+  return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
 // foreach(i) result = min(result, abs(arg[i]))
 int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
                        int device_id){
@@ -128,6 +157,16 @@ double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
   return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id);
 }
 
+std::complex<float> absmin_cuda_kernel(const std::complex<float> *arg, std::size_t n, cudaStream_t stream,
+                         int device_id){
+  return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
+std::complex<double> absmin_cuda_kernel(const std::complex<double> *arg, std::size_t n, cudaStream_t stream,
+                          int device_id){
+  return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id);
+}
+
 }  // namespace TiledArray
 
 #endif // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/cuda/kernel/reduce_kernel.h
index 857cad6c0c..1bcf526ee4 100644
--- a/src/TiledArray/cuda/kernel/reduce_kernel.h
+++ b/src/TiledArray/cuda/kernel/reduce_kernel.h
@@ -28,6 +28,8 @@
 
 #ifdef TILEDARRAY_HAS_CUDA
 
+#include <complex>
+
 namespace TiledArray {
 
 // foreach(i) result *= arg[i]
@@ -40,6 +42,14 @@ float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
 double product_cuda_kernel(const double *arg, std::size_t n,
                            cudaStream_t stream, int device_id);
 
+std::complex<float> product_cuda_kernel(const std::complex<float> *arg,
+                                        std::size_t n, cudaStream_t stream,
+                                        int device_id);
+
+std::complex<double> product_cuda_kernel(const std::complex<double> *arg,
+                                         std::size_t n, cudaStream_t stream,
+                                         int device_id);
+
 // foreach(i) result += arg[i]
 int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
                     int device_id);
@@ -50,6 +60,14 @@ float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
 double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
                        int device_id);
 
+std::complex<float> sum_cuda_kernel(const std::complex<float> *arg,
+                                    std::size_t n, cudaStream_t stream,
+                                    int device_id);
+
+std::complex<double> sum_cuda_kernel(const std::complex<double> *arg,
+                                     std::size_t n, cudaStream_t stream,
+                                     int device_id);
+
 // foreach(i) result = max(result, arg[i])
 int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
                     int device_id);
@@ -80,6 +98,14 @@ float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
 double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
                           int device_id);
 
+std::complex<float> absmax_cuda_kernel(const std::complex<float> *arg,
+                                       std::size_t n, cudaStream_t stream,
+                                       int device_id);
+
+std::complex<double> absmax_cuda_kernel(const std::complex<double> *arg,
+                                        std::size_t n, cudaStream_t stream,
+                                        int device_id);
+
 // foreach(i) result = min(result, abs(arg[i]))
 int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
                        int device_id);
@@ -90,6 +116,14 @@ float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
 double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
                           int device_id);
 
+std::complex<float> absmin_cuda_kernel(const std::complex<float> *arg,
+                                       std::size_t n, cudaStream_t stream,
+                                       int device_id);
+
+std::complex<double> absmin_cuda_kernel(const std::complex<double> *arg,
+                                        std::size_t n, cudaStream_t stream,
+                                        int device_id);
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h
index 12a8aa1e19..9dc6507cca 100644
--- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h
+++ b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h
@@ -27,6 +27,7 @@
 #include <limits>
 
 #include <TiledArray/external/cuda.h>
+#include <TiledArray/type_traits.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
@@ -38,9 +39,15 @@ namespace TiledArray {
 namespace detail {
 
 template <typename T>
-struct absolute_value : public thrust::unary_function<T, T> {
-  __host__ __device__ T operator()(const T &x) const {
-    return x < T(0) ? -x : x;
+struct absolute_value
+    : public thrust::unary_function<T, TiledArray::detail::scalar_t<T>> {
+  __host__ __device__ TiledArray::detail::scalar_t<T> operator()(
+      const T &x) const {
+    using RT = TiledArray::detail::scalar_t<T>;
+    if constexpr (!TiledArray::detail::is_complex_v<T>) {
+      return x < RT(0) ? -x : x;
+    } else
+      return std::sqrt(x.real() * x.real() + x.imag() * x.imag());
   }
 };
 
@@ -93,10 +100,11 @@ T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
 }
 
 template <typename T>
-T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
-                                 cudaStream_t stream, int device_id) {
-  T init(0);
-  thrust::maximum<T> max_op;
+TiledArray::detail::scalar_t<T> absmax_reduce_cuda_kernel_impl(
+    const T *arg, std::size_t n, cudaStream_t stream, int device_id) {
+  using TR = TiledArray::detail::scalar_t<T>;
+  TR init(0);
+  thrust::maximum<TR> max_op;
   detail::absolute_value<T> abs_op;
 
   CudaSafeCall(cudaSetDevice(device_id));
@@ -110,10 +118,11 @@ T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
 }
 
 template <typename T>
-T absmin_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
-                                 cudaStream_t stream, int device_id) {
-  T init(0);
-  thrust::minimum<T> min_op;
+TiledArray::detail::scalar_t<T> absmin_reduce_cuda_kernel_impl(
+    const T *arg, std::size_t n, cudaStream_t stream, int device_id) {
+  using TR = TiledArray::detail::scalar_t<T>;
+  TR init = std::numeric_limits<TR>::max();
+  thrust::minimum<TR> min_op;
   detail::absolute_value<T> abs_op;
 
   CudaSafeCall(cudaSetDevice(device_id));

From 67b2360a57635bfed8a6fe598960ef9bda0e9cae Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 27 Mar 2023 22:12:05 -0400
Subject: [PATCH 2/8] add --expt-relaxed-constexpr to CMAKE_CUDA_FLAGS to be
 able to handle std::complex

---
 external/cuda.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/external/cuda.cmake b/external/cuda.cmake
index 3b2eb6ce37..49f2cbc558 100644
--- a/external/cuda.cmake
+++ b/external/cuda.cmake
@@ -5,6 +5,13 @@ set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_EXTENSIONS OFF)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+# N.B. need relaxed constexpr for std::complex
+# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions%5B/url%5D:
+if (DEFINED CMAKE_CUDA_FLAGS)
+  set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr ${CMAKE_CUDA_FLAGS}")
+else()
+  set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
+endif()
 enable_language(CUDA)
 
 set(CUDA_FOUND TRUE)

From 661da1025f2f0e014113a6846a51b442afe91fdb Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 29 Mar 2023 18:55:06 -0400
Subject: [PATCH 3/8] a("i,j") = b("i,j") when a and b are same type is now
 deep copy of b into a to reduce surprise, since e.g. a("i,j") = b("j,i")
 makes new a

---
 src/TiledArray/expressions/tsr_expr.h | 10 ++++++++--
 tests/expressions_impl.h              | 12 ++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h
index a17fa65cbc..8430a3c852 100644
--- a/src/TiledArray/expressions/tsr_expr.h
+++ b/src/TiledArray/expressions/tsr_expr.h
@@ -112,8 +112,14 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
   /// Expression assignment operator
 
   /// \param other The expression that will be assigned to the array
-  array_type& operator=(TsrExpr_& other) {
-    other.eval_to(*this);
+  array_type& operator=(const TsrExpr_& other) {
+    // N.B. corner case: whether A("i,j") = B("i,j") is deep or shallow copy
+    // depends on whether the copy semantics of tiles ... to be sure use clone
+    if (IndexList(this->annotation()) == IndexList(other.annotation())) {
+      array_ = other.array().clone();
+    } else {
+      other.eval_to(*this);
+    }
     return array_;
   }
 
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
index 388cdd8e5d..91bcb10cc4 100644
--- a/tests/expressions_impl.h
+++ b/tests/expressions_impl.h
@@ -64,6 +64,18 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) {
                            ca("a,b,c").block(boost::combine(lobound, upbound)));
   BOOST_CHECK_NO_THROW(c("a,b,c") =
                            ca("a,b,c").block(iv(3, 3, 3), iv(5, 5, 5)));
+
+  // make sure that c("abc") = a("abc") does a deep copy
+  {
+    BOOST_CHECK_NO_THROW(c("a,b,c") = a("a,   b, c"));
+    for (auto&& idx : c.tiles_range()) {
+      if (c.is_local(idx) && !c.is_local(idx) && a.is_local(idx) &&
+          !a.is_zero(idx)) {
+        BOOST_CHECK(c.find_local(idx).get().data() !=
+                    a.find_local(idx).get().data());
+      }
+    }
+  }
 }
 
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(block_tensor_factories, F, Fixtures, F) {

From 9fab62564e90e590c8df58aec1b3ffd39bc03b80 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 31 Mar 2023 06:56:33 -0400
Subject: [PATCH 4/8] SizeArray can be compared to a sized range

---
 src/TiledArray/size_array.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h
index 6edbecb222..bd52139ce5 100644
--- a/src/TiledArray/size_array.h
+++ b/src/TiledArray/size_array.h
@@ -42,6 +42,15 @@ class SizeArray {
   T* first_ = nullptr;  ///< First element of the array
   T* last_ = nullptr;   ///< Last element of the array
 
+  // can compare to any sized range
+  template <typename U, typename SizedRange>
+  friend std::enable_if_t<
+      is_sized_range_v<std::remove_reference_t<SizedRange>> &&
+          !std::is_same_v<SizeArray<U>, std::remove_reference_t<SizedRange>> &&
+          !std::is_base_of_v<SizeArray<U>, std::remove_reference_t<SizedRange>>,
+      bool>
+  operator==(const SizeArray<U>&, SizedRange&&);
+
  public:
   // type definitions
   typedef T value_type;
@@ -436,6 +445,19 @@ class SizeArray {
 
 };  // class SizeArray
 
+template <typename U, typename SizedRange>
+std::enable_if_t<
+    is_sized_range_v<std::remove_reference_t<SizedRange>> &&
+        !std::is_same_v<SizeArray<U>, std::remove_reference_t<SizedRange>> &&
+        !std::is_base_of_v<SizeArray<U>, std::remove_reference_t<SizedRange>>,
+    bool>
+operator==(const SizeArray<U>& idx1, SizedRange&& idx2) {
+  if (idx1.size() == idx2.size())
+    return std::equal(idx1.begin(), idx1.end(), idx2.begin());
+  else
+    return false;
+}
+
 template <typename T>
 inline std::vector<T> operator*(const Permutation& perm,
                                 const SizeArray<T>& orig) {

From 07fb1e51c90d88530821e8b57a45de2ababbea73 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 31 Mar 2023 06:58:10 -0400
Subject: [PATCH 5/8] introduce {Index,IndexView} aliases to
 Range::{index_type,index_view_type}

---
 src/TiledArray/range.h |  4 ++++
 tests/range.cpp        | 24 +++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h
index 8108ecf227..b7a38d38b0 100644
--- a/src/TiledArray/range.h
+++ b/src/TiledArray/range.h
@@ -1245,6 +1245,10 @@ class Range {
 
 };  // class Range
 
+// lift Range::index_type and Range::index_view_type into user-land
+using Index = Range::index_type;
+using IndexView = Range::index_view_type;
+
 inline Range& Range::operator*=(const Permutation& perm) {
   TA_ASSERT(perm.size() == rank_);
   if (rank_ > 1ul) {
diff --git a/tests/range.cpp b/tests/range.cpp
index 1ad294363f..a71a0629d0 100644
--- a/tests/range.cpp
+++ b/tests/range.cpp
@@ -65,15 +65,37 @@ BOOST_FIXTURE_TEST_SUITE(range_suite, RangeFixture, TA_UT_LABEL_SERIAL)
 BOOST_AUTO_TEST_CASE(dimension_accessor) {
   BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound_data(), r.lobound_data() + r.rank(),
                                 start.begin(), start.end());  // check start()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound().begin(), r.lobound().end(),
+                                start.begin(), start.end());  // check start()
+  BOOST_CHECK_EQUAL(r.lobound(), start);                      // check start()
+  BOOST_CHECK_EQUAL(r.lobound(),
+                    (Index{start.begin(), start.end()}));  // check finish()
   BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound_data(), r.upbound_data() + r.rank(),
                                 finish.begin(),
                                 finish.end());  // check finish()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound().begin(), r.upbound().end(),
+                                finish.begin(),
+                                finish.end());  // check finish()
+  BOOST_CHECK_EQUAL(r.upbound(), finish);       // check finish()
+  BOOST_CHECK_EQUAL(r.upbound(),
+                    (Index{finish.begin(), finish.end()}));  // check finish()
   BOOST_CHECK_EQUAL_COLLECTIONS(r.extent_data(), r.extent_data() + r.rank(),
                                 size.begin(), size.end());  // check size()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.extent().begin(), r.extent().end(),
+                                size.begin(), size.end());  // check size()
+  BOOST_CHECK_EQUAL(r.extent(), size);                      // check size()
+  BOOST_CHECK_EQUAL(r.extent(),
+                    (Index{size.begin(), size.end()}));  // check size()
   BOOST_CHECK_EQUAL_COLLECTIONS(r.stride_data(), r.stride_data() + r.rank(),
                                 weight.begin(),
                                 weight.end());  // check weight()
-  BOOST_CHECK_EQUAL(r.volume(), volume);        // check volume()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.stride().begin(), r.stride().end(),
+                                weight.begin(),
+                                weight.end());  // check weight()
+  BOOST_CHECK_EQUAL(r.stride(), weight);        // check weight()
+  BOOST_CHECK_EQUAL(r.stride(),
+                    (Index{weight.begin(), weight.end()}));  // check weight()
+  BOOST_CHECK_EQUAL(r.volume(), volume);                     // check volume()
   for (size_t d = 0; d != r.rank(); ++d) {
     auto range_d = r.dim(d);
     BOOST_CHECK_EQUAL(range_d.first, start[d]);

From d8ddc210b5ecf838dcfb66fd7acf38e869ff1810 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 31 Mar 2023 06:59:36 -0400
Subject: [PATCH 6/8] added demo2 exec for the paper code snippets

---
 examples/demo/CMakeLists.txt |  8 ++--
 examples/demo/demo2.cpp      | 93 ++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 3 deletions(-)
 create mode 100644 examples/demo/demo2.cpp

diff --git a/examples/demo/CMakeLists.txt b/examples/demo/CMakeLists.txt
index c4c533cbf0..c2da9cb36e 100644
--- a/examples/demo/CMakeLists.txt
+++ b/examples/demo/CMakeLists.txt
@@ -16,8 +16,10 @@
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-# Create the ta_fock_build executable
-
-# Add the demo executable
+# Standard TA demo to accompany the keynote slides
 add_ta_executable(demo "demo.cpp" "tiledarray")
 add_dependencies(examples-tiledarray demo)
+
+# TA demo snippets for the paper
+add_ta_executable(demo2 "demo2.cpp" "tiledarray")
+add_dependencies(examples-tiledarray demo2)
diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp
new file mode 100644
index 0000000000..64144eab70
--- /dev/null
+++ b/examples/demo/demo2.cpp
@@ -0,0 +1,93 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2023  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef EXAMPLES_DEMO_DEMO2_CPP_
+#define EXAMPLES_DEMO_DEMO2_CPP_
+
+#include <tiledarray.h>
+#include <random>
+
+#include <TiledArray/expressions/einsum.h>
+#include <TiledArray/math/solvers/cp.h>
+
+int main(int argc, char* argv[]) {
+  using namespace std;
+
+  TA::srand(2017);
+  TA::World& world = TA::initialize(argc, argv);
+
+  using namespace TA;
+
+  // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  Range ρ{{1, 11}, {-1, 9}};
+  assert((ρ.lobound() == Index{1, -1}));
+  assert((ρ.upbound() == Index{11, 9}));
+  assert((ρ.extent() == Index{10, 10}));
+  assert(ρ.volume() == 100);
+  assert((ρ.stride() == Index{10, 1}));
+  assert((ρ.ordinal({1, -1}) == 0));
+  assert((ρ.ordinal({10, 8}) + 1 == ρ.volume()));
+  // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] "
+  for (auto&& idx : ρ) cout << idx << " ";
+
+  // $\mathbb{Z}_{1,11}$ tiled into $\mathbb{Z}_{1,5}$, $\mathbb{Z}_{5,8}$, and
+  // $\mathbb{Z}_{8,11}$
+  TiledRange1 τ0{1, 5, 8, 11};  // hashmarks
+  assert(τ0.extent() == 10);    // there are 10 elements in τ0
+  assert((τ0.elements_range() ==
+          Range1{1, 11}));        // elements indexed by $\mathbb{Z}_{1,11}$
+  assert(τ0.tile_extent() == 3);  // there are 3 tiles in τ0
+  assert((τ0.tiles_range() ==
+          Range1{0, 3}));                // tiles indexed by $\mathbb{Z}_{0,3}$
+  assert((τ0.tile(1) == Range1{5, 8}));  // 1st tile of τ0 is $\mathbb{Z}_{5,8}$
+
+  // $\mathbb{Z}_{-1,9}$ tiled into $\mathbb{Z}_{-1,5}$ and $\mathbb{Z}_{5,9}$
+  TiledRange1 τ1{-1, 5, 9};
+
+  // 2nd tile of $\code{tau0}$ is $\mathbb{Z}_{5,8}$
+  assert((τ0.tile(1) == Range1{5, 8}));
+  // 1st tile of $\code{tau1}$ is $\mathbb{Z}_{-1,5}$
+  assert((τ1.tile(0) == Range1{-1, 5}));
+
+  // prints "-1 0 1 2 3 4 "
+  for (auto&& i : τ1.tile(0)) cout << i << " ";
+
+  // tiling of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ by tensor product
+  // of
+  // $\code{τ0}$ and $\code{τ1}$
+  TiledRange τ{τ0, τ1};
+  // shortcut
+  TiledRange same_as_τ{{1, 5, 8, 11}, {-1, 5, 9}};
+
+  // tile index {0,0} refers to tile $\mathbb{Z}_{1,5} \otimes
+  // \mathbb{Z}_{-1,5}$
+  auto tile_0_0 = τ.tile({0, 0});
+  assert((tile_0_0 == Range{{1, 5}, {-1, 5}}));
+
+  // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s
+  DistArray array0(world, τ);
+  array0.fill(1.0);  // fill $\code{array0}$ with 1's
+
+  // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2.
+  auto t00 = array0.find({0, 0});
+
+  return 0;
+}
+
+#endif /* EXAMPLES_DEMO_DEMO2_CPP_ */

From 94a8fd9e71cd4bcc3f639bb05726314034e21841 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 31 Mar 2023 07:00:10 -0400
Subject: [PATCH 7/8] demo: use TA::srand

---
 examples/demo/demo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/demo/demo.cpp b/examples/demo/demo.cpp
index d6c1612d95..05f9a25bf5 100644
--- a/examples/demo/demo.cpp
+++ b/examples/demo/demo.cpp
@@ -39,7 +39,7 @@ auto make_tile(const TA::Range &range) {
 int main(int argc, char *argv[]) {
   using namespace std;
 
-  std::srand(2017);
+  TA::srand(2017);
   TA::World &world = TA::initialize(argc, argv);
 
   using namespace TA;
@@ -88,7 +88,6 @@ int main(int argc, char *argv[]) {
   SparseShape<float> shape(shape_tensor, TR);
   TSpArrayD a1(world, TR, shape);
   a1.fill_random();  // for deterministic fill:
-                     // TA::srand(seed);
                      // a1.fill_random<HostExecutor::Thread>();
   cout << "a1:\n" << a1 << endl;
   world.gop.fence();

From 6a55cdceb1a9808bb7135f9b25a5ccadd73f3787 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Sat, 1 Apr 2023 13:21:29 -0500
Subject: [PATCH 8/8] demo2: comment out most code unless compiler can support
 unicode chars in variable names

---
 examples/demo/demo2.cpp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp
index 64144eab70..5818fae22d 100644
--- a/examples/demo/demo2.cpp
+++ b/examples/demo/demo2.cpp
@@ -34,14 +34,34 @@ int main(int argc, char* argv[]) {
 
   using namespace TA;
 
+  // requires compiler new enough to support unicode characters in variable
+  // names
+#ifndef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
+#ifdef TILEDARRAY_CXX_COMPILER_IS_GCC
+#if __GNUC__ >= 10
+#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
+#endif
+#elif !defined(TILEDARRAY_CXX_COMPILER_IS_ICC)
+#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
+#endif
+#endif  // !defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
+
+#ifdef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
+
   // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   Range ρ{{1, 11}, {-1, 9}};
+  // lower bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   assert((ρ.lobound() == Index{1, -1}));
+  // upper bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   assert((ρ.upbound() == Index{11, 9}));
+  // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   assert((ρ.extent() == Index{10, 10}));
+  // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
   assert(ρ.volume() == 100);
+  // row-major order
   assert((ρ.stride() == Index{10, 1}));
   assert((ρ.ordinal({1, -1}) == 0));
+  assert((ρ.ordinal({1, 0}) == 1));
   assert((ρ.ordinal({10, 8}) + 1 == ρ.volume()));
   // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] "
   for (auto&& idx : ρ) cout << idx << " ";
@@ -87,6 +107,8 @@ int main(int argc, char* argv[]) {
   // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2.
   auto t00 = array0.find({0, 0});
 
+#endif  // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
+
   return 0;
 }