ValeevGroup · evaleev · Apr 2, 2023 · Mar 27, 2023 · Mar 28, 2023 · Mar 29, 2023
diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp
@@ -137,23 +137,31 @@ template <typename Storage>
 void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
                   const long Nn, const long Bn, const long Nk, const long Bk,
                   const long nrepeat) {
-  using Real = typename Storage::value_type;
+  using T = TiledArray::detail::numeric_t<Storage>;
+  using RT = TiledArray::detail::scalar_t<Storage>;
+  constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;
 
   const std::size_t Tm = Nm / Bm;
   const std::size_t Tn = Nn / Bn;
   const std::size_t Tk = Nk / Bk;
 
+  const std::int64_t nflops =
+      (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
+                           // 1 add takes 2/1 flops for complex/real
+      * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
+      static_cast<std::int64_t>(Nk);
+
   if (world.rank() == 0)
     std::cout << "TiledArray: dense matrix multiply test...\n"
               << "Number of nodes     = " << world.size()
               << "\nSize of A         = " << Nm << "x" << Nk << " ("
-              << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
+              << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
               << "\nSize of A block   = " << Bm << "x" << Bk
               << "\nSize of B         = " << Nk << "x" << Nn << " ("
-              << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)"
+              << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
               << "\nSize of B block   = " << Bk << "x" << Bn
               << "\nSize of C         = " << Nm << "x" << Nn << " ("
-              << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)"
+              << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
               << "\nSize of C block   = " << Bm << "x" << Bn
               << "\n# of blocks of C  = " << Tm * Tn
               << "\nAverage # of blocks of C/node = "
@@ -205,14 +213,13 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
   TiledArray::TiledRange  // TRange for b
       trange_b(blocking_B.begin(), blocking_B.end());
 
-  using value_type = typename Storage::value_type;
-  using CUDATile = btas::Tensor<Real, TA::Range, Storage>;
+  using CUDATile = btas::Tensor<T, TA::Range, Storage>;
   using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
-  using TAMatrix = TA::DistArray<TA::Tensor<value_type>>;
+  using TAMatrix = TA::DistArray<TA::Tensor<T>>;
 
   CUDAMatrix c(world, trange_c);
-  value_type val_a = 0.03;
-  value_type val_b = 0.02;
+  auto val_a = 0.03;
+  auto val_b = 0.02;
 
   {
     // Construct and initialize arrays
@@ -235,19 +242,26 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     // start profiler
     cudaProfilerStart();
 
-    // Start clock
-    const double wall_time_start = madness::wall_time();
+    double total_time = 0.0;
+    double total_gflop_rate = 0.0;
 
     // Do matrix multiplication
     for (int i = 0; i < nrepeat; ++i) {
       double iter_time_start = madness::wall_time();
       //      c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n");
       c("m,n") = a("m,k") * b("k,n");
+      c.world().gop.fence();  // fence since GEMM can return early
       double iter_time_stop = madness::wall_time();
+      const double iter_time = iter_time_stop - iter_time_start;
+      total_time += iter_time;
+      const double gflop_rate = double(nflops) / (iter_time * 1.e9);
+      total_gflop_rate += gflop_rate;
       if (world.rank() == 0)
-        std::cout << "Iteration " << i + 1
-                  << " wall time: " << (iter_time_stop - iter_time_start)
+        std::cout << "Iteration " << i + 1 << " wall time: " << iter_time
                   << "\n";
+      if (world.rank() == 0)
+        std::cout << "Iteration " << i + 1 << "   time=" << time
+                  << "   GFLOPS=" << gflop_rate << "\n";
     }
     // Stop clock
     const double wall_time_stop = madness::wall_time();
@@ -256,32 +270,43 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     cudaProfilerStop();
 
     if (world.rank() == 0)
-      std::cout << "Average wall time   = "
-                << (wall_time_stop - wall_time_start) / double(nrepeat)
+      std::cout << "Average wall time   = " << total_time / double(nrepeat)
                 << " sec\nAverage GFLOPS      = "
-                << double(nrepeat) * 2.0 * double(Nn * Nm * Nm) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
+                << total_gflop_rate / double(nrepeat) << "\n";
   }
 
-  double threshold =
-      std::numeric_limits<typename Storage::value_type>::epsilon();
+  double threshold = std::numeric_limits<RT>::epsilon();
   auto dot_length = Nk;
   //  auto result = dot_length * val_a * val_b + val_a - val_b;
-  auto result = dot_length * val_a * val_b;
+  T result;
+  if constexpr (complex_T) {
+    result = T(dot_length * val_a * val_b, 0.);
+  } else
+    result = dot_length * val_a * val_b;
 
   auto verify = [&world, &threshold, &result,
                  &dot_length](TA::Tile<CUDATile> &tile) {
     auto n_elements = tile.size();
     for (std::size_t i = 0; i < n_elements; i++) {
-      double abs_err = fabs(tile[i] - result);
+      double abs_err = std::abs(tile[i] - result);
       //      double abs_val = fabs(tile[i]);
-      double rel_err = abs_err / result / dot_length;
+      double rel_err = abs_err / std::abs(result) / dot_length;
       if (rel_err > threshold) {
+        auto to_string = [](const auto &v) {
+          constexpr bool complex_T =
+              TiledArray::detail::is_complex_v<std::decay_t<decltype(v)>>;
+          if constexpr (complex_T) {
+            std::string result;
+            result = "{" + std::to_string(v.real()) + "," +
+                     std::to_string(v.imag()) + "}";
+            return result;
+          } else
+            return std::to_string(v);
+        };
         std::cout << "Node: " << world.rank() << " Tile: " << tile.range()
                   << " id: " << i
-                  << std::string(" gpu: " + std::to_string(tile[i]) +
-                                 " cpu: " + std::to_string(result) + "\n");
+                  << std::string(" gpu: " + to_string(tile[i]) +
+                                 " cpu: " + to_string(result) + "\n");
         break;
       }
     }
@@ -308,7 +333,7 @@ int try_main(int argc, char **argv) {
                  "blocked by Bm, Bn, and Bk, respectively"
               << std::endl
               << "Usage: " << argv[0]
-              << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [real = double] "
+              << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] "
                  "[storage type = cuda_um_btas_varray]\n";
     return 0;
   }
@@ -337,13 +362,13 @@ int try_main(int argc, char **argv) {
     return 1;
   }
 
-  const auto real_type_str =
-      (argc >= 9) ? std::string(argv[8]) : std::string("double");
-
-  if (real_type_str != "float" && real_type_str != "double") {
-    std::cerr << "Error: invalid real type: " << real_type_str
-              << "\n Valid option includes: float or "
-                 "double. \n";
+  const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
+  if (scalar_type_str != "double" && scalar_type_str != "float" &&
+      scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+    std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+    std::cerr << "       valid real types are \"double\", \"float\", "
+                 "\"zdouble\", and \"zfloat\".\n";
+    return 1;
   }
 
   const auto storage_type =
@@ -357,7 +382,7 @@ int try_main(int argc, char **argv) {
                  "cuda_um_btas_varray or cuda_um_thrust_vector "
                  "or cpu_cuda_vector. \n";
   }
-  std::cout << "Storage type: " << storage_type << "<" << real_type_str << ">"
+  std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">"
             << std::endl;
   //  auto to_bool = [](const std::string &str) {
   //    return (str == "true" || str == "True" || str == "TRUE" || str == "1" ||
@@ -424,7 +449,7 @@ int try_main(int argc, char **argv) {
   }  // print device properties
 
   //  if (storage_type == "cpu_cuda_vector") {
-  //    if (real_type_str == "double")
+  //    if (scalar_type_str == "double")
   //      do_main_body<TiledArray::cpu_cuda_vector<double>>(world, Nm, Bm, Nn,
   //      Bn,
   //                                                        Nk, Bk, nrepeat);
@@ -434,15 +459,24 @@ int try_main(int argc, char **argv) {
   //                                                       Nk, Bk, nrepeat);
   //  } else if (storage_type == "cuda_um_btas_varray") {
   if (storage_type == "cuda_um_btas_varray") {
-    if (real_type_str == "double")
+    if (scalar_type_str == "double")
       do_main_body<TiledArray::cuda_um_btas_varray<double>>(
           world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
-    else
+    else if (scalar_type_str == "float")
       do_main_body<TiledArray::cuda_um_btas_varray<float>>(world, Nm, Bm, Nn,
                                                            Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "zdouble")
+      do_main_body<TiledArray::cuda_um_btas_varray<std::complex<double>>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "zfloat")
+      do_main_body<TiledArray::cuda_um_btas_varray<std::complex<float>>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else {
+      abort();  // unreachable
+    }
   }
   // else if (storage_type == "cuda_um_thrust_vector") {
-  //    if (real_type_str == "double")
+  //    if (scalar_type_str == "double")
   //      do_main_body<TiledArray::cuda_um_thrust_vector<double>>(
   //          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
   //    else

diff --git a/examples/demo/CMakeLists.txt b/examples/demo/CMakeLists.txt
@@ -16,8 +16,10 @@
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-# Create the ta_fock_build executable
-
-# Add the demo executable
+# Standard TA demo to accompany the keynote slides
 add_ta_executable(demo "demo.cpp" "tiledarray")
 add_dependencies(examples-tiledarray demo)
+
+# TA demo snippets for the paper
+add_ta_executable(demo2 "demo2.cpp" "tiledarray")
+add_dependencies(examples-tiledarray demo2)
diff --git a/examples/demo/demo.cpp b/examples/demo/demo.cpp
@@ -39,7 +39,7 @@ auto make_tile(const TA::Range &range) {
 int main(int argc, char *argv[]) {
   using namespace std;
 
-  std::srand(2017);
+  TA::srand(2017);
   TA::World &world = TA::initialize(argc, argv);
 
   using namespace TA;
@@ -88,7 +88,6 @@ int main(int argc, char *argv[]) {
   SparseShape<float> shape(shape_tensor, TR);
   TSpArrayD a1(world, TR, shape);
   a1.fill_random();  // for deterministic fill:
-                     // TA::srand(seed);
                      // a1.fill_random<HostExecutor::Thread>();
   cout << "a1:\n" << a1 << endl;
   world.gop.fence();

diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp
@@ -0,0 +1,115 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2023  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef EXAMPLES_DEMO_DEMO2_CPP_
+#define EXAMPLES_DEMO_DEMO2_CPP_
+
+#include <tiledarray.h>
+#include <random>
+
+#include <TiledArray/expressions/einsum.h>
+#include <TiledArray/math/solvers/cp.h>
+
+int main(int argc, char* argv[]) {
+  using namespace std;
+
+  TA::srand(2017);
+  TA::World& world = TA::initialize(argc, argv);
+
+  using namespace TA;
+
+  // requires compiler new enough to support unicode characters in variable
+  // names
+#ifndef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
+#ifdef TILEDARRAY_CXX_COMPILER_IS_GCC
+#if __GNUC__ >= 10
+#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
+#endif
+#elif !defined(TILEDARRAY_CXX_COMPILER_IS_ICC)
+#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
+#endif
+#endif  // !defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
+
+#ifdef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
+
+  // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  Range ρ{{1, 11}, {-1, 9}};
+  // lower bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert((ρ.lobound() == Index{1, -1}));
+  // upper bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert((ρ.upbound() == Index{11, 9}));
+  // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert((ρ.extent() == Index{10, 10}));
+  // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert(ρ.volume() == 100);
+  // row-major order
+  assert((ρ.stride() == Index{10, 1}));
+  assert((ρ.ordinal({1, -1}) == 0));
+  assert((ρ.ordinal({1, 0}) == 1));
+  assert((ρ.ordinal({10, 8}) + 1 == ρ.volume()));
+  // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] "
+  for (auto&& idx : ρ) cout << idx << " ";
+
+  // $\mathbb{Z}_{1,11}$ tiled into $\mathbb{Z}_{1,5}$, $\mathbb{Z}_{5,8}$, and
+  // $\mathbb{Z}_{8,11}$
+  TiledRange1 τ0{1, 5, 8, 11};  // hashmarks
+  assert(τ0.extent() == 10);    // there are 10 elements in τ0
+  assert((τ0.elements_range() ==
+          Range1{1, 11}));        // elements indexed by $\mathbb{Z}_{1,11}$
+  assert(τ0.tile_extent() == 3);  // there are 3 tiles in τ0
+  assert((τ0.tiles_range() ==
+          Range1{0, 3}));                // tiles indexed by $\mathbb{Z}_{0,3}$
+  assert((τ0.tile(1) == Range1{5, 8}));  // 1st tile of τ0 is $\mathbb{Z}_{5,8}$
+
+  // $\mathbb{Z}_{-1,9}$ tiled into $\mathbb{Z}_{-1,5}$ and $\mathbb{Z}_{5,9}$
+  TiledRange1 τ1{-1, 5, 9};
+
+  // 2nd tile of $\code{tau0}$ is $\mathbb{Z}_{5,8}$
+  assert((τ0.tile(1) == Range1{5, 8}));
+  // 1st tile of $\code{tau1}$ is $\mathbb{Z}_{-1,5}$
+  assert((τ1.tile(0) == Range1{-1, 5}));
+
+  // prints "-1 0 1 2 3 4 "
+  for (auto&& i : τ1.tile(0)) cout << i << " ";
+
+  // tiling of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ by tensor product
+  // of
+  // $\code{τ0}$ and $\code{τ1}$
+  TiledRange τ{τ0, τ1};
+  // shortcut
+  TiledRange same_as_τ{{1, 5, 8, 11}, {-1, 5, 9}};
+
+  // tile index {0,0} refers to tile $\mathbb{Z}_{1,5} \otimes
+  // \mathbb{Z}_{-1,5}$
+  auto tile_0_0 = τ.tile({0, 0});
+  assert((tile_0_0 == Range{{1, 5}, {-1, 5}}));
+
+  // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s
+  DistArray array0(world, τ);
+  array0.fill(1.0);  // fill $\code{array0}$ with 1's
+
+  // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2.
+  auto t00 = array0.find({0, 0});
+
+#endif  // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
+
+  return 0;
+}
+
+#endif /* EXAMPLES_DEMO_DEMO2_CPP_ */