From ca27eb0782861fe4e21678461e294594df269179 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Mar 2023 19:45:14 -0400 Subject: [PATCH 1/8] extend device API for complex types, as illustrated by ta_dense_cuda --- examples/cuda/ta_dense_cuda.cpp | 110 ++++++++++++------ examples/dgemm/ta_dense_asymm.cpp | 34 +++--- src/TiledArray/cuda/btas_um_tensor.cpp | 16 +++ src/TiledArray/cuda/btas_um_tensor.h | 16 +++ src/TiledArray/cuda/cpu_cuda_vector.cu | 21 +++- src/TiledArray/cuda/cpu_cuda_vector.h | 2 + src/TiledArray/cuda/cublas.h | 61 ++++++++++ src/TiledArray/cuda/kernel/mult_kernel.cu | 19 +++ src/TiledArray/cuda/kernel/mult_kernel.h | 20 ++++ src/TiledArray/cuda/kernel/reduce_kernel.cu | 41 ++++++- src/TiledArray/cuda/kernel/reduce_kernel.h | 34 ++++++ .../cuda/kernel/reduce_kernel_impl.h | 31 +++-- 12 files changed, 337 insertions(+), 68 deletions(-) diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp index 14f692329b..4a035f176b 100644 --- a/examples/cuda/ta_dense_cuda.cpp +++ b/examples/cuda/ta_dense_cuda.cpp @@ -137,23 +137,31 @@ template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, const long Nn, const long Bn, const long Nk, const long Bk, const long nrepeat) { - using Real = typename Storage::value_type; + using T = TiledArray::detail::numeric_t; + using RT = TiledArray::detail::scalar_t; + constexpr auto complex_T = TiledArray::detail::is_complex_v; const std::size_t Tm = Nm / Bm; const std::size_t Tn = Nn / Bn; const std::size_t Tk = Nk / Bk; + const std::int64_t nflops = + (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real + // 1 add takes 2/1 flops for complex/real + * static_cast(Nn) * static_cast(Nm) * + static_cast(Nk); + if (world.rank() == 0) std::cout << "TiledArray: dense matrix multiply test...\n" << "Number of nodes = " << world.size() << "\nSize of A = " << Nm << "x" << Nk << " (" - << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)" + << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" << "\nSize of A block = " << Bm << "x" << Bk << "\nSize of B = " << Nk << "x" << Nn << " (" - << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)" + << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" << "\nSize of B block = " << Bk << "x" << Bn << "\nSize of C = " << Nm << "x" << Nn << " (" - << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)" + << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" << "\nSize of C block = " << Bm << "x" << Bn << "\n# of blocks of C = " << Tm * Tn << "\nAverage # of blocks of C/node = " @@ -205,14 +213,13 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, TiledArray::TiledRange // TRange for b trange_b(blocking_B.begin(), blocking_B.end()); - using value_type = typename Storage::value_type; - using CUDATile = btas::Tensor; + using CUDATile = btas::Tensor; using CUDAMatrix = TA::DistArray>; - using TAMatrix = TA::DistArray>; + using TAMatrix = TA::DistArray>; CUDAMatrix c(world, trange_c); - value_type val_a = 0.03; - value_type val_b = 0.02; + auto val_a = 0.03; + auto val_b = 0.02; { // Construct and initialize arrays @@ -235,19 +242,26 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, // start profiler cudaProfilerStart(); - // Start clock - const double wall_time_start = madness::wall_time(); + double total_time = 0.0; + double total_gflop_rate = 0.0; // Do matrix multiplication for (int i = 0; i < nrepeat; ++i) { double iter_time_start = madness::wall_time(); // c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n"); c("m,n") = a("m,k") * b("k,n"); + c.world().gop.fence(); // fence since GEMM can return early double iter_time_stop = madness::wall_time(); + const double iter_time = iter_time_stop - iter_time_start; + total_time += iter_time; + const double gflop_rate = double(nflops) / (iter_time * 1.e9); + total_gflop_rate += gflop_rate; if (world.rank() == 0) - std::cout << "Iteration " << i + 1 - << " wall time: " << (iter_time_stop - iter_time_start) + std::cout << "Iteration " << i + 1 << " wall time: " << iter_time << "\n"; + if (world.rank() == 0) + std::cout << "Iteration " << i + 1 << " time=" << time + << " GFLOPS=" << gflop_rate << "\n"; } // Stop clock const double wall_time_stop = madness::wall_time(); @@ -256,32 +270,43 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, cudaProfilerStop(); if (world.rank() == 0) - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(nrepeat) + std::cout << "Average wall time = " << total_time / double(nrepeat) << " sec\nAverage GFLOPS = " - << double(nrepeat) * 2.0 * double(Nn * Nm * Nm) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + << total_gflop_rate / double(nrepeat) << "\n"; } - double threshold = - std::numeric_limits::epsilon(); + double threshold = std::numeric_limits::epsilon(); auto dot_length = Nk; // auto result = dot_length * val_a * val_b + val_a - val_b; - auto result = dot_length * val_a * val_b; + T result; + if constexpr (complex_T) { + result = T(dot_length * val_a * val_b, 0.); + } else + result = dot_length * val_a * val_b; auto verify = [&world, &threshold, &result, &dot_length](TA::Tile &tile) { auto n_elements = tile.size(); for (std::size_t i = 0; i < n_elements; i++) { - double abs_err = fabs(tile[i] - result); + double abs_err = std::abs(tile[i] - result); // double abs_val = fabs(tile[i]); - double rel_err = abs_err / result / dot_length; + double rel_err = abs_err / std::abs(result) / dot_length; if (rel_err > threshold) { + auto to_string = [](const auto &v) { + constexpr bool complex_T = + TiledArray::detail::is_complex_v>; + if constexpr (complex_T) { + std::string result; + result = "{" + std::to_string(v.real()) + "," + + std::to_string(v.imag()) + "}"; + return result; + } else + return std::to_string(v); + }; std::cout << "Node: " << world.rank() << " Tile: " << tile.range() << " id: " << i - << std::string(" gpu: " + std::to_string(tile[i]) + - " cpu: " + std::to_string(result) + "\n"); + << std::string(" gpu: " + to_string(tile[i]) + + " cpu: " + to_string(result) + "\n"); break; } } @@ -308,7 +333,7 @@ int try_main(int argc, char **argv) { "blocked by Bm, Bn, and Bk, respectively" << std::endl << "Usage: " << argv[0] - << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [real = double] " + << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] " "[storage type = cuda_um_btas_varray]\n"; return 0; } @@ -337,13 +362,13 @@ int try_main(int argc, char **argv) { return 1; } - const auto real_type_str = - (argc >= 9) ? std::string(argv[8]) : std::string("double"); - - if (real_type_str != "float" && real_type_str != "double") { - std::cerr << "Error: invalid real type: " << real_type_str - << "\n Valid option includes: float or " - "double. \n"; + const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double"); + if (scalar_type_str != "double" && scalar_type_str != "float" && + scalar_type_str != "zdouble" && scalar_type_str != "zfloat") { + std::cerr << "Error: invalid real type " << scalar_type_str << ".\n"; + std::cerr << " valid real types are \"double\", \"float\", " + "\"zdouble\", and \"zfloat\".\n"; + return 1; } const auto storage_type = @@ -357,7 +382,7 @@ int try_main(int argc, char **argv) { "cuda_um_btas_varray or cuda_um_thrust_vector " "or cpu_cuda_vector. \n"; } - std::cout << "Storage type: " << storage_type << "<" << real_type_str << ">" + std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">" << std::endl; // auto to_bool = [](const std::string &str) { // return (str == "true" || str == "True" || str == "TRUE" || str == "1" || @@ -424,7 +449,7 @@ int try_main(int argc, char **argv) { } // print device properties // if (storage_type == "cpu_cuda_vector") { - // if (real_type_str == "double") + // if (scalar_type_str == "double") // do_main_body>(world, Nm, Bm, Nn, // Bn, // Nk, Bk, nrepeat); @@ -434,15 +459,24 @@ int try_main(int argc, char **argv) { // Nk, Bk, nrepeat); // } else if (storage_type == "cuda_um_btas_varray") { if (storage_type == "cuda_um_btas_varray") { - if (real_type_str == "double") + if (scalar_type_str == "double") do_main_body>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); - else + else if (scalar_type_str == "float") do_main_body>(world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); + else if (scalar_type_str == "zdouble") + do_main_body>>( + world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); + else if (scalar_type_str == "zfloat") + do_main_body>>( + world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); + else { + abort(); // unreachable + } } // else if (storage_type == "cuda_um_thrust_vector") { - // if (real_type_str == "double") + // if (scalar_type_str == "double") // do_main_body>( // world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); // else diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp index ac72a39209..40183603bb 100644 --- a/examples/dgemm/ta_dense_asymm.cpp +++ b/examples/dgemm/ta_dense_asymm.cpp @@ -75,23 +75,6 @@ int main(int argc, char** argv) { const std::size_t Tn = Nn / Bn; const std::size_t Tk = Nk / Bk; - if (world.rank() == 0) - std::cout << "TiledArray: dense matrix multiply test...\n" - << "Number of nodes = " << world.size() - << "\nScalar type = " << scalar_type_str - << "\nSize of A = " << Nm << "x" << Nk << " (" - << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)" - << "\nSize of A block = " << Bm << "x" << Bk - << "\nSize of B = " << Nk << "x" << Nn << " (" - << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)" - << "\nSize of B block = " << Bk << "x" << Bn - << "\nSize of C = " << Nm << "x" << Nn << " (" - << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)" - << "\nSize of C block = " << Bm << "x" << Bn - << "\n# of blocks of C = " << Tm * Tn - << "\nAverage # of blocks of C/node = " - << double(Tm * Tn) / double(world.size()) << "\n"; - // Construct TiledRange std::vector blocking_m; blocking_m.reserve(Tm + 1); @@ -148,6 +131,23 @@ int main(int argc, char** argv) { * static_cast(Nn) * static_cast(Nm) * static_cast(Nk); + if (world.rank() == 0) + std::cout << "TiledArray: dense matrix multiply test...\n" + << "Number of nodes = " << world.size() + << "\nScalar type = " << scalar_type_str + << "\nSize of A = " << Nm << "x" << Nk << " (" + << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of A block = " << Bm << "x" << Bk + << "\nSize of B = " << Nk << "x" << Nn << " (" + << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of B block = " << Bk << "x" << Bn + << "\nSize of C = " << Nm << "x" << Nn << " (" + << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of C block = " << Bm << "x" << Bn + << "\n# of blocks of C = " << Tm * Tn + << "\nAverage # of blocks of C/node = " + << double(Tm * Tn) / double(world.size()) << "\n"; + auto memtrace = [do_memtrace, &world](const std::string& str) -> void { if (do_memtrace) { world.gop.fence(); diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp index 58c3981f18..9423e7563d 100644 --- a/src/TiledArray/cuda/btas_um_tensor.cpp +++ b/src/TiledArray/cuda/btas_um_tensor.cpp @@ -11,6 +11,10 @@ template class btas::varray>; template class btas::varray>; +template class btas::varray< + std::complex, TiledArray::cuda_um_allocator>>; +template class btas::varray, + TiledArray::cuda_um_allocator>>; template class btas::varray>; template class btas::varray>; @@ -18,6 +22,12 @@ template class btas::Tensor>; template class btas::Tensor>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; template class btas::Tensor>; template class btas::Tensor>>; template class TiledArray::Tile>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; template class TiledArray::Tile< btas::Tensor>>; template class TiledArray::Tile &array) { extern template class btas::varray>; extern template class btas::varray>; +extern template class btas::varray< + std::complex, TiledArray::cuda_um_allocator>>; +extern template class btas::varray< + std::complex, TiledArray::cuda_um_allocator>>; extern template class btas::varray>; extern template class btas::varray>; @@ -787,6 +791,12 @@ extern template class btas::Tensor>; extern template class btas::Tensor>; +extern template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; +extern template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; extern template class btas::Tensor>; extern template class btas::Tensor>>; extern template class TiledArray::Tile>>; +extern template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; +extern template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; extern template class TiledArray::Tile< btas::Tensor>>; extern template class TiledArray::Tile>( size_t size) { dev_vec.resize(size); } +template<> +void resize,thrust::device_allocator>>( + thrust::device_vector, thrust::device_allocator>>& dev_vec, + size_t size) { + dev_vec.resize(size); +} +template<> +void resize,thrust::device_allocator>>( + thrust::device_vector, thrust::device_allocator>>& dev_vec, + size_t size) { + dev_vec.resize(size); +} } namespace TiledArray { template class cpu_cuda_vector; template class cpu_cuda_vector; +template class cpu_cuda_vector>; +template class cpu_cuda_vector>; } // Thrust included in CUDA 9+ seems to generate uninstantiated CUB calls @@ -35,6 +49,12 @@ auto force_missing_copy_instantiations_double() { auto force_missing_copy_instantiations_float() { return force_missing_copy_instantiations(); } +auto force_missing_copy_instantiations_zdouble() { + return force_missing_copy_instantiations>(); +} +auto force_missing_copy_instantiations_zfloat() { + return force_missing_copy_instantiations>(); +} auto force_missing_copy_instantiations_unsigned_long() { return force_missing_copy_instantiations(); } @@ -65,4 +85,3 @@ auto force_missing_copy_n_instantiations_long_long(){ } #endif // __CUDACC_VER_MAJOR__ >= 9 - diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/cuda/cpu_cuda_vector.h index 7370eeaa2e..5a6e52beb5 100644 --- a/src/TiledArray/cuda/cpu_cuda_vector.h +++ b/src/TiledArray/cuda/cpu_cuda_vector.h @@ -158,6 +158,8 @@ class cpu_cuda_vector { extern template class cpu_cuda_vector; extern template class cpu_cuda_vector; +extern template class cpu_cuda_vector>; +extern template class cpu_cuda_vector>; template diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/cuda/cublas.h index a5d3da7afc..8d4085eabb 100644 --- a/src/TiledArray/cuda/cublas.h +++ b/src/TiledArray/cuda/cublas.h @@ -54,6 +54,25 @@ inline void __cublasSafeCall(cublasStatus_t err, const char *file, namespace TiledArray { +namespace detail { + +template +auto cublasPointer(T *std_complex_ptr) { + using Scalar = TiledArray::detail::scalar_t; + static_assert(std::is_same_v || + std::is_same_v); + constexpr bool DP = std::is_same_v; + using cuT = std::conditional_t, + cuDoubleComplex, cuComplex>; + if constexpr (std::is_const_v< + std::remove_pointer_t>) { + return reinterpret_cast(std_complex_ptr); + } else + return reinterpret_cast(std_complex_ptr); +}; + +} // namespace detail + /* * cuBLAS interface functions */ @@ -117,6 +136,29 @@ inline cublasStatus_t cublasGemm( return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } +template <> +inline cublasStatus_t cublasGemm>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const std::complex *alpha, + const std::complex *A, int lda, const std::complex *B, + int ldb, const std::complex *beta, std::complex *C, int ldc) { + using detail::cublasPointer; + return cublasCgemm(handle, transa, transb, m, n, k, cublasPointer(alpha), + cublasPointer(A), lda, cublasPointer(B), ldb, + cublasPointer(beta), cublasPointer(C), ldc); +} +template <> +inline cublasStatus_t cublasGemm>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const std::complex *alpha, + const std::complex *A, int lda, const std::complex *B, + int ldb, const std::complex *beta, std::complex *C, + int ldc) { + using detail::cublasPointer; + return cublasZgemm(handle, transa, transb, m, n, k, cublasPointer(alpha), + cublasPointer(A), lda, cublasPointer(B), ldb, + cublasPointer(beta), cublasPointer(C), ldc); +} /// AXPY interface functions @@ -139,6 +181,25 @@ inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } +template <> +inline cublasStatus_t cublasAxpy, std::complex>( + cublasHandle_t handle, int n, const std::complex *alpha, + const std::complex *x, int incx, std::complex *y, int incy) { + using detail::cublasPointer; + return cublasCaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx, + cublasPointer(y), incy); +} + +template <> +inline cublasStatus_t cublasAxpy, std::complex>( + cublasHandle_t handle, int n, const std::complex *alpha, + const std::complex *x, int incx, std::complex *y, + int incy) { + using detail::cublasPointer; + return cublasZaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx, + cublasPointer(y), incy); +} + template <> inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, const int *alpha, const float *x, diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/cuda/kernel/mult_kernel.cu index 8bbcae4927..aa3cadbc72 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel.cu +++ b/src/TiledArray/cuda/kernel/mult_kernel.cu @@ -45,6 +45,16 @@ void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n, mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); } +void mult_to_cuda_kernel(std::complex *result, const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id) { + mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); +} + +void mult_to_cuda_kernel(std::complex *result, const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id) { + mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); +} + /// result[i] = arg1[i] * arg2[i] void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, cudaStream_t stream, int device_id){ @@ -61,6 +71,15 @@ void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, st mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); } +void mult_cuda_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id){ + mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); +} + +void mult_cuda_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id){ + mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); +} } // namespace TiledArray diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/cuda/kernel/mult_kernel.h index 7c333e879a..0c5c3f7822 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel.h +++ b/src/TiledArray/cuda/kernel/mult_kernel.h @@ -28,6 +28,8 @@ #ifdef TILEDARRAY_HAS_CUDA +#include + namespace TiledArray { /// result[i] = result[i] * arg[i] @@ -40,6 +42,14 @@ void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n, void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n, cudaStream_t stream, int device_id); +void mult_to_cuda_kernel(std::complex *result, + const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id); + +void mult_to_cuda_kernel(std::complex *result, + const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id); + /// result[i] = arg1[i] * arg2[i] void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, cudaStream_t stream, int device_id); @@ -50,6 +60,16 @@ void mult_cuda_kernel(float *result, const float *arg1, const float *arg2, void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, std::size_t n, cudaStream_t stream, int device_id); +void mult_cuda_kernel(std::complex *result, + const std::complex *arg1, + const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id); + +void mult_cuda_kernel(std::complex *result, + const std::complex *arg1, + const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id); + } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/cuda/kernel/reduce_kernel.cu index 1e1550260f..d24669b920 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel.cu +++ b/src/TiledArray/cuda/kernel/reduce_kernel.cu @@ -33,7 +33,6 @@ namespace TiledArray { int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id){ return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); - } float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, @@ -47,6 +46,16 @@ double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex product_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex product_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + + return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} // foreach(i) result += arg[i] int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, @@ -64,6 +73,16 @@ double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex sum_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex sum_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + // foreach(i) result = max(result, arg[i]) int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id){ @@ -112,6 +131,16 @@ double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex absmax_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex absmax_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + // foreach(i) result = min(result, abs(arg[i])) int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id){ @@ -128,6 +157,16 @@ double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex absmin_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex absmin_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/cuda/kernel/reduce_kernel.h index 857cad6c0c..1bcf526ee4 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel.h +++ b/src/TiledArray/cuda/kernel/reduce_kernel.h @@ -28,6 +28,8 @@ #ifdef TILEDARRAY_HAS_CUDA +#include + namespace TiledArray { // foreach(i) result *= arg[i] @@ -40,6 +42,14 @@ float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex product_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex product_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + // foreach(i) result += arg[i] int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id); @@ -50,6 +60,14 @@ float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex sum_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex sum_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + // foreach(i) result = max(result, arg[i]) int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id); @@ -80,6 +98,14 @@ float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex absmax_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex absmax_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + // foreach(i) result = min(result, abs(arg[i])) int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id); @@ -90,6 +116,14 @@ float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex absmin_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex absmin_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h index 12a8aa1e19..9dc6507cca 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h +++ b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -38,9 +39,15 @@ namespace TiledArray { namespace detail { template -struct absolute_value : public thrust::unary_function { - __host__ __device__ T operator()(const T &x) const { - return x < T(0) ? -x : x; +struct absolute_value + : public thrust::unary_function> { + __host__ __device__ TiledArray::detail::scalar_t operator()( + const T &x) const { + using RT = TiledArray::detail::scalar_t; + if constexpr (!TiledArray::detail::is_complex_v) { + return x < RT(0) ? -x : x; + } else + return std::sqrt(x.real() * x.real() + x.imag() * x.imag()); } }; @@ -93,10 +100,11 @@ T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, } template -T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n, - cudaStream_t stream, int device_id) { - T init(0); - thrust::maximum max_op; +TiledArray::detail::scalar_t absmax_reduce_cuda_kernel_impl( + const T *arg, std::size_t n, cudaStream_t stream, int device_id) { + using TR = TiledArray::detail::scalar_t; + TR init(0); + thrust::maximum max_op; detail::absolute_value abs_op; CudaSafeCall(cudaSetDevice(device_id)); @@ -110,10 +118,11 @@ T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n, } template -T absmin_reduce_cuda_kernel_impl(const T *arg, std::size_t n, - cudaStream_t stream, int device_id) { - T init(0); - thrust::minimum min_op; +TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( + const T *arg, std::size_t n, cudaStream_t stream, int device_id) { + using TR = TiledArray::detail::scalar_t; + TR init = std::numeric_limits::max(); + thrust::minimum min_op; detail::absolute_value abs_op; CudaSafeCall(cudaSetDevice(device_id)); From 67b2360a57635bfed8a6fe598960ef9bda0e9cae Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Mar 2023 22:12:05 -0400 Subject: [PATCH 2/8] add --expt-relaxed-constexpr to CMAKE_CUDA_FLAGS to be able to handle std::complex --- external/cuda.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/external/cuda.cmake b/external/cuda.cmake index 3b2eb6ce37..49f2cbc558 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -5,6 +5,13 @@ set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_EXTENSIONS OFF) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) +# N.B. need relaxed constexpr for std::complex +# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions%5B/url%5D: +if (DEFINED CMAKE_CUDA_FLAGS) + set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr ${CMAKE_CUDA_FLAGS}") +else() + set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr") +endif() enable_language(CUDA) set(CUDA_FOUND TRUE) From 661da1025f2f0e014113a6846a51b442afe91fdb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Mar 2023 18:55:06 -0400 Subject: [PATCH 3/8] a("i,j") = b("i,j") when a and b are same type is now deep copy of b into a to reduce surprise, since e.g. a("i,j") = b("j,i") makes new a --- src/TiledArray/expressions/tsr_expr.h | 10 ++++++++-- tests/expressions_impl.h | 12 ++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h index a17fa65cbc..8430a3c852 100644 --- a/src/TiledArray/expressions/tsr_expr.h +++ b/src/TiledArray/expressions/tsr_expr.h @@ -112,8 +112,14 @@ class TsrExpr : public Expr> { /// Expression assignment operator /// \param other The expression that will be assigned to the array - array_type& operator=(TsrExpr_& other) { - other.eval_to(*this); + array_type& operator=(const TsrExpr_& other) { + // N.B. corner case: whether A("i,j") = B("i,j") is deep or shallow copy + // depends on whether the copy semantics of tiles ... to be sure use clone + if (IndexList(this->annotation()) == IndexList(other.annotation())) { + array_ = other.array().clone(); + } else { + other.eval_to(*this); + } return array_; } diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 388cdd8e5d..91bcb10cc4 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -64,6 +64,18 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { ca("a,b,c").block(boost::combine(lobound, upbound))); BOOST_CHECK_NO_THROW(c("a,b,c") = ca("a,b,c").block(iv(3, 3, 3), iv(5, 5, 5))); + + // make sure that c("abc") = a("abc") does a deep copy + { + BOOST_CHECK_NO_THROW(c("a,b,c") = a("a, b, c")); + for (auto&& idx : c.tiles_range()) { + if (c.is_local(idx) && !c.is_local(idx) && a.is_local(idx) && + !a.is_zero(idx)) { + BOOST_CHECK(c.find_local(idx).get().data() != + a.find_local(idx).get().data()); + } + } + } } BOOST_FIXTURE_TEST_CASE_TEMPLATE(block_tensor_factories, F, Fixtures, F) { From 9fab62564e90e590c8df58aec1b3ffd39bc03b80 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 06:56:33 -0400 Subject: [PATCH 4/8] SizeArray can be compared to a sized range --- src/TiledArray/size_array.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h index 6edbecb222..bd52139ce5 100644 --- a/src/TiledArray/size_array.h +++ b/src/TiledArray/size_array.h @@ -42,6 +42,15 @@ class SizeArray { T* first_ = nullptr; ///< First element of the array T* last_ = nullptr; ///< Last element of the array + // can compare to any sized range + template + friend std::enable_if_t< + is_sized_range_v> && + !std::is_same_v, std::remove_reference_t> && + !std::is_base_of_v, std::remove_reference_t>, + bool> + operator==(const SizeArray&, SizedRange&&); + public: // type definitions typedef T value_type; @@ -436,6 +445,19 @@ class SizeArray { }; // class SizeArray +template +std::enable_if_t< + is_sized_range_v> && + !std::is_same_v, std::remove_reference_t> && + !std::is_base_of_v, std::remove_reference_t>, + bool> +operator==(const SizeArray& idx1, SizedRange&& idx2) { + if (idx1.size() == idx2.size()) + return std::equal(idx1.begin(), idx1.end(), idx2.begin()); + else + return false; +} + template inline std::vector operator*(const Permutation& perm, const SizeArray& orig) { From 07fb1e51c90d88530821e8b57a45de2ababbea73 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 06:58:10 -0400 Subject: [PATCH 5/8] introduce {Index,IndexView} aliases to Range::{index_type,index_view_type} --- src/TiledArray/range.h | 4 ++++ tests/range.cpp | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index 8108ecf227..b7a38d38b0 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -1245,6 +1245,10 @@ class Range { }; // class Range +// lift Range::index_type and Range::index_view_type into user-land +using Index = Range::index_type; +using IndexView = Range::index_view_type; + inline Range& Range::operator*=(const Permutation& perm) { TA_ASSERT(perm.size() == rank_); if (rank_ > 1ul) { diff --git a/tests/range.cpp b/tests/range.cpp index 1ad294363f..a71a0629d0 100644 --- a/tests/range.cpp +++ b/tests/range.cpp @@ -65,15 +65,37 @@ BOOST_FIXTURE_TEST_SUITE(range_suite, RangeFixture, TA_UT_LABEL_SERIAL) BOOST_AUTO_TEST_CASE(dimension_accessor) { BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound_data(), r.lobound_data() + r.rank(), start.begin(), start.end()); // check start() + BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound().begin(), r.lobound().end(), + start.begin(), start.end()); // check start() + BOOST_CHECK_EQUAL(r.lobound(), start); // check start() + BOOST_CHECK_EQUAL(r.lobound(), + (Index{start.begin(), start.end()})); // check finish() BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound_data(), r.upbound_data() + r.rank(), finish.begin(), finish.end()); // check finish() + BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound().begin(), r.upbound().end(), + finish.begin(), + finish.end()); // check finish() + BOOST_CHECK_EQUAL(r.upbound(), finish); // check finish() + BOOST_CHECK_EQUAL(r.upbound(), + (Index{finish.begin(), finish.end()})); // check finish() BOOST_CHECK_EQUAL_COLLECTIONS(r.extent_data(), r.extent_data() + r.rank(), size.begin(), size.end()); // check size() + BOOST_CHECK_EQUAL_COLLECTIONS(r.extent().begin(), r.extent().end(), + size.begin(), size.end()); // check size() + BOOST_CHECK_EQUAL(r.extent(), size); // check size() + BOOST_CHECK_EQUAL(r.extent(), + (Index{size.begin(), size.end()})); // check size() BOOST_CHECK_EQUAL_COLLECTIONS(r.stride_data(), r.stride_data() + r.rank(), weight.begin(), weight.end()); // check weight() - BOOST_CHECK_EQUAL(r.volume(), volume); // check volume() + BOOST_CHECK_EQUAL_COLLECTIONS(r.stride().begin(), r.stride().end(), + weight.begin(), + weight.end()); // check weight() + BOOST_CHECK_EQUAL(r.stride(), weight); // check weight() + BOOST_CHECK_EQUAL(r.stride(), + (Index{weight.begin(), weight.end()})); // check weight() + BOOST_CHECK_EQUAL(r.volume(), volume); // check volume() for (size_t d = 0; d != r.rank(); ++d) { auto range_d = r.dim(d); BOOST_CHECK_EQUAL(range_d.first, start[d]); From d8ddc210b5ecf838dcfb66fd7acf38e869ff1810 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 06:59:36 -0400 Subject: [PATCH 6/8] added demo2 exec for the paper code snippets --- examples/demo/CMakeLists.txt | 8 ++-- examples/demo/demo2.cpp | 93 ++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 examples/demo/demo2.cpp diff --git a/examples/demo/CMakeLists.txt b/examples/demo/CMakeLists.txt index c4c533cbf0..c2da9cb36e 100644 --- a/examples/demo/CMakeLists.txt +++ b/examples/demo/CMakeLists.txt @@ -16,8 +16,10 @@ # along with this program. If not, see . # -# Create the ta_fock_build executable - -# Add the demo executable +# Standard TA demo to accompany the keynote slides add_ta_executable(demo "demo.cpp" "tiledarray") add_dependencies(examples-tiledarray demo) + +# TA demo snippets for the paper +add_ta_executable(demo2 "demo2.cpp" "tiledarray") +add_dependencies(examples-tiledarray demo2) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp new file mode 100644 index 0000000000..64144eab70 --- /dev/null +++ b/examples/demo/demo2.cpp @@ -0,0 +1,93 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2023 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#ifndef EXAMPLES_DEMO_DEMO2_CPP_ +#define EXAMPLES_DEMO_DEMO2_CPP_ + +#include +#include + +#include +#include + +int main(int argc, char* argv[]) { + using namespace std; + + TA::srand(2017); + TA::World& world = TA::initialize(argc, argv); + + using namespace TA; + + // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ + Range ρ{{1, 11}, {-1, 9}}; + assert((ρ.lobound() == Index{1, -1})); + assert((ρ.upbound() == Index{11, 9})); + assert((ρ.extent() == Index{10, 10})); + assert(ρ.volume() == 100); + assert((ρ.stride() == Index{10, 1})); + assert((ρ.ordinal({1, -1}) == 0)); + assert((ρ.ordinal({10, 8}) + 1 == ρ.volume())); + // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] " + for (auto&& idx : ρ) cout << idx << " "; + + // $\mathbb{Z}_{1,11}$ tiled into $\mathbb{Z}_{1,5}$, $\mathbb{Z}_{5,8}$, and + // $\mathbb{Z}_{8,11}$ + TiledRange1 τ0{1, 5, 8, 11}; // hashmarks + assert(τ0.extent() == 10); // there are 10 elements in τ0 + assert((τ0.elements_range() == + Range1{1, 11})); // elements indexed by $\mathbb{Z}_{1,11}$ + assert(τ0.tile_extent() == 3); // there are 3 tiles in τ0 + assert((τ0.tiles_range() == + Range1{0, 3})); // tiles indexed by $\mathbb{Z}_{0,3}$ + assert((τ0.tile(1) == Range1{5, 8})); // 1st tile of τ0 is $\mathbb{Z}_{5,8}$ + + // $\mathbb{Z}_{-1,9}$ tiled into $\mathbb{Z}_{-1,5}$ and $\mathbb{Z}_{5,9}$ + TiledRange1 τ1{-1, 5, 9}; + + // 2nd tile of $\code{tau0}$ is $\mathbb{Z}_{5,8}$ + assert((τ0.tile(1) == Range1{5, 8})); + // 1st tile of $\code{tau1}$ is $\mathbb{Z}_{-1,5}$ + assert((τ1.tile(0) == Range1{-1, 5})); + + // prints "-1 0 1 2 3 4 " + for (auto&& i : τ1.tile(0)) cout << i << " "; + + // tiling of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ by tensor product + // of + // $\code{τ0}$ and $\code{τ1}$ + TiledRange τ{τ0, τ1}; + // shortcut + TiledRange same_as_τ{{1, 5, 8, 11}, {-1, 5, 9}}; + + // tile index {0,0} refers to tile $\mathbb{Z}_{1,5} \otimes + // \mathbb{Z}_{-1,5}$ + auto tile_0_0 = τ.tile({0, 0}); + assert((tile_0_0 == Range{{1, 5}, {-1, 5}})); + + // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s + DistArray array0(world, τ); + array0.fill(1.0); // fill $\code{array0}$ with 1's + + // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2. + auto t00 = array0.find({0, 0}); + + return 0; +} + +#endif /* EXAMPLES_DEMO_DEMO2_CPP_ */ From 94a8fd9e71cd4bcc3f639bb05726314034e21841 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 07:00:10 -0400 Subject: [PATCH 7/8] demo: use TA::srand --- examples/demo/demo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/demo/demo.cpp b/examples/demo/demo.cpp index d6c1612d95..05f9a25bf5 100644 --- a/examples/demo/demo.cpp +++ b/examples/demo/demo.cpp @@ -39,7 +39,7 @@ auto make_tile(const TA::Range &range) { int main(int argc, char *argv[]) { using namespace std; - std::srand(2017); + TA::srand(2017); TA::World &world = TA::initialize(argc, argv); using namespace TA; @@ -88,7 +88,6 @@ int main(int argc, char *argv[]) { SparseShape shape(shape_tensor, TR); TSpArrayD a1(world, TR, shape); a1.fill_random(); // for deterministic fill: - // TA::srand(seed); // a1.fill_random(); cout << "a1:\n" << a1 << endl; world.gop.fence(); From 6a55cdceb1a9808bb7135f9b25a5ccadd73f3787 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 1 Apr 2023 13:21:29 -0500 Subject: [PATCH 8/8] demo2: comment out most code unless compiler can support unicode chars in variable names --- examples/demo/demo2.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp index 64144eab70..5818fae22d 100644 --- a/examples/demo/demo2.cpp +++ b/examples/demo/demo2.cpp @@ -34,14 +34,34 @@ int main(int argc, char* argv[]) { using namespace TA; + // requires compiler new enough to support unicode characters in variable + // names +#ifndef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES +#ifdef TILEDARRAY_CXX_COMPILER_IS_GCC +#if __GNUC__ >= 10 +#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1 +#endif +#elif !defined(TILEDARRAY_CXX_COMPILER_IS_ICC) +#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1 +#endif +#endif // !defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES) + +#ifdef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES + // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ Range ρ{{1, 11}, {-1, 9}}; + // lower bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.lobound() == Index{1, -1})); + // upper bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.upbound() == Index{11, 9})); + // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.extent() == Index{10, 10})); + // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert(ρ.volume() == 100); + // row-major order assert((ρ.stride() == Index{10, 1})); assert((ρ.ordinal({1, -1}) == 0)); + assert((ρ.ordinal({1, 0}) == 1)); assert((ρ.ordinal({10, 8}) + 1 == ρ.volume())); // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] " for (auto&& idx : ρ) cout << idx << " "; @@ -87,6 +107,8 @@ int main(int argc, char* argv[]) { // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2. auto t00 = array0.find({0, 0}); +#endif // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES) + return 0; }