From 08854201787720bda96783b7d9378d6411135af1 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 24 Jun 2024 13:49:46 -0400
Subject: [PATCH 1/4] amend computation of the root dir of CUDAToolkit:
 CUDAToolkit_INCLUDE_DIR is not defined, but CUDAToolkit_LIBRARY_DIR is

---
 external/cuda.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/cuda.cmake b/external/cuda.cmake
index 00a8b17477..2e757b60c4 100644
--- a/external/cuda.cmake
+++ b/external/cuda.cmake
@@ -28,7 +28,7 @@ foreach (library cublas;nvToolsExt)
 endforeach()
 
 if (NOT DEFINED CUDAToolkit_ROOT)
-  get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_INCLUDE_DIR}/../" ABSOLUTE CACHE)
+  get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_LIBRARY_DIR}/../" ABSOLUTE CACHE)
 endif(NOT DEFINED CUDAToolkit_ROOT)
 
 # sanitize implicit dirs if CUDA host compiler != C++ compiler

From b9b663a8bf920d6eaccc0bd29236cbf35f883828 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 24 Jun 2024 13:57:29 -0400
Subject: [PATCH 2/4] bump Umpire tag to v2024.02.1

---
 INSTALL.md              | 2 +-
 external/umpire.cmake   | 2 +-
 external/versions.cmake | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index b0705e6f1e..96e7259ed5 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -69,7 +69,7 @@ Optional prerequisites:
     - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
     - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably.
   - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece).
-  - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 20839b2e8e8972070dd8f75c7f00d50d6c399716).
+  - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag v2024.02.1).
 - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
 - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
   - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite
diff --git a/external/umpire.cmake b/external/umpire.cmake
index 57675ca189..37152e98d2 100644
--- a/external/umpire.cmake
+++ b/external/umpire.cmake
@@ -215,7 +215,7 @@ else()
             TiledArray_UMPIRE
             PROPERTIES
             INTERFACE_INCLUDE_DIRECTORIES
-            "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/include>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/include>"
+            "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/fmt/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/include>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/include>"
             INTERFACE_LINK_LIBRARIES
             "$<BUILD_INTERFACE:${UMPIRE_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/lib/libumpire${UMPIRE_DEFAULT_LIBRARY_SUFFIX}>"
             )
diff --git a/external/versions.cmake b/external/versions.cmake
index 8443052d37..e0680a6d48 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -22,8 +22,8 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14)
 set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece)
 set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83)
 
-set(TA_TRACKED_UMPIRE_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716)
-set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2023.06.0)
+set(TA_TRACKED_UMPIRE_TAG v2024.02.1)
+set(TA_TRACKED_UMPIRE_PREVIOUS_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716)
 
 set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81)
 set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf )

From d2dd697528172327cead2c5146dbe1bcd61529e7 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 25 Jun 2024 16:06:48 -0400
Subject: [PATCH 3/4] if CMAKE_CUDA_HOST_COMPILER is not set, set it to
 CMAKE_CXX_COMPILER in case it's not in PATH

---
 external/cuda.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/external/cuda.cmake b/external/cuda.cmake
index 2e757b60c4..aa1e51e53e 100644
--- a/external/cuda.cmake
+++ b/external/cuda.cmake
@@ -12,6 +12,11 @@ if (DEFINED CMAKE_CUDA_FLAGS)
 else()
   set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
 endif()
+# if CMAKE_CUDA_HOST_COMPILER not set, set it to CMAKE_CXX_COMPILER, else NVCC will grab something from PATH
+if (NOT DEFINED CMAKE_CUDA_HOST_COMPILER)
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "The host C++ compiler to be used by the CUDA compiler")
+endif()
+
 enable_language(CUDA)
 
 set(CUDA_FOUND TRUE)

From 9decfc25800db7d86d0132525925d8ea52bf9537 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 26 Jun 2024 07:29:13 -0400
Subject: [PATCH 4/4] amends 577fda29 to always use the old implementation of
 ta_tensor_to_um_tensor when element conversion is needed

---
 src/TiledArray/device/btas_um_tensor.h | 70 +++++++++++++++++---------
 1 file changed, 45 insertions(+), 25 deletions(-)

diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h
index 45f9b63731..dec80dcaf1 100644
--- a/src/TiledArray/device/btas_um_tensor.h
+++ b/src/TiledArray/device/btas_um_tensor.h
@@ -67,7 +67,7 @@ struct ArchiveLoadImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
                           TiledArray::btasUMTensorVarray<T> &t) {
     TiledArray::Range range{};
     TiledArray::device_um_btas_varray<T> store{};
-    ar &range &store;
+    ar & range & store;
     t = TiledArray::btasUMTensorVarray<T>(std::move(range), std::move(store));
     // device::setDevice(TiledArray::deviceEnv::instance()->default_device_id());
     // auto &stream = device::stream_for(range);
@@ -83,7 +83,7 @@ struct ArchiveStoreImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
     auto stream = TiledArray::device::stream_for(t.range());
     TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
         t.storage(), stream);
-    ar &t.range() & t.storage();
+    ar & t.range() & t.storage();
   }
 };
 
@@ -674,25 +674,12 @@ template <typename UMTensor, typename TATensor, typename Policy>
 typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
                         TiledArray::DistArray<UMTensor, Policy>>::type
 ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
-  auto convert_tile_memcpy = [](const TATensor &tile) {
-    /// UMTensor must be wrapped into TA::Tile
-
-    using Tensor = typename UMTensor::tensor_type;
-
-    auto stream = device::stream_for(tile.range());
-    typename Tensor::storage_type storage;
-    make_device_storage(storage, tile.range().area(), stream);
-    Tensor result(tile.range(), std::move(storage));
-
-    DeviceSafeCall(
-        device::memcpyAsync(result.data(), tile.data(),
-                            tile.size() * sizeof(typename Tensor::value_type),
-                            device::MemcpyDefault, stream));
-
-    device::sync_madness_task_with(stream);
-    return TiledArray::Tile<Tensor>(std::move(result));
-  };
+  using inT = typename TATensor::value_type;
+  using outT = typename UMTensor::value_type;
+  // check if element conversion is necessary
+  constexpr bool T_conversion = !std::is_same_v<inT, outT>;
 
+  // this is safe even when need to convert element types, but less efficient
   auto convert_tile_um = [](const TATensor &tile) {
     /// UMTensor must be wrapped into TA::Tile
 
@@ -711,14 +698,47 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
     TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
         result.storage(), stream);
 
+    // N.B. move! without it have D-to-H transfer due to calling UM
+    // allocator construct() on the host
     return TiledArray::Tile<Tensor>(std::move(result));
   };
 
-  const char *use_legacy_conversion =
-      std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION");
-  auto um_array = use_legacy_conversion
-                      ? to_new_tile_type(array, convert_tile_um)
-                      : to_new_tile_type(array, convert_tile_memcpy);
+  TiledArray::DistArray<UMTensor, Policy> um_array;
+  if constexpr (T_conversion) {
+    um_array = to_new_tile_type(array, convert_tile_um);
+  } else {
+    // this is more efficient for copying:
+    // - avoids copy on host followed by UM transfer, instead uses direct copy
+    // - replaced unneeded copy (which also caused D-to-H transfer due to
+    // calling UM allocator construct() on the host) by move
+    // This eliminates all spurious UM traffic in (T) W3 contractions
+    auto convert_tile_memcpy = [](const TATensor &tile) {
+      /// UMTensor must be wrapped into TA::Tile
+
+      using Tensor = typename UMTensor::tensor_type;
+
+      auto stream = device::stream_for(tile.range());
+      typename Tensor::storage_type storage;
+      make_device_storage(storage, tile.range().area(), stream);
+      Tensor result(tile.range(), std::move(storage));
+
+      DeviceSafeCall(
+          device::memcpyAsync(result.data(), tile.data(),
+                              tile.size() * sizeof(typename Tensor::value_type),
+                              device::MemcpyDefault, stream));
+
+      device::sync_madness_task_with(stream);
+      // N.B. move! without it have D-to-H transfer due to calling UM
+      // allocator construct() on the host
+      return TiledArray::Tile<Tensor>(std::move(result));
+    };
+
+    const char *use_legacy_conversion =
+        std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION");
+    um_array = use_legacy_conversion
+                   ? to_new_tile_type(array, convert_tile_um)
+                   : to_new_tile_type(array, convert_tile_memcpy);
+  }
 
   array.world().gop.fence();
   return um_array;