From 08854201787720bda96783b7d9378d6411135af1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 24 Jun 2024 13:49:46 -0400 Subject: [PATCH 1/4] amend computation of the root dir of CUDAToolkit: CUDAToolkit_INCLUDE_DIR is not defined, but CUDAToolkit_LIBRARY_DIR is --- external/cuda.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/cuda.cmake b/external/cuda.cmake index 00a8b17477..2e757b60c4 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -28,7 +28,7 @@ foreach (library cublas;nvToolsExt) endforeach() if (NOT DEFINED CUDAToolkit_ROOT) - get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_INCLUDE_DIR}/../" ABSOLUTE CACHE) + get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_LIBRARY_DIR}/../" ABSOLUTE CACHE) endif(NOT DEFINED CUDAToolkit_ROOT) # sanitize implicit dirs if CUDA host compiler != C++ compiler From b9b663a8bf920d6eaccc0bd29236cbf35f883828 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 24 Jun 2024 13:57:29 -0400 Subject: [PATCH 2/4] bump Umpire tag to v2024.02.1 --- INSTALL.md | 2 +- external/umpire.cmake | 2 +- external/versions.cmake | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index b0705e6f1e..96e7259ed5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -69,7 +69,7 @@ Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece). - - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 20839b2e8e8972070dd8f75c7f00d50d6c399716). + - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag v2024.02.1). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite diff --git a/external/umpire.cmake b/external/umpire.cmake index 57675ca189..37152e98d2 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -215,7 +215,7 @@ else() TiledArray_UMPIRE PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "$;$;$;$;$;$" + "$;$;$;$;$;$;$" INTERFACE_LINK_LIBRARIES "$;$" ) diff --git a/external/versions.cmake b/external/versions.cmake index 8443052d37..e0680a6d48 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -22,8 +22,8 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) -set(TA_TRACKED_UMPIRE_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) -set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2023.06.0) +set(TA_TRACKED_UMPIRE_TAG v2024.02.1) +set(TA_TRACKED_UMPIRE_PREVIOUS_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81) set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf ) From d2dd697528172327cead2c5146dbe1bcd61529e7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 25 Jun 2024 16:06:48 -0400 Subject: [PATCH 3/4] if CMAKE_CUDA_HOST_COMPILER is not set, set it to CMAKE_CXX_COMPILER in case it's not in PATH --- external/cuda.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/external/cuda.cmake b/external/cuda.cmake index 2e757b60c4..aa1e51e53e 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -12,6 +12,11 @@ if (DEFINED CMAKE_CUDA_FLAGS) else() set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr") endif() +# if CMAKE_CUDA_HOST_COMPILER not set, set it to CMAKE_CXX_COMPILER, else NVCC will grab something from PATH +if (NOT DEFINED CMAKE_CUDA_HOST_COMPILER) + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "The host C++ compiler to be used by the CUDA compiler") +endif() + enable_language(CUDA) set(CUDA_FOUND TRUE) From 9decfc25800db7d86d0132525925d8ea52bf9537 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Jun 2024 07:29:13 -0400 Subject: [PATCH 4/4] amends 577fda29 to always use the old implementation of ta_tensor_to_um_tensor when element conversion is needed --- src/TiledArray/device/btas_um_tensor.h | 70 +++++++++++++++++--------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index 45f9b63731..dec80dcaf1 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -67,7 +67,7 @@ struct ArchiveLoadImpl> { TiledArray::btasUMTensorVarray &t) { TiledArray::Range range{}; TiledArray::device_um_btas_varray store{}; - ar &range &store; + ar & range & store; t = TiledArray::btasUMTensorVarray(std::move(range), std::move(store)); // device::setDevice(TiledArray::deviceEnv::instance()->default_device_id()); // auto &stream = device::stream_for(range); @@ -83,7 +83,7 @@ struct ArchiveStoreImpl> { auto stream = TiledArray::device::stream_for(t.range()); TiledArray::to_execution_space( t.storage(), stream); - ar &t.range() & t.storage(); + ar & t.range() & t.storage(); } }; @@ -674,25 +674,12 @@ template typename std::enable_if::value, TiledArray::DistArray>::type ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { - auto convert_tile_memcpy = [](const TATensor &tile) { - /// UMTensor must be wrapped into TA::Tile - - using Tensor = typename UMTensor::tensor_type; - - auto stream = device::stream_for(tile.range()); - typename Tensor::storage_type storage; - make_device_storage(storage, tile.range().area(), stream); - Tensor result(tile.range(), std::move(storage)); - - DeviceSafeCall( - device::memcpyAsync(result.data(), tile.data(), - tile.size() * sizeof(typename Tensor::value_type), - device::MemcpyDefault, stream)); - - device::sync_madness_task_with(stream); - return TiledArray::Tile(std::move(result)); - }; + using inT = typename TATensor::value_type; + using outT = typename UMTensor::value_type; + // check if element conversion is necessary + constexpr bool T_conversion = !std::is_same_v; + // this is safe even when need to convert element types, but less efficient auto convert_tile_um = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile @@ -711,14 +698,47 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { TiledArray::to_execution_space( result.storage(), stream); + // N.B. move! without it have D-to-H transfer due to calling UM + // allocator construct() on the host return TiledArray::Tile(std::move(result)); }; - const char *use_legacy_conversion = - std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); - auto um_array = use_legacy_conversion - ? to_new_tile_type(array, convert_tile_um) - : to_new_tile_type(array, convert_tile_memcpy); + TiledArray::DistArray um_array; + if constexpr (T_conversion) { + um_array = to_new_tile_type(array, convert_tile_um); + } else { + // this is more efficient for copying: + // - avoids copy on host followed by UM transfer, instead uses direct copy + // - replaced unneeded copy (which also caused D-to-H transfer due to + // calling UM allocator construct() on the host) by move + // This eliminates all spurious UM traffic in (T) W3 contractions + auto convert_tile_memcpy = [](const TATensor &tile) { + /// UMTensor must be wrapped into TA::Tile + + using Tensor = typename UMTensor::tensor_type; + + auto stream = device::stream_for(tile.range()); + typename Tensor::storage_type storage; + make_device_storage(storage, tile.range().area(), stream); + Tensor result(tile.range(), std::move(storage)); + + DeviceSafeCall( + device::memcpyAsync(result.data(), tile.data(), + tile.size() * sizeof(typename Tensor::value_type), + device::MemcpyDefault, stream)); + + device::sync_madness_task_with(stream); + // N.B. move! without it have D-to-H transfer due to calling UM + // allocator construct() on the host + return TiledArray::Tile(std::move(result)); + }; + + const char *use_legacy_conversion = + std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); + um_array = use_legacy_conversion + ? to_new_tile_type(array, convert_tile_um) + : to_new_tile_type(array, convert_tile_memcpy); + } array.world().gop.fence(); return um_array;