Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Optional prerequisites:
- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
- [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably.
- [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece).
- [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 20839b2e8e8972070dd8f75c7f00d50d6c399716).
- [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag v2024.02.1).
- [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
- [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
- [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite
Expand Down
7 changes: 6 additions & 1 deletion external/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ if (DEFINED CMAKE_CUDA_FLAGS)
else()
set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
endif()
# if CMAKE_CUDA_HOST_COMPILER not set, set it to CMAKE_CXX_COMPILER, else NVCC will grab something from PATH
if (NOT DEFINED CMAKE_CUDA_HOST_COMPILER)
set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "The host C++ compiler to be used by the CUDA compiler")
endif()

enable_language(CUDA)

set(CUDA_FOUND TRUE)
Expand All @@ -28,7 +33,7 @@ foreach (library cublas;nvToolsExt)
endforeach()

if (NOT DEFINED CUDAToolkit_ROOT)
get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_INCLUDE_DIR}/../" ABSOLUTE CACHE)
get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_LIBRARY_DIR}/../" ABSOLUTE CACHE)
endif(NOT DEFINED CUDAToolkit_ROOT)

# sanitize implicit dirs if CUDA host compiler != C++ compiler
Expand Down
2 changes: 1 addition & 1 deletion external/umpire.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ else()
TiledArray_UMPIRE
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES
"$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/include>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/include>"
"$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/fmt/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/include>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/include>"
INTERFACE_LINK_LIBRARIES
"$<BUILD_INTERFACE:${UMPIRE_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/lib/libumpire${UMPIRE_DEFAULT_LIBRARY_SUFFIX}>"
)
Expand Down
4 changes: 2 additions & 2 deletions external/versions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14)
set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece)
set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83)

set(TA_TRACKED_UMPIRE_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716)
set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2023.06.0)
set(TA_TRACKED_UMPIRE_TAG v2024.02.1)
set(TA_TRACKED_UMPIRE_PREVIOUS_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716)

set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81)
set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf )
Expand Down
70 changes: 45 additions & 25 deletions src/TiledArray/device/btas_um_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct ArchiveLoadImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
TiledArray::btasUMTensorVarray<T> &t) {
TiledArray::Range range{};
TiledArray::device_um_btas_varray<T> store{};
ar &range &store;
ar & range & store;
t = TiledArray::btasUMTensorVarray<T>(std::move(range), std::move(store));
// device::setDevice(TiledArray::deviceEnv::instance()->default_device_id());
// auto &stream = device::stream_for(range);
Expand All @@ -83,7 +83,7 @@ struct ArchiveStoreImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
auto stream = TiledArray::device::stream_for(t.range());
TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
t.storage(), stream);
ar &t.range() & t.storage();
ar & t.range() & t.storage();
}
};

Expand Down Expand Up @@ -674,25 +674,12 @@ template <typename UMTensor, typename TATensor, typename Policy>
typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
TiledArray::DistArray<UMTensor, Policy>>::type
ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
auto convert_tile_memcpy = [](const TATensor &tile) {
/// UMTensor must be wrapped into TA::Tile

using Tensor = typename UMTensor::tensor_type;

auto stream = device::stream_for(tile.range());
typename Tensor::storage_type storage;
make_device_storage(storage, tile.range().area(), stream);
Tensor result(tile.range(), std::move(storage));

DeviceSafeCall(
device::memcpyAsync(result.data(), tile.data(),
tile.size() * sizeof(typename Tensor::value_type),
device::MemcpyDefault, stream));

device::sync_madness_task_with(stream);
return TiledArray::Tile<Tensor>(std::move(result));
};
using inT = typename TATensor::value_type;
using outT = typename UMTensor::value_type;
// check if element conversion is necessary
constexpr bool T_conversion = !std::is_same_v<inT, outT>;

// this is safe even when need to convert element types, but less efficient
auto convert_tile_um = [](const TATensor &tile) {
/// UMTensor must be wrapped into TA::Tile

Expand All @@ -711,14 +698,47 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
result.storage(), stream);

// N.B. move! without it have D-to-H transfer due to calling UM
// allocator construct() on the host
return TiledArray::Tile<Tensor>(std::move(result));
};

const char *use_legacy_conversion =
std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION");
auto um_array = use_legacy_conversion
? to_new_tile_type(array, convert_tile_um)
: to_new_tile_type(array, convert_tile_memcpy);
TiledArray::DistArray<UMTensor, Policy> um_array;
if constexpr (T_conversion) {
um_array = to_new_tile_type(array, convert_tile_um);
} else {
// this is more efficient for copying:
// - avoids copy on host followed by UM transfer, instead uses direct copy
// - replaced unneeded copy (which also caused D-to-H transfer due to
// calling UM allocator construct() on the host) by move
// This eliminates all spurious UM traffic in (T) W3 contractions
auto convert_tile_memcpy = [](const TATensor &tile) {
/// UMTensor must be wrapped into TA::Tile

using Tensor = typename UMTensor::tensor_type;

auto stream = device::stream_for(tile.range());
typename Tensor::storage_type storage;
make_device_storage(storage, tile.range().area(), stream);
Tensor result(tile.range(), std::move(storage));

DeviceSafeCall(
device::memcpyAsync(result.data(), tile.data(),
tile.size() * sizeof(typename Tensor::value_type),
device::MemcpyDefault, stream));

device::sync_madness_task_with(stream);
// N.B. move! without it have D-to-H transfer due to calling UM
// allocator construct() on the host
return TiledArray::Tile<Tensor>(std::move(result));
};

const char *use_legacy_conversion =
std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION");
um_array = use_legacy_conversion
? to_new_tile_type(array, convert_tile_um)
: to_new_tile_type(array, convert_tile_memcpy);
}

array.world().gop.fence();
return um_array;
Expand Down