diff --git a/INSTALL.md b/INSTALL.md index 3606a2bd25..c06535172e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -64,7 +64,7 @@ Compiling BTAS requires the following prerequisites: Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing: - - [cuTT](github.com/ValeevGroup/cutt) -- CUDA transpose library; note that our fork of the [original cuTT repo](github.com/ap-hynninen/cutt) is required to provide thread-safety (tag 0e8685bf82910bc7435835f846e88f1b39f47f09). + - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) with our additional thread-safety improvements (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da). - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: @@ -329,7 +329,7 @@ Support for execution on CUDA-enabled hardware is controlled by the following va * `ENABLE_CUDA` -- Set to `ON` to turn on CUDA support. [Default=OFF]. * `CMAKE_CUDA_HOST_COMPILER` -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.). * `ENABLE_CUDA_ERROR_CHECK` -- Set to `ON` to turn on assertions for successful completion of calls to CUDA runtime and libraries. [Default=OFF]. -* `CUTT_INSTALL_DIR` -- the installation prefix of the pre-installed cuTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install cuTT. +* `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT. * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire. For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info. diff --git a/bin/admin/dependency-versions-update-hook.py b/bin/admin/dependency-versions-update-hook.py index 19b7123703..686b98b49a 100755 --- a/bin/admin/dependency-versions-update-hook.py +++ b/bin/admin/dependency-versions-update-hook.py @@ -106,11 +106,11 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = ' btas_old_tag = tokens[2] else: btas_new_tag = tokens[2] - elif tokens[1].find('CUTT') != -1: + elif tokens[1].find('LIBRETT') != -1: if tokens[1].find('PREVIOUS') != -1: - cutt_old_tag = tokens[2] + librett_old_tag = tokens[2] else: - cutt_new_tag = tokens[2] + librett_new_tag = tokens[2] elif tokens[1].find('UMPIRE') != -1: if tokens[1].find('PREVIOUS') != -1: umpire_old_tag = tokens[2] @@ -146,8 +146,8 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = ' # BTAS tag in INSTALL.md any_files_changed |= replace_dep_id(topsrc, 'md', 'BTAS', btas_old_tag, btas_new_tag, 'ValeevGroup/BTAS), tag ', '') -# cuTT tag in INSTALL.md -any_files_changed |= replace_dep_id(topsrc, 'md', 'cuTT', cutt_old_tag, cutt_new_tag, '', '') +# LibreTT tag in INSTALL.md +any_files_changed |= replace_dep_id(topsrc, 'md', 'LibreTT', librett_old_tag, librett_new_tag, '', '') # Umpire tag in INSTALL.md any_files_changed |= replace_dep_id(topsrc, 'md', 'Umpire', umpire_old_tag, umpire_new_tag, '', '') diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index 2f6affe700..5d7f56c86e 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -25,7 +25,7 @@ if(CUDA_FOUND) - foreach(_exec cuda_cutt cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) + foreach(_exec cuda_librett cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) # Add executable add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") diff --git a/examples/cuda/cuda_cutt.cpp b/examples/cuda/cuda_librett.cpp similarity index 98% rename from examples/cuda/cuda_cutt.cpp rename to examples/cuda/cuda_librett.cpp index edaefc2597..a916bfc729 100644 --- a/examples/cuda/cuda_cutt.cpp +++ b/examples/cuda/cuda_librett.cpp @@ -29,7 +29,7 @@ #include /** - * Test cuTT + * Test LibreTT */ const std::size_t N = 100; diff --git a/external/cuda.cmake b/external/cuda.cmake index 1e5ebd8d60..3b2eb6ce37 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -42,6 +42,6 @@ message(STATUS "CMAKE Implicit Link Directories: ${CMAKE_CUDA_IMPLICIT_LINK_DIRE include(external/umpire.cmake) ## -## cuTT +## LibreTT ## -include(external/cutt.cmake) +include(external/librett.cmake) diff --git a/external/cutt.cmake b/external/librett.cmake similarity index 53% rename from external/cutt.cmake rename to external/librett.cmake index dbf4e94f91..a238f3af92 100644 --- a/external/cutt.cmake +++ b/external/librett.cmake @@ -1,48 +1,48 @@ ## -## find cuTT +## find LibreTT ## -find_path(_CUTT_INSTALL_DIR NAMES include/cutt.h lib/libcutt.a HINTS ${CUTT_INSTALL_DIR}) +find_path(_LIBRETT_INSTALL_DIR NAMES include/librett.h lib/librett.a HINTS ${LIBRETT_INSTALL_DIR}) -if( _CUTT_INSTALL_DIR ) +if( _LIBRETT_INSTALL_DIR ) - message(STATUS "cuTT found at ${_CUTT_INSTALL_DIR}") + message(STATUS "LibreTT found at ${_LIBRETT_INSTALL_DIR}") elseif(TA_EXPERT) - message("** cuTT was not found") - message(STATUS "** Downloading and building cuTT is explicitly disabled in EXPERT mode") + message("** LibreTT was not found") + message(STATUS "** Downloading and building LibreTT is explicitly disabled in EXPERT mode") else() - # TODO need to fix the auto installation of cuTT + # TODO need to fix the auto installation of LibreTT include(ExternalProject) # to pass CMAKE_C_* vars to external project enable_language(C) - # set source and build path for cuTT in the TiledArray project - set(EXTERNAL_SOURCE_DIR ${FETCHCONTENT_BASE_DIR}/cutt-src) - # cutt only supports in source build - set(EXTERNAL_BUILD_DIR ${FETCHCONTENT_BASE_DIR}/cutt-build) + # set source and build path for LibreTT in the TiledArray project + set(EXTERNAL_SOURCE_DIR ${FETCHCONTENT_BASE_DIR}/librett-src) + # librett only supports in source build + set(EXTERNAL_BUILD_DIR ${FETCHCONTENT_BASE_DIR}/librett-build) set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}) - if (NOT CUTT_URL) - set(CUTT_URL https://github.com/ValeevGroup/cutt.git) - endif (NOT CUTT_URL) - if (NOT CUTT_TAG) - set(CUTT_TAG ${TA_TRACKED_CUTT_TAG}) - endif (NOT CUTT_TAG) + if (NOT LIBRETT_URL) + set(LIBRETT_URL https://github.com/victor-anisimov/librett.git) + endif (NOT LIBRETT_URL) + if (NOT LIBRETT_TAG) + set(LIBRETT_TAG ${TA_TRACKED_LIBRETT_TAG}) + endif (NOT LIBRETT_TAG) - message("** Will clone cuTT from ${CUTT_URL}") + message("** Will clone LibreTT from ${LIBRETT_URL}") # need to change the separator of list to avoid issues with ExternalProject parsing # set(CUDA_FLAGS "${CUDA_NVCC_FLAGS}") # string(REPLACE ";" "::" CUDA_FLAGS "${CUDA_NVCC_FLAGS}") #message(STATUS "CUDA_FLAGS: " "${CUDA_FLAGS}") - set(CUTT_CMAKE_ARGS + set(LIBRETT_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DCMAKE_POSITION_INDEPENDENT_CODE=${CMAKE_POSITION_INDEPENDENT_CODE} @@ -66,87 +66,88 @@ else() -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD} -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS} -DENABLE_UMPIRE=OFF - -DCUTT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool + -DLIBRETT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool -DCMAKE_PREFIX_PATH=${_UMPIRE_INSTALL_DIR} -DENABLE_NO_ALIGNED_ALLOC=ON -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} + -DENABLE_CUDA=ON ) if (DEFINED CMAKE_CUDA_ARCHITECTURES) - list(APPEND CUTT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) + list(APPEND LIBRETT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) endif(DEFINED CMAKE_CUDA_ARCHITECTURES) if (CMAKE_TOOLCHAIN_FILE) - set(CUTT_CMAKE_ARGS "${CUTT_CMAKE_ARGS}" + set(LIBRETT_CMAKE_ARGS "${LIBRETT_CMAKE_ARGS}" "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") endif(CMAKE_TOOLCHAIN_FILE) if (BUILD_SHARED_LIBS) - set(CUTT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) else(BUILD_SHARED_LIBS) - set(CUTT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) endif(BUILD_SHARED_LIBS) # N.B. Ninja needs spelling out the byproducts of custom targets, see https://cmake.org/cmake/help/v3.3/policy/CMP0058.html - set(CUTT_BUILD_BYPRODUCTS "${EXTERNAL_BUILD_DIR}/src/libcutt${CUTT_DEFAULT_LIBRARY_SUFFIX}") - message(STATUS "custom target cutt is expected to build these byproducts: ${CUTT_BUILD_BYPRODUCTS}") + set(LIBRETT_BUILD_BYPRODUCTS "${EXTERNAL_BUILD_DIR}/src/librett${LIBRETT_DEFAULT_LIBRARY_SUFFIX}") + message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}") - ExternalProject_Add(cutt + ExternalProject_Add(librett PREFIX ${CMAKE_INSTALL_PREFIX} - STAMP_DIR ${FETCHCONTENT_BASE_DIR}/cutt-ep-artifacts - TMP_DIR ${FETCHCONTENT_BASE_DIR}/cutt-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable + STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts + TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} - GIT_REPOSITORY ${CUTT_URL} - GIT_TAG ${CUTT_TAG} + GIT_REPOSITORY ${LIBRETT_URL} + GIT_TAG ${LIBRETT_TAG} #--Configure step------------- SOURCE_DIR ${EXTERNAL_SOURCE_DIR} LIST_SEPARATOR :: UPDATE_DISCONNECTED 1 CMAKE_ARGS - ${CUTT_CMAKE_ARGS} + ${LIBRETT_CMAKE_ARGS} ${EXTERNAL_SOURCE_DIR} #--Build step----------------- BINARY_DIR ${EXTERNAL_BUILD_DIR} - BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cutt -v - BUILD_BYPRODUCTS ${CUTT_BUILD_BYPRODUCTS} + BUILD_COMMAND ${CMAKE_COMMAND} --build . --target librett -v + BUILD_BYPRODUCTS ${LIBRETT_BUILD_BYPRODUCTS} #--Install step--------------- - INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "cuTT will be installed during TiledArray's installation." + INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "LibreTT will be installed during TiledArray's installation." #--Custom targets------------- STEP_TARGETS build ) - # TiledArray_CUTT target depends on existence of this directory to be usable from the build tree at configure time + # TiledArray_LIBRETT target depends on existence of this directory to be usable from the build tree at configure time execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src") - # do install of cuTT as part of building TiledArray's install target + # do install of LibreTT as part of building TiledArray's install target install(CODE "execute_process( COMMAND \"${CMAKE_COMMAND}\" \"--build\" \".\" \"--target\" \"install\" WORKING_DIRECTORY \"${EXTERNAL_BUILD_DIR}\" RESULT_VARIABLE error_code) if(error_code) - message(FATAL_ERROR \"Failed to install cuTT\") + message(FATAL_ERROR \"Failed to install LibreTT\") endif() ") - # Add cuTT dependency to External - add_dependencies(External-tiledarray cutt-build) + # Add LibreTT dependency to External + add_dependencies(External-tiledarray librett-build) - set(_CUTT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR}) + set(_LIBRETT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR}) -endif(_CUTT_INSTALL_DIR) +endif(_LIBRETT_INSTALL_DIR) -add_library(TiledArray_CUTT INTERFACE) +add_library(TiledArray_LIBRETT INTERFACE) -set_target_properties(TiledArray_CUTT +set_target_properties(TiledArray_LIBRETT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "$;$" + "$;$" INTERFACE_LINK_LIBRARIES - "$;$" + "$;$" ) -install(TARGETS TiledArray_CUTT EXPORT tiledarray COMPONENT tiledarray) +install(TARGETS TiledArray_LIBRETT EXPORT tiledarray COMPONENT tiledarray) -#TODO test cuTT +#TODO test LibreTT diff --git a/external/versions.cmake b/external/versions.cmake index 4ac855e249..c1120147d9 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -27,8 +27,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) set(TA_TRACKED_BTAS_TAG 242871710dabd5ef337e5253000d3e38c1d977ba) set(TA_TRACKED_BTAS_PREVIOUS_TAG db884b020b5c13c312c07df9d5c03cea2d65afb2) -set(TA_TRACKED_CUTT_TAG 0e8685bf82910bc7435835f846e88f1b39f47f09) -set(TA_TRACKED_CUTT_PREVIOUS_TAG 592198b93c93b7ca79e7900b9a9f2e79f9dafec3) +set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) +set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796) set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82) set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f5ed90793b..d6f055df8f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -197,7 +197,7 @@ if(CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/cuda.h - TiledArray/external/cutt.h + TiledArray/external/ta-librett.h TiledArray/cuda/cublas.h TiledArray/cuda/btas_cublas.h TiledArray/cuda/btas_um_tensor.h @@ -245,7 +245,7 @@ if(CUDA_FOUND) LANGUAGE CUDA) # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_CUTT) + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT) endif(CUDA_FOUND) diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/cuda/btas_um_tensor.h index d6012f00f1..2ec1fb9a6d 100644 --- a/src/TiledArray/cuda/btas_um_tensor.h +++ b/src/TiledArray/cuda/btas_um_tensor.h @@ -32,7 +32,7 @@ #include #include -#include +#include #include namespace TiledArray { @@ -187,7 +187,7 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, std::move(storage)); // invoke the permute function - cutt_permute(const_cast(device_data(arg.storage())), + librett_permute(const_cast(device_data(arg.storage())), device_data(result.storage()), arg.range(), perm, stream); synchronize_stream(&stream); diff --git a/src/TiledArray/external/cutt.h b/src/TiledArray/external/ta-librett.h similarity index 80% rename from src/TiledArray/external/cutt.h rename to src/TiledArray/external/ta-librett.h index a2a31ec20d..bc0da4de8a 100644 --- a/src/TiledArray/external/cutt.h +++ b/src/TiledArray/external/ta-librett.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED -#define TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED +#ifndef TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED +#define TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED #include @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include @@ -77,38 +77,39 @@ inline void permutation_to_col_major(std::vector& perm) { * @param stream the CUDA stream this permutation will be submitted to */ template -void cutt_permute(T* inData, T* outData, const TiledArray::Range& range, +void librett_permute(T* inData, T* outData, const TiledArray::Range& range, const TiledArray::Permutation& perm, cudaStream_t stream) { auto extent = range.extent(); std::vector extent_int(extent.begin(), extent.end()); - // cuTT uses FROM notation + // LibreTT uses FROM notation auto perm_inv = perm.inv(); std::vector perm_int(perm_inv.begin(), perm_inv.end()); - // cuTT uses ColMajor + // LibreTT uses ColMajor TiledArray::extent_to_col_major(extent_int); TiledArray::permutation_to_col_major(perm_int); - cuttResult_t status; + //librettResult_t status; + librettResult status; - cuttHandle plan; - status = cuttPlan(&plan, range.rank(), extent_int.data(), perm_int.data(), + librettHandle plan; + status = librettPlan(&plan, range.rank(), extent_int.data(), perm_int.data(), sizeof(T), stream); - TA_ASSERT(status == CUTT_SUCCESS); + TA_ASSERT(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, inData, outData); + status = librettExecute(plan, inData, outData); - TA_ASSERT(status == CUTT_SUCCESS); + TA_ASSERT(status == LIBRETT_SUCCESS); - status = cuttDestroy(plan); + status = librettDestroy(plan); - TA_ASSERT(status == CUTT_SUCCESS); + TA_ASSERT(status == LIBRETT_SUCCESS); } } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA -#endif // TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED +#endif // TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 29b60a61d6..226d2365ac 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -7,7 +7,7 @@ #ifdef TILEDARRAY_HAS_CUDA #include #include -#include +#include #endif namespace TiledArray { @@ -20,14 +20,14 @@ inline void cuda_initialize() { cudaEnv::instance(); // cuBLASHandlePool::handle(); - // initialize cuTT - cuttInitialize(); + // initialize LibreTT + librettInitialize(); } /// finalize cuda environment inline void cuda_finalize() { CudaSafeCall(cudaDeviceSynchronize()); - cuttFinalize(); + librettFinalize(); cublasDestroy(cuBLASHandlePool::handle()); delete &cuBLASHandlePool::handle(); cudaEnv::instance().reset(nullptr); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0fccf921b5..1ac73df189 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -116,7 +116,7 @@ set(ta_test_src_files ta_test.cpp ) if(CUDA_FOUND) - list(APPEND ta_test_src_files cutt.cpp expressions_cuda_um.cpp tensor_um.cpp) + list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp) endif() # if tiledarray library was compiled without exceptions, use TA header-only (see below) diff --git a/tests/cutt.cpp b/tests/librett.cpp similarity index 81% rename from tests/cutt.cpp rename to tests/librett.cpp index 8a6b1af539..91c5b5b8ad 100644 --- a/tests/cutt.cpp +++ b/tests/librett.cpp @@ -27,8 +27,8 @@ #include #include "unit_test_config.h" -struct cuTTFixture { - // cuTTFixture() +struct LibreTTFixture { + // LibreTTFixture() // : A(100), // B(50), // C(20), @@ -36,16 +36,16 @@ struct cuTTFixture { // extent({100, 100}), // extent_nonsym({100, 50}), // perm({1, 0}) {} - cuTTFixture() : A(10), B(5), C(2) {} + LibreTTFixture() : A(10), B(5), C(2) {} int A; int B; int C; }; -BOOST_FIXTURE_TEST_SUITE(cutt_suite, cuTTFixture, TA_UT_LABEL_SERIAL); +BOOST_FIXTURE_TEST_SUITE(librett_suite, LibreTTFixture, TA_UT_LABEL_SERIAL); -BOOST_AUTO_TEST_CASE(cutt_gpu_mem) { +BOOST_AUTO_TEST_CASE(librett_gpu_mem) { int* a_host = (int*)std::malloc(A * A * sizeof(int)); int* b_host = (int*)std::malloc(A * A * sizeof(int)); int iter = 0; @@ -68,17 +68,18 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; - status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_device, b_device); + status = librettExecute(plan, a_device, b_device); - BOOST_CHECK(status == CUTT_SUCCESS); - cuttDestroy(plan); + BOOST_CHECK(status == LIBRETT_SUCCESS); + librettDestroy(plan); cudaMemcpy(b_host, b_device, A * A * sizeof(int), cudaMemcpyDeviceToHost); @@ -97,7 +98,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem) { cudaFree(b_device); } -BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) { +BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { int* a_host = (int*)std::malloc(A * B * sizeof(int)); int* b_host = (int*)std::malloc(A * B * sizeof(int)); int iter = 0; @@ -115,8 +116,9 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) { cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice); - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; std::vector extent({B, A}); TiledArray::extent_to_col_major(extent); @@ -124,14 +126,14 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_device, b_device); + status = librettExecute(plan, a_device, b_device); - BOOST_CHECK(status == CUTT_SUCCESS); - cuttDestroy(plan); + BOOST_CHECK(status == LIBRETT_SUCCESS); + librettDestroy(plan); cudaMemcpy(b_host, b_device, A * B * sizeof(int), cudaMemcpyDeviceToHost); @@ -150,7 +152,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) { cudaFree(b_device); } -BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) { +BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { int* a_host = (int*)std::malloc(A * B * C * sizeof(int)); int* b_host = (int*)std::malloc(A * B * C * sizeof(int)); int iter = 0; @@ -172,28 +174,29 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) { // b(j,i,k) = a(i,j,k) - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; std::vector extent3{int(A), int(B), int(C)}; std::vector perm3{1, 0, 2}; // std::vector perm3{0, 2, 1}; - status = cuttPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int), + status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int), 0, a_device, b_device); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_device, b_device); + status = librettExecute(plan, a_device, b_device); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost); - status = cuttDestroy(plan); + status = librettDestroy(plan); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); iter = 0; for (std::size_t k = 0; k < C; k++) { @@ -212,7 +215,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) { cudaFree(b_device); } -BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) { +BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { int* a_host = (int*)std::malloc(A * B * C * sizeof(int)); int* b_host = (int*)std::malloc(A * B * C * sizeof(int)); int iter = 0; @@ -234,8 +237,9 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) { // b(j,i,k) = a(i,j,k) - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; std::vector extent({A, B, C}); TiledArray::extent_to_col_major(extent); @@ -243,20 +247,20 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) { std::vector perm({1, 0, 2}); TiledArray::permutation_to_col_major(perm); - status = cuttPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0, + status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0, a_device, b_device); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_device, b_device); + status = librettExecute(plan, a_device, b_device); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost); - status = cuttDestroy(plan); + status = librettDestroy(plan); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -275,7 +279,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) { cudaFree(b_device); } -BOOST_AUTO_TEST_CASE(cutt_unified_mem) { +BOOST_AUTO_TEST_CASE(librett_unified_mem) { int* a_um; cudaMallocManaged(&a_um, A * A * sizeof(int)); @@ -290,8 +294,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) { } } - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; std::vector extent({A, A}); TiledArray::extent_to_col_major(extent); @@ -299,15 +304,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_um, b_um); + status = librettExecute(plan, a_um, b_um); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - cuttDestroy(plan); + librettDestroy(plan); cudaDeviceSynchronize(); @@ -323,7 +328,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) { cudaFree(b_um); } -BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) { +BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { int* a_um; cudaMallocManaged(&a_um, A * B * sizeof(int)); @@ -338,8 +343,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) { } } - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; std::vector extent({B, A}); TiledArray::extent_to_col_major(extent); @@ -347,15 +353,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_um, b_um); + status = librettExecute(plan, a_um, b_um); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - cuttDestroy(plan); + librettDestroy(plan); cudaDeviceSynchronize(); iter = 0; @@ -369,7 +375,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) { cudaFree(b_um); } -BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) { +BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { int* a_um; cudaMallocManaged(&a_um, A * B * C * sizeof(int)); @@ -386,8 +392,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) { } } - cuttHandle plan; - cuttResult_t status; + librettHandle plan; + //librettResult_t status; + librettResult status; // b(k,i,j) = a(i,j,k) @@ -397,15 +404,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) { std::vector perm({2, 0, 1}); TiledArray::permutation_to_col_major(perm); - status = cuttPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0); + status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - status = cuttExecute(plan, a_um, b_um); + status = librettExecute(plan, a_um, b_um); - BOOST_CHECK(status == CUTT_SUCCESS); + BOOST_CHECK(status == LIBRETT_SUCCESS); - cuttDestroy(plan); + librettDestroy(plan); cudaDeviceSynchronize(); iter = 0; @@ -421,7 +428,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) { cudaFree(b_um); } -BOOST_AUTO_TEST_CASE(cutt_um_tensor) { +BOOST_AUTO_TEST_CASE(librett_um_tensor) { TiledArray::Range range{A, A}; using Tile = TiledArray::btasUMTensorVarray; @@ -453,7 +460,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor) { } } -BOOST_AUTO_TEST_CASE(cutt_um_tensor_nonsym) { +BOOST_AUTO_TEST_CASE(librett_um_tensor_nonsym) { TiledArray::Range range{B, A}; using Tile = TiledArray::btasUMTensorVarray; @@ -485,7 +492,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_nonsym) { } } -BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_three) { +BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) { TiledArray::Range range{A, B, C}; using Tile = TiledArray::btasUMTensorVarray; @@ -540,7 +547,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_three) { } } -BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_four) { +BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) { std::size_t a = 2; std::size_t b = 3; std::size_t c = 6; @@ -609,7 +616,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_four) { } } -BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_six) { +BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) { std::size_t a = 2; std::size_t b = 3; std::size_t c = 6;