diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 11f03acf70..93850215f1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -7,12 +7,12 @@ default:
 
 variables:
   MAD_NUM_THREADS : 2
-  TA_TARGETS : "tiledarray examples ta_test check-tiledarray"
+  TA_TARGETS : "tiledarray examples-tiledarray ta_test check-tiledarray"
   # Debug builds with ScaLAPACK=ON need increased TA_UT_CTEST_TIMEOUT
   TA_CONFIG : >
     CMAKE_BUILD_TYPE=${BUILD_TYPE}
     TA_ASSERT_POLICY=TA_ASSERT_THROW
-    TA_UT_CTEST_TIMEOUT=2000
+    TA_UT_CTEST_TIMEOUT=3000
     ${TA_PYTHON}
     ${ENABLE_CUDA}
     ${BLA_VENDOR}
@@ -20,15 +20,16 @@ variables:
     ${ENABLE_SCALAPACK}
 
 before_script:
-  # NB: below tag parsing is not robust
-  - echo "CI_RUNNER_TAGS=$CI_RUNNER_TAGS"
-  - CMAKE_BUILD_PARALLEL_LEVEL=$(echo $CI_RUNNER_TAGS | sed -n 's/CMAKE_BUILD_PARALLEL_LEVEL=\([0-9]\+\).*/\1/p')
+  # NB: if CMAKE_BUILD_PARALLEL_LEVEL is not set (i.e. using shared runner), use 1 to ensure we have enough memory
+  # TODO optimize ta_test build memory consumption
   - export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:=1}
   - echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL"
 
 ubuntu:
   stage: build
-  tags: [ docker ]
+  tags:
+    - docker
+    - ${RUNNER_TAGS}
   timeout: 3h
   image: valeevgroup/${IMAGE}
   variables:
@@ -62,12 +63,15 @@ ubuntu:
         BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ]
         # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
         TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL
+        RUNNER_TAGS: [ linux ]
       - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
         CXX: [ g++, clang++-9 ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
+        RUNNER_TAGS: [ linux ]
       - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
         CXX: [ g++ ]
         BUILD_TYPE : [ "Release", "Debug" ]
         ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]
-        TA_TARGETS : [ "tiledarray examples" ]
+        TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ]
+        RUNNER_TAGS: [ cuda ]
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 0bf6535c4a..0000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# See http://about.travis-ci.org/docs/user/build-configuration/
-# To validate this file: http://lint.travis-ci.org/
-
-language: cpp
-dist: focal
-cache: ccache
-cache:
-  directories:
-  - /home/travis/_install
-os: linux
-
-addons:
-  apt:
-    packages: &base_packages
-    - libblas-dev
-    - liblapack-dev
-    - liblapacke-dev
-    - libtbb-dev
-    - lcov
-    - python3
-    - python3-pip
-    - python3-pytest
-    - python3-numpy
-
-env:
-  global:
-    - BUILD_PREFIX=/home/travis/_build
-    - INSTALL_PREFIX=/home/travis/_install
-
-matrix:
-  fast_finish: true
-  include:
-    - compiler: gcc
-      env: GCC_VERSION=7 BUILD_TYPE=Debug MADNESS_OVER_PARSEC=1
-      addons:
-        apt:
-          packages:
-           - *base_packages
-           - g++-7
-           - gfortran-7
-    - compiler: gcc
-      env: GCC_VERSION=7 BUILD_TYPE=Debug
-      addons:
-        apt:
-          packages:
-           - *base_packages
-           - g++-7
-           - gfortran-7
-    - compiler: gcc
-      env: GCC_VERSION=7 BUILD_TYPE=Release
-      addons:
-        apt:
-          packages:
-           - *base_packages
-           - g++-7
-           - gfortran-7
-    - compiler: gcc
-      env: GCC_VERSION=8 BUILD_TYPE=Debug COMPUTE_COVERAGE=1 MADNESS_OVER_PARSEC=1
-      addons:
-        apt:
-          packages:
-            - *base_packages
-            - g++-8
-            - gfortran-8
-    - compiler: gcc
-      env: GCC_VERSION=8 BUILD_TYPE=Release
-      addons:
-       apt:
-          packages:
-            - *base_packages
-            - g++-8
-            - gfortran-8
-    - compiler: gcc
-      env: GCC_VERSION=9 BUILD_TYPE=Debug MADNESS_OVER_PARSEC=1
-      addons:
-        apt:
-          sources:
-            - sourceline: 'ppa:ubuntu-toolchain-r/test'
-          packages:
-            - *base_packages
-            - g++-9
-            - gfortran-9
-
-before_install:
-  - env
-  - mkdir -p ${BUILD_PREFIX} && mkdir -p ${INSTALL_PREFIX}
-# use timeout to stop long-running (i.e. cache-rebuilding) jobs right before they get killed by Travis-CI
-# in case of timeout report success to Travis to force cache upload
-script:
-  - travis_wait 50 timeout 2850 ${TRAVIS_BUILD_DIR}/bin/build-$TRAVIS_OS_NAME.sh;  RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 124 ]; then true; else false; fi;
-after_failure:
-  - cat ${BUILD_PREFIX}/TA/external/madness-build/CMakeFiles/CMakeError.log
-  - cat ${BUILD_PREFIX}/TA/external/madness-build/CMakeFiles/CMakeOutput.log
-  - cat ${BUILD_PREFIX}/TA/CMakeFiles/CMakeError.log
-  - cat ${BUILD_PREFIX}/TA/CMakeFiles/CMakeOutput.log
-# codecov
-after_success:
-  # create report
-  - cd ${TRAVIS_BUILD_DIR}
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then lcov --gcov-tool gcov-${GCC_VERSION} --directory ${BUILD_PREFIX}/TA --capture --output-file coverage.info; fi; # capture coverage info
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then lcov --remove coverage.info '/usr/*' '*/madness/*' '*/btas/*' '*/tests/*' --output-file coverage.info; fi; # filter out non-project files
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then lcov --list coverage.info; fi; #debug info
-  - echo ${TRAVIS_CMD}
-  # upload report to CodeCov
-  - if [ "$COMPUTE_COVERAGE" = "1" ]; then bash <(curl -s https://codecov.io/bash) -t token; fi;
-  # deploy artifacts: currently only dox
-  - if [ "$DEPLOY" = "1" ]; then bash ${TRAVIS_BUILD_DIR}/bin/deploy-$TRAVIS_OS_NAME.sh; fi;
-
-notifications:
-  slack:
-    secure: aSmy6FmiEf+0gcbVpJs0GIrmpI1dF7/WFOXgUkM2wLxw5DBQxE4LW/yt01mvFqAMJLe0LzGujx/V/z98i0kA1S8DEMTqJ+IG2bbdmgb5CAw5LTP5Air1P2SeAyKW/eAAsnGsERaEnHj8nnZEa2dhbAFOPD5QDM7nwWG/xUkIGMU=
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 382f4abb56..ed419d8f20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -374,14 +374,18 @@ add_subdirectory(doc)
 ##########################
 include(CTest)
 if (BUILD_TESTING)
-  set(_ctest_args -V -R "tiledarray/unit")
+  set(_ctest_args -V -R "tiledarray/unit/run-np.*")
+  set(_ctest_args_serial -V -R "tiledarray/unit/run-np-1")
   if (DEFINED TA_UT_CTEST_TIMEOUT)
     list(APPEND _ctest_args --timeout ${TA_UT_CTEST_TIMEOUT})
+    list(APPEND _ctest_args_serial --timeout ${TA_UT_CTEST_TIMEOUT})
   endif(DEFINED TA_UT_CTEST_TIMEOUT)
   add_custom_target_subproject(tiledarray check USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args})
+  add_custom_target_subproject(tiledarray check_serial USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args_serial})
   add_subdirectory(tests)
 else()
   add_custom_target_subproject(tiledarray check USES_TERMINAL COMMAND echo "WARNING: unit testing disabled. To enable, give -DBUILD_TESTING=ON to cmake")
+  add_custom_target_subproject(tiledarray check_serial USES_TERMINAL COMMAND echo "WARNING: unit testing disabled. To enable, give -DBUILD_TESTING=ON to cmake")
 endif()
 
 ##########################
diff --git a/INSTALL.md b/INSTALL.md
index 0c3b843d3e..229f76d7d0 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -64,11 +64,11 @@ Compiling BTAS requires the following prerequisites:
 
 Optional prerequisites:
 - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing:
-  - [cuTT](github.com/ValeevGroup/cutt) -- CUDA transpose library; note that our fork of the [original cuTT repo](github.com/ap-hynninen/cutt) is required to provide thread-safety (tag 0e8685bf82910bc7435835f846e88f1b39f47f09).
+  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da).
   - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82).
 - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
 - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
-  - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 28433942197aee141cd9e96ed1d00f6ec7b902cb); pulls and builds the following additional prerequisite
+  - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 711ef363479a90c88788036f9c6c8adb70736cbf); pulls and builds the following additional prerequisite
     - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS
 - Python3 interpreter -- to test (optionally-built) Python bindings
 - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards.
@@ -330,7 +330,7 @@ Support for execution on CUDA-enabled hardware is controlled by the following va
 * `ENABLE_CUDA`  -- Set to `ON` to turn on CUDA support. [Default=OFF].
 * `CMAKE_CUDA_HOST_COMPILER`  -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.).
 * `ENABLE_CUDA_ERROR_CHECK` -- Set to `ON` to turn on assertions for successful completion of calls to CUDA runtime and libraries. [Default=OFF].
-* `CUTT_INSTALL_DIR` -- the installation prefix of the pre-installed cuTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install cuTT.
+* `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT.
 * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire.
 
 For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info.
diff --git a/README.md b/README.md
index 853629526a..8742d1e774 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-[![Travis Build Status](https://travis-ci.com/ValeevGroup/tiledarray.svg?branch=master)](https://travis-ci.com/ValeevGroup/tiledarray)
 [![Gitlab Pipeline Status](https://gitlab.com/ValeevGroup/tiledarray/badges/master/pipeline.svg)](https://gitlab.com/ValeevGroup/tiledarray/-/pipelines?page=1&scope=all&ref=master)
 [![codecov](https://codecov.io/gh/ValeevGroup/tiledarray/branch/master/graph/badge.svg)](https://codecov.io/gh/ValeevGroup/tiledarray)
 
diff --git a/bin/admin/dependency-versions-update-hook.py b/bin/admin/dependency-versions-update-hook.py
index 19b7123703..686b98b49a 100755
--- a/bin/admin/dependency-versions-update-hook.py
+++ b/bin/admin/dependency-versions-update-hook.py
@@ -106,11 +106,11 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
                 btas_old_tag = tokens[2]
             else:
                 btas_new_tag = tokens[2]
-        elif tokens[1].find('CUTT') != -1:
+        elif tokens[1].find('LIBRETT') != -1:
             if tokens[1].find('PREVIOUS') != -1:
-                cutt_old_tag = tokens[2]
+                librett_old_tag = tokens[2]
             else:
-                cutt_new_tag = tokens[2]
+                librett_new_tag = tokens[2]
         elif tokens[1].find('UMPIRE') != -1:
             if tokens[1].find('PREVIOUS') != -1:
                 umpire_old_tag = tokens[2]
@@ -146,8 +146,8 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
 # BTAS tag in INSTALL.md
 any_files_changed |= replace_dep_id(topsrc, 'md', 'BTAS', btas_old_tag, btas_new_tag, 'ValeevGroup/BTAS), tag ', '')
 
-# cuTT tag in INSTALL.md
-any_files_changed |= replace_dep_id(topsrc, 'md', 'cuTT', cutt_old_tag, cutt_new_tag, '', '')
+# LibreTT tag in INSTALL.md
+any_files_changed |= replace_dep_id(topsrc, 'md', 'LibreTT', librett_old_tag, librett_new_tag, '', '')
 
 # Umpire tag in INSTALL.md
 any_files_changed |= replace_dep_id(topsrc, 'md', 'Umpire', umpire_old_tag, umpire_new_tag, '', '')
diff --git a/bin/build-boost-linux.sh b/bin/build-boost-linux.sh
deleted file mode 100755
index 7c4fca8bbf..0000000000
--- a/bin/build-boost-linux.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#! /bin/sh
-
-export BOOST_VERSION=1_74_0
-
-# Exit on error
-set -ev
-
-if [ "$CXX" = "g++" ]; then
-    export CXX=/usr/bin/g++-$GCC_VERSION
-    export CXXFLAGS="-mno-avx"
-    export BOOST_TOOLSET=gcc
-else
-    export CXX=/usr/bin/clang++-$CLANG_VERSION
-    export CXXFLAGS="-mno-avx -stdlib=libc++"
-    export BOOST_TOOLSET=clang
-fi
-
-if [ "X$BUILD_TYPE" = "XDebug" ]; then
-    export BOOST_VARIANT="debug"
-else
-    export BOOST_VARIANT="release"
-fi
-
-# download+unpack (but not build!) Boost unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/boost
-if [ ! -d "${INSTALL_DIR}" ]; then
-    rm -fr boost_${BOOST_VERSION}.tar.bz2
-    wget https://boostorg.jfrog.io/artifactory/main/release/1.74.0/source/boost_${BOOST_VERSION}.tar.bz2
-    tar -xjf boost_${BOOST_VERSION}.tar.bz2
-    cd boost_${BOOST_VERSION}
-    cat > user-config.jam << END
-using ${BOOST_TOOLSET} : : ${CXX} :
-      <cxxflags>"${CXXFLAGS}"
-      <linkflags>"${CXXFLAGS}" ;
-END
-    ./bootstrap.sh --prefix=${INSTALL_DIR} --with-libraries=serialization
-    ./b2 -d0 --user-config=`pwd`/user-config.jam toolset=${BOOST_TOOLSET} link=static variant=${BOOST_VARIANT}
-    ./b2 -d0 install
-else
-    echo "Boost already installed ..."
-fi
diff --git a/bin/build-eigen3-linux.sh b/bin/build-eigen3-linux.sh
deleted file mode 100755
index 5f2133111b..0000000000
--- a/bin/build-eigen3-linux.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# Install packages
-
-# Environment variables
-if [ "$CXX" = "g++" ]; then
-  export CC=/usr/bin/gcc-$GCC_VERSION
-  export CXX=/usr/bin/g++-$GCC_VERSION
-  export EXTRACXXFLAGS="-mno-avx"
-else
-  export CC=/usr/bin/clang-$CLANG_VERSION
-  export CXX=/usr/bin/clang++-$CLANG_VERSION
-  export EXTRACXXFLAGS="-mno-avx  -stdlib=libc++"
-fi
-
-# Print compiler information
-$CC --version
-$CXX --version
-
-# log the CMake version (need 3+)
-cmake --version
-
-# Install Eigen3 unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/eigen3
-if [ ! -d "${INSTALL_DIR}" ]; then
-    cd ${BUILD_PREFIX}
-    wget -q https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2
-    tar -xjf eigen-3.3.7.tar.bz2
-    cd eigen-*
-    mkdir build
-    cd build
-    cmake .. -DCMAKE_CXX_COMPILER=$CXX \
-      -DCMAKE_C_COMPILER=$CC \
-      -DCMAKE_CXX_FLAGS="${EXTRACXXFLAGS}" \
-      -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}
-    make install
-else
-    echo "Eigen3 already installed ..."
-fi
diff --git a/bin/build-linux.sh b/bin/build-linux.sh
deleted file mode 100755
index a6c55ed951..0000000000
--- a/bin/build-linux.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#! /bin/sh
-
-# get the most recent cmake available
-if [ ! -d "${INSTALL_PREFIX}/cmake" ]; then
-  CMAKE_VERSION=3.17.0
-  CMAKE_URL="https://cmake.org/files/v${CMAKE_VERSION%.[0-9]}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
-  mkdir ${INSTALL_PREFIX}/cmake && wget --no-check-certificate -O - ${CMAKE_URL} | tar --strip-components=1 -xz -C ${INSTALL_PREFIX}/cmake
-fi
-export PATH=${INSTALL_PREFIX}/cmake/bin:${PATH}
-cmake --version
-
-export PYTHON_EXECUTABLE=$(which python3)
-export TA_PYTHON=ON
-
-${TRAVIS_BUILD_DIR}/bin/build-mpich-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-scalapack-mpich-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-madness-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-boost-linux.sh
-${TRAVIS_BUILD_DIR}/bin/build-eigen3-linux.sh
-
-# Exit on error
-set -ev
-
-# download latest Doxygen
-if [ "$DEPLOY" = "1" ]; then
-  DOXYGEN_VERSION=1.8.20
-  if [ ! -d ${INSTALL_PREFIX}/doxygen-${DOXYGEN_VERSION} ]; then
-    cd ${BUILD_PREFIX} && wget https://downloads.sourceforge.net/project/doxygen/rel-${DOXYGEN_VERSION}/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
-    cd ${INSTALL_PREFIX} && tar xzf ${BUILD_PREFIX}/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
-  fi
-  export PATH=${INSTALL_PREFIX}/doxygen-${DOXYGEN_VERSION}/bin:$PATH
-  which doxygen
-  doxygen --version
-fi
-
-# Environment variables
-if [ "$CXX" = "g++" ]; then
-    export CC=/usr/bin/gcc-$GCC_VERSION
-    export CXX=/usr/bin/g++-$GCC_VERSION
-    export EXTRACXXFLAGS="-mno-avx"
-    # if linking statically will need fortran libs to detect liblapacke.a in BTAS
-    export F77=gfortran-$GCC_VERSION
-else
-    export CC=/usr/bin/clang-$CLANG_VERSION
-    export CXX=/usr/bin/clang++-$CLANG_VERSION
-    export EXTRACXXFLAGS="-mno-avx -stdlib=libc++"
-    # if linking statically will need fortran libs to detect liblapacke.a in BTAS
-    export F77=gfortran-$GCC_VERSION
-fi
-
-export MPI_HOME=${INSTALL_PREFIX}/mpich
-export MPICC=$MPI_HOME/bin/mpicc
-export MPICXX=$MPI_HOME/bin/mpicxx
-export LD_LIBRARY_PATH=/usr/lib/lapack:/usr/lib/libblas:${INSTALL_PREFIX}/scalapack/lib:$LD_LIBRARY_PATH
-
-# list the prebuilt prereqs
-ls -l ${INSTALL_PREFIX}
-
-# where to install TA (need for testing installed code)
-export INSTALL_DIR=${INSTALL_PREFIX}/TA
-
-# make build dir
-cd ${BUILD_PREFIX}
-mkdir -p TA
-cd TA
-
-# if have old installed copy of TA, make sure that BTAS tag matches the required tag, if not, remove INSTALL_DIR (will cause rebuild of TA)
-if [ -f "${INSTALL_DIR}/include/btas/version.h" ]; then
-  export INSTALLED_BTAS_TAG=`grep 'define BTAS_REVISION' ${INSTALL_DIR}/include/btas/version.h | awk '{print $3}' | sed s/\"//g`
-  echo "installed BTAS revision = ${INSTALLED_BTAS_TAG}"
-  # extract the tracked tag of BTAS
-  export BTAS_TAG=`grep 'set(TA_TRACKED_BTAS_TAG ' ${TRAVIS_BUILD_DIR}/external/versions.cmake | awk '{print $2}' | sed s/\)//g`
-  echo "required BTAS revision = ${BTAS_TAG}"
-  if [ "${BTAS_TAG}" != "${INSTALLED_BTAS_TAG}" ]; then
-    rm -rf "${INSTALL_DIR}"
-  fi
-fi
-
-# MADNESS are build separately if $BUILD_TYPE=Debug, otherwise built as part of TA
-if [ "$BUILD_TYPE" = "Debug" ]; then
-
-  if [ "$COMPUTE_COVERAGE" = "1" ]; then
-    export CODECOVCXXFLAGS="-O0 --coverage"
-  fi
-
-  cmake ${TRAVIS_BUILD_DIR} \
-    -DCMAKE_TOOLCHAIN_FILE=cmake/vg/toolchains/travis.cmake \
-    -DCMAKE_CXX_COMPILER=$CXX \
-    -DCMAKE_C_COMPILER=$CC \
-    -DCMAKE_Fortran_COMPILER=$F77 \
-    -DMPI_CXX_COMPILER=$MPICXX \
-    -DMPI_C_COMPILER=$MPICC \
-    -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-    -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-    -DCMAKE_CXX_FLAGS="-ftemplate-depth=1024 -Wno-unused-command-line-argument ${EXTRACXXFLAGS} ${CODECOVCXXFLAGS}" \
-    -DCMAKE_PREFIX_PATH="${INSTALL_PREFIX}/madness;${INSTALL_PREFIX}/eigen3;${INSTALL_PREFIX}/boost" \
-    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-    -DTA_PYTHON="${TA_PYTHON}" \
-    -DENABLE_SCALAPACK=ON
-
-else
-
-  # if have old installed copy of TA, make sure that MADNESS tag matches the required tag, if not, remove INSTALL_DIR (will cause rebuild of MADNESS)
-  if [ -f "${INSTALL_DIR}/include/madness/config.h" ]; then
-    export INSTALLED_MADNESS_TAG=`grep 'define MADNESS_REVISION' ${INSTALL_DIR}/include/madness/config.h | awk '{print $3}' | sed s/\"//g`
-    echo "installed MADNESS revision = ${INSTALLED_MADNESS_TAG}"
-    # extract the tracked tag of MADNESS
-    export MADNESS_TAG=`grep 'set(TA_TRACKED_MADNESS_TAG ' ${TRAVIS_BUILD_DIR}/external/versions.cmake | awk '{print $2}' | sed s/\)//g`
-    echo "required MADNESS revision = ${MADNESS_TAG}"
-    if [ "${MADNESS_TAG}" != "${INSTALLED_MADNESS_TAG}" ]; then
-      rm -rf "${INSTALL_DIR}"
-    fi
-  fi
-
-  cmake ${TRAVIS_BUILD_DIR} \
-    -DCMAKE_TOOLCHAIN_FILE=cmake/vg/toolchains/travis.cmake \
-    -DCMAKE_CXX_COMPILER=$CXX \
-    -DCMAKE_C_COMPILER=$CC \
-    -DCMAKE_Fortran_COMPILER=$F77 \
-    -DMPI_CXX_COMPILER=$MPICXX \
-    -DMPI_C_COMPILER=$MPICC \
-    -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-    -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-    -DCMAKE_CXX_FLAGS="-ftemplate-depth=1024 -Wno-unused-command-line-argument ${EXTRACXXFLAGS}" \
-    -DCMAKE_PREFIX_PATH="${INSTALL_PREFIX}/eigen3;${INSTALL_PREFIX}/boost" \
-    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-    -DTA_PYTHON="${TA_PYTHON}" \
-    -DTA_ASSERT_POLICY=TA_ASSERT_THROW \
-    -DENABLE_SCALAPACK=ON
-
-fi
-
-# Build all libraries, examples, and applications
-make -j2 all VERBOSE=1
-make install
-# remove install dir to avoid broken artifacts like BTAS polluting the next build via cached copy
-rm -rf $INSTALL_DIR
-
-# Validate
-make -j1 ta_test VERBOSE=1
-export MAD_NUM_THREADS=2
-# to find dep shared libs (do we need this since El is gone?)
-export LD_LIBRARY_PATH=${INSTALL_PREFIX}/TA/lib:${INSTALL_PREFIX}/madness/lib:${LD_LIBRARY_PATH}
-make check-tiledarray
-
-# Build examples
-make -j2 examples VERBOSE=1
diff --git a/bin/build-madness-linux.sh b/bin/build-madness-linux.sh
deleted file mode 100755
index d255bff92d..0000000000
--- a/bin/build-madness-linux.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# Will build MADNESS stand-alone for Debug builds only
-if [ "$BUILD_TYPE" = "Debug" ]; then
-
-  # Environment variables
-  if [ "$CXX" = "g++" ]; then
-    export CC=/usr/bin/gcc-$GCC_VERSION
-    export CXX=/usr/bin/g++-$GCC_VERSION
-    export EXTRACXXFLAGS="-mno-avx"
-    export F77=gfortran-$GCC_VERSION
-  else
-    export CC=/usr/bin/clang-$CLANG_VERSION
-    export CXX=/usr/bin/clang++-$CLANG_VERSION
-    export EXTRACXXFLAGS="-mno-avx -stdlib=libc++"
-    export F77=gfortran-$GCC_VERSION
-  fi
-
-  export MPI_HOME=${INSTALL_PREFIX}/mpich
-  export MPICC=$MPI_HOME/bin/mpicc
-  export MPICXX=$MPI_HOME/bin/mpicxx
-  export LD_LIBRARY_PATH=/usr/lib/lapack:/usr/lib/libblas:$LD_LIBRARY_PATH
-
-  # list the prebuilt prereqs
-  ls -l ${INSTALL_PREFIX}
-
-  # where to install MADNESS (need for testing installed code)
-  export INSTALL_DIR=${INSTALL_PREFIX}/madness
-
-  # extract the tracked tag of MADNESS
-  export MADNESS_TAG=`grep 'set(TA_TRACKED_MADNESS_TAG ' ${TRAVIS_BUILD_DIR}/external/versions.cmake | awk '{print $2}' | sed s/\)//g`
-  echo "required MADNESS revision = ${MADNESS_TAG}"
-
-  # make sure installed MADNESS tag matches the required tag, if not, remove INSTALL_DIR (will cause reinstall)
-  if [ -f "${INSTALL_DIR}/include/madness/config.h" ]; then
-    export INSTALLED_MADNESS_TAG=`grep 'define MADNESS_REVISION' ${INSTALL_DIR}/include/madness/config.h | awk '{print $3}' | sed s/\"//g`
-    echo "installed MADNESS revision = ${INSTALLED_MADNESS_TAG}"
-    if [ "${MADNESS_TAG}" != "${INSTALLED_MADNESS_TAG}" ]; then
-      rm -rf "${INSTALL_DIR}"
-    fi
-  fi
-
-  if [ ! -d "${INSTALL_DIR}" ]; then
-
-    # make build dir
-    cd ${BUILD_PREFIX}
-    mkdir -p madness
-    cd madness
-
-    if [ -n "${MADNESS_OVER_PARSEC}" ]; then
-	MADNESS_BACKEND_OPTION="-DMADNESS_TASK_BACKEND=PaRSEC"
-    fi
-
-    # check out the tracked tag of MADNESS
-    git clone https://github.com/TESSEorg/madness.git madness_src && cd madness_src && git checkout ${MADNESS_TAG} && cd ..
-
-    cmake madness_src \
-      -DCMAKE_TOOLCHAIN_FILE="${TRAVIS_BUILD_DIR}/cmake/toolchains/travis.cmake" \
-      -DCMAKE_CXX_COMPILER=$CXX \
-      -DCMAKE_C_COMPILER=$CC \
-      -DMPI_CXX_COMPILER=$MPICXX \
-      -DMPI_C_COMPILER=$MPICC \
-      -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-      -DCMAKE_CXX_FLAGS="-ftemplate-depth=1024 -Wno-unused-command-line-argument ${EXTRACXXFLAGS}" \
-      -DMADNESS_BUILD_MADWORLD_ONLY=ON \
-      -DENABLE_MPI=ON \
-      -DMPI_THREAD=multiple \
-      -DENABLE_TBB=OFF \
-      -DTBB_ROOT_DIR=/usr \
-      -DFORTRAN_INTEGER_SIZE=4 \
-      -DENABLE_LIBXC=OFF \
-      -DENABLE_GPERFTOOLS=OFF \
-      -DASSERTION_TYPE=throw \
-      -DDISABLE_WORLD_GET_DEFAULT=ON \
-      ${MADNESS_BACKEND_OPTION}
-
-    # Build+install MADworld interface
-    make -j2 install VERBOSE=1
-  fi
-
-fi
diff --git a/bin/build-mpich-linux.sh b/bin/build-mpich-linux.sh
deleted file mode 100755
index 7e38ef3167..0000000000
--- a/bin/build-mpich-linux.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# Install packages
-
-# always use gcc to compile MPICH, there are unexplained issues with clang (e.g. MPI_Barrier aborts)
-export CC=/usr/bin/gcc-$GCC_VERSION
-export CXX=/usr/bin/g++-$GCC_VERSION
-export FC=/usr/bin/gfortran-$GCC_VERSION
-
-# Print compiler information
-$CC --version
-$CXX --version
-$FC --version
-
-# log the CMake version (need 3+)
-cmake --version
-
-# Install MPICH unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/mpich
-if [ ! -d "${INSTALL_DIR}" ]; then
-    cd ${BUILD_PREFIX}
-    export MPICH_VERSION=3.3
-    wget --no-check-certificate -q http://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz
-    tar -xzf mpich-${MPICH_VERSION}.tar.gz
-    cd mpich-${MPICH_VERSION}
-    ./configure FC=$FC CC=$CC CXX=$CXX --prefix=${INSTALL_DIR}
-    make -j2
-    make install
-    ${INSTALL_DIR}/bin/mpichversion
-    ${INSTALL_DIR}/bin/mpicc -show
-    ${INSTALL_DIR}/bin/mpicxx -show
-    ${INSTALL_DIR}/bin/mpifort -show
-else
-    echo "MPICH installed..."
-    find ${INSTALL_DIR} -name mpiexec
-    find ${INSTALL_DIR} -name mpicc
-    find ${INSTALL_DIR} -name mpicxx
-    find ${INSTALL_DIR} -name mpifort
-fi
diff --git a/bin/build-scalapack-mpich-linux.sh b/bin/build-scalapack-mpich-linux.sh
deleted file mode 100755
index 213d7bc5a7..0000000000
--- a/bin/build-scalapack-mpich-linux.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-# always use gcc, just like mpich ... ?
-export CC=/usr/bin/gcc-$GCC_VERSION
-export CXX=/usr/bin/g++-$GCC_VERSION
-export FC=/usr/bin/gfortran-$GCC_VERSION
-
-# Print compiler information
-$CC --version
-$CXX --version
-$FC --version
-
-# log the CMake version (need 3+)
-cmake --version
-
-# Install MPICH unless previous install is cached ... must manually wipe cache on version bump or toolchain update
-export INSTALL_DIR=${INSTALL_PREFIX}/scalapack
-if [ ! -d "${INSTALL_DIR}" ]; then
-
-    # Make sure MPI is built
-    ${INSTALL_PREFIX}/mpich/bin/mpichversion
-    ${INSTALL_PREFIX}/mpich/bin/mpicc -show
-    ${INSTALL_PREFIX}/mpich/bin/mpicxx -show
-    ${INSTALL_PREFIX}/mpich/bin/mpif90 -show
-
-    cd ${BUILD_PREFIX}
-    git clone https://github.com/Reference-ScaLAPACK/scalapack.git
-    cd scalapack
-    git checkout 0efeeb6d2ec9faf0f2fd6108de5eda60773cdcf9 # checked revision
-    cmake -H. -Bbuild_scalapack \
-      -DCMAKE_C_COMPILER=$CC \
-      -DCMAKE_Fortran_COMPILER=$FC \
-      -DMPI_C_COMPILER=${INSTALL_PREFIX}/mpich/bin/mpicc \
-      -DMPI_Fortran_COMPILER=${INSTALL_PREFIX}/mpich/bin/mpif90 \
-      -DCMAKE_TOOLCHAIN_FILE="${TRAVIS_BUILD_DIR}/cmake/toolchains/travis.cmake" \
-      -DCMAKE_PREFIX_PATH=${INSTALL_DIR} \
-      -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}
-
-    cmake --build build_scalapack -j2
-    cmake --build build_scalapack --target install
-    find ${INSTALL_DIR} -name libscalapack.so
-else
-    echo "ScaLAPACK installed..."
-    find ${INSTALL_DIR} -name libscalapack.so
-fi
diff --git a/bin/deploy-linux.sh b/bin/deploy-linux.sh
deleted file mode 100755
index 279a8f69e8..0000000000
--- a/bin/deploy-linux.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#! /bin/sh
-
-# Exit on error
-set -ev
-
-git config --global user.email "travis@travis-ci.org"
-git config --global user.name "Travis CI"
-
-# only non-cron job deploys
-RUN=1
-if [ "$TRAVIS_EVENT_TYPE" = "cron" ] || [ "$TRAVIS_BRANCH" != "master" ]; then
-  RUN=0
-fi
-if [ "$RUN" = "0" ]; then
-  echo "Deployment skipped"
-  exit 0
-fi
-
-# deploy from the build area
-cd ${BUILD_PREFIX}/TA
-
-### deploy docs
-# see https://gist.github.com/willprice/e07efd73fb7f13f917ea
-
-# build docs
-export VERBOSE=1
-cmake --build . --target html
-if [ ! -f "${BUILD_PREFIX}/TA/doc/dox/html/index.html" ]; then
-  echo "Target html built successfully but did not produce index.html"
-  exit 1
-fi
-
-# check out current docs + template
-git clone --depth=1 https://github.com/ValeevGroup/tiledarray.git --branch gh-pages --single-branch tiledarray-docs-current
-git clone --depth=1 https://github.com/ValeevGroup/tiledarray.git --branch gh-pages-template --single-branch tiledarray-docs-template
-mkdir tiledarray-docs
-cp -rp tiledarray-docs-current/* tiledarray-docs
-rm -rf tiledarray-docs-current
-cp -rp tiledarray-docs-template/* tiledarray-docs
-rm -rf tiledarray-docs-template
-cd tiledarray-docs
-# copy TA's README.md into index.md
-cp ${TRAVIS_BUILD_DIR}/README.md index.md
-# update dox
-if [ -d dox-master ]; then
-  rm -rf dox-master
-fi
-mv ${BUILD_PREFIX}/TA/doc/dox/html dox-master
-# Jekyll does not allow files with "special" names, e.g. whose names start with underscore
-# must "include" such files explicitly
-# re: how file names must be formatted: see https://github.com/jekyll/jekyll/issues/1352
-echo "include:" >> _config.yml
-find dox-master -name "_*" | sed "s/dox-master\//  \- /g" >> _config.yml
-# make empty repo to ensure gh-pages contains no history
-git init
-git add *
-git commit -a -q -m "rebuilt TA master docs via Travis build: $TRAVIS_BUILD_NUMBER"
-git checkout -b gh-pages
-git remote add origin https://${GH_TILEDARRAY_TOKEN}@github.com/ValeevGroup/tiledarray.git > /dev/null 2>&1
-git push origin +gh-pages --force
-cd ..
-rm -rf tiledarray-docs
diff --git a/bin/docker-cuda.md b/bin/docker-cuda.md
index a525369070..0f39c0ac20 100644
--- a/bin/docker-cuda.md
+++ b/bin/docker-cuda.md
@@ -1,5 +1,5 @@
 # Intro
-These notes describe how to build TiledArray with CUDA support enabled within the latest nvidia/cuda Docker image (https://hub.docker.com/r/nvidia/cuda/). This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article). If you want to use Docker to run/debug Travis-CI jobs, see [docker-travis.md](docker-travis.md)
+These notes describe how to build TiledArray with CUDA support enabled within the latest nvidia/cuda Docker image (https://hub.docker.com/r/nvidia/cuda/). This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article).
 
 # Using
 These notes assume that Docker 19.03 and NVIDIA Container Toolkit (https://github.com/NVIDIA/nvidia-docker) are installed on your machine and that you start at the top of the TiledArray source tree.
diff --git a/bin/docker-travis-build.sh b/bin/docker-travis-build.sh
deleted file mode 100755
index 4209bad9ef..0000000000
--- a/bin/docker-travis-build.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-
-# this script builds a 'Bionic' env docker image used by Travis-CI for TiledArray project
-#
-# to run bash in the image: docker run -it tiledarray-travis-debug bash -l
-# see docker-travis.md for further instructions
-# N.B. relevant locations:
-#   - source dir: /home/travis/build/ValeevGroup/tiledarray (TRAVIS_BUILD_DIR env in Travis jobs)
-#   - build dir: /home/travis/_build
-#   - install dir: /home/travis/_install
-
-# this is where in the container file system Travis-CI "starts"
-export TRAVIS_BUILD_TOPDIR=/home/travis/build
-export DIRNAME=`dirname $0`
-export ABSDIRNAME=`pwd $DIRNAME`
-
-##############################################################
-# make a script to download all prereqs and clone TiledArray repo
-setup=setup.sh
-cat > $setup << END
-#!/bin/sh
-curl -sSL "http://apt.llvm.org/llvm-snapshot.gpg.key" | apt-key add -
-echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" | tee -a /etc/apt/sources.list > /dev/null
-apt-add-repository -y "ppa:ubuntu-toolchain-r/test"
-apt-get -yq update >> ~/apt-get-update.log
-apt-get -yq --no-install-suggests --no-install-recommends --force-yes install g++-7 g++-8 g++-9 gfortran-7 gfortran-8 gfortran-9 libblas-dev liblapack-dev liblapacke-dev libtbb-dev clang-8 clang-9 cmake cmake-data libclang1-9 graphviz fonts-liberation \
-python3 python3-pip python3-pytest python3-numpy
-mkdir -p ${TRAVIS_BUILD_TOPDIR}
-cd ${TRAVIS_BUILD_TOPDIR}
-git clone https://github.com/ValeevGroup/tiledarray.git ${TRAVIS_BUILD_TOPDIR}/ValeevGroup/tiledarray
-END
-chmod +x $setup
-
-##############################################################
-# make a script to build all extra prereqs once in the container
-build=build.sh
-cat > $build << END
-#!/bin/sh
-cd /home/travis/_build
-export BUILD_PREFIX=/home/travis/_build
-export INSTALL_PREFIX=/home/travis/_install
-export TRAVIS_BUILD_DIR=${TRAVIS_BUILD_TOPDIR}/ValeevGroup/tiledarray
-export TRAVIS_EVENT_TYPE=cron
-export TRAVIS_OS_NAME=linux
-\${TRAVIS_BUILD_DIR}/bin/build-\$TRAVIS_OS_NAME.sh
-END
-chmod +x $build
-
-##############################################################
-# make Dockerfile
-cat > Dockerfile << END
-# Travis default 'Focal' image
-FROM travisci/ci-ubuntu-2004:packer-1609444725-e5de6974
-
-# Use baseimage-docker's init system.
-CMD ["/sbin/my_init"]
-
-# create source, build, and install dirs
-RUN mkdir -p /home/travis/_build
-RUN mkdir -p /home/travis/_install
-
-# install prereqs
-ADD $setup /home/travis/_build/$setup
-RUN /home/travis/_build/$setup
-
-# Clean up APT when done.
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# copy travis scripts
-ADD $build /home/travis/_build/$build
-
-# for further info ...
-RUN echo "\e[92mDone! For info on how to use the image refer to $ABSDIRNAME/docker-travis.md\e[0m"
-
-END
-
-function clean_up {
-  rm -f $setup $build Dockerfile
-  exit
-}
-
-trap clean_up SIGHUP SIGINT SIGTERM
-
-##############################################################
-# build a dev image
-docker build -t tiledarray-travis-debug .
-
-##############################################################
-# extra admin tasks, uncomment as needed
-
-##############################################################
-# done
-clean_up
diff --git a/bin/docker-travis.md b/bin/docker-travis.md
deleted file mode 100644
index 65e43632df..0000000000
--- a/bin/docker-travis.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Intro
-These notes describe how to build TiledArray within the latest Travis-CI Docker image. This is useful for debugging Travis-CI jobs on your local machine.
-# Using
-These notes assume that Docker is installed on your machine and that you start at the top of the TiledArray source tree.
-
-## Create/build Docker Travis image
-1. Create a Travis-CI docker image: `cd bin; ./docker-travis-build.sh`
-2. Run a container using the newly created image: `docker run -it tiledarray-travis-debug bash -l`
-3. `cd /home/travis/_build`
-4. Configure the job to use the appropriate compiler, compiler version, and debug/release build type:
-  * `export BUILD_TYPE=B`, where `B` is `Debug` or `Release`.
-  * If want to use GNU C++ compiler (gcc):
-    * `export GCC_VERSION=VVV` where `VVV` should be the GCC version to be used. The currently valid values are `7`, `8` and `9`.
-    * `export CXX=g++`
-  * If want to use Clang C++ compiler (clang++):
-    * `export GCC_VERSION=8`
-    * `export CLANG_VERSION=VVV` where `VVV` should be the Clang version to be used. The currently valid values is `11`.
-    * `export CXX=clang++`
-    * `apt-get update && apt-get install libc++-${CLANG_VERSION}-dev libc++abi-${CLANG_VERSION}-dev`
-5. Build prerequisites (MPICH, MADNESS, ScaLAPACK), TiledArray, and run tests: `./build.sh`
-
-## Notes
-* According to [Travis-CI docs](https://docs.travis-ci.com/user/reference/overview/) you want to configure your Docker to run containers with 2 cores and 7.5 GB of RAM to best match the production environment.
-* If you plan to use this container multiple times it might make sense to take a snapshot at this point to avoid having to recompile the prerequisites each and every time. Store it as a separate image, e.g. `docker commit container_id tiledarray-travis-debug:clang-debug`, where `container_id` can be found in the output of `docker ps`. Next time to start debugging you will need to pull updates to the TiledArray source (do `cd /home/travis/build/ValeevGroup/tiledarray && git pull`), then execute step 2 with the new image name, execute step 3, and go directly to step 6.
-* To install `gdb` execute `apt-get update && apt-get install gdb`. Also, it appears that to be able to attach `gdb` or any other debugger to a running process you must run the Docker container in privileged mode as `docker run --privileged -it tiledarray-travis-debug:clang-debug bash -l`.
-* To debug parallel jobs you want to launch jobs in a gdb in an xterm. To run xterm you need to ssh into the container. To start an ssh server in the container do this:
-  * Connect sshd's port of the container (22) to an unprivileged port (say, 2222) of the host: `docker run -p 127.0.0.1:2222:22 --privileged -it tiledarray-travis-debug:clang-debug bash -l`
-  * Generate host keys: `ssh-keygen -A`
-  * Create a root password: `passwd` and follow prompts. No need to be fancy: security is not a concern here, but `passwd` will not accept an empty password. N.B. This is easier than setting up a pubkey login, so don't bother with that.
-  * Edit `/etc/ssh/sshd_config` and allow root to log in by ensuring that `PermitRootLogin` and `PasswordAuthentication` are set to `yes`.
-  * Start ssh server: `/etc/init.d/ssh start`
-  * (optional) To launch gdb in xterm windows: `apt-get update && apt-get install xterm`
-  * You should be able to log in from an xterm on the host side: `ssh -Y -p 2222 root@localhost`
diff --git a/bin/docker.md b/bin/docker.md
index fb558db6db..1826c95ef2 100644
--- a/bin/docker.md
+++ b/bin/docker.md
@@ -1,5 +1,5 @@
 # Intro
-These notes describe how to build TiledArray within the latest phusion (https://github.com/phusion/baseimage-docker) Docker image. This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article). If you want to use Docker to run/debug Travis-CI jobs, see [docker-travis.md](docker-travis.md)
+These notes describe how to build TiledArray within the latest phusion (https://github.com/phusion/baseimage-docker) Docker image. This is useful for experimentation and/or provisioning computational results (e.g. for creating supplementary info for a journal article).
 
 # Using
 These notes assume that Docker is installed on your machine and that you start at the top of the TiledArray source tree.
diff --git a/ci/.build-project b/ci/.build-project
index 79a08d541b..aeb7c73787 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -80,13 +80,16 @@ if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
 fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "make -C /home/ValeevGroup install/cuda"
+  cmd "rm -fr /usr/local/bin/nvcc"
   cmd "export CUDACXX=/usr/local/cuda/bin/nvcc"
+  cmd "${CUDACXX} -V"
+  cmd "nvidia-smi"
 fi
 section_end preparing_system_section
 
 section_start configure_section "Configure"
 cmd mkdir -p ${build_dir}
-time_cmd configure "cmake -B${build_dir} $vars"
+time_cmd configure "cmake -GNinja -B${build_dir} $vars"
 section_end configure_section
 
 for target in ${targets}; do
diff --git a/cmake/modules/FindOrFetchBTAS.cmake b/cmake/modules/FindOrFetchBTAS.cmake
index 57a4b94ac0..764ec7046e 100644
--- a/cmake/modules/FindOrFetchBTAS.cmake
+++ b/cmake/modules/FindOrFetchBTAS.cmake
@@ -13,9 +13,9 @@ if (NOT TARGET BTAS::BTAS)
   # BTAS will load BLAS++/LAPACK++ ... if those use CMake's FindBLAS/FindLAPACK (as indicated by defined BLA_VENDOR)
   # will need to specify Fortran linkage convention ... manually for now, switching to NWX's linear algebra discovery
   # is necessary to handle all the corner cases for automatic discovery
-  if (DEFINED BLA_VENDOR)
+  if (BLA_VENDOR)
     set(_linalgpp_use_standard_linalg_kits TRUE)
-  endif(DEFINED BLA_VENDOR)
+  endif(BLA_VENDOR)
 
   if (NOT TILEDARRAY_HAS_CUDA)
     # tell BLAS++/LAPACK++ to ignore CUDA
diff --git a/doc/dox/contrib/Travis-CI-Administration-Notes.md b/doc/dox/contrib/Travis-CI-Administration-Notes.md
index 0b626507cd..0284ebf0b9 100644
--- a/doc/dox/contrib/Travis-CI-Administration-Notes.md
+++ b/doc/dox/contrib/Travis-CI-Administration-Notes.md
@@ -1,13 +1,5 @@
-# Managing Travis Builds {#Travis-CI-Administration-Notes}
+# Managing CI Builds {#CI-Administration-Notes}
 
 ## Basic Facts
-* Travis CI configuration is in file `.travis.yml`, and build scripts are in `bin/build-*linux.sh`. Only Linux builds are currently supported.
-* `BUILD_TYPE=Debug` jobs build and install MADNESS separately, before building TiledArray' `BUILD_TYPE=Release` jobs build MADNESS as a step of the TiledArray build.
-* MPICH and (`BUILD_TYPE=Debug` only) MADNESS installation directories are _cached_. **Build scripts only verify the presence of installed directories, and do not update them if their configuration (e.g. static vs. shared, or code version) has changed. _Thus it is admin's responsibility to manually wipe out the cache on a per-branch basis_.** It is the easiest to do via the Travis-CI web interface (click on 'More Options' menu at the top right, select 'Caches', etc.).
-* Rebuilding cache of prerequisites may take more time than the job limit (50 mins at the moment), so rebuilding cache can take several attempts. Since Travis-CI does not support forced cache updates (see e.g. https://github.com/travis-ci/travis-ci/issues/6410) if the job looks like it's going to time out we report success to Travis just so that it will store cache. __Thus jobs that timed out will be falsely reported as successful (rather than errored)!__ When rebuilding cache it may be necessary to manually restart some build jobs to make sure that cache rebuild is complete (or, just to be sure, restart the whole __build__ one time just to be sure all caches have been rebuilt). Again: this is only relevant when rebuilding caches (i.e. <5% of the time), otherwise there should be no need to restart jobs manually.
-
-# Debugging Travis-CI jobs
-
-## Local debugging
-
-Follow the instructions contained in [docker-travis.md](https://github.com/ValeevGroup/tiledarray/blob/master/bin/docker-travis.md) .
+* TiledArray only uses GitLab CI at this point
+* CI configuration is in file `.gitlab-ci.yml`, and build metadata is in `ci/`. Only Linux builds are currently supported.
diff --git a/examples/cc/ccd.cpp b/examples/cc/ccd.cpp
index 18106f34c1..2560048d26 100644
--- a/examples/cc/ccd.cpp
+++ b/examples/cc/ccd.cpp
@@ -96,27 +96,28 @@ int main(int argc, char** argv) {
 
     TiledArray::TSpArrayD t_aa_vvoo(world, v_aa_vvoo.trange(),
                                     v_aa_vvoo.shape());
-    for (auto it = t_aa_vvoo.range().begin(); it != t_aa_vvoo.range().end();
-         ++it)
+    for (auto it = t_aa_vvoo.tiles_range().begin();
+         it != t_aa_vvoo.tiles_range().end(); ++it)
       if (t_aa_vvoo.is_local(*it) && (!t_aa_vvoo.is_zero(*it)))
         t_aa_vvoo.set(*it, 0.0);
 
     TiledArray::TSpArrayD t_ab_vvoo(world, v_ab_vvoo.trange(),
                                     v_ab_vvoo.shape());
-    for (auto it = t_ab_vvoo.range().begin(); it != t_ab_vvoo.range().end();
-         ++it)
+    for (auto it = t_ab_vvoo.tiles_range().begin();
+         it != t_ab_vvoo.tiles_range().end(); ++it)
       if (t_ab_vvoo.is_local(*it) && (!t_ab_vvoo.is_zero(*it)))
         t_ab_vvoo.set(*it, 0.0);
 
     TiledArray::TSpArrayD t_bb_vvoo(world, v_bb_vvoo.trange(),
                                     v_bb_vvoo.shape());
-    for (auto it = t_bb_vvoo.range().begin(); it != t_bb_vvoo.range().end();
-         ++it)
+    for (auto it = t_bb_vvoo.tiles_range().begin();
+         it != t_bb_vvoo.tiles_range().end(); ++it)
       if (t_bb_vvoo.is_local(*it) && (!t_bb_vvoo.is_zero(*it)))
         t_bb_vvoo.set(*it, 0.0);
 
     TiledArray::TSpArrayD D_vvoo(world, v_ab_vvoo.trange(), v_ab_vvoo.shape());
-    for (auto it = D_vvoo.range().begin(); it != D_vvoo.range().end(); ++it)
+    for (auto it = D_vvoo.tiles_range().begin();
+         it != D_vvoo.tiles_range().end(); ++it)
       if (D_vvoo.is_local(*it) && (!D_vvoo.is_zero(*it)))
         D_vvoo.set(*it, world.taskq.add(data, &InputData::make_D_vvoo_tile,
                                         D_vvoo.trange().make_tile_range(*it)));
diff --git a/examples/cc/ccsd.cpp b/examples/cc/ccsd.cpp
index 47a29686fa..f06b53edf1 100644
--- a/examples/cc/ccsd.cpp
+++ b/examples/cc/ccsd.cpp
@@ -128,15 +128,16 @@ int main(int argc, char** argv) {
     //
     //
     //    TArray2s D_vo(world, f_a_vo.trange(), f_a_vo.shape());
-    //    for(TArray2s::range_type::const_iterator it = D_vo.range().begin(); it
-    //    != D_vo.range().end(); ++it)
+    //    for(TArray2s::range_type::const_iterator it =
+    //    D_vo.tiles_range().begin(); it
+    //    != D_vo.tiles_range().end(); ++it)
     //      if(D_vo.is_local(*it) && (! D_vo.is_zero(*it)))
     //        D_vo.set(*it, world.taskq.add(data, & InputData::make_D_vo_tile,
     //        D_vo.trange().make_tile_range(*it)));
     //
     //    TArray4s D_vvoo(world, v_ab_vvoo.trange(), v_ab_vvoo.shape());
-    //    for(TArray4s::range_type::const_iterator it = D_vvoo.range().begin();
-    //    it != D_vvoo.range().end(); ++it)
+    //    for(TArray4s::range_type::const_iterator it =
+    //    D_vvoo.tiles_range().begin(); it != D_vvoo.tiles_range().end(); ++it)
     //      if(D_vvoo.is_local(*it) && (! D_vvoo.is_zero(*it)))
     //        D_vvoo.set(*it, world.taskq.add(data, &
     //        InputData::make_D_vvoo_tile,
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
index 2f6affe700..5d7f56c86e 100644
--- a/examples/cuda/CMakeLists.txt
+++ b/examples/cuda/CMakeLists.txt
@@ -25,7 +25,7 @@
 
 if(CUDA_FOUND)
 
-  foreach(_exec cuda_cutt cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda)
+  foreach(_exec cuda_librett cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda)
 
     # Add executable
     add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
diff --git a/examples/cuda/cuda_cutt.cpp b/examples/cuda/cuda_librett.cpp
similarity index 98%
rename from examples/cuda/cuda_cutt.cpp
rename to examples/cuda/cuda_librett.cpp
index edaefc2597..a916bfc729 100644
--- a/examples/cuda/cuda_cutt.cpp
+++ b/examples/cuda/cuda_librett.cpp
@@ -29,7 +29,7 @@
 #include <iostream>
 
 /**
- *  Test cuTT
+ *  Test LibreTT
  */
 
 const std::size_t N = 100;
diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/cuda/ta_cc_abcd_cuda.cpp
index c67895f7dc..6a2ef26e5f 100644
--- a/examples/cuda/ta_cc_abcd_cuda.cpp
+++ b/examples/cuda/ta_cc_abcd_cuda.cpp
@@ -60,7 +60,7 @@ int main(int argc, char** argv) {
 
   try {
     // Initialize runtime
-    TA::World& world = TA::initialize(argc, argv);
+    TA::World& world = TA_SCOPED_INITIALIZE(argc, argv);
 
     // Get command line arguments
     if (argc < 5) {
@@ -136,9 +136,6 @@ int main(int argc, char** argv) {
     } else {
       cc_abcd<float>(world, trange_occ, trange_uocc, repeat);
     }
-
-    TA::finalize();
-
   } catch (TA::Exception& e) {
     std::cerr << "!! TiledArray exception: " << e.what() << "\n";
     rc = 1;
diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp
index 51ebc67b11..14f692329b 100644
--- a/examples/cuda/ta_dense_cuda.cpp
+++ b/examples/cuda/ta_dense_cuda.cpp
@@ -300,7 +300,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
-  TiledArray::World &world = TiledArray::initialize(argc, argv);
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
 
   // Get command line arguments
   if (argc < 6) {
@@ -453,8 +453,6 @@ int try_main(int argc, char **argv) {
     throw std::runtime_error("Invalid storage type!\n");
   }
 
-  TiledArray::finalize();
-
   return 0;
 }
 
diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/cuda/ta_reduce_cuda.cpp
index 417fa2d72f..e453069892 100644
--- a/examples/cuda/ta_reduce_cuda.cpp
+++ b/examples/cuda/ta_reduce_cuda.cpp
@@ -62,6 +62,8 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 
   TiledArray::TiledRange  // TRange
       trange(blocking.begin(), blocking.end());
+  TiledArray::TiledRange trange_tr(blocking.rbegin(),
+                                   blocking.rend());  // transposed trange
 
   using value_type = typename Tile::value_type;
   using TArray = TA::DistArray<Tile, TA::DensePolicy>;
@@ -116,7 +118,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);
@@ -198,7 +200,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);
@@ -239,7 +241,7 @@ using cudaTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
-  TiledArray::World &world = TiledArray::initialize(argc, argv);
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
 
   // Get command line arguments
   if (argc < 4) {
@@ -365,8 +367,6 @@ int try_main(int argc, char **argv) {
     do_main_body<TiledArray::Tensor<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
   }
 
-  TiledArray::finalize();
-
   return 0;
 }
 
diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/cuda/ta_vector_cuda.cpp
index f3c6265eb1..1593a68e8b 100644
--- a/examples/cuda/ta_vector_cuda.cpp
+++ b/examples/cuda/ta_vector_cuda.cpp
@@ -62,8 +62,9 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
   blocking.push_back(
       TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
 
-  TiledArray::TiledRange  // TRange
-      trange(blocking.begin(), blocking.end());
+  TiledArray::TiledRange trange(blocking.begin(), blocking.end());
+  TiledArray::TiledRange trange_tr(blocking.rbegin(),
+                                   blocking.rend());  // transposed trange
 
   using value_type = typename Tile::value_type;
   using TArray = TA::DistArray<Tile, TA::DensePolicy>;
@@ -150,7 +151,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);
@@ -222,7 +223,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
     }
 
     TArray a(world, trange);
-    TArray b(world, trange);
+    TArray b(world, trange_tr);
 
     a.fill(val_a);
     b.fill(val_b);
@@ -258,7 +259,7 @@ using cudaTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
-  TiledArray::World &world = TiledArray::initialize(argc, argv);
+  auto &world = TA_SCOPED_INITIALIZE(argc, argv);
 
   // Get command line arguments
   if (argc < 4) {
@@ -384,8 +385,6 @@ int try_main(int argc, char **argv) {
     do_main_body<TiledArray::Tensor<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
   }
 
-  TiledArray::finalize();
-
   return 0;
 }
 
diff --git a/external/cuda.cmake b/external/cuda.cmake
index 1e5ebd8d60..3b2eb6ce37 100644
--- a/external/cuda.cmake
+++ b/external/cuda.cmake
@@ -42,6 +42,6 @@ message(STATUS "CMAKE Implicit Link Directories: ${CMAKE_CUDA_IMPLICIT_LINK_DIRE
 include(external/umpire.cmake)
 
 ##
-## cuTT
+## LibreTT
 ##
-include(external/cutt.cmake)
+include(external/librett.cmake)
diff --git a/external/cutt.cmake b/external/librett.cmake
similarity index 53%
rename from external/cutt.cmake
rename to external/librett.cmake
index dbf4e94f91..a238f3af92 100644
--- a/external/cutt.cmake
+++ b/external/librett.cmake
@@ -1,48 +1,48 @@
 ##
-## find cuTT
+## find LibreTT
 ##
 
-find_path(_CUTT_INSTALL_DIR NAMES include/cutt.h lib/libcutt.a HINTS ${CUTT_INSTALL_DIR})
+find_path(_LIBRETT_INSTALL_DIR NAMES include/librett.h lib/librett.a HINTS ${LIBRETT_INSTALL_DIR})
 
-if( _CUTT_INSTALL_DIR )
+if( _LIBRETT_INSTALL_DIR )
 
-    message(STATUS "cuTT found at ${_CUTT_INSTALL_DIR}")
+    message(STATUS "LibreTT found at ${_LIBRETT_INSTALL_DIR}")
 
 elseif(TA_EXPERT)
 
-    message("** cuTT was not found")
-    message(STATUS "** Downloading and building cuTT is explicitly disabled in EXPERT mode")
+    message("** LibreTT was not found")
+    message(STATUS "** Downloading and building LibreTT is explicitly disabled in EXPERT mode")
 
 else()
 
-    # TODO need to fix the auto installation of cuTT
+    # TODO need to fix the auto installation of LibreTT
 
     include(ExternalProject)
 
     # to pass CMAKE_C_* vars to external project
     enable_language(C)
 
-    # set source and build path for cuTT in the TiledArray project
-    set(EXTERNAL_SOURCE_DIR   ${FETCHCONTENT_BASE_DIR}/cutt-src)
-    # cutt only supports in source build
-    set(EXTERNAL_BUILD_DIR  ${FETCHCONTENT_BASE_DIR}/cutt-build)
+    # set source and build path for LibreTT in the TiledArray project
+    set(EXTERNAL_SOURCE_DIR   ${FETCHCONTENT_BASE_DIR}/librett-src)
+    # librett only supports in source build
+    set(EXTERNAL_BUILD_DIR  ${FETCHCONTENT_BASE_DIR}/librett-build)
     set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX})
 
-    if (NOT CUTT_URL)
-        set(CUTT_URL https://github.com/ValeevGroup/cutt.git)
-    endif (NOT CUTT_URL)
-    if (NOT CUTT_TAG)
-        set(CUTT_TAG ${TA_TRACKED_CUTT_TAG})
-    endif (NOT CUTT_TAG)
+    if (NOT LIBRETT_URL)
+        set(LIBRETT_URL https://github.com/victor-anisimov/librett.git)
+    endif (NOT LIBRETT_URL)
+    if (NOT LIBRETT_TAG)
+        set(LIBRETT_TAG ${TA_TRACKED_LIBRETT_TAG})
+    endif (NOT LIBRETT_TAG)
 
-    message("** Will clone cuTT from ${CUTT_URL}")
+    message("** Will clone LibreTT from ${LIBRETT_URL}")
 
     # need to change the separator of list to avoid issues with ExternalProject parsing
 #    set(CUDA_FLAGS "${CUDA_NVCC_FLAGS}")
 #    string(REPLACE ";" "::" CUDA_FLAGS "${CUDA_NVCC_FLAGS}")
     #message(STATUS "CUDA_FLAGS: " "${CUDA_FLAGS}")
 
-    set(CUTT_CMAKE_ARGS
+    set(LIBRETT_CMAKE_ARGS
         -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR}
         -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
         -DCMAKE_POSITION_INDEPENDENT_CODE=${CMAKE_POSITION_INDEPENDENT_CODE}
@@ -66,87 +66,88 @@ else()
         -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD}
         -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS}
         -DENABLE_UMPIRE=OFF
-        -DCUTT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool
+        -DLIBRETT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool
         -DCMAKE_PREFIX_PATH=${_UMPIRE_INSTALL_DIR}
         -DENABLE_NO_ALIGNED_ALLOC=ON
         -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER}
         -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT}
+	-DENABLE_CUDA=ON
         )
     if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-        list(APPEND CUTT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})
+        list(APPEND LIBRETT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})
     endif(DEFINED CMAKE_CUDA_ARCHITECTURES)
     if (CMAKE_TOOLCHAIN_FILE)
-        set(CUTT_CMAKE_ARGS "${CUTT_CMAKE_ARGS}"
+        set(LIBRETT_CMAKE_ARGS "${LIBRETT_CMAKE_ARGS}"
             "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
     endif(CMAKE_TOOLCHAIN_FILE)
 
     if (BUILD_SHARED_LIBS)
-        set(CUTT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+        set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
     else(BUILD_SHARED_LIBS)
-        set(CUTT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+        set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
     endif(BUILD_SHARED_LIBS)
 
     # N.B. Ninja needs spelling out the byproducts of custom targets, see https://cmake.org/cmake/help/v3.3/policy/CMP0058.html
-    set(CUTT_BUILD_BYPRODUCTS "${EXTERNAL_BUILD_DIR}/src/libcutt${CUTT_DEFAULT_LIBRARY_SUFFIX}")
-    message(STATUS "custom target cutt is expected to build these byproducts: ${CUTT_BUILD_BYPRODUCTS}")
+    set(LIBRETT_BUILD_BYPRODUCTS "${EXTERNAL_BUILD_DIR}/src/librett${LIBRETT_DEFAULT_LIBRARY_SUFFIX}")
+    message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}")
 
-    ExternalProject_Add(cutt
+    ExternalProject_Add(librett
             PREFIX ${CMAKE_INSTALL_PREFIX}
-            STAMP_DIR ${FETCHCONTENT_BASE_DIR}/cutt-ep-artifacts
-            TMP_DIR ${FETCHCONTENT_BASE_DIR}/cutt-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
+            STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts
+            TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
             #--Download step--------------
             DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR}
-            GIT_REPOSITORY ${CUTT_URL}
-            GIT_TAG ${CUTT_TAG}
+            GIT_REPOSITORY ${LIBRETT_URL}
+            GIT_TAG ${LIBRETT_TAG}
             #--Configure step-------------
             SOURCE_DIR ${EXTERNAL_SOURCE_DIR}
             LIST_SEPARATOR ::
             UPDATE_DISCONNECTED 1
             CMAKE_ARGS
-            ${CUTT_CMAKE_ARGS}
+            ${LIBRETT_CMAKE_ARGS}
             	${EXTERNAL_SOURCE_DIR}
             #--Build step-----------------
             BINARY_DIR ${EXTERNAL_BUILD_DIR}
-            BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cutt -v
-            BUILD_BYPRODUCTS ${CUTT_BUILD_BYPRODUCTS}
+            BUILD_COMMAND ${CMAKE_COMMAND} --build . --target librett -v
+            BUILD_BYPRODUCTS ${LIBRETT_BUILD_BYPRODUCTS}
             #--Install step---------------
-            INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "cuTT will be installed during TiledArray's installation."
+            INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "LibreTT will be installed during TiledArray's installation."
             #--Custom targets-------------
             STEP_TARGETS build
             )
 
-    # TiledArray_CUTT target depends on existence of this directory to be usable from the build tree at configure time
+    # TiledArray_LIBRETT target depends on existence of this directory to be usable from the build tree at configure time
     execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src")
 
-    # do install of cuTT as part of building TiledArray's install target
+    # do install of LibreTT as part of building TiledArray's install target
     install(CODE
             "execute_process(
                COMMAND \"${CMAKE_COMMAND}\" \"--build\" \".\" \"--target\" \"install\"
                WORKING_DIRECTORY \"${EXTERNAL_BUILD_DIR}\"
                RESULT_VARIABLE error_code)
                if(error_code)
-                 message(FATAL_ERROR \"Failed to install cuTT\")
+                 message(FATAL_ERROR \"Failed to install LibreTT\")
                endif()
             ")
 
-    # Add cuTT dependency to External
-    add_dependencies(External-tiledarray cutt-build)
+    # Add LibreTT dependency to External
+    add_dependencies(External-tiledarray librett-build)
 
-    set(_CUTT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR})
+    set(_LIBRETT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR})
 
-endif(_CUTT_INSTALL_DIR)
+endif(_LIBRETT_INSTALL_DIR)
 
-add_library(TiledArray_CUTT INTERFACE)
+add_library(TiledArray_LIBRETT INTERFACE)
 
-set_target_properties(TiledArray_CUTT
+set_target_properties(TiledArray_LIBRETT
         PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES
-        "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<INSTALL_INTERFACE:${_CUTT_INSTALL_DIR}/include>"
+        "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<INSTALL_INTERFACE:${_LIBRETT_INSTALL_DIR}/include>"
         INTERFACE_LINK_LIBRARIES
-        "$<BUILD_INTERFACE:${CUTT_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_CUTT_INSTALL_DIR}/lib/libcutt.${CUTT_DEFAULT_LIBRARY_SUFFIX}>"
+        "$<BUILD_INTERFACE:${LIBRETT_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_LIBRETT_INSTALL_DIR}/lib/librett.${LIBRETT_DEFAULT_LIBRARY_SUFFIX}>"
         )
 
-install(TARGETS TiledArray_CUTT EXPORT tiledarray COMPONENT tiledarray)
+install(TARGETS TiledArray_LIBRETT EXPORT tiledarray COMPONENT tiledarray)
 
 
-#TODO test cuTT
+#TODO test LibreTT
diff --git a/external/versions.cmake b/external/versions.cmake
index 12b2746796..b2eb0b3719 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -27,8 +27,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 set(TA_TRACKED_BTAS_TAG f1d9eaeaf8f88f54defec991d34c7790c6c45bb2)
 set(TA_TRACKED_BTAS_PREVIOUS_TAG 240b49b033864b34d74f2b8d6dd55f2ab524eae3)
 
-set(TA_TRACKED_CUTT_TAG 0e8685bf82910bc7435835f846e88f1b39f47f09)
-set(TA_TRACKED_CUTT_PREVIOUS_TAG 592198b93c93b7ca79e7900b9a9f2e79f9dafec3)
+set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da)
+set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796)
 
 set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82)
 set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2578b4eb1a..c0b69b9b32 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -201,7 +201,7 @@ if(CUDA_FOUND)
 
   list(APPEND TILEDARRAY_HEADER_FILES
      TiledArray/external/cuda.h
-     TiledArray/external/cutt.h
+     TiledArray/external/librett.h
      TiledArray/cuda/cublas.h
      TiledArray/cuda/btas_cublas.h
      TiledArray/cuda/btas_um_tensor.h
@@ -250,7 +250,7 @@ if(CUDA_FOUND)
           LANGUAGE CUDA)
 
   # the list of libraries on which TiledArray depends on
-  list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_CUTT)
+  list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT)
 
 endif(CUDA_FOUND)
 
diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/cuda/btas_um_tensor.h
index d6012f00f1..7bddc4a178 100644
--- a/src/TiledArray/cuda/btas_um_tensor.h
+++ b/src/TiledArray/cuda/btas_um_tensor.h
@@ -32,7 +32,7 @@
 
 #include <TiledArray/cuda/btas_cublas.h>
 #include <TiledArray/cuda/um_storage.h>
-#include <TiledArray/external/cutt.h>
+#include <TiledArray/external/librett.h>
 #include <TiledArray/tile.h>
 
 namespace TiledArray {
@@ -95,7 +95,8 @@ namespace TiledArray {
 /// gemm
 ///
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> gemm(
     const btasUMTensorVarray<T, Range> &left,
     const btasUMTensorVarray<T, Range> &right, Scalar factor,
@@ -103,7 +104,8 @@ btasUMTensorVarray<T, Range> gemm(
   return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper);
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 void gemm(btasUMTensorVarray<T, Range> &result,
           const btasUMTensorVarray<T, Range> &left,
           const btasUMTensorVarray<T, Range> &right, Scalar factor,
@@ -159,8 +161,8 @@ btasUMTensorVarray<T, Range> shift(const btasUMTensorVarray<T, Range> &arg,
 /// shift to
 ///
 template <typename T, typename Range, typename Index>
-btasUMTensorVarray<T, Range>& shift_to(btasUMTensorVarray<T, Range> &arg,
-                                      const Index &range_shift) {
+btasUMTensorVarray<T, Range> &shift_to(btasUMTensorVarray<T, Range> &arg,
+                                       const Index &range_shift) {
   const_cast<Range &>(arg.range()).inplace_shift(range_shift);
   return arg;
 }
@@ -187,8 +189,8 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
                                       std::move(storage));
 
   // invoke the permute function
-  cutt_permute(const_cast<T *>(device_data(arg.storage())),
-               device_data(result.storage()), arg.range(), perm, stream);
+  librett_permute(const_cast<T *>(device_data(arg.storage())),
+                  device_data(result.storage()), arg.range(), perm, stream);
 
   synchronize_stream(&stream);
 
@@ -199,24 +201,29 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
 /// scale
 ///
 
-template <typename T, typename Range, typename Scalar, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Range, typename Scalar,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
                                    const Scalar factor) {
   detail::to_cuda(arg);
   return btas_tensor_scale_cuda_impl(arg, factor);
 }
 
-template <typename T, typename Range, typename Scalar, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& scale_to(btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
+template <typename T, typename Range, typename Scalar,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &scale_to(btasUMTensorVarray<T, Range> &arg,
+                                       const Scalar factor) {
   detail::to_cuda(arg);
   btas_tensor_scale_to_cuda_impl(arg, factor);
   return arg;
 }
 
-template <typename T, typename Range, typename Scalar, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Scalar, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
-                                   const Scalar factor,
-                                   const Perm &perm) {
+                                   const Scalar factor, const Perm &perm) {
   auto result = scale(arg, factor);
 
   // wait to finish before switch stream
@@ -236,7 +243,9 @@ btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg) {
   return btas_tensor_scale_cuda_impl(arg, T(-1.0));
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
                                  const Perm &perm) {
   auto result = neg(arg);
@@ -249,7 +258,7 @@ btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
 }
 
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& neg_to(btasUMTensorVarray<T, Range> &arg) {
+btasUMTensorVarray<T, Range> &neg_to(btasUMTensorVarray<T, Range> &arg) {
   detail::to_cuda(arg);
   btas_tensor_scale_to_cuda_impl(arg, T(-1.0));
   return arg;
@@ -267,7 +276,8 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0));
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor) {
@@ -276,7 +286,9 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   return result;
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Perm &perm) {
@@ -289,11 +301,13 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
   return permute(result, perm);
 }
 
-template <typename T, typename Scalar, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Scalar, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
-                                  const Scalar factor,
-                                  const Perm &perm) {
+                                  const Scalar factor, const Perm &perm) {
   auto result = subt(arg1, arg2, factor);
 
   // wait to finish before switch stream
@@ -308,17 +322,20 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
 ///
 
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& subt_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg1) {
+btasUMTensorVarray<T, Range> &subt_to(
+    btasUMTensorVarray<T, Range> &result,
+    const btasUMTensorVarray<T, Range> &arg1) {
   detail::to_cuda(result);
   detail::to_cuda(arg1);
   btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0));
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& subt_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg1, const Scalar factor) {
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &subt_to(btasUMTensorVarray<T, Range> &result,
+                                      const btasUMTensorVarray<T, Range> &arg1,
+                                      const Scalar factor) {
   subt_to(result, arg1);
   btas_tensor_scale_to_cuda_impl(result, factor);
   return result;
@@ -336,7 +353,8 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0));
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Scalar factor) {
@@ -345,11 +363,13 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Scalar, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
-                                 const Scalar factor,
-                                 const Perm &perm) {
+                                 const Scalar factor, const Perm &perm) {
   auto result = add(arg1, arg2, factor);
 
   // wait to finish before switch stream
@@ -359,7 +379,9 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
   return permute(result, perm);
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Perm &perm) {
@@ -377,17 +399,19 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
 ///
 
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& add_to(btasUMTensorVarray<T, Range> &result,
-            const btasUMTensorVarray<T, Range> &arg) {
+btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
+                                     const btasUMTensorVarray<T, Range> &arg) {
   detail::to_cuda(result);
   detail::to_cuda(arg);
   btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& add_to(btasUMTensorVarray<T, Range> &result,
-            const btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
+                                     const btasUMTensorVarray<T, Range> &arg,
+                                     const Scalar factor) {
   add_to(result, arg);
   btas_tensor_scale_to_cuda_impl(result, factor);
   return result;
@@ -416,7 +440,8 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   return btas_tensor_mult_cuda_impl(arg1, arg2);
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor) {
@@ -425,7 +450,9 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   return result;
 }
 
-template <typename T, typename Range, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Perm &perm) {
@@ -438,11 +465,13 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
   return permute(result, perm);
 }
 
-template <typename T, typename Range, typename Scalar, typename Perm, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> && TiledArray::detail::is_permutation_v<Perm>>>
+template <
+    typename T, typename Range, typename Scalar, typename Perm,
+    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                                TiledArray::detail::is_permutation_v<Perm>>>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
-                                  const Scalar factor,
-                                  const Perm &perm) {
+                                  const Scalar factor, const Perm &perm) {
   auto result = mult(arg1, arg2, factor);
 
   // wait to finish before switch stream
@@ -456,17 +485,19 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
 /// mult to
 ///
 template <typename T, typename Range>
-btasUMTensorVarray<T, Range>& mult_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg) {
+btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
+                                      const btasUMTensorVarray<T, Range> &arg) {
   detail::to_cuda(result);
   detail::to_cuda(arg);
   btas_tensor_mult_to_cuda_impl(result, arg);
   return result;
 }
 
-template <typename T, typename Scalar, typename Range, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btasUMTensorVarray<T, Range>& mult_to(btasUMTensorVarray<T, Range> &result,
-             const btasUMTensorVarray<T, Range> &arg, const Scalar factor) {
+template <typename T, typename Scalar, typename Range,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
+                                      const btasUMTensorVarray<T, Range> &arg,
+                                      const Scalar factor) {
   mult_to(result, arg);
   btas_tensor_scale_to_cuda_impl(result, factor);
   return result;
diff --git a/src/TiledArray/external/eigen.h b/src/TiledArray/external/eigen.h
index 6ee0eaea3f..cd2c50b522 100644
--- a/src/TiledArray/external/eigen.h
+++ b/src/TiledArray/external/eigen.h
@@ -46,7 +46,14 @@ TILEDARRAY_PRAGMA_GCC(system_header)
 #endif
 
 #include <Eigen/Core>
+
+// disable warnings re: ignored attributes on template argument
+// Eigen::PacketType<int, Eigen::DefaultDevice>::type
+// {aka __vector(2) long long int}
+TILEDARRAY_PRAGMA_GCC(diagnostic push)
+TILEDARRAY_PRAGMA_GCC(diagnostic ignored "-Wignored-attributes")
 #include <unsupported/Eigen/CXX11/Tensor>
+TILEDARRAY_PRAGMA_GCC(diagnostic pop)
 
 #if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_LAPACKE_STRICT)
 #if !EIGEN_VERSION_AT_LEAST(3, 3, 7)
diff --git a/src/TiledArray/external/cutt.h b/src/TiledArray/external/librett.h
similarity index 76%
rename from src/TiledArray/external/cutt.h
rename to src/TiledArray/external/librett.h
index a2a31ec20d..46d116c45b 100644
--- a/src/TiledArray/external/cutt.h
+++ b/src/TiledArray/external/librett.h
@@ -21,8 +21,8 @@
  *
  */
 
-#ifndef TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED
-#define TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED
+#ifndef TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
+#define TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
 
 #include <TiledArray/config.h>
 
@@ -31,7 +31,7 @@
 #include <algorithm>
 #include <vector>
 
-#include <cutt.h>
+#include <librett.h>
 
 #include <TiledArray/permutation.h>
 #include <TiledArray/range.h>
@@ -77,38 +77,39 @@ inline void permutation_to_col_major(std::vector<int>& perm) {
  * @param stream  the CUDA stream this permutation will be submitted to
  */
 template <typename T>
-void cutt_permute(T* inData, T* outData, const TiledArray::Range& range,
-                  const TiledArray::Permutation& perm, cudaStream_t stream) {
+void librett_permute(T* inData, T* outData, const TiledArray::Range& range,
+                     const TiledArray::Permutation& perm, cudaStream_t stream) {
   auto extent = range.extent();
   std::vector<int> extent_int(extent.begin(), extent.end());
 
-  // cuTT uses FROM notation
+  // LibreTT uses FROM notation
   auto perm_inv = perm.inv();
   std::vector<int> perm_int(perm_inv.begin(), perm_inv.end());
 
-  // cuTT uses ColMajor
+  // LibreTT uses ColMajor
   TiledArray::extent_to_col_major(extent_int);
   TiledArray::permutation_to_col_major(perm_int);
 
-  cuttResult_t status;
+  // librettResult_t status;
+  librettResult status;
 
-  cuttHandle plan;
-  status = cuttPlan(&plan, range.rank(), extent_int.data(), perm_int.data(),
-                    sizeof(T), stream);
+  librettHandle plan;
+  status = librettPlan(&plan, range.rank(), extent_int.data(), perm_int.data(),
+                       sizeof(T), stream);
 
-  TA_ASSERT(status == CUTT_SUCCESS);
+  TA_ASSERT(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, inData, outData);
+  status = librettExecute(plan, inData, outData);
 
-  TA_ASSERT(status == CUTT_SUCCESS);
+  TA_ASSERT(status == LIBRETT_SUCCESS);
 
-  status = cuttDestroy(plan);
+  status = librettDestroy(plan);
 
-  TA_ASSERT(status == CUTT_SUCCESS);
+  TA_ASSERT(status == LIBRETT_SUCCESS);
 }
 
 }  // namespace TiledArray
 
 #endif  //  TILEDARRAY_HAS_CUDA
 
-#endif  // TILEDARRAY_EXTERNAL_CUTT_H__INCLUDED
+#endif  // TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
diff --git a/src/TiledArray/initialize.h b/src/TiledArray/initialize.h
index c86fa1d151..324f772ccf 100644
--- a/src/TiledArray/initialize.h
+++ b/src/TiledArray/initialize.h
@@ -60,10 +60,34 @@ inline World& initialize(int& argc, char**& argv, const MPI_Comm& comm,
 
 /// @}
 
+#ifndef TA_SCOPED_INITIALIZE
+/// calling this will initialize TA and then finalize it when leaving this scope
+#define TA_SCOPED_INITIALIZE(args...) \
+  TiledArray::initialize(args);       \
+  auto finalizer = TiledArray::scoped_finalizer();
+#endif
+
 /// Finalizes TiledArray (and MADWorld runtime, if it had not been initialized
 /// when TiledArray::initialize was called).
 void finalize();
 
+namespace detail {
+struct Finalizer {
+  ~Finalizer() noexcept;
+};
+}  // namespace detail
+
+/// creates an object whose destruction upon leaving this scope will cause
+/// TiledArray::finalize to be called
+detail::Finalizer scoped_finalizer();
+
+#ifndef TA_FINALIZE_AFTER_LEAVING_THIS_SCOPE
+/// calling this will cause TiledArray::finalize() to be called (if needed)
+/// upon leaving this scope
+#define TA_FINALIZE_AFTER_LEAVING_THIS_SCOPE() \
+  auto finalizer = TiledArray::scoped_finalizer();
+#endif
+
 void taskq_wait_busy();
 void taskq_wait_yield();
 void taskq_wait_usleep(int);
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
index 088e4b9210..b4700ddec9 100644
--- a/src/TiledArray/tiledarray.cpp
+++ b/src/TiledArray/tiledarray.cpp
@@ -9,7 +9,7 @@
 #ifdef TILEDARRAY_HAS_CUDA
 #include <TiledArray/cuda/cublas.h>
 #include <TiledArray/external/cuda.h>
-#include <cutt.h>
+#include <librett.h>
 #endif
 
 #if TILEDARRAY_HAS_TTG
@@ -29,16 +29,18 @@ inline void cuda_initialize() {
   cudaEnv::instance();
   //
   cuBLASHandlePool::handle();
-  // initialize cuTT
-  cuttInitialize();
+  // initialize LibreTT
+  librettInitialize();
 }
 
 /// finalize cuda environment
 inline void cuda_finalize() {
   CudaSafeCall(cudaDeviceSynchronize());
-  cuttFinalize();
+  librettFinalize();
   cublasDestroy(cuBLASHandlePool::handle());
   delete &cuBLASHandlePool::handle();
+  // although TA::cudaEnv is a singleton, must explicitly delete it so
+  // that CUDA runtime is not finalized before the cudaEnv dtor is called
   cudaEnv::instance().reset(nullptr);
 }
 #endif
@@ -173,6 +175,16 @@ void TiledArray::finalize() {
   finalized_accessor() = true;
 }
 
+TiledArray::detail::Finalizer::~Finalizer() noexcept {
+  static std::mutex mtx;
+  std::scoped_lock lock(mtx);
+  if (TiledArray::initialized()) {
+    TiledArray::finalize();
+  }
+}
+
+TiledArray::detail::Finalizer TiledArray::scoped_finalizer() { return {}; }
+
 void TiledArray::ta_abort() { SafeMPI::COMM_WORLD.Abort(); }
 
 void TiledArray::ta_abort(const std::string& m) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3335f9acb8..88ea115334 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -101,7 +101,7 @@ set(ta_test_src_files  ta_test.cpp
 )
 
 if(CUDA_FOUND)
-    list(APPEND ta_test_src_files cutt.cpp expressions_cuda_um.cpp tensor_um.cpp)
+    list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp)
 endif()
 
 # if tiledarray library was compiled without exceptions, use TA header-only (see below)
@@ -154,9 +154,11 @@ if(ENABLE_MPI)
         $<TARGET_FILE:${executable}> --log_level=unit_scope ${${executable}_np_${p}_args}
         ${MPIEXEC_POSTFLAGS}
       )
+    # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
     set_tests_properties(tiledarray/unit/run-np-${p}
             PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC
-            ENVIRONMENT MAD_NUM_THREADS=2)
+            ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1"
+            )
 
     if (p GREATER 1)
       set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES ENVIRONMENT TA_UT_DISTRIBUTED=1)
@@ -165,7 +167,9 @@ if(ENABLE_MPI)
 else()
   add_test(NAME tiledarray/unit/run-np-1
            COMMAND ${executable})
+  # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
   set_tests_properties(tiledarray/unit/run-np-1
           PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC
-          ENVIRONMENT MAD_NUM_THREADS=2)
+          ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1"
+          )
 endif()
diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp
index b03ec0c994..a17b749789 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_cuda_um.cpp
@@ -123,7 +123,7 @@ struct UMExpressionsFixture : public TiledRangeFixture {
   TArrayUMD u;
   TArrayUMD v;
   TArrayUMD w;
-  double tolerance = 1.0e-14;
+  static constexpr double tolerance = 5.0e-14;
 };  // UMExpressionsFixture
 
 // Instantiate static variables for fixture
@@ -305,7 +305,8 @@ BOOST_AUTO_TEST_CASE(permute) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = b("c,b,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (a.is_local(perm_index)) {
       TArrayUMD::value_type a_tile = a.find(perm_index).get();
       TArrayUMD::value_type perm_b_tile = permute_fn(b.find(i), perm);
@@ -333,7 +334,8 @@ BOOST_AUTO_TEST_CASE(permute) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = b("b,c,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm2 * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm2 * b.tiles_range().idx(i));
     if (a.is_local(perm_index)) {
       TArrayUMD::value_type a_tile = a.find(perm_index).get();
       TArrayUMD::value_type perm_b_tile = permute_fn(b.find(i), perm2);
@@ -350,7 +352,8 @@ BOOST_AUTO_TEST_CASE(scale_permute) {
   BOOST_REQUIRE_NO_THROW(a("a,b,c") = 2 * b("c,b,a"));
 
   for (std::size_t i = 0ul; i < b.size(); ++i) {
-    const std::size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const std::size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (a.is_local(perm_index)) {
       TArrayUMD::value_type a_tile = a.find(perm_index).get();
       TArrayUMD::value_type perm_b_tile = permute_fn(b.find(i), perm);
@@ -517,6 +520,30 @@ BOOST_AUTO_TEST_CASE(scal_block) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(scal_add_block) {
+  Permutation perm({2, 1, 0});
+  BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5});
+
+  BOOST_REQUIRE_NO_THROW(c("a,b,c") =
+                             2 * (3 * a("a,b,c").block({3, 3, 3}, {5, 5, 5}) +
+                                  4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5})));
+
+  for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
+    if (!a.is_zero(block_range.ordinal(index)) &&
+        !b.is_zero(block_range.ordinal(index))) {
+      auto a_tile = a.find(block_range.ordinal(index)).get();
+      auto b_tile = b.find(block_range.ordinal(index)).get();
+      auto result_tile = c.find(index).get();
+
+      for (std::size_t j = 0ul; j < result_tile.range().volume(); ++j) {
+        BOOST_CHECK_EQUAL(result_tile[j], 2 * (3 * a_tile[j] + 4 * b_tile[j]));
+      }
+    } else {
+      BOOST_CHECK(c.is_zero(index));
+    }
+  }
+}
+
 BOOST_AUTO_TEST_CASE(permute_block) {
   Permutation perm({2, 1, 0});
   BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5});
@@ -524,7 +551,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
   BOOST_REQUIRE_NO_THROW(c("a,b,c") = a("c,b,a").block({3, 3, 3}, {5, 5, 5}));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
       auto arg_tile = permute_fn(a.find(block_range.ordinal(perm_index)), perm);
@@ -543,7 +571,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
                              2 * a("c,b,a").block({3, 3, 3}, {5, 5, 5}));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index))) {
       auto arg_tile = permute_fn(a.find(block_range.ordinal(perm_index)), perm);
@@ -563,7 +592,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
                                   4 * b("a,b,c").block({3, 3, 3}, {5, 5, 5})));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index)) ||
         !b.is_zero(block_range.ordinal(index))) {
@@ -584,7 +614,8 @@ BOOST_AUTO_TEST_CASE(permute_block) {
                                   4 * b("c,b,a").block({3, 3, 3}, {5, 5, 5})));
 
   for (std::size_t index = 0ul; index < block_range.volume(); ++index) {
-    const size_t perm_index = c.range().ordinal(perm * c.range().idx(index));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * c.tiles_range().idx(index));
 
     if (!a.is_zero(block_range.ordinal(perm_index)) ||
         !b.is_zero(block_range.ordinal(perm_index))) {
@@ -867,7 +898,8 @@ BOOST_AUTO_TEST_CASE(add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -879,7 +911,8 @@ BOOST_AUTO_TEST_CASE(add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -958,7 +991,8 @@ BOOST_AUTO_TEST_CASE(scale_add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -970,7 +1004,8 @@ BOOST_AUTO_TEST_CASE(scale_add_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1058,7 +1093,8 @@ BOOST_AUTO_TEST_CASE(subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1070,7 +1106,8 @@ BOOST_AUTO_TEST_CASE(subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1133,7 +1170,8 @@ BOOST_AUTO_TEST_CASE(scale_subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1145,7 +1183,8 @@ BOOST_AUTO_TEST_CASE(scale_subt_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1233,7 +1272,8 @@ BOOST_AUTO_TEST_CASE(mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1245,7 +1285,8 @@ BOOST_AUTO_TEST_CASE(mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -1308,7 +1349,8 @@ BOOST_AUTO_TEST_CASE(scale_mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = b.find(i).get();
 
@@ -1320,7 +1362,8 @@ BOOST_AUTO_TEST_CASE(scale_mult_permute) {
 
   for (std::size_t i = 0ul; i < c.size(); ++i) {
     TArrayUMD::value_type c_tile = c.find(i).get();
-    const size_t perm_index = c.range().ordinal(perm * a.range().idx(i));
+    const size_t perm_index =
+        c.tiles_range().ordinal(perm * a.tiles_range().idx(i));
     TArrayUMD::value_type a_tile = permute_fn(a.find(perm_index), perm);
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
@@ -2459,7 +2502,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
   double expected = 0;
   for (std::size_t i = 0ul; i < a.size(); ++i) {
     TArrayUMD::value_type a_tile = a.find(i).get();
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     TArrayUMD::value_type b_tile = permute_fn(b.find(perm_index), perm);
 
     for (std::size_t j = 0ul; j < a_tile.size(); ++j)
@@ -2476,7 +2520,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
@@ -2495,7 +2540,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
@@ -2516,7 +2562,8 @@ BOOST_AUTO_TEST_CASE(dot_permute) {
 
   // Compute the expected value for the dot function.
   for (std::size_t i = 0ul; i < a.size(); ++i) {
-    const size_t perm_index = a.range().ordinal(perm * b.range().idx(i));
+    const size_t perm_index =
+        a.tiles_range().ordinal(perm * b.tiles_range().idx(i));
     if (!a.is_zero(i) && !b.is_zero(perm_index)) {
       auto a_tile = a.find(i).get();
       auto b_tile = perm * b.find(perm_index).get();
diff --git a/tests/cutt.cpp b/tests/librett.cpp
similarity index 81%
rename from tests/cutt.cpp
rename to tests/librett.cpp
index 8a6b1af539..91c5b5b8ad 100644
--- a/tests/cutt.cpp
+++ b/tests/librett.cpp
@@ -27,8 +27,8 @@
 #include <TiledArray/cuda/btas_um_tensor.h>
 #include "unit_test_config.h"
 
-struct cuTTFixture {
-  //  cuTTFixture()
+struct LibreTTFixture {
+  //  LibreTTFixture()
   //      : A(100),
   //        B(50),
   //        C(20),
@@ -36,16 +36,16 @@ struct cuTTFixture {
   //        extent({100, 100}),
   //        extent_nonsym({100, 50}),
   //        perm({1, 0}) {}
-  cuTTFixture() : A(10), B(5), C(2) {}
+  LibreTTFixture() : A(10), B(5), C(2) {}
 
   int A;
   int B;
   int C;
 };
 
-BOOST_FIXTURE_TEST_SUITE(cutt_suite, cuTTFixture, TA_UT_LABEL_SERIAL);
+BOOST_FIXTURE_TEST_SUITE(librett_suite, LibreTTFixture, TA_UT_LABEL_SERIAL);
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
   int* a_host = (int*)std::malloc(A * A * sizeof(int));
   int* b_host = (int*)std::malloc(A * A * sizeof(int));
   int iter = 0;
@@ -68,17 +68,18 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
-  cuttDestroy(plan);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
+  librettDestroy(plan);
 
   cudaMemcpy(b_host, b_device, A * A * sizeof(int), cudaMemcpyDeviceToHost);
 
@@ -97,7 +98,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
   int* a_host = (int*)std::malloc(A * B * sizeof(int));
   int* b_host = (int*)std::malloc(A * B * sizeof(int));
   int iter = 0;
@@ -115,8 +116,9 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
 
   cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice);
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({B, A});
   TiledArray::extent_to_col_major(extent);
@@ -124,14 +126,14 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
-  cuttDestroy(plan);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
+  librettDestroy(plan);
 
   cudaMemcpy(b_host, b_device, A * B * sizeof(int), cudaMemcpyDeviceToHost);
 
@@ -150,7 +152,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
   int* a_host = (int*)std::malloc(A * B * C * sizeof(int));
   int* b_host = (int*)std::malloc(A * B * C * sizeof(int));
   int iter = 0;
@@ -172,28 +174,29 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) {
 
   // b(j,i,k) = a(i,j,k)
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent3{int(A), int(B), int(C)};
 
   std::vector<int> perm3{1, 0, 2};
   //  std::vector<int> perm3{0, 2, 1};
 
-  status = cuttPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int),
+  status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int),
                            0, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost);
 
-  status = cuttDestroy(plan);
+  status = librettDestroy(plan);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   iter = 0;
   for (std::size_t k = 0; k < C; k++) {
@@ -212,7 +215,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_column_major) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
+BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
   int* a_host = (int*)std::malloc(A * B * C * sizeof(int));
   int* b_host = (int*)std::malloc(A * B * C * sizeof(int));
   int iter = 0;
@@ -234,8 +237,9 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
 
   // b(j,i,k) = a(i,j,k)
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({A, B, C});
   TiledArray::extent_to_col_major(extent);
@@ -243,20 +247,20 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
   std::vector<int> perm({1, 0, 2});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0,
+  status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0,
                            a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_device, b_device);
+  status = librettExecute(plan, a_device, b_device);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost);
 
-  status = cuttDestroy(plan);
+  status = librettDestroy(plan);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -275,7 +279,7 @@ BOOST_AUTO_TEST_CASE(cutt_gpu_mem_nonsym_rank_three_row_major) {
   cudaFree(b_device);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
+BOOST_AUTO_TEST_CASE(librett_unified_mem) {
   int* a_um;
   cudaMallocManaged(&a_um, A * A * sizeof(int));
 
@@ -290,8 +294,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
     }
   }
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({A, A});
   TiledArray::extent_to_col_major(extent);
@@ -299,15 +304,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_um, b_um);
+  status = librettExecute(plan, a_um, b_um);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cuttDestroy(plan);
+  librettDestroy(plan);
 
   cudaDeviceSynchronize();
 
@@ -323,7 +328,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem) {
   cudaFree(b_um);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
+BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
   int* a_um;
   cudaMallocManaged(&a_um, A * B * sizeof(int));
 
@@ -338,8 +343,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
     }
   }
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   std::vector<int> extent({B, A});
   TiledArray::extent_to_col_major(extent);
@@ -347,15 +353,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_um, b_um);
+  status = librettExecute(plan, a_um, b_um);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cuttDestroy(plan);
+  librettDestroy(plan);
   cudaDeviceSynchronize();
 
   iter = 0;
@@ -369,7 +375,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_nonsym) {
   cudaFree(b_um);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
+BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
   int* a_um;
   cudaMallocManaged(&a_um, A * B * C * sizeof(int));
 
@@ -386,8 +392,9 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
     }
   }
 
-  cuttHandle plan;
-  cuttResult_t status;
+  librettHandle plan;
+  //librettResult_t status;
+  librettResult status;
 
   // b(k,i,j) = a(i,j,k)
 
@@ -397,15 +404,15 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
   std::vector<int> perm({2, 0, 1});
   TiledArray::permutation_to_col_major(perm);
 
-  status = cuttPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0);
+  status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  status = cuttExecute(plan, a_um, b_um);
+  status = librettExecute(plan, a_um, b_um);
 
-  BOOST_CHECK(status == CUTT_SUCCESS);
+  BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cuttDestroy(plan);
+  librettDestroy(plan);
   cudaDeviceSynchronize();
 
   iter = 0;
@@ -421,7 +428,7 @@ BOOST_AUTO_TEST_CASE(cutt_unified_mem_rank_three) {
   cudaFree(b_um);
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor) {
   TiledArray::Range range{A, A};
 
   using Tile = TiledArray::btasUMTensorVarray<int, TiledArray::Range>;
@@ -453,7 +460,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_nonsym) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_nonsym) {
   TiledArray::Range range{B, A};
 
   using Tile = TiledArray::btasUMTensorVarray<int, TiledArray::Range>;
@@ -485,7 +492,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_nonsym) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_three) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) {
   TiledArray::Range range{A, B, C};
 
   using Tile = TiledArray::btasUMTensorVarray<int, TiledArray::Range>;
@@ -540,7 +547,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_three) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_four) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) {
   std::size_t a = 2;
   std::size_t b = 3;
   std::size_t c = 6;
@@ -609,7 +616,7 @@ BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_four) {
   }
 }
 
-BOOST_AUTO_TEST_CASE(cutt_um_tensor_rank_six) {
+BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) {
   std::size_t a = 2;
   std::size_t b = 3;
   std::size_t c = 6;
diff --git a/tests/tensor_um.cpp b/tests/tensor_um.cpp
index 310e04234f..33efbfd7d4 100644
--- a/tests/tensor_um.cpp
+++ b/tests/tensor_um.cpp
@@ -87,8 +87,7 @@ struct TensorUMFixture {
 
 const TensorUMFixture::range_type TensorUMFixture::r = make_range(81);
 
-BOOST_FIXTURE_TEST_SUITE(tensor_um_suite, TensorUMFixture,
-                         TA_UT_LABEL_SERIAL)
+BOOST_FIXTURE_TEST_SUITE(tensor_um_suite, TensorUMFixture, TA_UT_LABEL_SERIAL)
 
 BOOST_AUTO_TEST_CASE(default_constructor) {
   // check constructor
@@ -98,7 +97,6 @@ BOOST_AUTO_TEST_CASE(default_constructor) {
   BOOST_CHECK(x.empty());
 
   // Check that range data is correct
-  BOOST_CHECK_EQUAL(x.data(), static_cast<int*>(NULL));
   BOOST_CHECK_EQUAL(x.size(), 0ul);
   BOOST_CHECK_EQUAL(x.range().volume(), 0ul);
 
diff --git a/tests/tot_dist_array_part2.cpp b/tests/tot_dist_array_part2.cpp
index fa13819fee..b916812884 100644
--- a/tests/tot_dist_array_part2.cpp
+++ b/tests/tot_dist_array_part2.cpp
@@ -255,7 +255,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(trange, TestParam, test_params) {
   }
 }
 
-BOOST_AUTO_TEST_CASE_TEMPLATE(range, TestParam, test_params) {
+BOOST_AUTO_TEST_CASE_TEMPLATE(tiles_range, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)