Skip to content

Commit

Permalink
RCCL support (#93)
Browse files Browse the repository at this point in the history
* Initial support for RCCL

* OMNITRACE_USE_RCCLP + sampling tweaks

- also OMNITRACE_SAMPLING_KEEP_INTERNAL option
- minor modifications to sampling to use keep internal option + discard funlockfile

* Update docker and workflows to download RCCL

* Update CPack DEB with rocprofiler dependency

* Rework rccl into library and library/components folder

- add tpls/rccl/rccl/rccl.h

* Fix timemory includes

* rcclp inline definitions when disabled

* Tweaks to ubuntu-focal-external-rocm

- disable ompt
- enable building testing

* Tweaks to ubuntu-focal-external-rocm

- ctest exclude

* Tweak ubuntu-focal.yml

- remove source /.../setup-env.sh, replace with $GITHUB_ENV

* Fix ubuntu-focal-rocm + OMPI + root

* Improved rocm-smi error handling

- Recover from rocm-smi errors
- Disabling rocm-smi after recovering from errors
- Werror in developer mode
- Remove State::DelayedInit
- Add State::Disabled

* formatting

* Fix merge of OMNITRACE_SAMPLING_KEEP_INTERNAL

* Update RCCL include directory

- based on ROCm version we need with <rccl/rccl.h> or <rccl.h>

* RCCL Testing

- updated tests to use configuration files
- many tests generate a configuration file
- tests how have GPU option
- enable ncclCommCount, disable ncclGetVersion
- add testing for RCCLP via rccl-tests
- working directory of tests is PROJECT_BINARY_DIR
- add nccl/rccl functions to get_whole_function_names
- some clang compiler fixes

* Handle RCCL include w/o HIP

* RCCL requires HIP

* Update OMNITRACE_SAMPLING_CPUS for testing

* Update tests/CMakeLists.txt

* Debug settings

* Install MPI even when USE_MPI=OFF

* exclude printf

* skip mpi tests w/o USE_MPI or USE_MPI_HEADERS

* update ubuntu rocm workflow

* Fix configure env step for ubuntu rocm
  • Loading branch information
jrmadsen committed Jul 25, 2022
1 parent a539da1 commit 45be039
Show file tree
Hide file tree
Showing 29 changed files with 1,814 additions and 121 deletions.
1 change: 1 addition & 0 deletions .cmake-format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ parse:
NAME: '*'
TARGET: '*'
MPI: '*'
GPU: '*'
NUM_PROCS: '*'
REWRITE_TIMEOUT: '*'
RUNTIME_TIMEOUT: '*'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/opensuse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,5 +121,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
1 change: 1 addition & 0 deletions .github/workflows/ubuntu-bionic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,5 +150,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
119 changes: 97 additions & 22 deletions .github/workflows/ubuntu-focal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
add-apt-repository -y ppa:ubuntu-toolchain-r/test &&
apt-get update &&
apt-get upgrade -y &&
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev ${{ matrix.compiler }} &&
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev libmpich-dev mpich ${{ matrix.compiler }} &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy &&
python3 -m pip install perfetto &&
Expand Down Expand Up @@ -166,6 +166,7 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
Expand All @@ -176,12 +177,23 @@ jobs:
strategy:
matrix:
compiler: ['g++']
rocm_version: ['4.3', '4.5', 'debian']
rocm_version: ['4.3', '4.5', '5.0']
mpi_headers: ['OFF']
build_jobs: ['4']
ctest_exclude: ['-LE "mpi-example|transpose"']
perfetto-tools: ['ON']
include:
- compiler: 'g++'
rocm_version: 'debian'
mpi_headers: 'ON'
build_jobs: '2'
ctest_exclude: '-LE transpose'
perfetto-tools: 'OFF'

env:
BUILD_TYPE: MinSizeRel
OMNITRACE_OUTPUT_PATH: omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX: "%argt%/"
OMPI_ALLOW_RUN_AS_ROOT: 1
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1

steps:
- uses: actions/checkout@v2
Expand All @@ -194,18 +206,41 @@ jobs:
wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&
echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list &&
apt-get update &&
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl &&
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev &&
python3 -m pip install --upgrade pip &&
python3 -m pip install 'cmake==3.16.3' &&
for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
python3 -m pip install 'cmake==3.21.4' &&
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done

- name: Configure Env
- name: Install RCCL
if: ${{ matrix.rocm_version != '4.3' }}
timeout-minutes: 5
run:
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV &&
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV &&
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV &&
echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH &&
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
apt-get install -y rccl-dev

- name: Configure Env
run: |
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
cat << EOF > test-install.cfg
OMNITRACE_USE_TIMEMORY = ON
OMNITRACE_USE_PERFETTO = ON
OMNITRACE_USE_PID = OFF
OMNITRACE_USE_SAMPLING = OFF
OMNITRACE_USE_PROCESS_SAMPLING = OFF
OMNITRACE_COUT_OUTPUT = ON
OMNITRACE_TIME_OUTPUT = OFF
OMNITRACE_TIMEMORY_COMPONENTS = cpu_clock cpu_util current_peak_rss kernel_mode_time monotonic_clock monotonic_raw_clock network_stats num_io_in num_io_out num_major_page_faults num_minor_page_faults page_rss peak_rss priority_context_switch process_cpu_clock process_cpu_util read_bytes read_char system_clock thread_cpu_clock thread_cpu_util timestamp trip_count user_clock user_mode_time virtual_memory voluntary_context_switch wall_clock written_bytes written_char
OMNITRACE_OUTPUT_PATH = omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX = %tag%/
OMNITRACE_DEBUG = OFF
OMNITRACE_VERBOSE = 3
OMNITRACE_DL_VERBOSE = 3
OMNITRACE_PERFETTO_BACKEND = system
EOF
realpath test-install.cfg
cat test-install.cfg
- name: Configure CMake
timeout-minutes: 10
Expand All @@ -217,22 +252,27 @@ jobs:
-DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
-DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }}
-DCMAKE_INSTALL_PREFIX=/opt/omnitrace
-DOMNITRACE_BUILD_TESTING=OFF
-DOMNITRACE_BUILD_TESTING=ON
-DOMNITRACE_BUILD_DEVELOPER=ON
-DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF
-DOMNITRACE_BUILD_LTO=OFF
-DOMNITRACE_USE_MPI=OFF
-DOMNITRACE_USE_MPI_HEADERS=ON
-DOMNITRACE_USE_HIP=ON
-DOMNITRACE_MAX_THREADS=32
-DOMNITRACE_USE_SANITIZER=OFF
-DOMNITRACE_USE_PAPI=OFF
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON
-DOMNITRACE_USE_OMPT=OFF
-DOMNITRACE_USE_PYTHON=ON
-DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }}
-DOMNITRACE_USE_SANITIZER=OFF
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=${{ matrix.perfetto-tools }}
-DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs
-DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10"
-DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }}

- name: Build
timeout-minutes: 60
run:
cmake --build build --target all --parallel 2 -- VERBOSE=1
cmake --build build --target all --parallel ${{ matrix.build_jobs }} -- VERBOSE=1

- name: Install
run:
Expand All @@ -244,20 +284,28 @@ jobs:
cd build &&
ldd ./bin/omnitrace &&
./bin/omnitrace --help &&
ctest -V -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
ctest -V --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure
ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure

- name: Configure Install Env
run: |
echo "/opt/omnitrace/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
echo "OMNITRACE_CONFIG_FILE=test-install.cfg" >> $GITHUB_ENV
- name: Test Install
timeout-minutes: 10
if: ${{ matrix.perfetto-tools == 'ON' }}
run: |
set -v
cat ${OMNITRACE_CONFIG_FILE}
omnitrace-perfetto-traced --background
export OMNITRACE_DEBUG=ON
export OMNITRACE_PERFETTO_BACKEND=system
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-python
omnitrace-python --help
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
Expand All @@ -272,6 +320,31 @@ jobs:
du -m ls-perfetto-trace.proto
/opt/conda/envs/py3.8/bin/python ./tests/validate-perfetto-proto.py -p -i ./ls-perfetto-trace.proto
- name: Test Install
timeout-minutes: 10
if: ${{ matrix.perfetto-tools == 'OFF' }}
run: |
set -v
cat ${OMNITRACE_CONFIG_FILE}
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-python
omnitrace-python --help
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
ldd $(which omnitrace)
omnitrace --help
omnitrace -e -v 1 -o sleep.inst --simulate -- sleep
omnitrace -e -v 1 --simulate -- sleep
omnitrace -e -v 1 -o sleep.inst -- sleep
./sleep.inst 5
omnitrace -e -v 1 -- sleep 5
cat omnitrace-tests-output/sleep.inst/wall_clock.txt
cat omnitrace-tests-output/sleep/wall_clock.txt
- name: Test User API
timeout-minutes: 10
run: |
Expand All @@ -293,6 +366,7 @@ jobs:
name: data-${{ github.job }}-files
path: |
omnitrace-tests-output/**/*.txt
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
Expand Down Expand Up @@ -445,5 +519,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
${{ github.workspace }}/build/omnitrace-tests-config/*.cfg
${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt
${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support"
omnitrace_add_option(
OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling"
${OMNITRACE_USE_HIP})
omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP})
omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON)
Expand Down Expand Up @@ -175,14 +176,18 @@ if(NOT OMNITRACE_USE_HIP)
set(OMNITRACE_USE_ROCM_SMI
OFF
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
set(OMNITRACE_USE_RCCL
OFF
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
elseif(
OMNITRACE_USE_HIP
AND NOT OMNITRACE_USE_ROCTRACER
AND NOT OMNITRACE_USE_ROCPROFILER
AND NOT OMNITRACE_USE_ROCM_SMI)
AND NOT OMNITRACE_USE_ROCM_SMI
AND NOT OMNITRACE_USE_RCCL)
omnitrace_message(
AUTHOR_WARNING
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, and rocm-smi options are disabled"
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled"
)
set(OMNITRACE_USE_HIP OFF)
endif()
Expand Down
4 changes: 4 additions & 0 deletions cmake/ConfigCPack.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ if(NOT OMNITRACE_BUILD_DYNINST)
endif()
endif()
if(ROCmVersion_FOUND)
set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCM_SMI_SUFFIX
" (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})")
Expand All @@ -167,6 +168,9 @@ endif()
if(OMNITRACE_USE_ROCTRACER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}")
endif()
if(OMNITRACE_USE_ROCPROFILER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}")
endif()
if(OMNITRACE_USE_MPI)
if("${OMNITRACE_MPI_IMPL}" STREQUAL "openmpi")
list(APPEND _DEBIAN_PACKAGE_DEPENDS "libopenmpi-dev")
Expand Down
94 changes: 94 additions & 0 deletions cmake/Modules/FindRCCL-Headers.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file
# Copyright.txt or https://cmake.org/licensing for details.

include(FindPackageHandleStandardArgs)

# ----------------------------------------------------------------------------------------#

set(RCCL-Headers_INCLUDE_DIR_INTERNAL
"${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/rccl"
CACHE PATH "Path to internal rccl.h")

# ----------------------------------------------------------------------------------------#

if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()

foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl)
if(EXISTS ${_DIR})
get_filename_component(_ABS_DIR "${_DIR}" REALPATH)
list(APPEND _RCCL_PATHS ${_ABS_DIR})
endif()
endforeach()

# ----------------------------------------------------------------------------------------#

find_package(
rccl
QUIET
CONFIG
HINTS
${_RCCL_PATHS}
PATHS
${_RCCL_PATHS}
PATH_SUFFIXES
rccl/lib/cmake)

if(NOT rccl_FOUND)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
CACHE PATH "Path to RCCL headers")
else()
set(RCCL-Headers_INCLUDE_DIR
"${rccl_INCLUDE_DIR}"
CACHE PATH "Path to RCCL headers")
endif()

# because of the annoying warning starting with v5.2.0, we've got to do this crap
if(ROCmVersion_NUMERIC_VERSION)
if(ROCmVersion_NUMERIC_VERSION LESS 50200)
set(_RCCL-Headers_FILE "rccl.h")
set(_RCCL-Headers_DIR "/rccl")
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()

if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}")
omnitrace_message(
AUTHOR_WARNING
"RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}"
CACHE PATH "Path to RCCL headers" FORCE)
endif()

unset(_RCCL-Headers_FILE)
unset(_RCCL-Headers_DIR)

mark_as_advanced(RCCL-Headers_INCLUDE_DIR)

# ----------------------------------------------------------------------------------------#

find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR)

# ------------------------------------------------------------------------------#

if(RCCL-Headers_FOUND)
add_library(roc::rccl-headers INTERFACE IMPORTED)
set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR})

target_include_directories(roc::rccl-headers SYSTEM
INTERFACE ${RCCL-Headers_INCLUDE_DIR})

add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED)
target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers)
endif()

# ------------------------------------------------------------------------------#
Loading

0 comments on commit 45be039

Please sign in to comment.