Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RCCL support #93

Merged
merged 26 commits into from
Jul 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
bed55ad
Initial support for RCCL
jrmadsen Jul 18, 2022
8099c48
OMNITRACE_USE_RCCLP + sampling tweaks
jrmadsen Jul 18, 2022
2d61e18
Update docker and workflows to download RCCL
jrmadsen Jul 18, 2022
5450b24
Update CPack DEB with rocprofiler dependency
jrmadsen Jul 18, 2022
13f5a71
Rework rccl into library and library/components folder
jrmadsen Jul 18, 2022
4b6468b
Fix timemory includes
jrmadsen Jul 18, 2022
a0a5a5f
rcclp inline definitions when disabled
jrmadsen Jul 18, 2022
ea7f4d4
Tweaks to ubuntu-focal-external-rocm
jrmadsen Jul 18, 2022
0c63a45
Tweaks to ubuntu-focal-external-rocm
jrmadsen Jul 19, 2022
673870c
Tweak ubuntu-focal.yml
jrmadsen Jul 19, 2022
a743cce
Fix ubuntu-focal-rocm + OMPI + root
jrmadsen Jul 21, 2022
a4e94e2
Improved rocm-smi error handling
jrmadsen Jul 21, 2022
37f8353
formatting
jrmadsen Jul 21, 2022
c5850e5
Fix merge of OMNITRACE_SAMPLING_KEEP_INTERNAL
jrmadsen Jul 22, 2022
35105df
Update RCCL include directory
jrmadsen Jul 23, 2022
755e16c
RCCL Testing
jrmadsen Jul 25, 2022
f4e1c7f
Handle RCCL include w/o HIP
jrmadsen Jul 25, 2022
97d8cb4
RCCL requires HIP
jrmadsen Jul 25, 2022
0231f23
Update OMNITRACE_SAMPLING_CPUS for testing
jrmadsen Jul 25, 2022
a031b7e
Update tests/CMakeLists.txt
jrmadsen Jul 25, 2022
679dff8
Debug settings
jrmadsen Jul 25, 2022
df9c874
Install MPI even when USE_MPI=OFF
jrmadsen Jul 25, 2022
2781e63
exclude printf
jrmadsen Jul 25, 2022
105aecf
skip mpi tests w/o USE_MPI or USE_MPI_HEADERS
jrmadsen Jul 25, 2022
f4cfeca
update ubuntu rocm workflow
jrmadsen Jul 25, 2022
1f8037c
Fix configure env step for ubuntu rocm
jrmadsen Jul 25, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cmake-format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ parse:
NAME: '*'
TARGET: '*'
MPI: '*'
GPU: '*'
NUM_PROCS: '*'
REWRITE_TIMEOUT: '*'
RUNTIME_TIMEOUT: '*'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/opensuse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,5 +121,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
1 change: 1 addition & 0 deletions .github/workflows/ubuntu-bionic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,5 +150,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
119 changes: 97 additions & 22 deletions .github/workflows/ubuntu-focal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
add-apt-repository -y ppa:ubuntu-toolchain-r/test &&
apt-get update &&
apt-get upgrade -y &&
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev ${{ matrix.compiler }} &&
apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev libmpich-dev mpich ${{ matrix.compiler }} &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy &&
python3 -m pip install perfetto &&
Expand Down Expand Up @@ -166,6 +166,7 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json

Expand All @@ -176,12 +177,23 @@ jobs:
strategy:
matrix:
compiler: ['g++']
rocm_version: ['4.3', '4.5', 'debian']
rocm_version: ['4.3', '4.5', '5.0']
mpi_headers: ['OFF']
build_jobs: ['4']
ctest_exclude: ['-LE "mpi-example|transpose"']
perfetto-tools: ['ON']
include:
- compiler: 'g++'
rocm_version: 'debian'
mpi_headers: 'ON'
build_jobs: '2'
ctest_exclude: '-LE transpose'
perfetto-tools: 'OFF'

env:
BUILD_TYPE: MinSizeRel
OMNITRACE_OUTPUT_PATH: omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX: "%argt%/"
OMPI_ALLOW_RUN_AS_ROOT: 1
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1

steps:
- uses: actions/checkout@v2
Expand All @@ -194,18 +206,41 @@ jobs:
wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&
echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list &&
apt-get update &&
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl &&
apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev &&
python3 -m pip install --upgrade pip &&
python3 -m pip install 'cmake==3.16.3' &&
for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
python3 -m pip install 'cmake==3.21.4' &&
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done

- name: Configure Env
- name: Install RCCL
if: ${{ matrix.rocm_version != '4.3' }}
timeout-minutes: 5
run:
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV &&
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV &&
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV &&
echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH &&
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
apt-get install -y rccl-dev

- name: Configure Env
run: |
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV
echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
cat << EOF > test-install.cfg
OMNITRACE_USE_TIMEMORY = ON
OMNITRACE_USE_PERFETTO = ON
OMNITRACE_USE_PID = OFF
OMNITRACE_USE_SAMPLING = OFF
OMNITRACE_USE_PROCESS_SAMPLING = OFF
OMNITRACE_COUT_OUTPUT = ON
OMNITRACE_TIME_OUTPUT = OFF
OMNITRACE_TIMEMORY_COMPONENTS = cpu_clock cpu_util current_peak_rss kernel_mode_time monotonic_clock monotonic_raw_clock network_stats num_io_in num_io_out num_major_page_faults num_minor_page_faults page_rss peak_rss priority_context_switch process_cpu_clock process_cpu_util read_bytes read_char system_clock thread_cpu_clock thread_cpu_util timestamp trip_count user_clock user_mode_time virtual_memory voluntary_context_switch wall_clock written_bytes written_char
OMNITRACE_OUTPUT_PATH = omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX = %tag%/
OMNITRACE_DEBUG = OFF
OMNITRACE_VERBOSE = 3
OMNITRACE_DL_VERBOSE = 3
OMNITRACE_PERFETTO_BACKEND = system
EOF
realpath test-install.cfg
cat test-install.cfg

- name: Configure CMake
timeout-minutes: 10
Expand All @@ -217,22 +252,27 @@ jobs:
-DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
-DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }}
-DCMAKE_INSTALL_PREFIX=/opt/omnitrace
-DOMNITRACE_BUILD_TESTING=OFF
-DOMNITRACE_BUILD_TESTING=ON
-DOMNITRACE_BUILD_DEVELOPER=ON
-DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF
-DOMNITRACE_BUILD_LTO=OFF
-DOMNITRACE_USE_MPI=OFF
-DOMNITRACE_USE_MPI_HEADERS=ON
-DOMNITRACE_USE_HIP=ON
-DOMNITRACE_MAX_THREADS=32
-DOMNITRACE_USE_SANITIZER=OFF
-DOMNITRACE_USE_PAPI=OFF
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON
-DOMNITRACE_USE_OMPT=OFF
-DOMNITRACE_USE_PYTHON=ON
-DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }}
-DOMNITRACE_USE_SANITIZER=OFF
-DOMNITRACE_INSTALL_PERFETTO_TOOLS=${{ matrix.perfetto-tools }}
-DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs
-DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10"
-DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }}

- name: Build
timeout-minutes: 60
run:
cmake --build build --target all --parallel 2 -- VERBOSE=1
cmake --build build --target all --parallel ${{ matrix.build_jobs }} -- VERBOSE=1

- name: Install
run:
Expand All @@ -244,20 +284,28 @@ jobs:
cd build &&
ldd ./bin/omnitrace &&
./bin/omnitrace --help &&
ctest -V -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
ctest -V --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure
ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log &&
ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure

- name: Configure Install Env
run: |
echo "/opt/omnitrace/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
echo "OMNITRACE_CONFIG_FILE=test-install.cfg" >> $GITHUB_ENV

- name: Test Install
timeout-minutes: 10
if: ${{ matrix.perfetto-tools == 'ON' }}
run: |
set -v
cat ${OMNITRACE_CONFIG_FILE}
omnitrace-perfetto-traced --background
export OMNITRACE_DEBUG=ON
export OMNITRACE_PERFETTO_BACKEND=system
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-python
omnitrace-python --help
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
Expand All @@ -272,6 +320,31 @@ jobs:
du -m ls-perfetto-trace.proto
/opt/conda/envs/py3.8/bin/python ./tests/validate-perfetto-proto.py -p -i ./ls-perfetto-trace.proto

- name: Test Install
timeout-minutes: 10
if: ${{ matrix.perfetto-tools == 'OFF' }}
run: |
set -v
cat ${OMNITRACE_CONFIG_FILE}
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-python
omnitrace-python --help
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
ldd $(which omnitrace)
omnitrace --help
omnitrace -e -v 1 -o sleep.inst --simulate -- sleep
omnitrace -e -v 1 --simulate -- sleep
omnitrace -e -v 1 -o sleep.inst -- sleep
./sleep.inst 5
omnitrace -e -v 1 -- sleep 5
cat omnitrace-tests-output/sleep.inst/wall_clock.txt
cat omnitrace-tests-output/sleep/wall_clock.txt

- name: Test User API
timeout-minutes: 10
run: |
Expand All @@ -293,6 +366,7 @@ jobs:
name: data-${{ github.job }}-files
path: |
omnitrace-tests-output/**/*.txt
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json

Expand Down Expand Up @@ -445,5 +519,6 @@ jobs:
with:
name: data-${{ github.job }}-files
path: |
${{ github.workspace }}/build/omnitrace-tests-config/*.cfg
${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt
${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support"
omnitrace_add_option(
OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling"
${OMNITRACE_USE_HIP})
omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP})
omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON)
Expand Down Expand Up @@ -175,14 +176,18 @@ if(NOT OMNITRACE_USE_HIP)
set(OMNITRACE_USE_ROCM_SMI
OFF
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
set(OMNITRACE_USE_RCCL
OFF
CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE)
elseif(
OMNITRACE_USE_HIP
AND NOT OMNITRACE_USE_ROCTRACER
AND NOT OMNITRACE_USE_ROCPROFILER
AND NOT OMNITRACE_USE_ROCM_SMI)
AND NOT OMNITRACE_USE_ROCM_SMI
AND NOT OMNITRACE_USE_RCCL)
omnitrace_message(
AUTHOR_WARNING
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, and rocm-smi options are disabled"
"Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled"
)
set(OMNITRACE_USE_HIP OFF)
endif()
Expand Down
4 changes: 4 additions & 0 deletions cmake/ConfigCPack.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ if(NOT OMNITRACE_BUILD_DYNINST)
endif()
endif()
if(ROCmVersion_FOUND)
set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})")
set(_ROCM_SMI_SUFFIX
" (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})")
Expand All @@ -167,6 +168,9 @@ endif()
if(OMNITRACE_USE_ROCTRACER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}")
endif()
if(OMNITRACE_USE_ROCPROFILER)
list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}")
endif()
if(OMNITRACE_USE_MPI)
if("${OMNITRACE_MPI_IMPL}" STREQUAL "openmpi")
list(APPEND _DEBIAN_PACKAGE_DEPENDS "libopenmpi-dev")
Expand Down
94 changes: 94 additions & 0 deletions cmake/Modules/FindRCCL-Headers.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file
# Copyright.txt or https://cmake.org/licensing for details.

include(FindPackageHandleStandardArgs)

# ----------------------------------------------------------------------------------------#

set(RCCL-Headers_INCLUDE_DIR_INTERNAL
"${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/rccl"
CACHE PATH "Path to internal rccl.h")

# ----------------------------------------------------------------------------------------#

if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()

foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl)
if(EXISTS ${_DIR})
get_filename_component(_ABS_DIR "${_DIR}" REALPATH)
list(APPEND _RCCL_PATHS ${_ABS_DIR})
endif()
endforeach()

# ----------------------------------------------------------------------------------------#

find_package(
rccl
QUIET
CONFIG
HINTS
${_RCCL_PATHS}
PATHS
${_RCCL_PATHS}
PATH_SUFFIXES
rccl/lib/cmake)

if(NOT rccl_FOUND)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
CACHE PATH "Path to RCCL headers")
else()
set(RCCL-Headers_INCLUDE_DIR
"${rccl_INCLUDE_DIR}"
CACHE PATH "Path to RCCL headers")
endif()

# because of the annoying warning starting with v5.2.0, we've got to do this crap
if(ROCmVersion_NUMERIC_VERSION)
if(ROCmVersion_NUMERIC_VERSION LESS 50200)
set(_RCCL-Headers_FILE "rccl.h")
set(_RCCL-Headers_DIR "/rccl")
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()

if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}")
omnitrace_message(
AUTHOR_WARNING
"RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}"
CACHE PATH "Path to RCCL headers" FORCE)
endif()

unset(_RCCL-Headers_FILE)
unset(_RCCL-Headers_DIR)

mark_as_advanced(RCCL-Headers_INCLUDE_DIR)

# ----------------------------------------------------------------------------------------#

find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR)

# ------------------------------------------------------------------------------#

if(RCCL-Headers_FOUND)
add_library(roc::rccl-headers INTERFACE IMPORTED)
set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR})

target_include_directories(roc::rccl-headers SYSTEM
INTERFACE ${RCCL-Headers_INCLUDE_DIR})

add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED)
target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers)
endif()

# ------------------------------------------------------------------------------#
Loading