From bed55ade25fe74a112930ae9c0a0f90f88d71f8e Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 01:53:52 -0500 Subject: [PATCH 01/26] Initial support for RCCL --- CMakeLists.txt | 1 + source/lib/omnitrace/CMakeLists.txt | 5 + source/lib/omnitrace/library.cpp | 6 + .../lib/omnitrace/library/components/fwd.hpp | 2 + source/lib/omnitrace/library/perfetto.hpp | 2 + source/lib/omnitrace/library/rcclp.cpp | 415 ++++++++++++++++++ source/lib/omnitrace/library/rcclp.hpp | 42 ++ 7 files changed, 473 insertions(+) create mode 100644 source/lib/omnitrace/library/rcclp.cpp create mode 100644 source/lib/omnitrace/library/rcclp.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ae15ed8b..8e688513c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support" omnitrace_add_option( OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling" ${OMNITRACE_USE_HIP}) +omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP}) omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON) diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt index bf75fdf2e..8efc80801 100644 --- a/source/lib/omnitrace/CMakeLists.txt +++ b/source/lib/omnitrace/CMakeLists.txt @@ -143,6 +143,11 @@ if(OMNITRACE_USE_ROCTRACER) ${CMAKE_CURRENT_LIST_DIR}/library/roctracer.cpp) endif() +if(OMNITRACE_USE_RCCL) + target_sources(omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/rcclp.cpp) +endif() + if(OMNITRACE_USE_ROCPROFILER) target_sources( omnitrace-object-library diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index c512e0977..7fa6fb394 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -46,6 +46,7 @@ #include "library/thread_data.hpp" #include "library/timemory.hpp" #include "library/tracing.hpp" +#include "library/rcclp.hpp" #include @@ -647,6 +648,9 @@ omnitrace_init_tooling_hidden() ompt::setup(); } + //if(get_use_rccl()) + rcclp::setup(); + if(get_use_perfetto() && !is_system_backend()) { #if defined(CUSTOM_DATA_SOURCE) @@ -840,6 +844,8 @@ omnitrace_finalize_hidden(void) } } + rcclp::shutdown(); + if(get_use_ompt()) { OMNITRACE_VERBOSE_F(1, "Shutting down OMPT...\n"); diff --git a/source/lib/omnitrace/library/components/fwd.hpp b/source/lib/omnitrace/library/components/fwd.hpp index d8e6abd5d..9401bacfa 100644 --- a/source/lib/omnitrace/library/components/fwd.hpp +++ b/source/lib/omnitrace/library/components/fwd.hpp @@ -74,6 +74,7 @@ TIMEMORY_DEFINE_NS_API(category, pthread) TIMEMORY_DEFINE_NS_API(category, kokkos) TIMEMORY_DEFINE_NS_API(category, mpi) TIMEMORY_DEFINE_NS_API(category, ompt) +TIMEMORY_DEFINE_NS_API(category, rccl) TIMEMORY_DEFINE_NS_API(category, critical_trace) TIMEMORY_DEFINE_NS_API(category, host_critical_trace) TIMEMORY_DEFINE_NS_API(category, device_critical_trace) @@ -93,6 +94,7 @@ TIMEMORY_DEFINE_NAME_TRAIT("pthread", category::pthread); TIMEMORY_DEFINE_NAME_TRAIT("kokkos", category::kokkos); TIMEMORY_DEFINE_NAME_TRAIT("mpi", category::mpi); TIMEMORY_DEFINE_NAME_TRAIT("ompt", category::ompt); +TIMEMORY_DEFINE_NAME_TRAIT("rccl", category::rccl); TIMEMORY_DEFINE_NAME_TRAIT("critical-trace", category::critical_trace); TIMEMORY_DEFINE_NAME_TRAIT("host-critical-trace", category::host_critical_trace); TIMEMORY_DEFINE_NAME_TRAIT("device-critical-trace", category::device_critical_trace); diff --git a/source/lib/omnitrace/library/perfetto.hpp b/source/lib/omnitrace/library/perfetto.hpp index c237c3df8..f9d7c81b0 100644 --- a/source/lib/omnitrace/library/perfetto.hpp +++ b/source/lib/omnitrace/library/perfetto.hpp @@ -83,6 +83,8 @@ perfetto::Category("kokkos").SetDescription("Kokkos regions"), \ perfetto::Category("mpi").SetDescription("MPI regions"), \ perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \ + perfetto::Category("rccl").SetDescription( \ + "ROCm Communication Collectives Library (RCCL) regions"), \ perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \ perfetto::Category("host-critical-trace") \ .SetDescription("Host-side critical traces"), \ diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp new file mode 100644 index 000000000..9f409d211 --- /dev/null +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -0,0 +1,415 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/category_region.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include + +#include + +#include +#include +#include +#include +#include + +TIMEMORY_DECLARE_COMPONENT(rccl_comm_data) + +#if !defined(NUM_TIMEMORY_RCCLP_WRAPPERS) +# define NUM_TIMEMORY_RCCLP_WRAPPERS 15 +#endif + +namespace tim +{ +namespace component +{ +template +struct rcclp_handle; +} +} // namespace tim + +struct rcclp_tag +{}; + +using api_t = rcclp_tag; +using rccl_data_tracker_t = tim::component::data_tracker; + +TIMEMORY_STATISTICS_TYPE(rccl_data_tracker_t, float) +TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, rccl_data_tracker_t, true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, rccl_data_tracker_t, true_type) + +using rccl_toolset_t = + tim::component_bundle, + tim::component::rccl_comm_data*>; +using rcclp_handle_t = omnitrace::comp::rcclp_handle; +static uint64_t global_id = std::numeric_limits::max(); +static void* librccl_handle = nullptr; + +namespace tim +{ +namespace component +{ +template +struct rcclp_handle : base, void> +{ + static constexpr size_t rcclp_wrapper_count = NUM_TIMEMORY_RCCLP_WRAPPERS; + + using value_type = void; + using this_type = rcclp_handle; + using base_type = base; + + using string_t = std::string; + using nccl_toolset_t = Toolset; + using rcclp_gotcha_t = + tim::component::gotcha; + using rcclp_tuple_t = tim::component_tuple; + using toolset_ptr_t = std::shared_ptr; + + static string_t label() { return "rcclp_handle"; } + static string_t description() { return "Handle for activating NCCL wrappers"; } + + void get() {} + + void start() + { + if(get_tool_count()++ == 0) + { + get_tool_instance() = std::make_shared("timemory_rcclp"); + get_tool_instance()->start(); + } + } + + void stop() + { + auto idx = --get_tool_count(); + if(get_tool_instance().get()) + { + get_tool_instance()->stop(); + if(idx == 0) get_tool_instance().reset(); + } + } + + int get_count() { return get_tool_count().load(); } + +private: + struct persistent_data + { + std::atomic m_configured; + std::atomic m_count; + toolset_ptr_t m_tool; + }; + + static persistent_data& get_persistent_data() + { + static persistent_data _instance; + return _instance; + } + + static std::atomic& get_configured() + { + return get_persistent_data().m_configured; + } + + static toolset_ptr_t& get_tool_instance() { return get_persistent_data().m_tool; } + + static std::atomic& get_tool_count() + { + return get_persistent_data().m_count; + } +}; + +template +static uint64_t +activate_rcclp() +{ + using handle_t = tim::component::rcclp_handle; + + static std::shared_ptr _handle; + + if(!_handle.get()) + { + _handle = std::make_shared(); + _handle->start(); + + auto cleanup_functor = [=]() { + if(_handle) + { + _handle->stop(); + _handle.reset(); + } + }; + + std::stringstream ss; + ss << "timemory-rcclp-" << typeid(Toolset).name() << "-" << typeid(Tag).name(); + tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); + return 1; + } + return 0; +} +// +//======================================================================================// +// +/// \fn uint64_t tim::component::deactivate_rcclp(uint64_t id) +/// \brief The thread that created the initial rcclp handle will turn off. Returns +/// the number of handles active +/// +template +static uint64_t +deactivate_rcclp(uint64_t id) +{ + if(id > 0) + { + std::stringstream ss; + ss << "timemory-rcclp-" << typeid(Toolset).name() << "-" << typeid(Tag).name(); + tim::manager::instance()->cleanup(ss.str()); + return 0; + } + return 1; +} + +// +template +void +configure_rcclp(const std::set& permit = {}, + const std::set& reject = {}) +{ + static constexpr size_t rcclp_wrapper_count = NUM_TIMEMORY_RCCLP_WRAPPERS; + + using string_t = std::string; + using rcclp_gotcha_t = tim::component::gotcha; + + static bool is_initialized = false; + if(!is_initialized) + { + // generate the gotcha wrappers + rcclp_gotcha_t::get_initializer() = []() { + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclReduce); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclBcast); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclBroadcast); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclAllReduce); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclReduceScatter); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclAllGather); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclGroupStart); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclGroupEnd); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclSend); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclRecv); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclCommCount); + }; + + // provide environment variable for suppressing wrappers + rcclp_gotcha_t::get_reject_list() = [reject]() { + auto _reject = reject; + // check environment + auto reject_list = tim::get_env("OMNITRACE_RCCLP_REJECT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(reject_list)) + _reject.insert(itr); + return _reject; + }; + + // provide environment variable for selecting wrappers + rcclp_gotcha_t::get_permit_list() = [permit]() { + auto _permit = permit; + // check environment + auto permit_list = tim::get_env("OMNITRACE_RCCLP_PERMIT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(permit_list)) + _permit.insert(itr); + return _permit; + }; + + is_initialized = true; + } +} +} // namespace component +} // namespace tim + +namespace omnitrace +{ +namespace rcclp +{ +void +configure() +{ + rccl_data_tracker_t::label() = "rccl_comm_data"; + rccl_data_tracker_t::description() = "Tracks RCCL communication data"; +} + +void +setup() +{ + configure(); + + // make sure the symbols are loaded to be wrapped + auto libpath = tim::get_env("OMNITRACE_RCCL_LIBRARY", "librccl.so"); + librccl_handle = dlopen(libpath.c_str(), RTLD_NOW | RTLD_GLOBAL); + if(!librccl_handle) fprintf(stderr, "%s\n", dlerror()); + dlerror(); // Clear any existing error + + auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true); + if(_data) + rccl_toolset_t::get_initializer() = [](rccl_toolset_t& cb) { + cb.initialize(); + }; + + comp::configure_rcclp(); + global_id = comp::activate_rcclp(); + if(librccl_handle) dlclose(librccl_handle); +} + +void +shutdown() +{ + if(global_id < std::numeric_limits::max()) + comp::deactivate_rcclp(global_id); +} +} // namespace rcclp +} // namespace omnitrace +// +//--------------------------------------------------------------------------------------// +// +namespace tim +{ +namespace component +{ +// +//--------------------------------------------------------------------------------------// +// +struct rccl_comm_data : base +{ + using value_type = void; + using this_type = rccl_comm_data; + using base_type = base; + using tracker_t = tim::auto_tuple; + using data_type = float; + + TIMEMORY_DEFAULT_OBJECT(rccl_comm_data) + + static void preinit() { omnitrace::rcclp::configure(); } + + void start() {} + void stop() {} + + static auto rccl_type_size(ncclDataType_t datatype) + { + switch(datatype) + { + case ncclInt8: + case ncclUint8: return 1; + case ncclFloat16: return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: return 8; + default: return 0; + }; + } + + // ncclReduce + void audit(const std::string& _name, const void*, void*, size_t count, + ncclDataType_t datatype, ncclRedOp_t, int root, ncclComm_t, hipStream_t) + { + int size = rccl_type_size(datatype); + add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", root)); + } + + // ncclSend + void audit(const std::string& _name, const void*, size_t count, + ncclDataType_t datatype, int peer, ncclComm_t, hipStream_t) + { + int size = rccl_type_size(datatype); + add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", peer)); + } + + // ncclBcast + // ncclRecv + void audit(const std::string& _name, void*, size_t count, ncclDataType_t datatype, + int root, ncclComm_t, hipStream_t) + { + int size = rccl_type_size(datatype); + add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", root)); + } + + // ncclBroadcast + void audit(const std::string& _name, const void*, void*, size_t count, + ncclDataType_t datatype, int root, ncclComm_t, hipStream_t) + { + int size = rccl_type_size(datatype); + add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", root)); + } + + // ncclAllReduce + // ncclReduceScatter + void audit(const std::string& _name, const void*, void*, size_t count, + ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, hipStream_t) + { + int size = rccl_type_size(datatype); + add(_name, count * size); + } + + // ncclAllGather + void audit(const std::string& _name, const void*, void*, size_t count, + ncclDataType_t datatype, ncclComm_t, hipStream_t) + { + int size = rccl_type_size(datatype); + add(_name, count * size); + } + +private: + template + void add(tracker_t& _t, data_type value, Args&&... args) + { + _t.store(std::plus{}, value); + TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward(args), value)); + } + + template + void add(const std::string& _name, data_type value, Args&&... args) + { + tracker_t _t(_name); + add(_t, value, std::forward(args)...); + } + + template + void add_secondary(tracker_t&, const std::string& _name, data_type value, + Args&&... args) + { + // if(tim::settings::add_secondary()) + { + tracker_t _s(_name); + add(_s, value, std::forward(args)...); + } + } +}; +} // namespace component +} // namespace tim + +TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t) diff --git a/source/lib/omnitrace/library/rcclp.hpp b/source/lib/omnitrace/library/rcclp.hpp new file mode 100644 index 000000000..f9844ef0e --- /dev/null +++ b/source/lib/omnitrace/library/rcclp.hpp @@ -0,0 +1,42 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/defines.hpp" + +namespace omnitrace +{ +namespace rcclp +{ +void +configure(); + +void +setup(); + +void +shutdown(); +} // namespace rcclp +} // namespace omnitrace From 8099c48f5ff61c439946d7a9c6003145214bd0ed Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 08:24:00 -0500 Subject: [PATCH 02/26] OMNITRACE_USE_RCCLP + sampling tweaks - also OMNITRACE_SAMPLING_KEEP_INTERNAL option - minor modifications to sampling to use keep internal option + discard funlockfile --- source/lib/omnitrace/library.cpp | 14 +++++++++--- source/lib/omnitrace/library/config.cpp | 29 +++++++++++++++++++++++++ source/lib/omnitrace/library/config.hpp | 6 +++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 7fa6fb394..6b198d5ff 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -41,6 +41,7 @@ #include "library/ompt.hpp" #include "library/process_sampler.hpp" #include "library/ptl.hpp" +#include "library/rcclp.hpp" #include "library/rocprofiler.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" @@ -648,8 +649,11 @@ omnitrace_init_tooling_hidden() ompt::setup(); } - //if(get_use_rccl()) - rcclp::setup(); + if(get_use_rcclp()) + { + OMNITRACE_VERBOSE_F(1, "Setting up RCCLP...\n"); + rcclp::setup(); + } if(get_use_perfetto() && !is_system_backend()) { @@ -844,7 +848,11 @@ omnitrace_finalize_hidden(void) } } - rcclp::shutdown(); + if(get_use_rcclp()) + { + OMNITRACE_VERBOSE_F(1, "Shutting down RCCLP...\n"); + rcclp::shutdown(); + } if(get_use_ompt()) { diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 1fa362fa4..202f40a27 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -270,6 +270,11 @@ configure_settings(bool _init) "Enable support for Kokkos Tools", false, "kokkos", "backend"); + OMNITRACE_CONFIG_SETTING( + bool, "OMNITRACE_USE_RCCLP", + "Enable support for ROCm Communication Collectives Library (RCCL) Performance", + false, "rocm", "rccl", "backend"); + OMNITRACE_CONFIG_CL_SETTING( bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false, "--omnitrace-kokkos-kernel-logger", "kokkos", "debugging"); @@ -323,6 +328,13 @@ configure_settings(bool _init) "'all' and 'none' suppresses all GPU sampling", std::string{ "all" }, "rocm_smi", "rocm", "process_sampling"); + OMNITRACE_CONFIG_SETTING( + bool, "OMNITRACE_SAMPLING_KEEP_INTERNAL", + "Configure whether the statistical samples should include call-stack entries " + "from internal routines in omnitrace. E.g. when ON, the call-stack will show " + "functions like omnitrace_push_trace", + true, "sampling", "thread_sampling", "data"); + auto _backend = tim::get_env_choice( "OMNITRACE_PERFETTO_BACKEND", (_system_backend) ? "system" // if OMNITRACE_PERFETTO_BACKEND_SYSTEM is true, @@ -666,6 +678,7 @@ configure_mode_settings() _set("OMNITRACE_USE_ROCTRACER", false); _set("OMNITRACE_USE_ROCPROFILER", false); _set("OMNITRACE_USE_KOKKOSP", false); + _set("OMNITRACE_USE_RCCLP", false); _set("OMNITRACE_USE_OMPT", false); _set("OMNITRACE_USE_SAMPLING", false); _set("OMNITRACE_USE_PROCESS_SAMPLING", false); @@ -721,6 +734,7 @@ configure_mode_settings() _set("OMNITRACE_USE_ROCTRACER", false); _set("OMNITRACE_USE_ROCPROFILER", false); _set("OMNITRACE_USE_KOKKOSP", false); + _set("OMNITRACE_USE_RCCLP", false); _set("OMNITRACE_USE_OMPT", false); _set("OMNITRACE_USE_SAMPLING", false); _set("OMNITRACE_USE_PROCESS_SAMPLING", false); @@ -817,6 +831,7 @@ configure_disabled_settings() _handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto"); _handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory"); _handle_use_option("OMNITRACE_USE_OMPT", "ompt"); + _handle_use_option("OMNITRACE_USE_RCCLP", "rcclp"); _handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi"); _handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer"); _handle_use_option("OMNITRACE_USE_ROCPROFILER", "rocprofiler"); @@ -1355,6 +1370,13 @@ get_use_code_coverage() return static_cast&>(*_v->second).get(); } +bool +get_use_rcclp() +{ + static auto _v = get_config()->find("OMNITRACE_USE_RCCLP"); + return static_cast&>(*_v->second).get(); +} + bool get_critical_trace_debug() { @@ -1607,6 +1629,13 @@ get_sampling_gpus() #endif } +bool +get_sampling_keep_internal() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_KEEP_INTERNAL"); + return static_cast&>(*_v->second).get(); +} + bool get_trace_thread_locks() { diff --git a/source/lib/omnitrace/library/config.hpp b/source/lib/omnitrace/library/config.hpp index 3b0d2bbe2..2e0bb0700 100644 --- a/source/lib/omnitrace/library/config.hpp +++ b/source/lib/omnitrace/library/config.hpp @@ -216,6 +216,9 @@ get_use_sampling_cputime(); int get_sampling_rtoffset(); +bool +get_use_rcclp(); + bool get_timeline_sampling(); @@ -292,6 +295,9 @@ get_process_sampling_freq(); std::string get_sampling_gpus(); +bool +get_sampling_keep_internal(); + int64_t get_critical_trace_per_row(); From 2d61e182ccb1a4c8725dc03d17305719e1314a49 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 16:00:22 -0500 Subject: [PATCH 03/26] Update docker and workflows to download RCCL --- .github/workflows/ubuntu-focal.yml | 2 +- docker/Dockerfile.centos | 2 +- docker/Dockerfile.opensuse | 2 +- docker/Dockerfile.ubuntu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 56327d0ba..abb8de80f 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -194,7 +194,7 @@ jobs: wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list && apt-get update && - apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl && + apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl && python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' && for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done diff --git a/docker/Dockerfile.centos b/docker/Dockerfile.centos index 4889f4be2..bf2611339 100644 --- a/docker/Dockerfile.centos +++ b/docker/Dockerfile.centos @@ -24,7 +24,7 @@ ARG AMDGPU_RPM=21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm RUN yum install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \ amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms --skip-broken -y && \ - yum install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \ + yum install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \ yum update -y && \ yum clean all diff --git a/docker/Dockerfile.opensuse b/docker/Dockerfile.opensuse index 81c138607..d6a6e8278 100644 --- a/docker/Dockerfile.opensuse +++ b/docker/Dockerfile.opensuse @@ -25,7 +25,7 @@ RUN zypper --no-gpg-checks install -y https://repo.radeon.com/amdgpu-install/${A zypper addrepo https://download.opensuse.org/repositories/devel:languages:perl/SLE_15/devel:languages:perl.repo && \ zypper --non-interactive --gpg-auto-import-keys refresh && \ amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms -y && \ - zypper install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \ + zypper install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \ zypper clean --all ARG PYTHON_VERSIONS="6 7 8 9 10" diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index d822cddb6..2fedac199 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -28,7 +28,7 @@ RUN apt-get update && \ echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_REPO_VERSION}/ ${ROCM_REPO_DIST} main" | tee /etc/apt/sources.list.d/rocm.list && \ apt-get update && \ apt-get dist-upgrade -y && \ - apt-get install -y rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \ + apt-get install -y rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \ apt-get autoclean RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ From 5450b24a42f0c04b3612ed849c7565acdbc45f12 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 16:01:05 -0500 Subject: [PATCH 04/26] Update CPack DEB with rocprofiler dependency --- cmake/ConfigCPack.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/ConfigCPack.cmake b/cmake/ConfigCPack.cmake index 15f16e742..4aa3bdd2c 100644 --- a/cmake/ConfigCPack.cmake +++ b/cmake/ConfigCPack.cmake @@ -157,6 +157,7 @@ if(NOT OMNITRACE_BUILD_DYNINST) endif() endif() if(ROCmVersion_FOUND) + set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})") set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})") set(_ROCM_SMI_SUFFIX " (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})") @@ -167,6 +168,9 @@ endif() if(OMNITRACE_USE_ROCTRACER) list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}") endif() +if(OMNITRACE_USE_ROCPROFILER) + list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}") +endif() if(OMNITRACE_USE_MPI) if("${OMNITRACE_MPI_IMPL}" STREQUAL "openmpi") list(APPEND _DEBIAN_PACKAGE_DEPENDS "libopenmpi-dev") From 13f5a715f65bc2dcca58d95cef0721bb99cfe670 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 16:01:59 -0500 Subject: [PATCH 05/26] Rework rccl into library and library/components folder - add tpls/rccl/rccl/rccl.h --- cmake/Modules/FindRCCL-Headers.cmake | 77 +++ cmake/Packages.cmake | 14 + source/lib/omnitrace/CMakeLists.txt | 9 +- source/lib/omnitrace/library.cpp | 1 - source/lib/omnitrace/library/common.hpp | 1 + .../lib/omnitrace/library/components/fwd.hpp | 13 + .../omnitrace/library/components/rcclp.cpp | 269 +++++++++ .../omnitrace/library/components/rcclp.hpp | 216 ++++++++ source/lib/omnitrace/library/rcclp.cpp | 347 +----------- .../omnitrace/library/tpls/rccl/rccl/rccl.h | 522 ++++++++++++++++++ 10 files changed, 1127 insertions(+), 342 deletions(-) create mode 100644 cmake/Modules/FindRCCL-Headers.cmake create mode 100644 source/lib/omnitrace/library/components/rcclp.cpp create mode 100644 source/lib/omnitrace/library/components/rcclp.hpp create mode 100644 source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h diff --git a/cmake/Modules/FindRCCL-Headers.cmake b/cmake/Modules/FindRCCL-Headers.cmake new file mode 100644 index 000000000..89d92bb2c --- /dev/null +++ b/cmake/Modules/FindRCCL-Headers.cmake @@ -0,0 +1,77 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file +# Copyright.txt or https://cmake.org/licensing for details. + +include(FindPackageHandleStandardArgs) + +# ----------------------------------------------------------------------------------------# + +set(RCCL-Headers_INCLUDE_DIR_INTERNAL + "${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/rccl" + CACHE PATH "Path to internal rccl.h") + +# ----------------------------------------------------------------------------------------# + +if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "") + set(ROCM_PATH "$ENV{ROCM_PATH}") +endif() + +foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl) + if(EXISTS ${_DIR}) + get_filename_component(_ABS_DIR "${_DIR}" REALPATH) + list(APPEND _RCCL_PATHS ${_ABS_DIR}) + endif() +endforeach() + +# ----------------------------------------------------------------------------------------# + +find_package( + rccl + QUIET + CONFIG + HINTS + ${_RCCL_PATHS} + PATHS + ${_RCCL_PATHS} + PATH_SUFFIXES + rccl/lib/cmake) + +if(NOT rccl_FOUND) + set(RCCL-Headers_INCLUDE_DIR + "${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + CACHE PATH "Path to RCCL headers") +else() + set(RCCL-Headers_INCLUDE_DIR + "${rccl_INCLUDE_DIR}" + CACHE PATH "Path to RCCL headers") +endif() + +if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/rccl/rccl.h") + omnitrace_message( + AUTHOR_WARNING + "RCCL header (${RCCL-Headers_INCLUDE_DIR}/rccl/rccl.h) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + ) + set(RCCL-Headers_INCLUDE_DIR + "${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + CACHE PATH "Path to RCCL headers" FORCE) +endif() + +mark_as_advanced(RCCL-Headers_INCLUDE_DIR) + +# ----------------------------------------------------------------------------------------# + +find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR) + +# ------------------------------------------------------------------------------# + +if(RCCL-Headers_FOUND) + add_library(roc::rccl-headers INTERFACE IMPORTED) + set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR}) + + target_include_directories(roc::rccl-headers SYSTEM + INTERFACE ${RCCL-Headers_INCLUDE_DIR}) + + add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED) + target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers) +endif() + +# ------------------------------------------------------------------------------# diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index 72856d79b..54dbb381a 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -20,6 +20,8 @@ omnitrace_add_interface_library(omnitrace-rocprofiler "Provides flags and libraries for rocprofiler") omnitrace_add_interface_library(omnitrace-rocm-smi "Provides flags and libraries for rocm-smi") +omnitrace_add_interface_library( + omnitrace-rccl "Provides flags for ROCm Communication Collectives Library (RCCL)") omnitrace_add_interface_library(omnitrace-mpi "Provides MPI or MPI headers") omnitrace_add_interface_library(omnitrace-ptl "Enables PTL support (tasking)") omnitrace_add_interface_library(omnitrace-papi "Enable PAPI support") @@ -37,6 +39,7 @@ set(OMNITRACE_EXTENSION_LIBRARIES omnitrace::omnitrace-roctracer omnitrace::omnitrace-rocprofiler omnitrace::omnitrace-rocm-smi + omnitrace::omnitrace-rccl omnitrace::omnitrace-mpi omnitrace::omnitrace-ptl omnitrace::omnitrace-ompt @@ -196,6 +199,17 @@ if(OMNITRACE_USE_ROCM_SMI) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${rocm-smi_LIBRARY_DIRS}") endif() +# ----------------------------------------------------------------------------------------# +# +# RCCL +# +# ----------------------------------------------------------------------------------------# +if(OMNITRACE_USE_RCCL) + find_package(RCCL-Headers ${omnitrace_FIND_QUIETLY} REQUIRED) + target_link_libraries(omnitrace-rccl INTERFACE roc::rccl-headers) + omnitrace_target_compile_definitions(omnitrace-rccl INTERFACE OMNITRACE_USE_RCCL) +endif() + # ----------------------------------------------------------------------------------------# # # MPI diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt index 8efc80801..f08a1279e 100644 --- a/source/lib/omnitrace/CMakeLists.txt +++ b/source/lib/omnitrace/CMakeLists.txt @@ -34,6 +34,7 @@ target_link_libraries( $ $ $ + $ $,omnitrace::omnitrace-lto,>> $,omnitrace::omnitrace-static-libgcc,>> $,omnitrace::omnitrace-static-libstdcxx,>> @@ -100,6 +101,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp ${CMAKE_CURRENT_LIST_DIR}/library/process_sampler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/library/rocm.hpp ${CMAKE_CURRENT_LIST_DIR}/library/rocprofiler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/roctracer.hpp @@ -118,6 +120,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/rocprofiler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp @@ -144,8 +147,10 @@ if(OMNITRACE_USE_ROCTRACER) endif() if(OMNITRACE_USE_RCCL) - target_sources(omnitrace-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/rcclp.cpp) + target_sources( + omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/rcclp.cpp) endif() if(OMNITRACE_USE_ROCPROFILER) diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 6b198d5ff..7aec59d75 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -47,7 +47,6 @@ #include "library/thread_data.hpp" #include "library/timemory.hpp" #include "library/tracing.hpp" -#include "library/rcclp.hpp" #include diff --git a/source/lib/omnitrace/library/common.hpp b/source/lib/omnitrace/library/common.hpp index 64b13e818..1cc7832e8 100644 --- a/source/lib/omnitrace/library/common.hpp +++ b/source/lib/omnitrace/library/common.hpp @@ -47,6 +47,7 @@ TIMEMORY_DEFINE_NS_API(api, omnitrace) TIMEMORY_DEFINE_NS_API(api, sampling) TIMEMORY_DEFINE_NS_API(api, rocm_smi) +TIMEMORY_DEFINE_NS_API(api, rccl) namespace omnitrace { diff --git a/source/lib/omnitrace/library/components/fwd.hpp b/source/lib/omnitrace/library/components/fwd.hpp index 9401bacfa..9b25cbeea 100644 --- a/source/lib/omnitrace/library/components/fwd.hpp +++ b/source/lib/omnitrace/library/components/fwd.hpp @@ -22,7 +22,9 @@ #pragma once +#include "library/common.hpp" #include "library/defines.hpp" +#include "timemory/mpl/types.hpp" #include #include @@ -40,6 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling) TIMEMORY_DECLARE_COMPONENT(roctracer) TIMEMORY_DECLARE_COMPONENT(rocprofiler) +TIMEMORY_DECLARE_COMPONENT(rccl_comm_data) +TIMEMORY_DECLARE_COMPONENT(rcclp_handle) +TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl) +TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker) /// \struct tim::trait::name /// \brief provides a constexpr string in ::value @@ -152,6 +158,13 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type) #endif +#if !defined(OMNITRACE_USE_RCCL) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type) +#endif + #if !defined(TIMEMORY_USE_LIBUNWIND) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type) diff --git a/source/lib/omnitrace/library/components/rcclp.cpp b/source/lib/omnitrace/library/components/rcclp.cpp new file mode 100644 index 000000000..24dee893f --- /dev/null +++ b/source/lib/omnitrace/library/components/rcclp.cpp @@ -0,0 +1,269 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/rcclp.hpp" +#include "library/rcclp.hpp" + +#include + +std::ostream& +operator<<(std::ostream& _os, const ncclUniqueId& _v) +{ + for(auto itr : _v.internal) + _os << itr; + return _os; +} + +namespace tim +{ +namespace component +{ +uint64_t +activate_rcclp() +{ + using handle_t = tim::component::rcclp_handle; + + static auto _handle = std::shared_ptr{}; + + if(!_handle.get()) + { + _handle = std::make_shared(); + _handle->start(); + + auto cleanup_functor = [=]() { + if(_handle) + { + _handle->stop(); + _handle.reset(); + } + }; + + std::stringstream ss; + ss << "timemory-rcclp-" << demangle() << "-" + << demangle(); + tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); + return 1; + } + return 0; +} +// +//======================================================================================// +// +uint64_t +deactivate_rcclp(uint64_t id) +{ + if(id > 0) + { + std::stringstream ss; + ss << "timemory-rcclp-" << demangle() << "-" + << demangle(); + tim::manager::instance()->cleanup(ss.str()); + return 0; + } + return 1; +} +// +//======================================================================================// +// +void +configure_rcclp(const std::set& permit, const std::set& reject) +{ + static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS; + + using rcclp_gotcha_t = + tim::component::gotcha; + + static bool is_initialized = false; + if(!is_initialized) + { + // generate the gotcha wrappers + rcclp_gotcha_t::get_initializer() = []() { + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy); + }; + + // provide environment variable for suppressing wrappers + rcclp_gotcha_t::get_reject_list() = [reject]() { + auto _reject = reject; + // check environment + auto reject_list = + tim::get_env("OMNITRACE_RCCLP_REJECT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(reject_list)) + _reject.insert(itr); + return _reject; + }; + + // provide environment variable for selecting wrappers + rcclp_gotcha_t::get_permit_list() = [permit]() { + auto _permit = permit; + // check environment + auto permit_list = + tim::get_env("OMNITRACE_RCCLP_PERMIT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(permit_list)) + _permit.insert(itr); + return _permit; + }; + + is_initialized = true; + } +} + +void +rcclp_handle::start() +{ + if(get_tool_count()++ == 0) + { + get_tool_instance() = std::make_shared("timemory_rcclp"); + get_tool_instance()->start(); + } +} + +void +rcclp_handle::stop() +{ + auto idx = --get_tool_count(); + if(get_tool_instance().get()) + { + get_tool_instance()->stop(); + if(idx == 0) get_tool_instance().reset(); + } +} + +rcclp_handle::persistent_data& +rcclp_handle::get_persistent_data() +{ + static persistent_data _instance; + return _instance; +} + +std::atomic& +rcclp_handle::get_configured() +{ + return get_persistent_data().m_configured; +} + +rcclp_handle::toolset_ptr_t& +rcclp_handle::get_tool_instance() +{ + return get_persistent_data().m_tool; +} + +std::atomic& +rcclp_handle::get_tool_count() +{ + return get_persistent_data().m_count; +} + +void +rccl_comm_data::preinit() +{ + omnitrace::rcclp::configure(); +} + +// ncclReduce +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, + ncclComm_t, hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); +} + +// ncclSend +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, + size_t count, ncclDataType_t datatype, int peer, ncclComm_t, + hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer)); +} + +// ncclBcast +// ncclRecv +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count, + ncclDataType_t datatype, int root, ncclComm_t, hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); +} + +// ncclBroadcast +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, int root, ncclComm_t, + hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); +} + +// ncclAllReduce +// ncclReduceScatter +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, + hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size); +} + +// ncclAllGather +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size); +} + +} // namespace component +} // namespace tim + +TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t) diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp new file mode 100644 index 000000000..c80ac2a10 --- /dev/null +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -0,0 +1,216 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/common.hpp" +#include "library/components/category_region.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" +#include "timemory/components/macros.hpp" + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#if !defined(OMNITRACE_NUM_RCCLP_WRAPPERS) +# define OMNITRACE_NUM_RCCLP_WRAPPERS 25 +#endif + +TIMEMORY_COMPONENT_ALIAS( + rccl_toolset_t, + component_bundle, + rccl_comm_data*>) +TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t, + gotcha) + +#if !defined(OMNITRACE_USE_RCCL) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) +#endif + +TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float) +TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t, + true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t, + true_type) + +namespace tim +{ +namespace component +{ +uint64_t +activate_rcclp(); + +uint64_t +deactivate_rcclp(uint64_t id); + +void +configure_rcclp(const std::set& permit = {}, + const std::set& reject = {}); + +struct rcclp_handle : base +{ + static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS; + + using value_type = void; + using this_type = rcclp_handle; + using base_type = base; + + using rcclp_tuple_t = tim::component_tuple; + using toolset_ptr_t = std::shared_ptr; + + static std::string label() { return "rcclp_handle"; } + static std::string description() { return "Handle for activating NCCL wrappers"; } + static void get() {} + static void start(); + static void stop(); + static int get_count() { return get_tool_count().load(); } + +private: + struct persistent_data + { + std::atomic m_configured{ 0 }; + std::atomic m_count{ 0 }; + toolset_ptr_t m_tool = toolset_ptr_t{}; + }; + + static persistent_data& get_persistent_data(); + static std::atomic& get_configured(); + static toolset_ptr_t& get_tool_instance(); + static std::atomic& get_tool_count(); +}; + +struct rccl_comm_data : base +{ + using value_type = void; + using this_type = rccl_comm_data; + using base_type = base; + using tracker_t = tim::auto_tuple; + using data_type = float; + + TIMEMORY_DEFAULT_OBJECT(rccl_comm_data) + + static void preinit(); + static void start() {} + static void stop() {} + + static auto rccl_type_size(ncclDataType_t datatype) + { + switch(datatype) + { + case ncclInt8: + case ncclUint8: return 1; + case ncclFloat16: return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: return 8; + default: return 0; + }; + } + + // ncclReduce + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, + ncclComm_t, hipStream_t); + + // ncclSend + static void audit(const gotcha_data& _data, audit::incoming, const void*, + size_t count, ncclDataType_t datatype, int peer, ncclComm_t, + hipStream_t); + + // ncclBcast + // ncclRecv + static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count, + ncclDataType_t datatype, int root, ncclComm_t, hipStream_t); + + // ncclBroadcast + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, int root, ncclComm_t, + hipStream_t); + + // ncclAllReduce + // ncclReduceScatter + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, + hipStream_t); + + // ncclAllGather + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t); + +private: + template + static void add(tracker_t& _t, data_type value, Args&&... args) + { + _t.store(std::plus{}, value); + TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward(args), value)); + } + + template + static void add(const gotcha_data& _data, data_type value, Args&&... args) + { + tracker_t _t{ std::string_view{ _data.tool_id.c_str() } }; + add(_t, value, std::forward(args)...); + } + + template + static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value, + Args&&... args) + { + // if(tim::settings::add_secondary()) + { + tracker_t _s{ std::string_view{ _data.tool_id.c_str() } }; + add(_s, _data, value, std::forward(args)...); + } + } + + template + static void add(std::string_view _name, data_type value, Args&&... args) + { + tracker_t _t{ _name }; + add(_t, value, std::forward(args)...); + } + + template + static void add_secondary(tracker_t&, std::string_view _name, data_type value, + Args&&... args) + { + // if(tim::settings::add_secondary()) + { + tracker_t _s{ _name }; + add(_s, value, std::forward(args)...); + } + } +}; +} // namespace component +} // namespace tim diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp index 9f409d211..00c6b5d5f 100644 --- a/source/lib/omnitrace/library/rcclp.cpp +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -22,6 +22,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "library/components/rcclp.hpp" #include "library/components/category_region.hpp" #include "library/components/fwd.hpp" #include "library/defines.hpp" @@ -29,7 +30,7 @@ #include -#include +#include #include #include @@ -37,219 +38,9 @@ #include #include -TIMEMORY_DECLARE_COMPONENT(rccl_comm_data) - -#if !defined(NUM_TIMEMORY_RCCLP_WRAPPERS) -# define NUM_TIMEMORY_RCCLP_WRAPPERS 15 -#endif - -namespace tim -{ -namespace component -{ -template -struct rcclp_handle; -} -} // namespace tim - -struct rcclp_tag -{}; - -using api_t = rcclp_tag; -using rccl_data_tracker_t = tim::component::data_tracker; - -TIMEMORY_STATISTICS_TYPE(rccl_data_tracker_t, float) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, rccl_data_tracker_t, true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, rccl_data_tracker_t, true_type) - -using rccl_toolset_t = - tim::component_bundle, - tim::component::rccl_comm_data*>; -using rcclp_handle_t = omnitrace::comp::rcclp_handle; static uint64_t global_id = std::numeric_limits::max(); static void* librccl_handle = nullptr; -namespace tim -{ -namespace component -{ -template -struct rcclp_handle : base, void> -{ - static constexpr size_t rcclp_wrapper_count = NUM_TIMEMORY_RCCLP_WRAPPERS; - - using value_type = void; - using this_type = rcclp_handle; - using base_type = base; - - using string_t = std::string; - using nccl_toolset_t = Toolset; - using rcclp_gotcha_t = - tim::component::gotcha; - using rcclp_tuple_t = tim::component_tuple; - using toolset_ptr_t = std::shared_ptr; - - static string_t label() { return "rcclp_handle"; } - static string_t description() { return "Handle for activating NCCL wrappers"; } - - void get() {} - - void start() - { - if(get_tool_count()++ == 0) - { - get_tool_instance() = std::make_shared("timemory_rcclp"); - get_tool_instance()->start(); - } - } - - void stop() - { - auto idx = --get_tool_count(); - if(get_tool_instance().get()) - { - get_tool_instance()->stop(); - if(idx == 0) get_tool_instance().reset(); - } - } - - int get_count() { return get_tool_count().load(); } - -private: - struct persistent_data - { - std::atomic m_configured; - std::atomic m_count; - toolset_ptr_t m_tool; - }; - - static persistent_data& get_persistent_data() - { - static persistent_data _instance; - return _instance; - } - - static std::atomic& get_configured() - { - return get_persistent_data().m_configured; - } - - static toolset_ptr_t& get_tool_instance() { return get_persistent_data().m_tool; } - - static std::atomic& get_tool_count() - { - return get_persistent_data().m_count; - } -}; - -template -static uint64_t -activate_rcclp() -{ - using handle_t = tim::component::rcclp_handle; - - static std::shared_ptr _handle; - - if(!_handle.get()) - { - _handle = std::make_shared(); - _handle->start(); - - auto cleanup_functor = [=]() { - if(_handle) - { - _handle->stop(); - _handle.reset(); - } - }; - - std::stringstream ss; - ss << "timemory-rcclp-" << typeid(Toolset).name() << "-" << typeid(Tag).name(); - tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); - return 1; - } - return 0; -} -// -//======================================================================================// -// -/// \fn uint64_t tim::component::deactivate_rcclp(uint64_t id) -/// \brief The thread that created the initial rcclp handle will turn off. Returns -/// the number of handles active -/// -template -static uint64_t -deactivate_rcclp(uint64_t id) -{ - if(id > 0) - { - std::stringstream ss; - ss << "timemory-rcclp-" << typeid(Toolset).name() << "-" << typeid(Tag).name(); - tim::manager::instance()->cleanup(ss.str()); - return 0; - } - return 1; -} - -// -template -void -configure_rcclp(const std::set& permit = {}, - const std::set& reject = {}) -{ - static constexpr size_t rcclp_wrapper_count = NUM_TIMEMORY_RCCLP_WRAPPERS; - - using string_t = std::string; - using rcclp_gotcha_t = tim::component::gotcha; - - static bool is_initialized = false; - if(!is_initialized) - { - // generate the gotcha wrappers - rcclp_gotcha_t::get_initializer() = []() { - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclReduce); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclBcast); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclBroadcast); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclAllReduce); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclReduceScatter); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclAllGather); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclGroupStart); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclGroupEnd); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclSend); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclRecv); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclCommCount); - }; - - // provide environment variable for suppressing wrappers - rcclp_gotcha_t::get_reject_list() = [reject]() { - auto _reject = reject; - // check environment - auto reject_list = tim::get_env("OMNITRACE_RCCLP_REJECT_LIST", ""); - // add environment setting - for(const auto& itr : tim::delimit(reject_list)) - _reject.insert(itr); - return _reject; - }; - - // provide environment variable for selecting wrappers - rcclp_gotcha_t::get_permit_list() = [permit]() { - auto _permit = permit; - // check environment - auto permit_list = tim::get_env("OMNITRACE_RCCLP_PERMIT_LIST", ""); - // add environment setting - for(const auto& itr : tim::delimit(permit_list)) - _permit.insert(itr); - return _permit; - }; - - is_initialized = true; - } -} -} // namespace component -} // namespace tim - namespace omnitrace { namespace rcclp @@ -257,8 +48,8 @@ namespace rcclp void configure() { - rccl_data_tracker_t::label() = "rccl_comm_data"; - rccl_data_tracker_t::description() = "Tracks RCCL communication data"; + comp::rccl_data_tracker_t::label() = "rccl_comm_data"; + comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data"; } void @@ -274,12 +65,12 @@ setup() auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true); if(_data) - rccl_toolset_t::get_initializer() = [](rccl_toolset_t& cb) { + comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) { cb.initialize(); }; - comp::configure_rcclp(); - global_id = comp::activate_rcclp(); + comp::configure_rcclp(); + global_id = comp::activate_rcclp(); if(librccl_handle) dlclose(librccl_handle); } @@ -287,129 +78,7 @@ void shutdown() { if(global_id < std::numeric_limits::max()) - comp::deactivate_rcclp(global_id); + comp::deactivate_rcclp(global_id); } } // namespace rcclp } // namespace omnitrace -// -//--------------------------------------------------------------------------------------// -// -namespace tim -{ -namespace component -{ -// -//--------------------------------------------------------------------------------------// -// -struct rccl_comm_data : base -{ - using value_type = void; - using this_type = rccl_comm_data; - using base_type = base; - using tracker_t = tim::auto_tuple; - using data_type = float; - - TIMEMORY_DEFAULT_OBJECT(rccl_comm_data) - - static void preinit() { omnitrace::rcclp::configure(); } - - void start() {} - void stop() {} - - static auto rccl_type_size(ncclDataType_t datatype) - { - switch(datatype) - { - case ncclInt8: - case ncclUint8: return 1; - case ncclFloat16: return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: return 8; - default: return 0; - }; - } - - // ncclReduce - void audit(const std::string& _name, const void*, void*, size_t count, - ncclDataType_t datatype, ncclRedOp_t, int root, ncclComm_t, hipStream_t) - { - int size = rccl_type_size(datatype); - add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", root)); - } - - // ncclSend - void audit(const std::string& _name, const void*, size_t count, - ncclDataType_t datatype, int peer, ncclComm_t, hipStream_t) - { - int size = rccl_type_size(datatype); - add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", peer)); - } - - // ncclBcast - // ncclRecv - void audit(const std::string& _name, void*, size_t count, ncclDataType_t datatype, - int root, ncclComm_t, hipStream_t) - { - int size = rccl_type_size(datatype); - add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", root)); - } - - // ncclBroadcast - void audit(const std::string& _name, const void*, void*, size_t count, - ncclDataType_t datatype, int root, ncclComm_t, hipStream_t) - { - int size = rccl_type_size(datatype); - add(_name, count * size, TIMEMORY_JOIN('_', _name, "root", root)); - } - - // ncclAllReduce - // ncclReduceScatter - void audit(const std::string& _name, const void*, void*, size_t count, - ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, hipStream_t) - { - int size = rccl_type_size(datatype); - add(_name, count * size); - } - - // ncclAllGather - void audit(const std::string& _name, const void*, void*, size_t count, - ncclDataType_t datatype, ncclComm_t, hipStream_t) - { - int size = rccl_type_size(datatype); - add(_name, count * size); - } - -private: - template - void add(tracker_t& _t, data_type value, Args&&... args) - { - _t.store(std::plus{}, value); - TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward(args), value)); - } - - template - void add(const std::string& _name, data_type value, Args&&... args) - { - tracker_t _t(_name); - add(_t, value, std::forward(args)...); - } - - template - void add_secondary(tracker_t&, const std::string& _name, data_type value, - Args&&... args) - { - // if(tim::settings::add_secondary()) - { - tracker_t _s(_name); - add(_s, value, std::forward(args)...); - } - } -}; -} // namespace component -} // namespace tim - -TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t) diff --git a/source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h b/source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h new file mode 100644 index 000000000..5fb23b1ab --- /dev/null +++ b/source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h @@ -0,0 +1,522 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include + +#define NCCL_MAJOR 2 +#define NCCL_MINOR 11 +#define NCCL_PATCH 4 +#define NCCL_SUFFIX "" + +#define NCCL_VERSION_CODE 21104 +#define NCCL_VERSION(X, Y, Z) \ + (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z)) + +#define RCCL_BFLOAT16 1 +#define RCCL_GATHER_SCATTER 1 +#define RCCL_ALLTOALLV 1 + +#ifdef __cplusplus +extern "C" +{ +#endif + + /*! @brief Opaque handle to communicator */ + typedef struct ncclComm* ncclComm_t; + +#define NCCL_UNIQUE_ID_BYTES 128 + typedef struct + { + char internal[NCCL_UNIQUE_ID_BYTES]; + } ncclUniqueId; + + /*! @brief Error type */ + typedef enum + { + ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclNumResults = 6 + } ncclResult_t; + + /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * + * @details This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ + ncclResult_t ncclGetVersion(int* version); + /// @cond include_hidden + ncclResult_t pncclGetVersion(int* version); + /// @endcond + + /*! @brief Generates an ID for ncclCommInitRank + + @details + Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be + called once and the Id should be distributed to all ranks in the + communicator before calling ncclCommInitRank. + + @param[in] + uniqueId ncclUniqueId* + pointer to uniqueId + + */ + ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); + /// @cond include_hidden + ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + /// @endcond + + /*! @brief Creates a new communicator (multi thread/process version). + + @details + rank must be between 0 and nranks-1 and unique within a communicator clique. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. + ncclCommInitRank implicitly syncronizes with other ranks, so it must be + called by different threads/processes or use ncclGroupStart/ncclGroupEnd. + + @param[in] + comm ncclComm_t* + communicator struct pointer + */ + ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, + int rank); + /// @cond include_hidden + ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, + int rank); + /// @endcond + + /*! @brief Creates a clique of communicators (single process version). + * + * @details This is a convenience function to create a single-process communicator + * clique. Returns an array of ndev newly initialized communicators in comm. comm + * should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is + * NULL, the first ndev HIP devices are used. Order of devlist defines user-order of + * processors within the communicator. + * */ + ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + /// @cond include_hidden + ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + /// @endcond + + /*! @brief Frees resources associated with communicator object, but waits for any + * operations that might still be running on the device */ + ncclResult_t ncclCommDestroy(ncclComm_t comm); + /// @cond include_hidden + ncclResult_t pncclCommDestroy(ncclComm_t comm); + /// @endcond + + /*! @brief Frees resources associated with communicator object and aborts any + * operations that might still be running on the device. */ + ncclResult_t ncclCommAbort(ncclComm_t comm); + /// @cond include_hidden + ncclResult_t pncclCommAbort(ncclComm_t comm); + /// @endcond + + /*! @brief Returns a human-readable error message. */ + const char* ncclGetErrorString(ncclResult_t result); + const char* pncclGetErrorString(ncclResult_t result); + + /*! @brief Checks whether the comm has encountered any asynchronous errors */ + ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); + /// @cond include_hidden + ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); + /// @endcond + + /*! @brief Gets the number of ranks in the communicator clique. */ + ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); + /// @cond include_hidden + ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + /// @endcond + + /*! @brief Returns the rocm device number associated with the communicator. */ + ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); + /// @cond include_hidden + ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + /// @endcond + + /*! @brief Returns the user-ordered "rank" associated with the communicator. */ + ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); + /// @cond include_hidden + ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + /// @endcond + + /*! @brief Reduction operation selector */ + /* Reduction operation selector */ + typedef enum + { + ncclNumOps_dummy = 5 + } ncclRedOp_dummy_t; + typedef enum + { + ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) + } ncclRedOp_t; + + /*! @brief Data types */ + typedef enum + { + ncclInt8 = 0, + ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, + ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, + ncclHalf = 6, + ncclFloat32 = 7, + ncclFloat = 7, + ncclFloat64 = 8, + ncclDouble = 8, + ncclBfloat16 = 9, + ncclNumTypes = 10 + } ncclDataType_t; + + /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ + typedef enum + { + /* ncclScalarDevice: The scalar is in device-visible memory and will be + * dereferenced while the collective is running. */ + ncclScalarDevice = 0, + + /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be + * dereferenced before the ncclRedOpCreate***() function returns. */ + ncclScalarHostImmediate = 1 + } ncclScalarResidence_t; + + /* + * ncclRedOpCreatePreMulSum + * + * Creates a new reduction operator which pre-multiplies input values by a given + * scalar locally before reducing them with peer values via summation. For use + * only with collectives launched against *comm* and *datatype*. The + * *residence* argument indicates how/when the memory pointed to by *scalar* + * will be dereferenced. Upon return, the newly created operator's handle + * is stored in *op*. + */ + ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); + ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); + + /* + * ncclRedOpDestroy + * + * Destroys the reduction operator *op*. The operator must have been created by + * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + * destroyed as soon as the last NCCL function which is given that operator returns. + */ + ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + + /* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + + /*! + * @brief Reduce + * + * @details Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, + ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief (deprecated) Broadcast (in-place) + * + * @details Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ + ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief Broadcast + * + * @details Copies count values from root to all other devices. + * root is the rank (not the HIP device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief All-Reduce + * + * @details Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! + * @brief Reduce-Scatter + * + * @details Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ + ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief All-Gather + * + * @details Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ + ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief Send + * + * @details Send data from sendbuff to rank peer. + * Rank peer needs to call ncclRecv with the same datatype and the same count from + * this rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv + * operations need to progress concurrently to complete, they must be fused within a + * ncclGroupStart/ ncclGroupEnd section. + */ + ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief Receive + * + * @details Receive data from rank peer into recvbuff. + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv + * operations need to progress concurrently to complete, they must be fused within a + * ncclGroupStart/ ncclGroupEnd section. + */ + ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief Gather + * + * @details Root device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ + ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief Scatter + * + * @details Scattered over the devices so that recvbuff on rank i will contain the + * i-th block of the data on root. + * + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ + ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief All-To-All + * + * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th + * block. Each block for sending/receiving has count elements, which means + * that recvbuff and sendbuff should have a size of nranks*count elements. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief All-To-Allv + * + * @details Device (i) sends sendcounts[j] of data from offset sdispls[j] + * to device (j). In the same time, device (i) receives recvcounts[j] of data + * from device (j) to be placed at rdispls[j]. + + * sendcounts, sdispls, recvcounts and rdispls are all measured in the units + * of datatype, not bytes. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], + const size_t sdispls[], void* recvbuff, + const size_t recvcounts[], const size_t rdispls[], + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[], + const size_t sdispls[], void* recvbuff, + const size_t recvcounts[], const size_t rdispls[], + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. + */ + + /*! @brief Group Start + * + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. + */ + ncclResult_t ncclGroupStart(); + /// @cond include_hidden + ncclResult_t pncclGroupStart(); + /// @endcond + + /*! @brief Group End + * + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. + */ + ncclResult_t ncclGroupEnd(); + /// @cond include_hidden + ncclResult_t pncclGroupEnd(); + /// @endcond + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard From 4b6468b1c220317fe20e92f50fc2ab273a24a8e5 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 16:29:03 -0500 Subject: [PATCH 06/26] Fix timemory includes --- source/lib/omnitrace/library/components/fwd.hpp | 2 +- source/lib/omnitrace/library/components/rcclp.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/lib/omnitrace/library/components/fwd.hpp b/source/lib/omnitrace/library/components/fwd.hpp index 9b25cbeea..26e578c08 100644 --- a/source/lib/omnitrace/library/components/fwd.hpp +++ b/source/lib/omnitrace/library/components/fwd.hpp @@ -24,7 +24,6 @@ #include "library/common.hpp" #include "library/defines.hpp" -#include "timemory/mpl/types.hpp" #include #include @@ -34,6 +33,7 @@ #include #include #include +#include #include diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp index c80ac2a10..49fb25f17 100644 --- a/source/lib/omnitrace/library/components/rcclp.hpp +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -27,9 +27,9 @@ #include "library/components/fwd.hpp" #include "library/defines.hpp" #include "library/timemory.hpp" -#include "timemory/components/macros.hpp" #include +#include #include From a0a5a5fb0b1670ac6b838bdb3bca85864e2cbd05 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 18:38:53 -0500 Subject: [PATCH 07/26] rcclp inline definitions when disabled --- source/lib/omnitrace/library/rcclp.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/source/lib/omnitrace/library/rcclp.hpp b/source/lib/omnitrace/library/rcclp.hpp index f9844ef0e..1b04559d8 100644 --- a/source/lib/omnitrace/library/rcclp.hpp +++ b/source/lib/omnitrace/library/rcclp.hpp @@ -38,5 +38,20 @@ setup(); void shutdown(); + +#if !defined(OMNITRACE_USE_RCCL) || \ + (defined(OMNITRACE_USE_RCCL) && OMNITRACE_USE_RCCL == 0) +inline void +configure() +{} + +inline void +setup() +{} + +inline void +shutdown() +{} +#endif } // namespace rcclp } // namespace omnitrace From ea7f4d43bc0e99357c12ef143fc79ece6856c222 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 18 Jul 2022 18:52:57 -0500 Subject: [PATCH 08/26] Tweaks to ubuntu-focal-external-rocm - disable ompt - enable building testing --- .github/workflows/ubuntu-focal.yml | 35 +++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index abb8de80f..da5172786 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -176,7 +176,14 @@ jobs: strategy: matrix: compiler: ['g++'] - rocm_version: ['4.3', '4.5', 'debian'] + rocm_version: ['4.3', '4.5', '5.0'] + mpi_headers: ['OFF'] + build_jobs: ['4'] + include: + - compiler: 'g++' + rocm_version: 'debian' + mpi_headers: 'ON' + build_jobs: '2' env: BUILD_TYPE: MinSizeRel @@ -194,18 +201,23 @@ jobs: wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list && apt-get update && - apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl && + apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl && python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' && - for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done + for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done + + - name: Install RCCL + if: ${{ matrix.rocm_version != '4.3' }} + timeout-minutes: 5 + run: + apt-get install -y rccl-dev - name: Configure Env run: echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV && echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && - echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH && - echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV - name: Configure CMake timeout-minutes: 10 @@ -217,7 +229,7 @@ jobs: -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} -DCMAKE_INSTALL_PREFIX=/opt/omnitrace - -DOMNITRACE_BUILD_TESTING=OFF + -DOMNITRACE_BUILD_TESTING=ON -DOMNITRACE_BUILD_DEVELOPER=ON -DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF -DOMNITRACE_BUILD_LTO=OFF @@ -225,14 +237,19 @@ jobs: -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_HIP=ON -DOMNITRACE_MAX_THREADS=32 - -DOMNITRACE_USE_SANITIZER=OFF -DOMNITRACE_USE_PAPI=OFF + -DOMNITRACE_USE_OMPT=OFF + -DOMNITRACE_USE_PYTHON=ON + -DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }} + -DOMNITRACE_USE_SANITIZER=OFF -DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON + -DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs + -DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10" - name: Build timeout-minutes: 60 run: - cmake --build build --target all --parallel 2 -- VERBOSE=1 + cmake --build build --target all --parallel ${{ matrix.build_jobs }} -- VERBOSE=1 - name: Install run: @@ -258,6 +275,8 @@ jobs: ldd $(which omnitrace-avail) omnitrace-avail --help omnitrace-avail -a + which omnitrace-python + omnitrace-python --help which omnitrace-critical-trace ldd $(which omnitrace-critical-trace) which omnitrace From 0c63a45df16fa439e10513a5defa03d5fbfc5a19 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Tue, 19 Jul 2022 02:16:55 -0500 Subject: [PATCH 09/26] Tweaks to ubuntu-focal-external-rocm - ctest exclude --- .github/workflows/ubuntu-focal.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index da5172786..58035a860 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -179,11 +179,13 @@ jobs: rocm_version: ['4.3', '4.5', '5.0'] mpi_headers: ['OFF'] build_jobs: ['4'] + ctest_exclude: ['-LE "mpi-example|transpose"'] include: - compiler: 'g++' rocm_version: 'debian' mpi_headers: 'ON' build_jobs: '2' + ctest_exclude: '-LE transpose' env: BUILD_TYPE: MinSizeRel @@ -261,8 +263,8 @@ jobs: cd build && ldd ./bin/omnitrace && ./bin/omnitrace --help && - ctest -V -N -O omnitrace-ctest-${{ github.job }}-commands.log && - ctest -V --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure + ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log && + ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure - name: Test Install timeout-minutes: 10 From 673870c6c2278fbc2f4a5bd53d6de08e44cffddd Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Tue, 19 Jul 2022 11:30:22 -0500 Subject: [PATCH 10/26] Tweak ubuntu-focal.yml - remove source /.../setup-env.sh, replace with $GITHUB_ENV --- .github/workflows/ubuntu-focal.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 58035a860..60c035fce 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -266,6 +266,11 @@ jobs: ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log && ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure + - name: Configure Install Env + run: + echo "/opt/omnitrace/bin" >> $GITHUB_PATH && + echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + - name: Test Install timeout-minutes: 10 run: | From a743cced24a0193daa66bc90c68c7c2cef3bddc9 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 21 Jul 2022 01:55:56 -0500 Subject: [PATCH 11/26] Fix ubuntu-focal-rocm + OMPI + root --- .github/workflows/ubuntu-focal.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 60c035fce..a1369e8b2 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -191,6 +191,8 @@ jobs: BUILD_TYPE: MinSizeRel OMNITRACE_OUTPUT_PATH: omnitrace-tests-output OMNITRACE_OUTPUT_PREFIX: "%argt%/" + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 steps: - uses: actions/checkout@v2 @@ -247,6 +249,7 @@ jobs: -DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON -DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs -DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10" + -DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }} - name: Build timeout-minutes: 60 From a4e94e253239da33d1f7e44b49f5cd467ed147c4 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 21 Jul 2022 15:55:56 -0500 Subject: [PATCH 12/26] Improved rocm-smi error handling - Recover from rocm-smi errors - Disabling rocm-smi after recovering from errors - Werror in developer mode - Remove State::DelayedInit - Add State::Disabled --- source/lib/omnitrace/library/components/rocm_smi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/omnitrace/library/components/rocm_smi.cpp b/source/lib/omnitrace/library/components/rocm_smi.cpp index 4b3abdc0b..90f364e44 100644 --- a/source/lib/omnitrace/library/components/rocm_smi.cpp +++ b/source/lib/omnitrace/library/components/rocm_smi.cpp @@ -30,11 +30,11 @@ # undef NDEBUG #endif -#include "library/components/rocm_smi.hpp" #include "library/common.hpp" #include "library/components/fwd.hpp" #include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" +#include "library/components/rocm_smi.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" #include "library/debug.hpp" From 37f8353d0d5a177aa484351d6e0bb58e5f6dee4f Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 21 Jul 2022 15:58:02 -0500 Subject: [PATCH 13/26] formatting --- source/lib/omnitrace/library/components/rocm_smi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/omnitrace/library/components/rocm_smi.cpp b/source/lib/omnitrace/library/components/rocm_smi.cpp index 90f364e44..4b3abdc0b 100644 --- a/source/lib/omnitrace/library/components/rocm_smi.cpp +++ b/source/lib/omnitrace/library/components/rocm_smi.cpp @@ -30,11 +30,11 @@ # undef NDEBUG #endif +#include "library/components/rocm_smi.hpp" #include "library/common.hpp" #include "library/components/fwd.hpp" #include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" -#include "library/components/rocm_smi.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" #include "library/debug.hpp" From c5850e51682642fb9173af8b779b736db1f83505 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Fri, 22 Jul 2022 14:59:00 -0500 Subject: [PATCH 14/26] Fix merge of OMNITRACE_SAMPLING_KEEP_INTERNAL --- source/lib/omnitrace/library/config.cpp | 14 -------------- source/lib/omnitrace/library/config.hpp | 3 --- 2 files changed, 17 deletions(-) diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 202f40a27..1399a1ad5 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -328,13 +328,6 @@ configure_settings(bool _init) "'all' and 'none' suppresses all GPU sampling", std::string{ "all" }, "rocm_smi", "rocm", "process_sampling"); - OMNITRACE_CONFIG_SETTING( - bool, "OMNITRACE_SAMPLING_KEEP_INTERNAL", - "Configure whether the statistical samples should include call-stack entries " - "from internal routines in omnitrace. E.g. when ON, the call-stack will show " - "functions like omnitrace_push_trace", - true, "sampling", "thread_sampling", "data"); - auto _backend = tim::get_env_choice( "OMNITRACE_PERFETTO_BACKEND", (_system_backend) ? "system" // if OMNITRACE_PERFETTO_BACKEND_SYSTEM is true, @@ -1629,13 +1622,6 @@ get_sampling_gpus() #endif } -bool -get_sampling_keep_internal() -{ - static auto _v = get_config()->find("OMNITRACE_SAMPLING_KEEP_INTERNAL"); - return static_cast&>(*_v->second).get(); -} - bool get_trace_thread_locks() { diff --git a/source/lib/omnitrace/library/config.hpp b/source/lib/omnitrace/library/config.hpp index 2e0bb0700..0fc21f699 100644 --- a/source/lib/omnitrace/library/config.hpp +++ b/source/lib/omnitrace/library/config.hpp @@ -295,9 +295,6 @@ get_process_sampling_freq(); std::string get_sampling_gpus(); -bool -get_sampling_keep_internal(); - int64_t get_critical_trace_per_row(); From 35105df3fbd0c979930923e9c1766213c6d3b7cf Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Sat, 23 Jul 2022 01:44:06 -0500 Subject: [PATCH 15/26] Update RCCL include directory - based on ROCm version we need with or --- cmake/Modules/FindRCCL-Headers.cmake | 23 ++++++++++++++++--- .../omnitrace/library/components/rcclp.hpp | 6 ++++- source/lib/omnitrace/library/rcclp.cpp | 6 ++++- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/cmake/Modules/FindRCCL-Headers.cmake b/cmake/Modules/FindRCCL-Headers.cmake index 89d92bb2c..f70876833 100644 --- a/cmake/Modules/FindRCCL-Headers.cmake +++ b/cmake/Modules/FindRCCL-Headers.cmake @@ -45,16 +45,33 @@ else() CACHE PATH "Path to RCCL headers") endif() -if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/rccl/rccl.h") +# because of the annoying warning starting with v5.2.0, we've got to do this crap +if(ROCmVersion_NUMERIC_VERSION) + if(ROCmVersion_NUMERIC_VERSION LESS 50200) + set(_RCCL-Headers_FILE "rccl.h") + set(_RCCL-Headers_DIR "/rccl") + else() + set(_RCCL-Headers_FILE "rccl/rccl.h") + set(_RCCL-Headers_DIR "") + endif() +else() + set(_RCCL-Headers_FILE "rccl/rccl.h") + set(_RCCL-Headers_DIR "") +endif() + +if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}") omnitrace_message( AUTHOR_WARNING - "RCCL header (${RCCL-Headers_INCLUDE_DIR}/rccl/rccl.h) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + "RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}" ) set(RCCL-Headers_INCLUDE_DIR - "${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + "${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}" CACHE PATH "Path to RCCL headers" FORCE) endif() +unset(_RCCL-Headers_FILE) +unset(_RCCL-Headers_DIR) + mark_as_advanced(RCCL-Headers_INCLUDE_DIR) # ----------------------------------------------------------------------------------------# diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp index 49fb25f17..12845bb49 100644 --- a/source/lib/omnitrace/library/components/rcclp.hpp +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -31,7 +31,11 @@ #include #include -#include +#if OMNITRACE_HIP_VERSION >= 50200 +# include +#else +# include +#endif #include #include diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp index 00c6b5d5f..fae71f533 100644 --- a/source/lib/omnitrace/library/rcclp.cpp +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -30,7 +30,11 @@ #include -#include +#if OMNITRACE_HIP_VERSION >= 50200 +# include +#else +# include +#endif #include #include From 755e16c02c709e3d6bd01d69fefd2e4b246428c4 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 01:54:24 -0500 Subject: [PATCH 16/26] RCCL Testing - updated tests to use configuration files - many tests generate a configuration file - tests how have GPU option - enable ncclCommCount, disable ncclGetVersion - add testing for RCCLP via rccl-tests - working directory of tests is PROJECT_BINARY_DIR - add nccl/rccl functions to get_whole_function_names - some clang compiler fixes --- .cmake-format.yaml | 1 + .github/workflows/opensuse.yml | 1 + .github/workflows/ubuntu-bionic.yml | 1 + .github/workflows/ubuntu-focal.yml | 3 + CMakeLists.txt | 2 +- examples/CMakeLists.txt | 1 + examples/rccl/CMakeLists.txt | 61 +++++ examples/transpose/CMakeLists.txt | 13 +- source/bin/omnitrace/details.cpp | 8 +- .../omnitrace/library/components/rcclp.cpp | 4 +- tests/CMakeLists.txt | 223 ++++++++++++++++-- 11 files changed, 285 insertions(+), 33 deletions(-) create mode 100644 examples/rccl/CMakeLists.txt diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 6ee48f754..8e661d6b8 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -28,6 +28,7 @@ parse: NAME: '*' TARGET: '*' MPI: '*' + GPU: '*' NUM_PROCS: '*' REWRITE_TIMEOUT: '*' RUNTIME_TIMEOUT: '*' diff --git a/.github/workflows/opensuse.yml b/.github/workflows/opensuse.yml index 71290a708..1bc056db3 100644 --- a/.github/workflows/opensuse.yml +++ b/.github/workflows/opensuse.yml @@ -121,5 +121,6 @@ jobs: with: name: data-${{ github.job }}-files path: | + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json diff --git a/.github/workflows/ubuntu-bionic.yml b/.github/workflows/ubuntu-bionic.yml index 823f44e69..8b2b8806d 100644 --- a/.github/workflows/ubuntu-bionic.yml +++ b/.github/workflows/ubuntu-bionic.yml @@ -150,5 +150,6 @@ jobs: with: name: data-${{ github.job }}-files path: | + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index a1369e8b2..3e58e2547 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -166,6 +166,7 @@ jobs: with: name: data-${{ github.job }}-files path: | + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json @@ -322,6 +323,7 @@ jobs: name: data-${{ github.job }}-files path: | omnitrace-tests-output/**/*.txt + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json @@ -474,5 +476,6 @@ jobs: with: name: data-${{ github.job }}-files path: | + ${{ github.workspace }}/build/omnitrace-tests-config/*.cfg ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e688513c..434902911 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support" omnitrace_add_option( OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling" ${OMNITRACE_USE_HIP}) -omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP}) +omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ON) omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a3a5d7506..3cfdd7958 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -32,3 +32,4 @@ add_subdirectory(openmp) add_subdirectory(mpi) add_subdirectory(python) add_subdirectory(lulesh) +add_subdirectory(rccl) diff --git a/examples/rccl/CMakeLists.txt b/examples/rccl/CMakeLists.txt new file mode 100644 index 000000000..275b16012 --- /dev/null +++ b/examples/rccl/CMakeLists.txt @@ -0,0 +1,61 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +project(omnitrace-rccl-example LANGUAGES CXX) + +find_package(rccl) +find_package(hip HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR}) + +function(rccl_message) + if("${CMAKE_PROJECT_NAME}" STREQUAL "omnitrace") + omnitrace_message(${ARGN}) + else() + message(${ARGN}) + endif() +endfunction() + +if(hip_FOUND AND rccl_FOUND) + include(FetchContent) + fetchcontent_declare( + rccl-tests GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rccl-tests.git) + + # After the following call, the CMake targets defined by googletest and Catch2 will be + # available to the rest of the build + fetchcontent_makeavailable(rccl-tests) + + get_filename_component(rccl_ROOT_DIR "${rccl_INCLUDE_DIR}" DIRECTORY) + + rccl_message(STATUS "Building rccl-tests...") + execute_process( + COMMAND make HIP_HOME=${ROCM_PATH} RCCL_HOME=${rccl_ROOT_DIR} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/rccl-tests-src + RESULT_VARIABLE _RCCL_BUILD_RET + ERROR_VARIABLE _RCCL_BUILD_ERR + OUTPUT_VARIABLE _RCCL_BUILD_OUT + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE) + + if(NOT _RCCL_BUILD_RET EQUAL 0) + rccl_message(${_RCCL_BUILD_OUT}) + rccl_message(AUTHOR_WARNING "Failed to build rccl-tests: ${_RCCL_BUILD_ERR}") + else() + file(GLOB RCCL_TEST_EXECUTABLES + ${CMAKE_BINARY_DIR}/_deps/rccl-tests-src/build/*_perf) + set(_RCCL_TEST_TARGETS) + + foreach(_EXE ${RCCL_TEST_EXECUTABLES}) + get_filename_component(_EXE_NAME "${_EXE}" NAME) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${_EXE} + ${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME}) + add_executable(rccl-tests::${_EXE_NAME} IMPORTED GLOBAL) + set_property( + TARGET rccl-tests::${_EXE_NAME} + PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME}) + list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}") + endforeach() + + set(RCCL_TEST_TARGETS + "${_RCCL_TEST_TARGETS}" + CACHE INTERNAL "rccl-test targets") + endif() +else() + rccl_message(AUTHOR_WARNING "${PROJECT_NAME} skipped. Missing RCCL and/or HIP...") +endif() diff --git a/examples/transpose/CMakeLists.txt b/examples/transpose/CMakeLists.txt index 72962cecb..828e0be93 100644 --- a/examples/transpose/CMakeLists.txt +++ b/examples/transpose/CMakeLists.txt @@ -40,13 +40,16 @@ endif() add_executable(transpose transpose.cpp) -if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" + AND NOT CMAKE_CXX_COMPILER_IS_HIPCC + AND NOT HIPCC_EXECUTABLE) target_link_libraries( transpose - PRIVATE - $,omnitrace::omnitrace-compile-options,> - $,hip::host,> - $,hip::device,>) + PRIVATE $ + $ $) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_link_libraries( + transpose PRIVATE $) else() target_compile_options(transpose PRIVATE -W -Wall) endif() diff --git a/source/bin/omnitrace/details.cpp b/source/bin/omnitrace/details.cpp index e63aaaaa8..4a1bbe155 100644 --- a/source/bin/omnitrace/details.cpp +++ b/source/bin/omnitrace/details.cpp @@ -52,7 +52,13 @@ get_whole_function_names() "rocr::core::BusyWaitSignal::WaitAcquire", "rocr::core::BusyWaitSignal::WaitRelaxed", "rocr::HSA::hsa_signal_wait_scacquire", "rocr::os::ThreadTrampoline", "rocr::image::ImageRuntime::CreateImageManager", - "rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info", "event_base_loop" + "rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info", + "event_base_loop", "bootstrapRoot", "bootstrapNetAccept", "ncclCommInitRank", + "ncclCommInitAll", "ncclCommDestroy", "ncclCommCount", "ncclCommCuDevice", + "ncclCommUserRank", "ncclReduce", "ncclBcast", "ncclBroadcast", "ncclAllReduce", + "ncclReduceScatter", "ncclAllGather", "ncclGroupStart", "ncclGroupEnd", + "ncclSend", "ncclRecv", "ncclGather", "ncclScatter", "ncclAllToAll", + "ncclAllToAllv" }; #else // should hopefully be removed soon diff --git a/source/lib/omnitrace/library/components/rcclp.cpp b/source/lib/omnitrace/library/components/rcclp.cpp index 24dee893f..4f42487d1 100644 --- a/source/lib/omnitrace/library/components/rcclp.cpp +++ b/source/lib/omnitrace/library/components/rcclp.cpp @@ -97,12 +97,12 @@ configure_rcclp(const std::set& permit, const std::set { // generate the gotcha wrappers rcclp_gotcha_t::get_initializer() = []() { - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion); // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId); TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank); TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll); TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount); TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5850c0cfb..8f5b10752 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -30,8 +30,8 @@ set(_test_openmp_env "OMP_PROC_BIND=spread" "OMP_PLACES=threads" "OMP_NUM_THREAD set(_base_environment "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}" - "${_test_library_path}") + "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_FILE_OUTPUT=ON" "${_test_openmp_env}" "${_test_library_path}") set(_flat_environment "OMNITRACE_USE_PERFETTO=ON" @@ -42,12 +42,8 @@ set(_flat_environment "OMNITRACE_TIMELINE_PROFILE=OFF" "OMNITRACE_COLLAPSE_PROCESSES=ON" "OMNITRACE_COLLAPSE_THREADS=ON" - "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" - "OMP_PROC_BIND=spread" - "OMP_PLACES=threads" - "OMP_NUM_THREADS=2" - "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" - ) + "${_test_openmp_env}" + "${_test_library_path}") set(_lock_environment "OMNITRACE_USE_SAMPLING=OFF" @@ -62,18 +58,34 @@ set(_lock_environment "${_test_library_path}") set(_ompt_environment - "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_TIME_OUTPUT=OFF" - "OMNITRACE_USE_OMPT=ON" "OMNITRACE_CRITICAL_TRACE=OFF" "${_test_openmp_env}" + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_USE_OMPT=ON" + "OMNITRACE_CRITICAL_TRACE=OFF" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss" + "${_test_openmp_env}" "${_test_library_path}") set(_perfetto_environment - "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=OFF" "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}" + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=OFF" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_PERFETTO_BACKEND=inprocess" + "OMNITRACE_PERFETTO_FILL_POLICY=ring_buffer" + "${_test_openmp_env}" "${_test_library_path}") set(_timemory_environment - "OMNITRACE_USE_PERFETTO=OFF" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}" + "OMNITRACE_USE_PERFETTO=OFF" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss" + "${_test_openmp_env}" "${_test_library_path}") set(_test_environment ${_base_environment} "OMNITRACE_CRITICAL_TRACE=OFF") @@ -86,7 +98,7 @@ set(_python_environment "OMNITRACE_TIME_OUTPUT=OFF" "OMNITRACE_TREE_OUTPUT=OFF" "OMNITRACE_USE_PID=OFF" - "OMNITRACE_TIMEMORY_COMPONENTS=trip_count" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" "${_test_library_path}" "PYTHONPATH=${PROJECT_BINARY_DIR}/lib/python/site-packages") @@ -100,6 +112,18 @@ set(_attach_environment "OMNITRACE_USE_KOKKOSP=ON" "OMNITRACE_TIME_OUTPUT=OFF" "OMNITRACE_USE_PID=OFF" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" + "${_test_openmp_env}" + "${_test_library_path}") + +set(_rccl_environment + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=OFF" + "OMNITRACE_USE_PROCESS_SAMPLING=ON" + "OMNITRACE_USE_RCCLP=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_USE_PID=OFF" "${_test_openmp_env}" "${_test_library_path}") @@ -123,6 +147,81 @@ endif() # -------------------------------------------------------------------------------------- # +set(_VALID_GPU OFF) +if(OMNITRACE_USE_HIP) + set(_VALID_GPU ON) + find_program( + OMNITRACE_ROCM_SMI_EXE + NAMES rocm-smi + HINTS ${ROCmVersion_DIR} + PATHS ${ROCmVersion_DIR} + PATH_SUFFIXES bin) + if(OMNITRACE_ROCM_SMI_EXE) + execute_process( + COMMAND ${OMNITRACE_ROCM_SMI_EXE} + OUTPUT_VARIABLE _RSMI_OUT + ERROR_VARIABLE _RSMI_ERR + RESULT_VARIABLE _RSMI_RET) + if(_RSMI_RET EQUAL 0) + if("${_RSMI_OUTPUT}" MATCHES "ERROR" OR "${_RSMI_ERR}" MATCHES "ERROR") + set(_VALID_GPU OFF) + endif() + else() + set(_VALID_GPU OFF) + endif() + endif() + if(NOT _VALID_GPU) + omnitrace_message(AUTHOR_WARNING + "rocm-smi did not successfully run. Disabling GPU tests...") + endif() +endif() + +set(LULESH_USE_GPU ${LULESH_USE_HIP}) +if(LULESH_USE_CUDA) + set(LULESH_USE_GPU ON) +endif() + +# -------------------------------------------------------------------------------------- # + +function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV) + set(_FILE_CONTENTS) + set(_ENV_CONTENTS) + + foreach(_VAL ${${_ENV}}) + if("${_VAL}" MATCHES "^OMNITRACE_") + set(_FILE_CONTENTS "${_FILE_CONTENTS}${_VAL}\n") + else() + list(APPEND _ENV_CONTENTS "${_VAL}") + endif() + endforeach() + + set(_CONFIG_FILE ${PROJECT_BINARY_DIR}/omnitest-test-config/${_FILE}) + file( + WRITE ${_CONFIG_FILE} + "# auto-generated by cmake + +# default values +OMNITRACE_CI = ON +OMNITRACE_VERBOSE = 1 +OMNITRACE_DL_VERBOSE = 1 +OMNITRACE_SAMPLING_FREQ = 50 +OMNITRACE_SAMPLING_DELAY = 0.05 +OMNITRACE_SAMPLING_CPUS = 0-3 +OMNITRACE_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES +OMNITRACE_ROCTRACER_HSA_API = ON +OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON + +# test-specific values +${_FILE_CONTENTS} +") + list(APPEND _ENV_CONTENTS "OMNITRACE_CONFIG_FILE=${_CONFIG_FILE}") + set(${_ENV} + "${_ENV_CONTENTS}" + PARENT_SCOPE) +endfunction() + +# -------------------------------------------------------------------------------------- # + function(OMNITRACE_ADD_TEST) foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN) foreach(_TYPE PASS FAIL SKIP) @@ -135,10 +234,17 @@ function(OMNITRACE_ADD_TEST) cmake_parse_arguments( TEST "SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING" # options - "NAME;TARGET;MPI;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value args + "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value + # args "${_KWARGS}" # multiple value args ${ARGN}) + if(TEST_GPU AND NOT _VALID_GPU) + omnitrace_message(STATUS + "${TEST_NAME} requires a GPU and no valid GPUs were found") + return() + endif() + if("${TEST_MPI}" STREQUAL "") set(TEST_MPI OFF) endif() @@ -189,7 +295,7 @@ function(OMNITRACE_ADD_TEST) add_test( NAME ${TEST_NAME}-baseline COMMAND ${COMMAND_PREFIX} $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() if(NOT TEST_SKIP_REWRITE) @@ -199,7 +305,7 @@ function(OMNITRACE_ADD_TEST) $ -o $/${TEST_NAME}.inst ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) if(NOT TEST_SKIP_SAMPLING) add_test( @@ -208,7 +314,7 @@ function(OMNITRACE_ADD_TEST) $ -o $/${TEST_NAME}.samp -M sampling ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() add_test( @@ -216,7 +322,7 @@ function(OMNITRACE_ADD_TEST) COMMAND ${COMMAND_PREFIX} $/${TEST_NAME}.inst ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) if(NOT TEST_SKIP_SAMPLING) add_test( @@ -225,7 +331,7 @@ function(OMNITRACE_ADD_TEST) ${COMMAND_PREFIX} $/${TEST_NAME}.samp ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() endif() @@ -234,14 +340,14 @@ function(OMNITRACE_ADD_TEST) NAME ${TEST_NAME}-runtime-instrument COMMAND $ ${TEST_RUNTIME_ARGS} -- $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) if(NOT TEST_SKIP_SAMPLING) add_test( NAME ${TEST_NAME}-runtime-instrument-sampling COMMAND $ -M sampling ${TEST_RUNTIME_ARGS} -- $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() endif() @@ -303,6 +409,7 @@ function(OMNITRACE_ADD_TEST) endforeach() if(TEST ${TEST_NAME}-${_TEST}) + omnitrace_write_test_config(${TEST_NAME}-${_TEST}.cfg _environ) set_tests_properties( ${TEST_NAME}-${_TEST} PROPERTIES ENVIRONMENT @@ -431,6 +538,7 @@ endfunction() # general config file tests # # -------------------------------------------------------------------------------------- # + file( WRITE ${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg " @@ -476,6 +584,7 @@ omnitrace_add_test( NAME transpose TARGET transpose MPI ${TRANSPOSE_USE_MPI} + GPU ON NUM_PROCS ${NUM_PROCS} REWRITE_ARGS -e -v 2 --print-instructions -E uniform_int_distribution RUNTIME_ARGS @@ -497,6 +606,7 @@ omnitrace_add_test( TARGET transpose LABELS "loops" MPI ${TRANSPOSE_USE_MPI} + GPU ON NUM_PROCS ${NUM_PROCS} REWRITE_ARGS -e @@ -617,7 +727,8 @@ omnitrace_add_test( args --min-instructions 0 - ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_USE_MPIP=ON" + ENVIRONMENT + "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON" REWRITE_RUN_PASS_REGEX ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" ) @@ -649,6 +760,7 @@ omnitrace_add_test( NAME lulesh TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos" REWRITE_ARGS -e -v 2 --label file line return args @@ -674,6 +786,7 @@ omnitrace_add_test( NAME lulesh-baseline-kokkosp-libomnitrace TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p @@ -686,6 +799,7 @@ omnitrace_add_test( NAME lulesh-baseline-kokkosp-libomnitrace-dl TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p @@ -698,6 +812,7 @@ omnitrace_add_test( NAME lulesh-kokkosp TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos" REWRITE_ARGS -e -v 2 @@ -721,6 +836,7 @@ omnitrace_add_test( NAME lulesh-perfetto TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;loops" REWRITE_ARGS -e -v 2 @@ -743,6 +859,7 @@ omnitrace_add_test( NAME lulesh-timemory TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;loops" REWRITE_ARGS -e -v 2 -l --dynamic-callsites --traps --allow-overlapping @@ -965,6 +1082,64 @@ if(TARGET parallel-overhead AND _VALID_PTRACE_SCOPE) "Dyninst was unable to attach to the specified process") endif() +# -------------------------------------------------------------------------------------- # +# +# rccl tests +# +# -------------------------------------------------------------------------------------- # + +foreach(_TARGET ${RCCL_TEST_TARGETS}) + string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}") + string(REPLACE "_" "-" _NAME "${_NAME}") + omnitrace_add_test( + SKIP_SAMPLING + NAME rccl-test-${_NAME} + TARGET ${_TARGET} + LABELS "rccl-tests;rcclp" + MPI ON + GPU ON + NUM_PROCS 1 + REWRITE_ARGS + -e + -v + 2 + -i + 8 + --label + file + line + return + args + RUNTIME_ARGS + -e + -v + 1 + -i + 8 + --label + file + line + return + args + RUN_ARGS -t + 1 + -g + 1 + -i + 10 + -w + 2 + -m + 2 + -p + -c + 1 + -z + -s + 1 + ENVIRONMENT "${_rccl_environment}") +endforeach() + # -------------------------------------------------------------------------------------- # # # python tests From f4e1c7f0afd15221fb955ada3399b80586a9dead Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 02:19:39 -0500 Subject: [PATCH 17/26] Handle RCCL include w/o HIP --- source/lib/omnitrace/library/components/rcclp.hpp | 2 +- source/lib/omnitrace/library/rcclp.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp index 12845bb49..172ec97f0 100644 --- a/source/lib/omnitrace/library/components/rcclp.hpp +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -31,7 +31,7 @@ #include #include -#if OMNITRACE_HIP_VERSION >= 50200 +#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 # include #else # include diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp index fae71f533..420f5b72b 100644 --- a/source/lib/omnitrace/library/rcclp.cpp +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -30,7 +30,7 @@ #include -#if OMNITRACE_HIP_VERSION >= 50200 +#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 # include #else # include From 97d8cb478260643c756fce8da08be21dc1358c73 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 04:53:16 -0500 Subject: [PATCH 18/26] RCCL requires HIP --- CMakeLists.txt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 434902911..c55fba1b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support" omnitrace_add_option( OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling" ${OMNITRACE_USE_HIP}) -omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ON) +omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP}) omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON) @@ -176,14 +176,18 @@ if(NOT OMNITRACE_USE_HIP) set(OMNITRACE_USE_ROCM_SMI OFF CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE) + set(OMNITRACE_USE_RCCL + OFF + CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE) elseif( OMNITRACE_USE_HIP AND NOT OMNITRACE_USE_ROCTRACER AND NOT OMNITRACE_USE_ROCPROFILER - AND NOT OMNITRACE_USE_ROCM_SMI) + AND NOT OMNITRACE_USE_ROCM_SMI + AND NOT OMNITRACE_USE_RCCL) omnitrace_message( AUTHOR_WARNING - "Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, and rocm-smi options are disabled" + "Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled" ) set(OMNITRACE_USE_HIP OFF) endif() From 0231f23335573ac03b47e671ac5ecef8565ac7c2 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 05:59:17 -0500 Subject: [PATCH 19/26] Update OMNITRACE_SAMPLING_CPUS for testing --- tests/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8f5b10752..95a53e24b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,6 +12,11 @@ if(NOT DEFINED NUM_PROCS) set(NUM_PROCS 2) endif() +math(EXPR NUM_SAMPLING_PROCS "${NUM_PROCS_REAL}-1") +if(NUM_SAMPLING_PROCS GREATER 3) + set(NUM_SAMPLING_PROCS 3) +endif() + math(EXPR NUM_THREADS "${NUM_PROCS_REAL} + (${NUM_PROCS_REAL} / 2)") if(NUM_THREADS GREATER 12) set(NUM_THREADS 12) @@ -195,7 +200,7 @@ function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV) endif() endforeach() - set(_CONFIG_FILE ${PROJECT_BINARY_DIR}/omnitest-test-config/${_FILE}) + set(_CONFIG_FILE ${PROJECT_BINARY_DIR}/omnitrace-tests-config/${_FILE}) file( WRITE ${_CONFIG_FILE} "# auto-generated by cmake @@ -206,7 +211,7 @@ OMNITRACE_VERBOSE = 1 OMNITRACE_DL_VERBOSE = 1 OMNITRACE_SAMPLING_FREQ = 50 OMNITRACE_SAMPLING_DELAY = 0.05 -OMNITRACE_SAMPLING_CPUS = 0-3 +OMNITRACE_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS} OMNITRACE_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES OMNITRACE_ROCTRACER_HSA_API = ON OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON From a031b7e810234bcbd49cf51b73e0a658cb69b282 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 06:38:32 -0500 Subject: [PATCH 20/26] Update tests/CMakeLists.txt --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 95a53e24b..92756d128 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -47,6 +47,7 @@ set(_flat_environment "OMNITRACE_TIMELINE_PROFILE=OFF" "OMNITRACE_COLLAPSE_PROCESSES=ON" "OMNITRACE_COLLAPSE_THREADS=ON" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" "${_test_openmp_env}" "${_test_library_path}") From 679dff8d11502f53794fb961ab6c23f1f1b387e3 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 08:03:01 -0500 Subject: [PATCH 21/26] Debug settings --- source/lib/omnitrace/library/config.cpp | 19 +++++++++++++++++++ tests/CMakeLists.txt | 4 +++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 1399a1ad5..5f1a1e736 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -24,6 +24,7 @@ #include "library/debug.hpp" #include "library/defines.hpp" #include "library/gpu.hpp" +#include "library/mproc.hpp" #include "library/perfetto.hpp" #include "library/runtime.hpp" @@ -46,9 +47,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -587,12 +590,28 @@ configure_settings(bool _init) } if(!_found_sep && _cmd.size() > 1) _cmd.insert(_cmd.begin() + 1, "--"); + auto _pid = getpid(); + auto _ppid = getppid(); + auto _proc = mproc::get_concurrent_processes(_ppid); + bool _main_proc = (_proc.size() < 2 || *_proc.begin() == _pid); + for(auto&& itr : tim::delimit(_config->get("OMNITRACE_CONFIG_FILE"), ";:")) { if(_config->get_suppress_config()) continue; OMNITRACE_BASIC_VERBOSE(1, "Reading config file %s\n", itr.c_str()); _config->read(itr); + if(_config->get("OMNITRACE_CI") && _main_proc) + { + std::ifstream _in{ itr }; + std::stringstream _iss{}; + while (_in) { + std::string _s{}; + getline(_in, _s); + _iss << _s << "\n"; + } + OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(), _iss.str().c_str()); + } } settings::suppress_config() = true; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 92756d128..1615f2759 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -190,11 +190,12 @@ endif() # -------------------------------------------------------------------------------------- # function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV) + set(_ENV_ONLY "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS)=") set(_FILE_CONTENTS) set(_ENV_CONTENTS) foreach(_VAL ${${_ENV}}) - if("${_VAL}" MATCHES "^OMNITRACE_") + if("${_VAL}" MATCHES "^OMNITRACE_" AND NOT "${_VAL}" MATCHES "${_ENV_ONLY}") set(_FILE_CONTENTS "${_FILE_CONTENTS}${_VAL}\n") else() list(APPEND _ENV_CONTENTS "${_VAL}") @@ -221,6 +222,7 @@ OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON ${_FILE_CONTENTS} ") list(APPEND _ENV_CONTENTS "OMNITRACE_CONFIG_FILE=${_CONFIG_FILE}") + list(APPEND _ENV_CONTENTS "OMNITRACE_DEBUG_SETTINGS=1") set(${_ENV} "${_ENV_CONTENTS}" PARENT_SCOPE) From df9c8741325cb03fb04df2970d9732e0f6e29ab1 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 08:07:54 -0500 Subject: [PATCH 22/26] Install MPI even when USE_MPI=OFF --- .github/workflows/ubuntu-focal.yml | 4 ++-- source/lib/omnitrace/library/config.cpp | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 3e58e2547..8325c19a3 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -66,7 +66,7 @@ jobs: add-apt-repository -y ppa:ubuntu-toolchain-r/test && apt-get update && apt-get upgrade -y && - apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev ${{ matrix.compiler }} && + apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev libmpich-dev mpich ${{ matrix.compiler }} && python3 -m pip install --upgrade pip && python3 -m pip install numpy && python3 -m pip install perfetto && @@ -206,7 +206,7 @@ jobs: wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list && apt-get update && - apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl && + apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev && python3 -m pip install --upgrade pip && python3 -m pip install 'cmake==3.16.3' && for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 5f1a1e736..7d82292f7 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -590,9 +590,9 @@ configure_settings(bool _init) } if(!_found_sep && _cmd.size() > 1) _cmd.insert(_cmd.begin() + 1, "--"); - auto _pid = getpid(); - auto _ppid = getppid(); - auto _proc = mproc::get_concurrent_processes(_ppid); + auto _pid = getpid(); + auto _ppid = getppid(); + auto _proc = mproc::get_concurrent_processes(_ppid); bool _main_proc = (_proc.size() < 2 || *_proc.begin() == _pid); for(auto&& itr : @@ -603,14 +603,16 @@ configure_settings(bool _init) _config->read(itr); if(_config->get("OMNITRACE_CI") && _main_proc) { - std::ifstream _in{ itr }; + std::ifstream _in{ itr }; std::stringstream _iss{}; - while (_in) { + while(_in) + { std::string _s{}; getline(_in, _s); _iss << _s << "\n"; } - OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(), _iss.str().c_str()); + OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(), + _iss.str().c_str()); } } From 2781e63d1a476a44a7515132e99fd3d2ca1ca19a Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 10:08:38 -0500 Subject: [PATCH 23/26] exclude printf --- source/bin/omnitrace/module_function.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source/bin/omnitrace/module_function.cpp b/source/bin/omnitrace/module_function.cpp index e693d28d1..39a3f4b45 100644 --- a/source/bin/omnitrace/module_function.cpp +++ b/source/bin/omnitrace/module_function.cpp @@ -429,6 +429,7 @@ module_function::is_routine_constrained() const static std::regex exclude( "(omnitrace|tim::|N3tim|MPI_Init|MPI_Finalize|dyninst|tm_clones)", regex_opts); + static std::regex exclude_printf("(|v|f)printf$", regex_opts); static std::regex exclude_cxx( "(std::_Sp_counted_base|std::(use|has)_facet|std::locale|::sentry|^std::_|::_(M|" "S)_|::basic_string[a-zA-Z,<>: ]+::_M_create|::__|::_(Alloc|State)|" @@ -456,6 +457,11 @@ module_function::is_routine_constrained() const return _report("Excluding", "critical", 3); } + if(std::regex_search(function_name, exclude_printf)) + { + return _report("Excluding", "critical-printf", 3); + } + if(whole.count(function_name) > 0) { return _report("Excluding", "critical-whole-match", 3); From 105aecf6f32f08c1ab0b9fc5e408696af4ab5740 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 10:09:00 -0500 Subject: [PATCH 24/26] skip mpi tests w/o USE_MPI or USE_MPI_HEADERS --- tests/CMakeLists.txt | 134 ++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1615f2759..985d404e2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -695,74 +695,76 @@ omnitrace_add_test( ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF" REWRITE_FAIL_REGEX "0 instrumented loops in procedure") -omnitrace_add_test( - SKIP_RUNTIME - NAME "mpi" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1" - REWRITE_RUN_PASS_REGEX - "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" - REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)") +if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS) + omnitrace_add_test( + SKIP_RUNTIME + NAME "mpi" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1" + REWRITE_RUN_PASS_REGEX + "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" + REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)") -omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME "mpi-flat-mpip" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - LABELS "mpip" - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT - "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON" - REWRITE_RUN_PASS_REGEX - ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" - ) + omnitrace_add_test( + SKIP_RUNTIME SKIP_SAMPLING + NAME "mpi-flat-mpip" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + LABELS "mpip" + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT + "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON" + REWRITE_RUN_PASS_REGEX + ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" + ) -omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME "mpi-flat" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - LABELS "mpip" - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF" - REWRITE_RUN_PASS_REGEX - ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" - ) + omnitrace_add_test( + SKIP_RUNTIME SKIP_SAMPLING + NAME "mpi-flat" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + LABELS "mpip" + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF" + REWRITE_RUN_PASS_REGEX + ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" + ) +endif() omnitrace_add_test( NAME lulesh From f4cfecad48f576fc0154df105813e40fe0b524d6 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 10:09:18 -0500 Subject: [PATCH 25/26] update ubuntu rocm workflow --- .github/workflows/ubuntu-focal.yml | 61 +++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 8325c19a3..90352feef 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -181,17 +181,17 @@ jobs: mpi_headers: ['OFF'] build_jobs: ['4'] ctest_exclude: ['-LE "mpi-example|transpose"'] + perfetto-tools: ['ON'] include: - compiler: 'g++' rocm_version: 'debian' mpi_headers: 'ON' build_jobs: '2' ctest_exclude: '-LE transpose' + perfetto-tools: 'OFF' env: BUILD_TYPE: MinSizeRel - OMNITRACE_OUTPUT_PATH: omnitrace-tests-output - OMNITRACE_OUTPUT_PREFIX: "%argt%/" OMPI_ALLOW_RUN_AS_ROOT: 1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 @@ -208,7 +208,7 @@ jobs: apt-get update && apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev && python3 -m pip install --upgrade pip && - python3 -m pip install 'cmake==3.16.3' && + python3 -m pip install 'cmake==3.21.4' && for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done - name: Install RCCL @@ -223,6 +223,24 @@ jobs: echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + cat << EOF > test-install.cfg + OMNITRACE_USE_TIMEMORY = ON + OMNITRACE_USE_PERFETTO = ON + OMNITRACE_USE_PID = OFF + OMNITRACE_USE_SAMPLING = OFF + OMNITRACE_USE_PROCESS_SAMPLING = OFF + OMNITRACE_COUT_OUTPUT = ON + OMNITRACE_TIME_OUTPUT = OFF + OMNITRACE_TIMEMORY_COMPONENTS = cpu_clock cpu_util current_peak_rss kernel_mode_time monotonic_clock monotonic_raw_clock network_stats num_io_in num_io_out num_major_page_faults num_minor_page_faults page_rss peak_rss priority_context_switch process_cpu_clock process_cpu_util read_bytes read_char system_clock thread_cpu_clock thread_cpu_util timestamp trip_count user_clock user_mode_time virtual_memory voluntary_context_switch wall_clock written_bytes written_char + OMNITRACE_OUTPUT_PATH = omnitrace-tests-output + OMNITRACE_OUTPUT_PREFIX = %tag%/ + OMNITRACE_DEBUG = OFF + OMNITRACE_VERBOSE = 3 + OMNITRACE_DL_VERBOSE = 3 + OMNITRACE_PERFETTO_BACKEND = system + EOF + realpath test-install.cfg + cat test-install.cfg - name: Configure CMake timeout-minutes: 10 @@ -239,7 +257,6 @@ jobs: -DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF -DOMNITRACE_BUILD_LTO=OFF -DOMNITRACE_USE_MPI=OFF - -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_HIP=ON -DOMNITRACE_MAX_THREADS=32 -DOMNITRACE_USE_PAPI=OFF @@ -247,7 +264,7 @@ jobs: -DOMNITRACE_USE_PYTHON=ON -DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }} -DOMNITRACE_USE_SANITIZER=OFF - -DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON + -DOMNITRACE_INSTALL_PERFETTO_TOOLS=${{ matrix.perfetto-tools }} -DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs -DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10" -DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }} @@ -271,17 +288,18 @@ jobs: ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure - name: Configure Install Env - run: - echo "/opt/omnitrace/bin" >> $GITHUB_PATH && + run: | + echo "/opt/omnitrace/bin" >> $GITHUB_PATH echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + echo "OMNITRACE_CONFIG_FILE=test-install.cfg" >> $GITHUB_ENV - name: Test Install timeout-minutes: 10 + if: ${{ matrix.perfetto-tools == 'ON' }} run: | set -v + cat ${OMNITRACE_CONFIG_FILE} omnitrace-perfetto-traced --background - export OMNITRACE_DEBUG=ON - export OMNITRACE_PERFETTO_BACKEND=system which omnitrace-avail ldd $(which omnitrace-avail) omnitrace-avail --help @@ -302,6 +320,31 @@ jobs: du -m ls-perfetto-trace.proto /opt/conda/envs/py3.8/bin/python ./tests/validate-perfetto-proto.py -p -i ./ls-perfetto-trace.proto + - name: Test Install + timeout-minutes: 10 + if: ${{ matrix.perfetto-tools == 'OFF' }} + run: | + set -v + cat ${OMNITRACE_CONFIG_FILE} + which omnitrace-avail + ldd $(which omnitrace-avail) + omnitrace-avail --help + omnitrace-avail -a + which omnitrace-python + omnitrace-python --help + which omnitrace-critical-trace + ldd $(which omnitrace-critical-trace) + which omnitrace + ldd $(which omnitrace) + omnitrace --help + omnitrace -e -v 1 -o sleep.inst --simulate -- sleep + omnitrace -e -v 1 --simulate -- sleep + omnitrace -e -v 1 -o sleep.inst -- sleep + ./sleep.inst 5 + omnitrace -e -v 1 -- sleep 5 + cat omnitrace-tests-output/sleep.inst/wall_clock.txt + cat omnitrace-tests-output/sleep/wall_clock.txt + - name: Test User API timeout-minutes: 10 run: | From 1f8037c92e22564f4e66152d68e967c83779a1b1 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 25 Jul 2022 11:07:46 -0500 Subject: [PATCH 26/26] Fix configure env step for ubuntu rocm --- .github/workflows/ubuntu-focal.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 90352feef..c70b01302 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -218,10 +218,10 @@ jobs: apt-get install -y rccl-dev - name: Configure Env - run: - echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV && - echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && - echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && + run: | + echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV + echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV + echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV cat << EOF > test-install.cfg OMNITRACE_USE_TIMEMORY = ON