Skip to content

Commit

Permalink
OMNITRACE_ROCM_SMI_METRICS (#331)
Browse files Browse the repository at this point in the history
* OMNITRACE_ROCM_SMI_METRICS

- configuration variable OMNITRACE_ROCM_SMI_METRICS for specifying which rocm-smi metrics to collect
- auto-disable metric collection when rsmi_dev_X_get returns RSMI_STATUS_NOT_SUPPORTED

* Bump version to 1.11.1

* Python formatting

* Update python/libpyomnitrace.cpp

- fix usage of substr (ignored return value)

* Update python/gui/source/gui.py

- Fix E721
  - do not compare types, for exact checks use `is` / `is not`, for instance checks use `isinstance()`
  • Loading branch information
jrmadsen committed Feb 8, 2024
1 parent 77d5281 commit 15127c0
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 35 deletions.
2 changes: 1 addition & 1 deletion VERSION
@@ -1 +1 @@
1.11.0
1.11.1
5 changes: 5 additions & 0 deletions source/lib/core/config.cpp
Expand Up @@ -647,6 +647,11 @@ configure_settings(bool _init)
"is collected on every available device",
"", "rocprofiler", "rocm", "hardware_counters");

OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_ROCM_SMI_METRICS",
"rocm-smi metrics to collect: busy, temp, power, mem_usage",
"busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm",
"process_sampling", "advanced");

OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE_DEBUG",
"Enable debugging for critical trace", _omnitrace_debug,
"debugging", "critical_trace", "advanced");
Expand Down
133 changes: 105 additions & 28 deletions source/lib/omnitrace/library/rocm_smi.cpp
Expand Up @@ -44,7 +44,9 @@

#include <timemory/backends/threading.hpp>
#include <timemory/components/timing/backends.hpp>
#include <timemory/mpl/type_traits.hpp>
#include <timemory/units.hpp>
#include <timemory/utility/delimit.hpp>
#include <timemory/utility/locking.hpp>

#include <rocm_smi/rocm_smi.h>
Expand All @@ -58,8 +60,8 @@
#include <sys/resource.h>
#include <thread>

#define OMNITRACE_ROCM_SMI_CALL(ERROR_CODE) \
::omnitrace::rocm_smi::check_error(ERROR_CODE, __FILE__, __LINE__)
#define OMNITRACE_ROCM_SMI_CALL(...) \
::omnitrace::rocm_smi::check_error(__FILE__, __LINE__, __VA_ARGS__)

namespace omnitrace
{
Expand All @@ -70,6 +72,13 @@ using sampler_instances = thread_data<bundle_t, category::rocm_smi>;

namespace
{
auto&
get_settings(uint32_t _dev_id)
{
static auto _v = std::unordered_map<uint32_t, rocm_smi::settings>{};
return _v[_dev_id];
}

bool&
is_initialized()
{
Expand All @@ -78,9 +87,16 @@ is_initialized()
}

void
check_error(rsmi_status_t _code, const char* _file, int _line)
check_error(const char* _file, int _line, rsmi_status_t _code, bool* _option = nullptr)
{
if(_code == RSMI_STATUS_SUCCESS) return;
if(_code == RSMI_STATUS_SUCCESS)
return;
else if(_code == RSMI_STATUS_NOT_SUPPORTED && _option)
{
*_option = false;
return;
}

const char* _msg = nullptr;
auto _err = rsmi_status_string(_code, &_msg);
if(_err != RSMI_STATUS_SUCCESS)
Expand Down Expand Up @@ -120,24 +136,29 @@ data::sample(uint32_t _dev_id)
m_dev_id = _dev_id;
m_ts = _ts;

#define OMNITRACE_RSMI_GET(FUNCTION, ...) \
try \
#define OMNITRACE_RSMI_GET(OPTION, FUNCTION, ...) \
if(OPTION) \
{ \
OMNITRACE_ROCM_SMI_CALL(FUNCTION(__VA_ARGS__)); \
} catch(std::runtime_error & _e) \
{ \
OMNITRACE_VERBOSE_F( \
0, "[%s] Exception: %s. Disabling future samples from rocm-smi...\n", \
#FUNCTION, _e.what()); \
get_state().store(State::Disabled); \
try \
{ \
OMNITRACE_ROCM_SMI_CALL(FUNCTION(__VA_ARGS__), &OPTION); \
} catch(std::runtime_error & _e) \
{ \
OMNITRACE_VERBOSE_F( \
0, "[%s] Exception: %s. Disabling future samples from rocm-smi...\n", \
#FUNCTION, _e.what()); \
get_state().store(State::Disabled); \
} \
}

OMNITRACE_RSMI_GET(rsmi_dev_busy_percent_get, _dev_id, &m_busy_perc);
OMNITRACE_RSMI_GET(rsmi_dev_temp_metric_get, _dev_id, RSMI_TEMP_TYPE_EDGE,
RSMI_TEMP_CURRENT, &m_temp);
OMNITRACE_RSMI_GET(rsmi_dev_power_ave_get, _dev_id, 0, &m_power);
OMNITRACE_RSMI_GET(rsmi_dev_memory_usage_get, _dev_id, RSMI_MEM_TYPE_VRAM,
&m_mem_usage);
OMNITRACE_RSMI_GET(get_settings(m_dev_id).busy, rsmi_dev_busy_percent_get, _dev_id,
&m_busy_perc);
OMNITRACE_RSMI_GET(get_settings(m_dev_id).temp, rsmi_dev_temp_metric_get, _dev_id,
RSMI_TEMP_TYPE_EDGE, RSMI_TEMP_CURRENT, &m_temp);
OMNITRACE_RSMI_GET(get_settings(m_dev_id).power, rsmi_dev_power_ave_get, _dev_id, 0,
&m_power);
OMNITRACE_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get,
_dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage);

#undef OMNITRACE_RSMI_GET
}
Expand Down Expand Up @@ -249,7 +270,19 @@ data::post_process(uint32_t _dev_id)
OMNITRACE_CI_THROW(!_thread_info, "Missing thread info for thread 0");
if(!_thread_info) return;

auto _settings = get_settings(_dev_id);

auto _process_perfetto = [&]() {
auto _idx = std::array<uint64_t, 4>{};
{
_idx.fill(_idx.size());
uint64_t nidx = 0;
if(_settings.busy) _idx.at(0) = nidx++;
if(_settings.temp) _idx.at(1) = nidx++;
if(_settings.power) _idx.at(2) = nidx++;
if(_settings.mem_usage) _idx.at(3) = nidx++;
}

for(auto& itr : _rocm_smi)
{
using counter_track = perfetto_counter_track<data>;
Expand All @@ -259,10 +292,15 @@ data::post_process(uint32_t _dev_id)
auto addendum = [&](const char* _v) {
return JOIN(" ", "GPU", _v, JOIN("", '[', _dev_id, ']'), "(S)");
};
counter_track::emplace(_dev_id, addendum("Busy"), "%");
counter_track::emplace(_dev_id, addendum("Temperature"), "deg C");
counter_track::emplace(_dev_id, addendum("Power"), "watts");
counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes");

if(_settings.busy) counter_track::emplace(_dev_id, addendum("Busy"), "%");
if(_settings.temp)
counter_track::emplace(_dev_id, addendum("Temperature"), "deg C");
if(_settings.power)
counter_track::emplace(_dev_id, addendum("Power"), "watts");
if(_settings.mem_usage)
counter_track::emplace(_dev_id, addendum("Memory Usage"),
"megabytes");
}
uint64_t _ts = itr.m_ts;
if(!_thread_info->is_valid_time(_ts)) continue;
Expand All @@ -271,11 +309,19 @@ data::post_process(uint32_t _dev_id)
double _temp = itr.m_temp / 1.0e3;
double _power = itr.m_power / 1.0e6;
double _usage = itr.m_mem_usage / static_cast<double>(units::megabyte);
TRACE_COUNTER("device_busy", counter_track::at(_dev_id, 0), _ts, _busy);
TRACE_COUNTER("device_temp", counter_track::at(_dev_id, 1), _ts, _temp);
TRACE_COUNTER("device_power", counter_track::at(_dev_id, 2), _ts, _power);
TRACE_COUNTER("device_memory_usage", counter_track::at(_dev_id, 3), _ts,
_usage);

if(_settings.busy)
TRACE_COUNTER("device_busy", counter_track::at(_dev_id, _idx.at(0)), _ts,
_busy);
if(_settings.temp)
TRACE_COUNTER("device_temp", counter_track::at(_dev_id, _idx.at(1)), _ts,
_temp);
if(_settings.power)
TRACE_COUNTER("device_power", counter_track::at(_dev_id, _idx.at(2)), _ts,
_power);
if(_settings.mem_usage)
TRACE_COUNTER("device_memory_usage",
counter_track::at(_dev_id, _idx.at(3)), _ts, _usage);
}
};

Expand All @@ -288,6 +334,11 @@ data::post_process(uint32_t _dev_id)
using samp_bundle_t = tim::lightweight_tuple<sampling_gpu_busy, sampling_gpu_temp,
sampling_gpu_power, sampling_gpu_memory>;

trait::runtime_enabled<sampling_gpu_busy>::set(_settings.busy);
trait::runtime_enabled<sampling_gpu_temp>::set(_settings.temp);
trait::runtime_enabled<sampling_gpu_power>::set(_settings.power);
trait::runtime_enabled<sampling_gpu_memory>::set(_settings.mem_usage);

using entry_t = critical_trace::entry;
auto _gpu_entries = critical_trace::get_entries(
[](const entry_t& _e) { return (_e.device == critical_trace::Device::GPU); });
Expand Down Expand Up @@ -391,13 +442,39 @@ setup()

data::device_list = _devices;

auto _metrics = get_setting_value<std::string>("OMNITRACE_ROCM_SMI_METRICS");

try
{
for(auto itr : _devices)
{
uint16_t dev_id = 0;
OMNITRACE_ROCM_SMI_CALL(rsmi_dev_id_get(itr, &dev_id));
// dev_id holds the device ID of device i, upon a successful call

if(_metrics && !_metrics->empty())
{
using key_pair_t = std::pair<std::string_view, bool&>;
const auto supported = std::unordered_map<std::string_view, bool&>{
key_pair_t{ "busy", get_settings(dev_id).busy },
key_pair_t{ "temp", get_settings(dev_id).temp },
key_pair_t{ "power", get_settings(dev_id).power },
key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage },
};

get_settings(dev_id) = { false, false, false, false };
for(const auto& metric : tim::delimit(*_metrics, ",;:\t\n "))
{
auto iitr = supported.find(metric);
if(iitr == supported.end())
OMNITRACE_FAIL_F("unsupported rocm-smi metric: %s\n",
metric.c_str());

OMNITRACE_VERBOSE_F(1, "Enabling rocm-smi metric '%s'\n",
metric.c_str());
iitr->second = true;
}
}
}

is_initialized() = true;
Expand Down
8 changes: 8 additions & 0 deletions source/lib/omnitrace/library/rocm_smi.hpp
Expand Up @@ -69,6 +69,14 @@ void set_state(State);
uint32_t
device_count();

struct settings
{
bool busy = true;
bool temp = true;
bool power = true;
bool mem_usage = true;
};

struct data
{
using msec_t = std::chrono::milliseconds;
Expand Down
10 changes: 6 additions & 4 deletions source/python/gui/source/gui.py
Expand Up @@ -191,7 +191,7 @@ def update_line_graph(

def reset_input_filters(workloads, max_points, verbosity):
sortOptions = ["Alphabetical", "Max Speedup", "Min Speedup", "Impact"]
if type(workloads) == str:
if isinstance(workloads, str):
workloads = [workloads]

input_filters = [
Expand Down Expand Up @@ -241,9 +241,11 @@ def build_causal_layout(
]

app.layout = html.Div(
style={"backgroundColor": "rgb(255, 255, 255)"}
if light_mode
else {"backgroundColor": "rgb(50, 50, 50)"}
style=(
{"backgroundColor": "rgb(255, 255, 255)"}
if light_mode
else {"backgroundColor": "rgb(50, 50, 50)"}
)
)

line_graph1, line_graph2 = build_line_graph()
Expand Down
2 changes: 1 addition & 1 deletion source/python/gui/source/header.py
Expand Up @@ -186,7 +186,7 @@ def get_header(dropDownMenuItems, input_filters):
ul = html.Div(
id="nav-center",
className="nav-center",
children=filter_children
children=filter_children,
# [
# html.Li(className="filter", children=filter_children),
# refresh(),
Expand Down
2 changes: 1 addition & 1 deletion source/python/libpyomnitrace.cpp
Expand Up @@ -135,7 +135,7 @@ PYBIND11_MODULE(libpyomnitrace, omni)
}
if(!_cmd_line.empty())
{
_cmd_line.substr(_cmd_line.find_first_not_of(' '));
_cmd_line = _cmd_line.substr(_cmd_line.find_first_not_of(' '));
tim::set_env("OMNITRACE_COMMAND_LINE", _cmd_line, 0);
}
omnitrace_init("trace", false, _cmd.c_str());
Expand Down

0 comments on commit 15127c0

Please sign in to comment.