Skip to content

Commit

Permalink
[PROTON][AMD] Add Proton HIP GPU Utilization Metrics (triton-lang#4119)
Browse files Browse the repository at this point in the history
This PR adds Proton HIP GPU utilization metrics and an associated test.
  • Loading branch information
CRobeck authored Jun 12, 2024
1 parent 6c3005c commit c1776fa
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 14 deletions.
4 changes: 2 additions & 2 deletions third_party/proton/csrc/include/Driver/Device.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ struct Device {
uint64_t memoryClockRate; // khz
uint64_t busWidth;
uint64_t numSms;
uint64_t arch;
std::string arch;

Device() = default;

Device(DeviceType type, uint64_t id, uint64_t clockRate,
uint64_t memoryClockRate, uint64_t busWidth, uint64_t numSms,
uint64_t arch)
std::string arch)
: type(type), id(id), clockRate(clockRate),
memoryClockRate(memoryClockRate), busWidth(busWidth), numSms(numSms),
arch(arch) {}
Expand Down
5 changes: 5 additions & 0 deletions third_party/proton/csrc/include/Driver/GPU/HipApi.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@ hipError_t deviceGetAttribute(int *value, hipDeviceAttribute_t attribute,

template <bool CheckSuccess> hipError_t getDeviceCount(int *count);

template <bool CheckSuccess>
hipError_t getDeviceProperties(hipDeviceProp_t *prop, int deviceId);

Device getDevice(uint64_t index);

const std::string getHipArchName(uint64_t index);

const char *getKernelNameRef(const hipFunction_t f);
const char *getKernelNameRefByPtr(const void *hostFunction, hipStream_t stream);

Expand Down
3 changes: 2 additions & 1 deletion third_party/proton/csrc/lib/Driver/GPU/CudaApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ Device getDevice(uint64_t index) {
int minor;
cuda::deviceGetAttribute<true>(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
auto arch = major * 10 + minor;
std::string arch = std::to_string(major * 10 + minor);

return Device(DeviceType::CUDA, index, clockRate, memoryClockRate, busWidth,
numSms, arch);
}
Expand Down
26 changes: 23 additions & 3 deletions third_party/proton/csrc/lib/Driver/GPU/HipApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ DEFINE_DISPATCH(ExternLibHip, deviceGetAttribute, hipDeviceGetAttribute, int *,

DEFINE_DISPATCH(ExternLibHip, getDeviceCount, hipGetDeviceCount, int *);

DEFINE_DISPATCH(ExternLibHip, getDeviceProperties, hipGetDeviceProperties,
hipDeviceProp_t *, int);

Device getDevice(uint64_t index) {
int clockRate;
(void)hip::deviceGetAttribute<true>(&clockRate, hipDeviceAttributeClockRate,
Expand All @@ -37,13 +40,30 @@ Device getDevice(uint64_t index) {
(void)hip::deviceGetAttribute<true>(
&smCount, hipDeviceAttributeMultiprocessorCount, index);

// TODO: Compute capability is a NVIDIA concept. It doesn't map naturally to
// AMD GPUs. Figure out a better way to support this.
uint64_t arch = 0;
std::string arch = getHipArchName(index);

return Device(DeviceType::HIP, index, clockRate, memoryClockRate, busWidth,
smCount, arch);
}

// TODO: hipDeviceProp_t was updated to point from hipDeviceProp_tR0000 ->
// hipDeviceProp_tR0600 as part of a breaking API change in Rocm 6.0
// https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/driver.c
// uses hipDeviceProp_tR0000 and imports the hip_deprecated.h header file to be
// be back compatible with ROCm 5.x. PyTorch stills needs to support 5.x and the
// hipDeviceProp_tR0600 symbol does not exist pre-Rocm 6.0. Calling
// hipDeviceProp_tR0000 here with Rocm 6.1 causes a stack corruption. Therefore
// were will use hipDeviceProp_t and investigate if we can unify the definitions
// in the two files.

const std::string getHipArchName(uint64_t index) {
hipDeviceProp_t devProp;
(void)hip::getDeviceProperties<true>(&devProp, index);
std::string gcnArchName(devProp.gcnArchName);
std::string hipArch = gcnArchName.substr(0, 6);
return hipArch;
}

const char *getKernelNameRef(const hipFunction_t f) {
typedef const char *(*hipKernelNameRef_t)(const hipFunction_t);
static hipKernelNameRef_t func = nullptr;
Expand Down
11 changes: 8 additions & 3 deletions third_party/proton/proton/viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,19 @@ def get_min_time_flops(df, device_info):
continue
max_flops = 0
if device_type == "CUDA":
if arch == 80:
if arch == "80":
max_flops = 624e12 / (width / 8)
elif arch == 89:
elif arch == "89":
# TODO(Keren): Implement fp16 acc-> 660.6 fp8
max_flops = (330.3 * 1e12) / (width / 8)
elif arch == 90:
elif arch == "90":
# 114 sms and 1755mhz is the base number of sms and clock rate of H100 pcie
max_flops = ((num_sms / 114 * clock_rate / (1755 * 1e3) * 1513) * 1e12) / (width / 8)
elif device_type == "HIP":
if arch == "gfx90a":
max_flops = 383e12 / (width / 8)
elif arch == "gfx941" or arch == "gfx942":
max_flops = 2614.9e12 / (width / 8)
else:
raise ValueError(f"Unsupported device type: {device_type}")
min_time_flops.loc[idx, "min_time"] += device_frames[f"flops{width}"].fillna(0) / max_flops
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@
{
"CUDA": {
"0": {
"arch": 89,
"arch": "89",
"bus_width": 384,
"clock_rate": 2625000,
"memory_clock_rate": 10501000,
"num_sms": 128
},
"1": {
"arch": 90,
"arch": "90",
"bus_width": 6144,
"clock_rate": 1980000,
"memory_clock_rate": 2619000,
Expand Down
64 changes: 64 additions & 0 deletions third_party/proton/test/example_hip.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[
{
"children": [
{
"children": [],
"frame": {
"name": "foo0",
"type": "function"
},
"metrics": {
"Count": 1,
"DeviceId": "1",
"DeviceType": "HIP",
"Time (ns)": 204800,
"flops8": 1e11,
"bytes": 1e8
}
},
{
"children": [],
"frame": {
"name": "foo1",
"type": "function"
},
"metrics": {
"Count": 1,
"DeviceId": "0",
"DeviceType": "HIP",
"Time (ns)": 204800,
"flops8": 1e10,
"bytes": 1e7
}
}
],
"frame": {
"name": "ROOT",
"type": "function"
},
"metrics": {
"Count": 0,
"Time (ns)": 0,
"flops8": 0,
"bytes": 0
}
},
{
"HIP": {
"0": {
"arch": "gfx90a",
"bus_width": 4096,
"clock_rate": 1700000,
"memory_clock_rate": 1600000,
"num_sms": 104
},
"1": {
"arch": "gfx941",
"bus_width": 8192,
"clock_rate": 5200000,
"memory_clock_rate": 2525000,
"num_sms": 304
}
}
}
]
25 changes: 22 additions & 3 deletions third_party/proton/test/test_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import numpy as np

file_path = __file__
example_file = file_path.replace("test_viewer.py", "example.json")
cuda_example_file = file_path.replace("test_viewer.py", "example_cuda.json")
hip_example_file = file_path.replace("test_viewer.py", "example_hip.json")


def test_help():
Expand All @@ -13,7 +14,7 @@ def test_help():


def test_min_time_flops():
with open(example_file, "r") as f:
with open(cuda_example_file, "r") as f:
gf, _, device_info = get_raw_metrics(f)
ret = get_min_time_flops(gf.dataframe, device_info)
device0_idx = gf.dataframe["DeviceId"] == "0"
Expand All @@ -22,10 +23,19 @@ def test_min_time_flops():
np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000025]], atol=1e-5)
# sm90
np.testing.assert_allclose(ret[device1_idx].to_numpy(), [[0.00005]], atol=1e-5)
with open(hip_example_file, "r") as f:
gf, _, device_info = get_raw_metrics(f)
ret = get_min_time_flops(gf.dataframe, device_info)
device0_idx = gf.dataframe["DeviceId"] == "0"
device1_idx = gf.dataframe["DeviceId"] == "1"
# MI200
np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000026]], atol=1e-5)
# MI300
np.testing.assert_allclose(ret[device1_idx].to_numpy(), [[0.000038]], atol=1e-5)


def test_min_time_bytes():
with open(example_file, "r") as f:
with open(cuda_example_file, "r") as f:
gf, _, device_info = get_raw_metrics(f)
ret = get_min_time_bytes(gf.dataframe, device_info)
device0_idx = gf.dataframe["DeviceId"] == "0"
Expand All @@ -34,3 +44,12 @@ def test_min_time_bytes():
np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[9.91969e-06]], atol=1e-6)
# sm90
np.testing.assert_allclose(ret[device1_idx].to_numpy(), [[2.48584e-05]], atol=1e-6)
with open(hip_example_file, "r") as f:
gf, _, device_info = get_raw_metrics(f)
ret = get_min_time_bytes(gf.dataframe, device_info)
device0_idx = gf.dataframe["DeviceId"] == "0"
device1_idx = gf.dataframe["DeviceId"] == "1"
# MI200
np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[6.10351e-06]], atol=1e-6)
# MI300
np.testing.assert_allclose(ret[device1_idx].to_numpy(), [[1.93378e-05]], atol=1e-6)

0 comments on commit c1776fa

Please sign in to comment.