In [1]:
using TranscodingStreams, CodecZlib, JSON, Statistics

In [2]:
prefix = "/pscratch/sd/b/blaschke/GB25/runs/"

"/pscratch/sd/b/blaschke/GB25/runs/"

In [3]:
run_dir = "2025-04-07T00-53-00.048_ytYO"

"2025-04-07T00-53-00.048_ytYO"

In [6]:
function test_dir(ng::Int)::String
    nstr = lpad(ng, 5, "0")
    "ngpu=$(nstr)"
end

get_jobid(pth::String)::String = readdir(pth) |> 
    filter(endswith(".err"))                  |> 
    only                                      |>
    Base.Fix2(split, ".")                     |>
    first

function profile_for_job(root::String, jid::String, rank::Int)::String
    pdir  = joinpath(root, "profiling", "$(jid).$(rank)", "loop", "plugins", "profile")
    ddir  = readdir(pdir) |> only
    tpath = joinpath(pdir, ddir) |> readdir |>
        Base.Fix1(filter, endswith(".trace.json.gz")) |>
        only
    joinpath(pdir, ddir, tpath)
end

function get_nccl_stats(pth::String)
    stream = GzipDecompressorStream(open(pth))
    tprof  = readchomp(stream) |> JSON.parse
    close(stream)

    nccl_events = tprof["traceEvents"] |> filter(x->"name" in keys(x) && contains(x["name"], "nccl"))

    nccl_stats = Dict()
    for event in nccl_events
        if !("name" in keys(event)) || !("dur" in keys(event))
            continue
        end
        push!(get!(nccl_stats, event["name"], []), event["dur"])
    end

    nccl_summary = Dict()
    for event_type in keys(nccl_stats)
        events = nccl_stats[event_type]
        nccl_summary[event_type] = (
            mean = mean(events),
            std  = std(events),
            sum  = sum(events),
            freq = length(events)
        )
    end

    nccl_summary, nccl_stats
end

get_nccl_stats (generic function with 1 method)

In [7]:
jid   = get_jobid(joinpath(prefix, run_dir, test_dir(4)))
tpath = profile_for_job(joinpath(prefix, run_dir), jid, 0)

"/pscratch/sd/b/blaschke/GB25/runs/2025-04-07T00-53-00.048_ytYO/profiling/37518693.0/loop/plugins/profile/2025_04_07_01_01_36/nid001448.trace.json.gz"

In [13]:
st, _ = get_nccl_stats(tpath)

(Dict{Any, Any}("ncclDevKernel_AllReduce_Sum_f64_RING_LL(ncclDevKernelArgsStorage<4096ul>)" => (mean = 7.663837837837841, std = 19.350766902041155, sum = 163331.71200000006, freq = 21312), "ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<4096ul>)" => (mean = 6.934242884789518, std = 10.178049737066774, sum = 3.124458896e6, freq = 450584), "ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<4096ul>)" => (mean = 16.872203703703697, std = 34.81005526843525, sum = 10933.187999999996, freq = 648)), Dict{Any, Any}("ncclDevKernel_AllReduce_Sum_f64_RING_LL(ncclDevKernelArgsStorage<4096ul>)" => Any[4.168, 7.084, 14.208, 10.289, 11.04, 9.344, 8.032, 8.608, 3.377, 3.747  …  85.695, 64.576, 3.537, 8.608, 8.064, 3.276, 7.328, 7.104, 8.096, 6.784], "ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<4096ul>)" => Any[8.216, 9.719, 9.017, 8.507, 20.607, 13.984, 11.967, 13.312, 4.558, 5.511  …  5.08, 5.07, 10.624, 3.397, 8.768, 10.08, 3.547, 8.96, 4.298, 11.68], "ncclDevKernel_AllGather_RING_LL(nccl

In [14]:
k_sendrecv = keys(st) |> filter(startswith("ncclDevKernel_SendRecv")) |> only

"ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<4096ul>)"

In [15]:
st[k_sendrecv]

(mean = 6.934242884789518, std = 10.178049737066774, sum = 3.124458896e6, freq = 450584)