In [1]:
using LIKWID

In [2]:
function matmul(n, k=n)
    A = rand(n, k)
    B = rand(k, n)
    C = zeros(n, n)
    # simple matmul implementation
    for n in axes(C, 2), m in axes(C, 1)
        Cmn = zero(eltype(C))
        for k in axes(A, 2)
            tmp = A[m, k] * B[k, n]
            Cmn += tmp
        end
        C[m, n] = Cmn
    end
    return C
end

matmul (generic function with 2 methods)

In [3]:
metrics, events = @perfmon "FLOPS_DP" matmul(1000, 100);


Group: [0m[1mFLOPS_DP[22m
┌──────────────────────────────────┬───────────┐
│[1m                            Event [0m│[1m  Thread 1 [0m│
├──────────────────────────────────┼───────────┤
│                 ACTUAL_CPU_CLOCK │ 4.36872e8 │
│                    MAX_CPU_CLOCK │ 4.37114e8 │
│             RETIRED_INSTRUCTIONS │ 1.32528e9 │
│              CPU_CLOCKS_UNHALTED │ 4.26696e8 │
│ RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL │   2.002e8 │
│                            MERGE │       0.0 │
└──────────────────────────────────┴───────────┘
┌──────────────────────┬──────────┐
│[1m               Metric [0m│[1m Thread 1 [0m│
├──────────────────────┼──────────┤
│  Runtime (RDTSC) [s] │ 0.109318 │
│ Runtime unhalted [s] │ 0.109424 │
│          Clock [MHz] │  3990.26 │
│                  CPI │ 0.321968 │
│         DP [MFLOP/s] │  1831.36 │
└──────────────────────┴──────────┘


In [4]:
function count_flops(f)
    metrics, events = perfmon(f, "FLOPS_DP"; print=false)
    flops_per_second = metrics["FLOPS_DP"][1]["DP [MFLOP/s]"] * 1e6
    runtime = metrics["FLOPS_DP"][1]["Runtime (RDTSC) [s]"]
    flops = round(Int, flops_per_second * runtime)
    return flops
end

count_flops (generic function with 1 method)

In [5]:
N = 1000
K = 100
count_flops(() -> matmul(N, K))

200200000

In [6]:
using Test
nflops_rand = 2 * N * K
nflops_matmul = 2 * N^2 * K
@test count_flops(() -> matmul(N, K)) ≈ nflops_matmul + nflops_rand

[32m[1mTest Passed[22m[39m