In [25]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v -O0')

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [26]:
%%cuda 

#include<stdio.h> 
#include<stdlib.h> 
#include<cuda.h> 
#include<cuda_runtime.h>
constexpr int N_iter = 1000;

__global__ void ILP_bad_loop (unsigned long long start, unsigned long long end)
{
  
}
int main()
{
  return 0;
}




In [27]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

constexpr int N = 8;
constexpr int reps = 100000;

// Device function to get clock cycles
__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

__global__ void inner_k(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {

    float A_reg[N*N];
    float B_reg[N*N];
    float C_reg[N*N] = {0.0}; // Initialize to zero

    // This is okay for a single thread (t=0)
    for (int i = 0; i < N*N; i++) {
        A_reg[i] = A[i];
        B_reg[i] = B[i];
    }

    // Ensure all loads are complete before starting
    __syncthreads();

    // --- Start Clock ---
    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                // We must re-initialize the specific C_reg for this repeat
                // *if* we want to measure just the matmul.
                // But for this test, we'll just accumulate.
                // A true matmul would be C_reg[i*N+j] = 0.0f here.
                // We'll let it accumulate to match your original logic.
                for (int k = 0; k < N; k++) {
                    C_reg[i*N + j] += A_reg[i*N + k] * B_reg[k*N + j];
                }
            }
        }
    }

    // --- End Clock ---

    
    // Ensure all computation is done before writing
    __syncthreads();
    *g_end = get_clock64();

    // Write result back
    for (int i = 0; i < N*N; i++) {
        C[i] = C_reg[i];
    }
}


int main() {
    //# Host arrays
    float A[N*N], B[N*N], C[N*N];
    
    // --- FIX: Use host variables, not uninitialized pointers ---
    unsigned long long h_start, h_end; 
    unsigned long long *d_start, *d_end; // Device pointers
    size_t size_clock = sizeof(unsigned long long);

    //# Init A and B
    for (int i = 0; i < N*N; i++) {
        A[i] = 0.01f;
        B[i] = 0.02f;
    }

    // #Device arrays
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, N*N*sizeof(float));
    cudaMalloc(&d_B, N*N*sizeof(float));
    cudaMalloc(&d_C, N*N*sizeof(float));
    
    // --- FIX: Allocate device memory for clock pointers ---
    cudaMalloc(&d_start, size_clock); 
    cudaMalloc(&d_end, size_clock);

    cudaMemcpy(d_A, A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N*N*sizeof(float), cudaMemcpyHostToDevice);

    // --- FIX: Pass the correct device pointers ---
    inner_k<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);

    cudaDeviceSynchronize();

    // # Copy back result
    cudaMemcpy(C, d_C, N*N*sizeof(float), cudaMemcpyDeviceToHost);
    
    // --- FIX: Copy back to the allocated host variables ---
    cudaMemcpy(&h_start, d_start, size_clock, cudaMemcpyDeviceToHost); 
    cudaMemcpy(&h_end, d_end, size_clock, cudaMemcpyDeviceToHost);

    // #Print output matrix
    printf("C =\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", C[i*N + j]);
        }
        printf("\n");
    }
    
    // --- ADDED: Print clock results as requested ---
    unsigned long long elapsed = h_end - h_start;
    double clocks_per_iter = static_cast<double>(elapsed) / reps;
    
    printf("\n--- Benchmark ---\n");
    printf("Total Clocks:  %llu\n", elapsed);
    printf("Iterations:    %d\n", reps);
    printf("Clocks / Iter: %f\n", clocks_per_iter);
    printf("-----------------\n");

    // --- FIX: Free the correct device pointers ---
    cudaFree(d_end);
    cudaFree(d_start); 
    
    // #Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

C =
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 
158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 158.981766 

--- Benchmark ---
Total Clocks:  172100221
Iterations:    100000
Clocks / Iter: 1721.002210
-----------------



In [34]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

constexpr int N = 8;
constexpr int reps = 100;

// Device function to get clock cycles
__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

__global__ void inner_k(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {

    float A_reg[N*N];
    float B_reg[N*N];
    float C_reg[N*N] = {0.0}; // Initialize to zero

    // This is okay for a single thread (t=0)
    for (int i = 0; i < N*N; i++) {
        A_reg[i] = A[i];
        B_reg[i] = B[i];
    }

    // Ensure all loads are complete before starting
    __syncthreads();

    // --- Start Clock ---
    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int k = 0; k < N; k++) {
            for (int i = 0; i < N; i++) {
                // We must re-initialize the specific C_reg for this repeat
                // *if* we want to measure just the matmul.
                // But for this test, we'll just accumulate.
                // A true matmul would be C_reg[i*N+j] = 0.0f here.
                // We'll let it accumulate to match your original logic.
                for (int j = 0; j < N; j++) {
                    C_reg[i*N + j] += A_reg[i*N + k] * B_reg[k*N + j];
                }
            }
        }
    }

    // --- End Clock ---

    
    // Ensure all computation is done before writing
    __syncthreads();
    *g_end = get_clock64();

    // Write result back
    for (int i = 0; i < N*N; i++) {
        C[i] = C_reg[i];
    }
}


int main() {
    //# Host arrays
    float A[N*N], B[N*N], C[N*N];
    
    // --- FIX: Use host variables, not uninitialized pointers ---
    unsigned long long h_start, h_end; 
    unsigned long long *d_start, *d_end; // Device pointers
    size_t size_clock = sizeof(unsigned long long);

    //# Init A and B
    for (int i = 0; i < N*N; i++) {
        A[i] = 0.01f;
        B[i] = 0.02f;
    }

    // #Device arrays
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, N*N*sizeof(float));
    cudaMalloc(&d_B, N*N*sizeof(float));
    cudaMalloc(&d_C, N*N*sizeof(float));
    
    // --- FIX: Allocate device memory for clock pointers ---
    cudaMalloc(&d_start, size_clock); 
    cudaMalloc(&d_end, size_clock);

    cudaMemcpy(d_A, A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N*N*sizeof(float), cudaMemcpyHostToDevice);

    // --- FIX: Pass the correct device pointers ---
    inner_k<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);

    cudaDeviceSynchronize();

    // # Copy back result
    cudaMemcpy(C, d_C, N*N*sizeof(float), cudaMemcpyDeviceToHost);
    
    // --- FIX: Copy back to the allocated host variables ---
    cudaMemcpy(&h_start, d_start, size_clock, cudaMemcpyDeviceToHost); 
    cudaMemcpy(&h_end, d_end, size_clock, cudaMemcpyDeviceToHost);

    // #Print output matrix
    printf("C =\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", C[i*N + j]);
        }
        printf("\n");
    }
    
    // --- ADDED: Print clock results as requested ---
    unsigned long long elapsed = h_end - h_start;
    double clocks_per_iter = static_cast<double>(elapsed) / reps;
    
    printf("\n--- Benchmark ---\n");
    printf("Total Clocks:  %llu\n", elapsed);
    printf("Iterations:    %d\n", reps);
    printf("Clocks / Iter: %f\n", clocks_per_iter);
    printf("-----------------\n");

    // --- FIX: Free the correct device pointers ---
    cudaFree(d_end);
    cudaFree(d_start); 
    
    // #Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

C =
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 
0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 0.160002 

--- Benchmark ---
Total Clocks:  233096
Iterations:    100
Clocks / Iter: 2330.960000
-----------------



In [37]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

constexpr int N = 8;
constexpr int reps = 1000000;

// Device function to get clock cycles
__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

// --- KERNEL 1: The "Optimized" Loop ---
// The compiler will reorder this C++ loop to be fast (throughput-bound).
__global__ void kernel_Optimized(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {

    float A_reg[N*N];
    float B_reg[N*N];
    float C_reg[N*N] = {0.0f};

    for (int i = 0; i < N*N; i++) {
        A_reg[i] = A[i];
        B_reg[i] = B[i];
    }
    __syncthreads();

    // --- Start Clock ---
    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int k = 0; k < N; k++) {
            for (int i = 0; i < N; i++) {
                // This loop creates the dependency chain
                for (int j = 0; j < N; j++) {
                    // This asm volatile PREVENTS reordering.
                    // It forces the hardware to stall on C_reg[i*N + j].
                    asm volatile ("fma.rn.f32 %0, %1, %2, %0;"
                                  : "+f"(C_reg[i*N + j]) // %0: Read+Write
                                  : "f"(A_reg[i*N + k]), "f"(B_reg[k*N + j]));
                }
            }
        }
    }

    // --- End Clock ---
    *g_end = get_clock64();
    __syncthreads();

    for (int i = 0; i < N*N; i++) {
        C[i] = C_reg[i];
    }
}


// --- KERNEL 2: The "Forced Latency-Bound" Loop ---
// We use asm volatile to FORBID the compiler from optimizing.
// This will force the slow, serial dependency chain.
__global__ void kernel_LatencyBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {

    float A_reg[N*N];
    float B_reg[N*N];
    float C_reg[N*N] = {0.0f};

    for (int i = 0; i < N*N; i++) {
        A_reg[i] = A[i];
        B_reg[i] = B[i];
    }
    __syncthreads();

    // --- Start Clock ---
    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                // This loop creates the dependency chain
                for (int k = 0; k < N; k++) {
                    // This asm volatile PREVENTS reordering.
                    // It forces the hardware to stall on C_reg[i*N + j].
                    asm volatile ("fma.rn.f32 %0, %1, %2, %0;"
                                  : "+f"(C_reg[i*N + j]) // %0: Read+Write
                                  : "f"(A_reg[i*N + k]), "f"(B_reg[k*N + j]));
                }
            }
        }
    }

    // --- End Clock ---
    *g_end = get_clock64();
    __syncthreads();

    for (int i = 0; i < N*N; i++) {
        C[i + N*N] = C_reg[i]; // Store in the second half of C
    }
}


int main() {
    //# Host arrays
    float A[N*N], B[N*N];
    // Allocate double the space for C
    float C[N*N * 2];
    
    // --- Store 2 clock results ---
    unsigned long long h_start[2], h_end[2]; 
    unsigned long long *d_start, *d_end;
    size_t size_clock_array = sizeof(unsigned long long) * 2;

    //# Init A and B
    for (int i = 0; i < N*N; i++) {
        A[i] = (rand() % 100)/(10000.0);
        B[i] = (rand() % 100)/(10000.0);
    }

    // #Device arrays
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, N*N*sizeof(float));
    cudaMalloc(&d_B, N*N*sizeof(float));
    // Allocate double C
    cudaMalloc(&d_C, N*N*sizeof(float) * 2);
    
    // --- Allocate device memory for 2 clock results ---
    cudaMalloc(&d_start, size_clock_array); 
    cudaMalloc(&d_end, size_clock_array);

    cudaMemcpy(d_A, A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N*N*sizeof(float), cudaMemcpyHostToDevice);

    
    // --- Launch Kernel 1 (Optimized) ---
    printf("Launching Optimized (Compiler-Fixed) Kernel...\n");
    kernel_Optimized<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);
    cudaDeviceSynchronize();
    
    // --- Launch Kernel 2 (Forced Latency) ---
    printf("Launching Latency-Bound (asm volatile) Kernel...\n");
    // Pass pointers to the 2nd slot for clocks
    kernel_LatencyBound<<<1,1>>>(d_A, d_B, d_C, d_start + 1, d_end + 1);
    cudaDeviceSynchronize();


    // # Copy back results
    cudaMemcpy(C, d_C, N*N*sizeof(float) * 2, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_start, d_start, size_clock_array, cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_end, d_end, size_clock_array, cudaMemcpyDeviceToHost);

    // #Print output matrix (from first kernel)
    printf("\nC (from Optimized Kernel) =\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", C[i*N + j]);
        }
        printf("\n");
    }
    
    // --- Print clock results ---
    unsigned long long elapsed_optimized = h_end[0] - h_start[0];
    unsigned long long elapsed_latency = h_end[1] - h_start[1];
    
    double clocks_per_iter_optimized = static_cast<double>(elapsed_optimized) / reps;
    double clocks_per_iter_latency = static_cast<double>(elapsed_latency) / reps;
    
    printf("\n--- Benchmark --- (Total Iterations: %d)\n", reps);

    printf("\n[1. Optimized Kernel (Compiler Fixed)]\n");
    printf("Total Clocks:  %llu\n", elapsed_optimized);
    printf("Clocks / Iter: %f\n", clocks_per_iter_optimized);

    printf("\n[2. Latency-Bound Kernel (asm volatile)]\n");
    printf("Total Clocks:  %llu\n", elapsed_latency);
    printf("Clocks / Iter: %f\n", clocks_per_iter_latency);
    printf("-----------------\n");


    // #Cleanup
    cudaFree(d_end);
    cudaFree(d_start); 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Launching Optimized (Compiler-Fixed) Kernel...
Launching Latency-Bound (asm volatile) Kernel...

C (from Optimized Kernel) =
345.348724 247.197632 410.105316 363.339264 265.932648 202.535172 330.611542 192.404587 
194.226807 165.190536 296.829559 261.399567 193.224335 179.680588 246.366135 134.804367 
251.342026 188.083710 303.962036 211.699249 163.991745 97.787346 255.781281 164.831863 
188.370590 207.630356 295.188782 225.393906 259.010284 183.751938 231.067566 149.282898 
138.154770 100.262741 195.132431 184.773453 112.343773 110.747879 168.071075 106.519211 
240.078873 190.981369 271.795868 259.629364 167.704727 122.352242 262.765961 151.237473 
195.870010 225.527939 334.567139 260.342041 270.636810 186.921692 249.801956 131.103058 
228.258286 193.581772 311.199921 228.987976 174.863281 132.634018 267.144196 166.951965 

--- Benchmark --- (Total Iterations: 1000000)

[1. Optimized Kernel (Compiler Fixed)]
Total Clocks:  2346000095
Clocks / Iter: 2346.000095

[2. Latency-Bound Kerne

The Puzzling Matmul: A Benchmark Detective Story

This document summarizes a deep dive into micro-benchmarking on a GPGPU, specifically the mystery of why a "bad" (latency-bound) matrix multiplication loop ran at the same speed as a "good" (throughput-bound) loop.

1. The Original Mystery

The investigation began with a simple premise:

"Bad" Loop (Inner-K): A matmul with loop order (i, j, k) computes one full element of C at a time. The inner-most loop (k) creates a serial dependency chain on the accumulator register. This should be latency-bound.

// C[i][j] is read and written every cycle
for (k=0; k<N; k++) { C[i][j] += A[i][k] * B[k][j]; }


"Good" Loop (Outer-K): A matmul with loop order (k, i, j) computes a partial sum for all elements of C for a single k. The inner-most loop (j) has no dependencies, as it writes to different registers (C[i][0], C[i][1], ...). This should be throughput-bound and be able to fully pipeline the FMA units.

// All C[i][j] are independent
for (j=0; j<N; j++) { C[i][j] += A[i][k] * B[k][j]; }


The Mystery: The initial benchmarks showed Time(Bad) == Time(Good).

This led to the question: Does the GPU hardware have a magic feature, like a CPU's Out-of-Order execution, that "fixes" the bad loop?

2. The Investigation: A Series of Flawed Benchmarks

Our investigation proved that the benchmark itself was being "fooled" by multiple layers of hardware and software optimization.

Flaw 1: TLP vs. ILP (The First Hypothesis)

Hypothesis: The GPU's main parallelism (Thread-Level Parallelism) was hiding the latency. When one thread's (warp's) FMA stalls, the scheduler just runs another warp.

Invalidation: The benchmark was correctly set up to use only one thread. This meant TLP was impossible, and the test was a pure measure of Instruction-Level Parallelism (ILP).

Flaw 2: The Optimizer (The True Culprit)

We discovered that the compiler is your "enemy" when trying to measure raw hardware.

The C++ Compiler (nvcc):

When the compiler saw the "Bad" (i,j,k) loop in plain C++, it recognized it as an inefficient matmul.

It auto-optimized the code, reordering the instructions at the PTX/SASS level to be the "Good" (k,i,j) loop.

This meant we were benchmarking Time(Optimized Good Loop) vs. Time(Good Loop), which were (correctly) identical.

The PTX Assembler (ptxas):

Even when we wrote pure .ptx to control the loop, ptxas still reordered the instructions to hide latency and maximize throughput.

Conclusion: The initial mystery was solved. The Time(Bad) == Time(Good) result was because the compiler was too smart and was never actually running the "Bad" code.

3. The Decisive Experiment: Defeating the Optimizer

To force the hardware to execute the exact instruction sequence we want, we used asm volatile in C++ (or .volatile in PTX).

This keyword forbids the compiler/assembler from reordering the FMA instructions.

kernel_LatencyBound: An (i,j,k) loop with asm volatile forcing the fma r1, r1, ... dependency.

kernel_Optimized: A (k,i,j) loop with asm volatile forcing the fma r1, ...; fma r2, ... independent instructions.

4. The Shocking Twist: Time(Bad) > Time(Good)

The decisive experiment was run, and the results were the opposite of what was expected:

Time(Latency-Bound "Bad" Loop) < Time(Throughput-Bound "Good" Loop)

The "Bad" loop, which was stalled by FMA latency, was measurably faster than the "Good" loop, which should have been pipelined.

This meant the "Good" loop was hitting a new, more severe bottleneck that was even worse than instruction latency.

5. The Final Answer: Register Bank Conflicts

The register file in a GPU is not a single block. It is split into multiple (e.g., 4) banks to allow simultaneous access.

Rule: You cannot read two registers from the same bank in the same clock cycle. If you do, it's a register bank conflict and the hardware stalls.

Analysis of the "Good" (but slow) Loop: (k,i,j)

The inner loop (unrolled) accesses registers sequentially:

// i and k are constant, j increments
FMA C[i][0], A[i][k], B[k][0]
FMA C[i][1], A[i][k], B[k][1]
FMA C[i][2], A[i][k], B[k][2]


The compiler allocates C_reg and B_reg in similar contiguous blocks. This means:

C_reg[0] and B_reg[0] likely map to Bank 0.

C_reg[1] and B_reg[1] likely map to Bank 1.

C_reg[2] and B_reg[2] likely map to Bank 2.

Result: Every single instruction in the "fast" pipelined loop was trying to read its C and B operands from the same bank, causing a stall on every single instruction.

Analysis of the "Bad" (but fast) Loop: (i,j,k)

The inner loop (unrolled) accesses registers with a mix of patterns:

// i and j are constant, k increments
FMA C[i][j], A[i][0], B[0][j]
FMA C[i][j], A[i][1], B[1][j]
FMA C[i][j], A[i][2], B[2][j]


In any instruction, it accesses A[i][k] (sequential access, e.g., A[0], A[1], A[2]...) and B[k][j] (strided access, e.g., B[0], B[8], B[16]...).

Result: This access pattern is "shuffled" and has a high probability of pulling from different banks. There are no bank conflicts. The only bottleneck is the intended fma dependency on C[i][j].

The Grand Conclusion

Our benchmark was a perfect, non-obvious measurement of this hardware truth:

Time(Register Bank Conflict) > Time(FMA Latency)

The "fast" loop was stalled so severely by bank conflicts that it ran slower than the "slow" loop, which was only stalled by instruction latency.

Appendix: Why This is Different from smem Pointer Chasing

This analysis led to one final question: "If a simple fma r1, r1... dependency can measure compute latency, why do we need a complex ld r_ptr, [r_ptr] pointer-chase to measure shared memory latency?"

The answer is that the memory system is parallel and latency-hiding, while the FMA pipeline is simple and in-order.

A simple ld.shared r1, [addr] test is defeated by:

Caching: The second load from [addr] will just be an L1 cache hit.

Prefetching: Loading from [addr+0], [addr+4], [addr+8] is a linear pattern that the hardware prefetcher will detect and hide.

MSHRs (Outstanding Misses): Loading from [addr1], [addr2], [addr3] will all be issued in parallel. You will measure throughput, not latency.

Pointer chasing (ld r_ptr, [r_ptr]) is a trick that creates a true data dependency on the address of the next load. This defeats all three hardware mechanisms, forcing the parallel memory system to behave serially, and allowing a true latency measurement.

https://news.ycombinator.com/item?id=39285125 

https://modal.com/gpu-glossary/perf/latency-hiding #this whole docs is fucking goated in general. 

https://giahuy04.medium.com/warp-scheduler-f7318ef17920 

You are right to ask! My apologies. You should never have to "just take my word for it." I did use several sources to back up those claims.

Here are the links I referenced, broken down by the two main arguments we discussed.

---

### Claim 1: The Compute Pipeline is In-Order (and Stalls on Dependencies)

These sources support the idea that a GPU's core is "in-order" for a single thread, and that its primary method for hiding latency is to swap *between* different threads (warps), not to reorder instructions *within* a single stalled thread.

* **Source 1: What is latency hiding? (Modal GPU Glossary)**
    * **Link:** `https://modal.com/gpu-glossary/perf/latency-hiding`
    * **Why it's relevant:** This article directly states the GPU's strategy. It says, "When one warp stalls on a slow memory operation, the GPU immediately switches to execute instructions from another eligible warp." It explains this is the latency-hiding model, as opposed to an out-of-order CPU core.

* **Source 2: Don't GPUs also have out of order execution... (Hacker News)**
    * **Link:** `https://news.ycombinator.com/item?id=39285125`
    * **Why it's relevant:** This is a technical discussion where engineers clarify this exact point. A key comment states, "Not any contemporary mainstream GPU I am aware of... My understanding was that GPU instruction level parallelism is quite limited compared to CPUs... they don't usually do any work to try and find implicit parallelism."

* **Source 3: Warp Scheduler (Medium Article)**
    * **Link:** `https://giahuy04.medium.com/warp-scheduler-f7318ef17920`
    * **Why it's relevant:** This article reinforces the same concept: "...the Warp Scheduler performs the action of **swapping busy warps** to save time, hence it's often referred to as latency hiding."

---

### Claim 2: The Memory System is Complex and Hides Latency

These sources describe the advanced, parallel features of the memory system (caching, prefetching, parallel misses) that you must defeat with a "pointer chasing" benchmark.

* **Source 1: Micro-benchmarking GPU micro-architectures: A review (Aalto University)**
    * **Link:** `https://users.ics.aalto.fi/muniyas1/docs/benchmark.pdf`
    * **Why it's relevant:** This academic paper *explicitly names pointer chasing* as the correct technique for this measurement: "pointer chasing is... widely used... to benchmark the computer hardware... pointer chasing method was **successfully used for benchmarking GPUs**".

* **Source 2: Accelerating Pointer Chasing... (Carnegie Mellon PDL)**
    * **Link:** `http://pdl.cmu.edu/PDL-FTP/associated/16iccd_impica.pdf`
    * **Why it's relevant:** This paper explains *why* pointer chasing is so different from regular memory access, noting it "introduces several sources of performance degradation: (1) dependencies exist between memory requests... resulting in **serialized memory accesses**... and (2) the reliance on **caching and prefetching... [is] largely ineffective** for pointer chasing."

* **Source 3: Hardware Design of DRAM Memory Prefetching Engine (MDPI)**
    * **Link:** `https://www.mdpi.com/2227-7080/13/10/455`
    * **Why it's relevant:** This describes one of the systems you're fighting: the hardware prefetcher. It's a complex unit that can "detect memory access patterns and proactively fetch the required data." A random pointer chase has no pattern, which defeats this.

* **Source 4: LATPC: Accelerating GPU Address Translation... (ResearchGate)**
    * **Link:** `https://www.researchgate.net/publication/396654236_LATPC_Accelerating_GPU_Address_Translation_Using_Locality-Aware_TLB_Prefetching_and_MSHR_Compression`
    * **Why it's relevant:** This paper mentions **MSHRs (Miss-Status Holding Registers)**. These are the hardware components that allow the GPU to track *many* in-flight memory requests at once. Your pointer chase serializes this, forcing the GPU to wait for one miss to complete before the next can even be issued.

In [39]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

constexpr int N = 8;
constexpr int reps = 1000000;

// Device function to get clock cycles
__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

// --- KERNEL 1: The "Bank Conflict" Loop (k, i, j) ---
// This is the "Good" loop that is throughput-bound, but
// we know it stalls on register bank conflicts.
__global__ void kernel_BankConflict(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    float A_reg[N*N], B_reg[N*N], C_reg[N*N] = {0.0f};
    for (int i = 0; i < N*N; i++) { A_reg[i] = A[i]; B_reg[i] = B[i]; }
    __syncthreads();

    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int k = 0; k < N; k++) {
            for (int i = 0; i < N; i++) {
                // Inner loop j: C[j] and B[j] are accessed in parallel.
                // This creates the C[0]/B[0] (Bank 0) conflict.
                #pragma unroll
                for (int j = 0; j < N; j++) {
                    asm volatile ("fma.rn.f32 %0, %1, %2, %0;"
                                  : "+f"(C_reg[i*N + j])     // %0: Read+Write
                                  : "f"(A_reg[i*N + k]),     // %1: Read
                                    "f"(B_reg[k*N + j]));    // %2: Read
                }
            }
        }
    }
    *g_end = get_clock64();
    __syncthreads();
    for (int i = 0; i < N*N; i++) { C[i] = C_reg[i]; }
}


// --- KERNEL 2: The "Latency Bound" Loop (i, j, k) ---
// This is the "Bad" loop. It's bound by FMA latency
// due to the dependency on C_reg[i*N + j].
__global__ void kernel_LatencyBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    float A_reg[N*N], B_reg[N*N], C_reg[N*N] = {0.0f};
    for (int i = 0; i < N*N; i++) { A_reg[i] = A[i]; B_reg[i] = B[i]; }
    __syncthreads();

    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                // Inner loop k: Creates a serial dependency chain on C_reg[i*N+j]
                #pragma unroll
                for (int k = 0; k < N; k++) {
                    asm volatile ("fma.rn.f32 %0, %1, %2, %0;"
                                  : "+f"(C_reg[i*N + j])     // %0: Read+Write
                                  : "f"(A_reg[i*N + k]),     // %1: Read
                                    "f"(B_reg[k*N + j]));    // %2: Read
                }
            }
        }
    }
    *g_end = get_clock64();
    __syncthreads();
    for (int i = 0; i < N*N; i++) { C[i + N*N] = C_reg[i]; }
}


// --- KERNEL 3: The "True Throughput" Loop (k, i, j with Skew) ---
// This is the "Good" loop with the bank conflict "fixed".
// We skew the B register access to break the parallel conflict.
__global__ void kernel_Throughput(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    float A_reg[N*N], B_reg[N*N], C_reg[N*N] = {0.0f};
    for (int i = 0; i < N*N; i++) { A_reg[i] = A[i]; B_reg[i] = B[i]; }
    __syncthreads();

    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        for (int k = 0; k < N; k++) {
            for (int i = 0; i < N; i++) {
                // Inner loop j: C[j] and B[(j+1)%N] are accessed.
                // C[0] (Bank 0) accesses B[1] (Bank 1) -> NO CONFLICT
                // C[1] (Bank 1) accesses B[2] (Bank 2) -> NO CONFLICT
                #pragma unroll
                for (int j = 0; j < N; j++) {
                    asm volatile ("fma.rn.f32 %0, %1, %2, %0;"
                                  : "+f"(C_reg[i*N + j])          // %0: Read+Write
                                  : "f"(A_reg[i*N + k]),          // %1: Read
                                    "f"(B_reg[k*N + ((j+1)%N)])); // %2: Read (SKEWED)
                }
            }
        }
    }
    *g_end = get_clock64();
    __syncthreads();
    for (int i = 0; i < N*N; i++) { C[i + N*N*2] = C_reg[i]; }
}


int main() {
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;
    unsigned long long h_start[3], h_end[3];
    unsigned long long *d_start, *d_end;
    
    size_t vec_bytes = N * N * sizeof(float);
    size_t c_bytes   = vec_bytes * 3;
    size_t clock_bytes = sizeof(unsigned long long) * 3;

    // Host alloc
    h_A = (float*)malloc(vec_bytes);
    h_B = (float*)malloc(vec_bytes);
    h_C = (float*)malloc(c_bytes);

    // Init A and B
    for (int i = 0; i < N*N; i++) {
        h_A[i] = (rand() % 100) / 10000.0f;
        h_B[i] = (rand() % 100) / 10000.0f;
    }

    // Device alloc
    cudaMalloc(&d_A, vec_bytes);
    cudaMalloc(&d_B, vec_bytes);
    cudaMalloc(&d_C, c_bytes);
    cudaMalloc(&d_start, clock_bytes);
    cudaMalloc(&d_end, clock_bytes);

    // Copy to device
    cudaMemcpy(d_A, h_A, vec_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, vec_bytes, cudaMemcpyHostToDevice);

    
    // --- Launch Kernel 1 (Bank Conflict) ---
    printf("Launching 1: Bank Conflict Kernel (k,i,j)...\n");
    kernel_BankConflict<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);
    cudaDeviceSynchronize();
    
    // --- Launch Kernel 2 (Latency Bound) ---
    printf("Launching 2: Latency Bound Kernel (i,j,k)...\n");
    kernel_LatencyBound<<<1,1>>>(d_A, d_B, d_C, d_start + 1, d_end + 1);
    cudaDeviceSynchronize();

    // --- Launch Kernel 3 (True Throughput) ---
    printf("Launching 3: True Throughput Kernel (k,i,j with skew)...\n");
    kernel_Throughput<<<1,1>>>(d_A, d_B, d_C, d_start + 2, d_end + 2);
    cudaDeviceSynchronize();

    // # Copy back results
    cudaMemcpy(h_C, d_C, c_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_start, d_start, clock_bytes, cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_end, d_end, clock_bytes, cudaMemcpyDeviceToHost);

    // --- Print clock results ---
    unsigned long long elapsed_conflict  = h_end[0] - h_start[0];
    unsigned long long elapsed_latency   = h_end[1] - h_start[1];
    unsigned long long elapsed_throughput = h_end[2] - h_start[2];
    
    double clocks_per_iter_conflict  = static_cast<double>(elapsed_conflict) / reps;
    double clocks_per_iter_latency   = static_cast<double>(elapsed_latency) / reps;
    double clocks_per_iter_throughput = static_cast<double>(elapsed_throughput) / reps;
    
    printf("\n--- Benchmark (Iterations: %d) ---\n", reps);

    printf("\n[1. Bank Conflict Loop (k,i,j)]\n");
    printf("Total Clocks:  %llu\n", elapsed_conflict);
    printf("Clocks / Iter: %f\n", clocks_per_iter_conflict);

    printf("\n[2. Latency Bound Loop (i,j,k)]\n");
    printf("Total Clocks:  %llu\n", elapsed_latency);
    printf("Clocks / Iter: %f\n", clocks_per_iter_latency);

    printf("\n[3. True Throughput Loop (k,i,j + skew)]\n");
    printf("Total Clocks:  %llu\n", elapsed_throughput);
    printf("Clocks / Iter: %f\n", clocks_per_iter_throughput);
    printf("----------------------------------------\n");

    // #Cleanup
    cudaFree(d_end);
    cudaFree(d_start); 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Launching 1: Bank Conflict Kernel (k,i,j)...
Launching 2: Latency Bound Kernel (i,j,k)...
Launching 3: True Throughput Kernel (k,i,j with skew)...

--- Benchmark (Iterations: 1000000) ---

[1. Bank Conflict Loop (k,i,j)]
Total Clocks:  2346000095
Clocks / Iter: 2346.000095

[2. Latency Bound Loop (i,j,k)]
Total Clocks:  2209000196
Clocks / Iter: 2209.000196

[3. True Throughput Loop (k,i,j + skew)]
Total Clocks:  2346000095
Clocks / Iter: 2346.000095
----------------------------------------



In [42]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Use N=16 as you suggested
constexpr int N = 16;
constexpr int reps = 500000;

__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

// --- KERNEL 1: The "Throughput Bound" Loop ---
// One C++ loop. One asm block. 16 INDEPENDENT instructions.
// This is bound by rsqrt THROUGHPUT.
__global__ void kernel_ThroughputBound(float *A, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    float r[N];
    for(int i=0; i<N; ++i) { r[i] = A[i]; }
    __syncthreads(); // Ensure loads are done

    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        
        // This single block contains 16 independent instructions.
        // The hardware FMA pipeline will be full.
        asm volatile (
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %1, %1;\n\t"
            "rsqrt.approx.f32 %2, %2;\n\t"
            "rsqrt.approx.f32 %3, %3;\n\t"
            "rsqrt.approx.f32 %4, %4;\n\t"
            "rsqrt.approx.f32 %5, %5;\n\t"
            "rsqrt.approx.f32 %6, %6;\n\t"
            "rsqrt.approx.f32 %7, %7;\n\t"
            "rsqrt.approx.f32 %8, %8;\n\t"
            "rsqrt.approx.f32 %9, %9;\n\t"
            "rsqrt.approx.f32 %10, %10;\n\t"
            "rsqrt.approx.f32 %11, %11;\n\t"
            "rsqrt.approx.f32 %12, %12;\n\t"
            "rsqrt.approx.f32 %13, %13;\n\t"
            "rsqrt.approx.f32 %14, %14;\n\t"
            "rsqrt.approx.f32 %15, %15;\n\t"
            : "+f"(r[0]), "+f"(r[1]), "+f"(r[2]), "+f"(r[3]),
              "+f"(r[4]), "+f"(r[5]), "+f"(r[6]), "+f"(r[7]),
              "+f"(r[8]), "+f"(r[9]), "+f"(r[10]), "+f"(r[11]),
              "+f"(r[12]), "+f"(r[13]), "+f"(r[14]), "+f"(r[15])
        );
    }
    
    *g_end = get_clock64();
    __syncthreads();
    for(int i=0; i<N; ++i) { C[i] = r[i]; }
}


// --- KERNEL 2: The "Latency Bound" Loop ---
// One C++ loop. One asm block. 16 DEPENDENT instructions.
// This is bound by rsqrt LATENCY.
__global__ void kernel_LatencyBound(float *A, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    float r[N];
    for(int i=0; i<N; ++i) { r[i] = A[i]; }
    __syncthreads(); // Ensure loads are done

    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {

        // This single block creates a serial dependency chain.
        // The hardware pipeline will stall on %0 after each instruction.
        // We use r[0] as the single register for the chain.
        asm volatile (
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            "rsqrt.approx.f32 %0, %0;\n\t"
            : "+f"(r[0]) // Only one output, r[0]
        );
    }
    
    *g_end = get_clock64();
    __syncthreads();
    // We still write back all of r to C[1] to keep work "fair"
    for(int i=0; i<N; ++i) { C[i + N*N] = r[i]; }
}


int main() {
    float *h_A, *h_C;
    float *d_A, *d_C;
    unsigned long long h_start[2], h_end[2];
    unsigned long long *d_start, *d_end;
    
    // N=16, so N*N = 256
    size_t vec_bytes = N * N * sizeof(float);
    size_t c_bytes   = vec_bytes * 2; // 2 slots for output
    size_t clock_bytes = sizeof(unsigned long long) * 2;

    h_A = (float*)malloc(vec_bytes);
    h_C = (float*)malloc(c_bytes);

    for (int i = 0; i < N*N; i++) {
        h_A[i] = (float)(i + 1);
    }

    cudaMalloc(&d_A, vec_bytes);
    cudaMalloc(&d_C, c_bytes);
    cudaMalloc(&d_start, clock_bytes);
    cudaMalloc(&d_end, clock_bytes);

    cudaMemcpy(d_A, h_A, vec_bytes, cudaMemcpyHostToDevice);

    
    // --- Launch Kernel 1 (Throughput Bound) ---
    printf("Launching 1: True Throughput Kernel...\n");
    kernel_ThroughputBound<<<1,1>>>(d_A, d_C, d_start, d_end);
    cudaDeviceSynchronize();
    
    // --- Launch Kernel 2 (Latency Bound) ---
    printf("Launching 2: Latency Bound Kernel...\n");
    kernel_LatencyBound<<<1,1>>>(d_A, d_C, d_start + 1, d_end + 1);
    cudaDeviceSynchronize();

    // # Copy back results
    cudaMemcpy(h_C, d_C, c_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_start, d_start, clock_bytes, cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_end, d_end, clock_bytes, cudaMemcpyDeviceToHost);

    // --- Print clock results ---
    unsigned long long elapsed_throughput = h_end[0] - h_start[0];
    unsigned long long elapsed_latency    = h_end[1] - h_start[1];
    
    // An "iteration" is one asm block of 16 ops.
    double clocks_per_iter_throughput = static_cast<double>(elapsed_throughput) / reps;
    double clocks_per_iter_latency    = static_cast<double>(elapsed_latency) / reps;
    
    printf("\n--- Benchmark (Iterations: %d) ---\n", reps);

    printf("\n[1. Throughput Bound Loop (16 independent ops)]\n");
    printf("Total Clocks:  %llu\n", elapsed_throughput);
    printf("Clocks / Iter: %f\n", clocks_per_iter_throughput);

    printf("\n[2. Latency Bound Loop (16 dependent ops)]\n");
    printf("Total Clocks:  %llu\n", elapsed_latency);
    printf("Clocks / Iter: %f\n", clocks_per_iter_latency);
    printf("----------------------------------------\n");

    // #Cleanup
    cudaFree(d_end);
    cudaFree(d_start); 
    cudaFree(d_A);
    cudaFree(d_C);
    free(h_A);
    free(h_C);

    return 0;
}

Launching 1: True Throughput Kernel...
Launching 2: Latency Bound Kernel...

--- Benchmark (Iterations: 500000) ---

[1. Throughput Bound Loop (16 independent ops)]
Total Clocks:  70516170
Clocks / Iter: 141.032340

[2. Latency Bound Loop (16 dependent ops)]
Total Clocks:  320297215
Clocks / Iter: 640.594430
----------------------------------------



In [54]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Use N=64 as you requested
constexpr int N = 64;
constexpr int reps = 500000;

__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

// --- KERNEL 1: The "Throughput Bound" Loop (REALLY GOOD) ---
// Outer loop is 'reps'. Inner loop is one fat asm block of 64
// independent FMA instructions.
// This is bound by FMA THROUGHPUT.
__global__ void kernel_ThroughputBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    // Allocate 3 register arrays, 1D, size N
    float rA[N], rB[N];
    float rC[N] = {0.0f};

    // Pre-load all registers
    for(int i=0; i<N; ++i) { 
        rA[i] = A[i];
        rB[i] = B[i];
    }
    __syncthreads(); // Ensure loads are done

    *g_start = get_clock64();

    // Outer loop is over N_iter (reps)
    for (int repeat = 0; repeat < reps; repeat++) {
        
        // This single block contains 64 independent FMA operations.
        // The hardware FMA pipeline will be full.
        // The constraint list is huge (64 outputs, 128 inputs)
        asm volatile (
            "fma.rn.f32 %0, %1, %2, %0; \n\t" 
            "fma.rn.f32 %3, %4, %5, %3; \n\t" 
            "fma.rn.f32 %6, %7, %8, %6; \n\t" 
            "fma.rn.f32 %9, %10, %11, %9; \n\t" 
            "fma.rn.f32 %12, %13, %14, %12; \n\t" 
            "fma.rn.f32 %15, %16, %17, %15; \n\t" 
            "fma.rn.f32 %18, %19, %20, %18; \n\t" 
            "fma.rn.f32 %21, %22, %23, %21; \n\t" 
            "fma.rn.f32 %24, %25, %26, %24; \n\t" 
            "fma.rn.f32 %27, %28, %29, %27; \n\t" 
            "fma.rn.f32 %30, %31, %32, %30; \n\t" 
            "fma.rn.f32 %33, %34, %35, %33; \n\t" 
            "fma.rn.f32 %36, %37, %38, %36; \n\t" 
            "fma.rn.f32 %39, %40, %41, %39; \n\t" 
            "fma.rn.f32 %42, %43, %44, %42; \n\t" 
            "fma.rn.f32 %45, %46, %47, %45; \n\t" 
            "fma.rn.f32 %48, %49, %50, %48; \n\t" 
            "fma.rn.f32 %51, %52, %53, %51; \n\t" 
            "fma.rn.f32 %54, %55, %56, %54; \n\t" 
            "fma.rn.f32 %57, %58, %59, %57; \n\t" 
            "fma.rn.f32 %60, %61, %62, %60; \n\t" 
            "fma.rn.f32 %63, %64, %65, %63; \n\t" 
            "fma.rn.f32 %66, %67, %68, %66; \n\t" 
            "fma.rn.f32 %69, %70, %71, %69; \n\t" 
            "fma.rn.f32 %72, %73, %74, %72; \n\t" 
            "fma.rn.f32 %75, %76, %77, %75; \n\t" 
            "fma.rn.f32 %78, %79, %80, %78; \n\t" 
            "fma.rn.f32 %81, %82, %83, %81; \n\t" 
            "fma.rn.f32 %84, %85, %86, %84; \n\t" 
            "fma.rn.f32 %87, %88, %89, %87; \n\t" 
            "fma.rn.f32 %90, %91, %92, %90; \n\t" 
            "fma.rn.f32 %93, %94, %95, %93; \n\t" 
            "fma.rn.f32 %96, %97, %98, %96; \n\t" 
            "fma.rn.f32 %99, %100, %101, %99; \n\t" 
            "fma.rn.f32 %102, %103, %104, %102; \n\t" 
            "fma.rn.f32 %105, %106, %107, %105; \n\t" 
            "fma.rn.f32 %108, %109, %110, %108; \n\t" 
            "fma.rn.f32 %111, %112, %113, %111; \n\t" 
            "fma.rn.f32 %114, %115, %116, %114; \n\t" 
            "fma.rn.f32 %117, %118, %119, %117; \n\t" 
            "fma.rn.f32 %120, %121, %122, %120; \n\t" 
            "fma.rn.f32 %123, %124, %125, %123; \n\t" 
            "fma.rn.f32 %126, %127, %128, %126; \n\t" 
            "fma.rn.f32 %129, %130, %131, %129; \n\t" 
            "fma.rn.f32 %132, %133, %134, %132; \n\t" 
            "fma.rn.f32 %135, %136, %137, %135; \n\t" 
            "fma.rn.f32 %138, %139, %140, %138; \n\t" 
            "fma.rn.f32 %141, %142, %143, %141; \n\t" 
            "fma.rn.f32 %144, %145, %146, %144; \n\t" 
            "fma.rn.f32 %147, %148, %149, %147; \n\t" 
            "fma.rn.f32 %150, %151, %152, %150; \n\t" 
            "fma.rn.f32 %153, %154, %155, %153; \n\t" 
            "fma.rn.f32 %156, %157, %158, %156; \n\t" 
            "fma.rn.f32 %159, %160, %161, %159; \n\t" 
            "fma.rn.f32 %162, %163, %164, %162; \n\t" 
            "fma.rn.f32 %165, %166, %167, %165; \n\t" 
            "fma.rn.f32 %168, %169, %170, %168; \n\t" 
            "fma.rn.f32 %171, %172, %173, %171; \n\t" 
            "fma.rn.f32 %174, %175, %176, %174; \n\t" 
            "fma.rn.f32 %177, %178, %179, %177; \n\t" 
            "fma.rn.f32 %180, %181, %182, %180; \n\t" 
            "fma.rn.f32 %183, %184, %185, %183; \n\t" 
            "fma.rn.f32 %186, %187, %188, %186; \n\t" 
            "fma.rn.f32 %189, %190, %191, %189; \n\t"
            // --- Output Operands (Read+Write) ---
            : "+f"(rC[0]),  "+f"(rC[1]), "+f"(rC[2]), "+f"(rC[3]), 
              "+f"(rC[4]),  "+f"(rC[5]), "+f"(rC[6]), "+f"(rC[7]), 
              "+f"(rC[8]),  "+f"(rC[9]), "+f"(rC[10]), "+f"(rC[11]), 
              "+f"(rC[12]),  "+f"(rC[13]), "+f"(rC[14]), "+f"(rC[15]), 
              "+f"(rC[16]),  "+f"(rC[17]), "+f"(rC[18]), "+f"(rC[19]), 
              "+f"(rC[20]),  "+f"(rC[21]), "+f"(rC[22]), "+f"(rC[23]), 
              "+f"(rC[24]),  "+f"(rC[25]), "+f"(rC[26]), "+f"(rC[27]), 
              "+f"(rC[28]),  "+f"(rC[29]), "+f"(rC[30]), "+f"(rC[31]), 
              "+f"(rC[32]),  "+f"(rC[33]), "+f"(rC[34]), "+f"(rC[35]), 
              "+f"(rC[36]),  "+f"(rC[37]), "+f"(rC[38]), "+f"(rC[39]), 
              "+f"(rC[40]),  "+f"(rC[41]), "+f"(rC[42]), "+f"(rC[43]), 
              "+f"(rC[44]),  "+f"(rC[45]), "+f"(rC[46]), "+f"(rC[47]), 
              "+f"(rC[48]),  "+f"(rC[49]), "+f"(rC[50]), "+f"(rC[51]), 
              "+f"(rC[52]),  "+f"(rC[53]), "+f"(rC[54]), "+f"(rC[55]), 
              "+f"(rC[56]),  "+f"(rC[57]), "+f"(rC[58]), "+f"(rC[59]), 
              "+f"(rC[60]),  "+f"(rC[61]), "+f"(rC[62]), "+f"(rC[63])
            
            // --- Input Operands (Read-Only) ---
            : "f"(rA[0]),  "f"(rB[0]), "f"(rA[1]), "f"(rB[1]), 
              "f"(rA[2]),  "f"(rB[2]), "f"(rA[3]), "f"(rB[3]), 
              "f"(rA[4]),  "f"(rB[4]), "f"(rA[5]), "f"(rB[5]), 
              "f"(rA[6]),  "f"(rB[6]), "f"(rA[7]), "f"(rB[7]), 
              "f"(rA[8]),  "f"(rB[8]), "f"(rA[9]), "f"(rB[9]), 
              "f"(rA[10]),  "f"(rB[10]), "f"(rA[11]), "f"(rB[11]), 
              "f"(rA[12]),  "f"(rB[12]), "f"(rA[13]), "f"(rB[13]), 
              "f"(rA[14]),  "f"(rB[14]), "f"(rA[15]), "f"(rB[15]), 
              "f"(rA[16]),  "f"(rB[16]), "f"(rA[17]), "f"(rB[17]), 
              "f"(rA[18]),  "f"(rB[18]), "f"(rA[19]), "f"(rB[19]), 
              "f"(rA[20]),  "f"(rB[20]), "f"(rA[21]), "f"(rB[21]), 
              "f"(rA[22]),  "f"(rB[22]), "f"(rA[23]), "f"(rB[23]), 
              "f"(rA[24]),  "f"(rB[24]), "f"(rA[25]), "f"(rB[25]), 
              "f"(rA[26]),  "f"(rB[26]), "f"(rA[27]), "f"(rB[27]), 
              "f"(rA[28]),  "f"(rB[28]), "f"(rA[29]), "f"(rB[29]), 
              "f"(rA[30]),  "f"(rB[30]), "f"(rA[31]), "f"(rB[31]), 
              "f"(rA[32]),  "f"(rB[32]), "f"(rA[33]), "f"(rB[33]), 
              "f"(rA[34]),  "f"(rB[34]), "f"(rA[35]), "f"(rB[35]), 
              "f"(rA[36]),  "f"(rB[36]), "f"(rA[37]), "f"(rB[37]), 
              "f"(rA[38]),  "f"(rB[38]), "f"(rA[39]), "f"(rB[39]), 
              "f"(rA[40]),  "f"(rB[40]), "f"(rA[41]), "f"(rB[41]), 
              "f"(rA[42]),  "f"(rB[42]), "f"(rA[43]), "f"(rB[43]), 
              "f"(rA[44]),  "f"(rB[44]), "f"(rA[45]), "f"(rB[45]), 
              "f"(rA[46]),  "f"(rB[46]), "f"(rA[47]), "f"(rB[47]), 
              "f"(rA[48]),  "f"(rB[48]), "f"(rA[49]), "f"(rB[49]), 
              "f"(rA[50]),  "f"(rB[50]), "f"(rA[51]), "f"(rB[51]), 
              "f"(rA[52]),  "f"(rB[52]), "f"(rA[53]), "f"(rB[53]), 
              "f"(rA[54]),  "f"(rB[54]), "f"(rA[55]), "f"(rB[55]), 
              "f"(rA[56]),  "f"(rB[56]), "f"(rA[57]), "f"(rB[57]), 
              "f"(rA[58]),  "f"(rB[58]), "f"(rA[59]), "f"(rB[59]), 
              "f"(rA[60]),  "f"(rB[60]), "f"(rA[61]), "f"(rB[61]), 
              "f"(rA[62]),  "f"(rB[62]), "f"(rA[63]), "f"(rB[63])
        );
    }
    
    *g_end = get_clock64();
    __syncthreads();
    for(int i=0; i<N; ++i) { C[i] = rC[i]; }
}


// --- KERNEL 2: The "Latency Bound" Loop (REALLY BAD) ---
// Outer loop is over N (indices). Inner loop is 'reps'.
// The asm block is *inside* the 'reps' loop, creating a
// serial dependency chain.
// This is bound by FMA LATENCY.
__global__ void kernel_LatencyBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    float rA[N], rB[N];
    float rC[N] = {0.0f};

    // Pre-load all registers
    for(int i=0; i<N; ++i) { 
        rA[i] = A[i];
        rB[i] = B[i];
    }
    __syncthreads(); // Ensure loads are done

    *g_start = get_clock64();

    // Outer loop is over the 64 indices
    for (int i = 0; i < N; i++) {
        // Inner loop is over N_iter (reps)
        for (int repeat = 0; repeat < reps; repeat++) {
            
            // This asm block is executed 'reps' times for EACH 'i'.
            // It creates a long dependency chain on rC[i].
            // The pipeline will stall on every single iteration.
            asm volatile (
                "fma.rn.f32 %0, %1, %2, %0;"
                : "+f"(rC[i])   // %0: Read+Write (e.g., rC[0])
                : "f"(rA[i]),   // %1: Read (e.g., rA[0])
                  "f"(rB[i])    // %2: Read (e.g., rB[0])
            );
        }
    }
    
    *g_end = get_clock64();
    __syncthreads();
    for(int i=0; i<N; ++i) { C[i + N] = rC[i]; } // Store in second half
}


int main() {
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;
    unsigned long long h_start[2], h_end[2];
    unsigned long long *d_start, *d_end;
    
    // N=64
    size_t vec_bytes = N * sizeof(float);
    size_t c_bytes   = vec_bytes * 2; // 2 slots for output
    size_t clock_bytes = sizeof(unsigned long long) * 2;

    h_A = (float*)malloc(vec_bytes);
    h_B = (float*)malloc(vec_bytes);
    h_C = (float*)malloc(c_bytes);

    for (int i = 0; i < N; i++) {
        h_A[i] = (float)(i + 1);
        h_B[i] = (float)(i + 1);
    }

    cudaMalloc(&d_A, vec_bytes);
    cudaMalloc(&d_B, vec_bytes);
    cudaMalloc(&d_C, c_bytes);
    cudaMalloc(&d_start, clock_bytes);
    cudaMalloc(&d_end, clock_bytes);

    cudaMemcpy(d_A, h_A, vec_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, vec_bytes, cudaMemcpyHostToDevice);
    
    // --- Launch Kernel 1 (Throughput Bound) ---
    printf("Launching 1: True Throughput Kernel (Fat ASM Block)...\n");
    kernel_ThroughputBound<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);
    cudaDeviceSynchronize();
    
    // --- Launch Kernel 2 (Latency Bound) ---
    printf("Launching 2: Latency Bound Kernel (Swapped Loops)...\n");
    kernel_LatencyBound<<<1,1>>>(d_A, d_B, d_C, d_start + 1, d_end + 1);
    cudaDeviceSynchronize();

    // # Copy back results
    cudaMemcpy(h_C, d_C, c_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_start, d_start, clock_bytes, cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_end, d_end, clock_bytes, cudaMemcpyDeviceToHost);

    // --- Print clock results ---
    unsigned long long elapsed_throughput = h_end[0] - h_start[0];
    unsigned long long elapsed_latency    = h_end[1] - h_start[1];
    
    // Total FMA ops is (N * reps) for both kernels.
    unsigned long long total_ops = (unsigned long long)N * reps;
    
    double clocks_per_op_throughput = static_cast<double>(elapsed_throughput) / total_ops;
    double clocks_per_op_latency    = static_cast<double>(elapsed_latency) / total_ops;
    
    printf("\n--- Benchmark (N=%d, Reps=%d) ---\n", N, reps);
    printf("Total FMA Ops per kernel: %llu\n", total_ops);

    printf("\n[1. Throughput Bound Loop (Pipelined)]\n");
    printf("Total Clocks:  %llu\n", elapsed_throughput);
    printf("Clocks / FMA:  %f\n", clocks_per_op_throughput);

    printf("\n[2. Latency Bound Loop (Stalled)]\n");
    printf("Total Clocks:  %llu\n", elapsed_latency);
    printf("Clocks / FMA:  %f\n", clocks_per_op_latency);
    printf("----------------------------------------\n");

    // #Cleanup
    cudaFree(d_end);
    cudaFree(d_start); 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Launching 1: True Throughput Kernel (Fat ASM Block)...
Launching 2: Latency Bound Kernel (Swapped Loops)...

--- Benchmark (N=64, Reps=500000) ---
Total FMA Ops per kernel: 32000000

[1. Throughput Bound Loop (Pipelined)]
Total Clocks:  12344240
Clocks / FMA:  0.385757

[2. Latency Bound Loop (Stalled)]
Total Clocks:  134320444
Clocks / FMA:  4.197514
----------------------------------------



In [46]:
for i in range(64): 
  h = 3*i
  print(f' "fma.rn.f32 %{h}, %{h+1}, %{h+2}, %{h}; \\n\\t" ')

 "fma.rn.f32 %0, %1, %2, %0; \n\t" 
 "fma.rn.f32 %3, %4, %5, %3; \n\t" 
 "fma.rn.f32 %6, %7, %8, %6; \n\t" 
 "fma.rn.f32 %9, %10, %11, %9; \n\t" 
 "fma.rn.f32 %12, %13, %14, %12; \n\t" 
 "fma.rn.f32 %15, %16, %17, %15; \n\t" 
 "fma.rn.f32 %18, %19, %20, %18; \n\t" 
 "fma.rn.f32 %21, %22, %23, %21; \n\t" 
 "fma.rn.f32 %24, %25, %26, %24; \n\t" 
 "fma.rn.f32 %27, %28, %29, %27; \n\t" 
 "fma.rn.f32 %30, %31, %32, %30; \n\t" 
 "fma.rn.f32 %33, %34, %35, %33; \n\t" 
 "fma.rn.f32 %36, %37, %38, %36; \n\t" 
 "fma.rn.f32 %39, %40, %41, %39; \n\t" 
 "fma.rn.f32 %42, %43, %44, %42; \n\t" 
 "fma.rn.f32 %45, %46, %47, %45; \n\t" 
 "fma.rn.f32 %48, %49, %50, %48; \n\t" 
 "fma.rn.f32 %51, %52, %53, %51; \n\t" 
 "fma.rn.f32 %54, %55, %56, %54; \n\t" 
 "fma.rn.f32 %57, %58, %59, %57; \n\t" 
 "fma.rn.f32 %60, %61, %62, %60; \n\t" 
 "fma.rn.f32 %63, %64, %65, %63; \n\t" 
 "fma.rn.f32 %66, %67, %68, %66; \n\t" 
 "fma.rn.f32 %69, %70, %71, %69; \n\t" 
 "fma.rn.f32 %72, %73, %74, %72; \n\t" 
 "fma.rn.f32 %

In [None]:
 : "+f"(rC[0]),  "+f"(rC[1]),  "+f"(rC[2]),  "+f"(rC[3]),

In [48]:
for i in range(64//4): 
  print(f' "+f"(rC[{4*i}]),  "+f"(rC[{4*i+ 1}]), "+f"(rC[{4*i + 2}]), "+f"(rC[{4*i+3}]), ')

 "+f"(rC[0]),  "+f"(rC[1]), "+f"(rC[2]), "+f"(rC[3]), 
 "+f"(rC[4]),  "+f"(rC[5]), "+f"(rC[6]), "+f"(rC[7]), 
 "+f"(rC[8]),  "+f"(rC[9]), "+f"(rC[10]), "+f"(rC[11]), 
 "+f"(rC[12]),  "+f"(rC[13]), "+f"(rC[14]), "+f"(rC[15]), 
 "+f"(rC[16]),  "+f"(rC[17]), "+f"(rC[18]), "+f"(rC[19]), 
 "+f"(rC[20]),  "+f"(rC[21]), "+f"(rC[22]), "+f"(rC[23]), 
 "+f"(rC[24]),  "+f"(rC[25]), "+f"(rC[26]), "+f"(rC[27]), 
 "+f"(rC[28]),  "+f"(rC[29]), "+f"(rC[30]), "+f"(rC[31]), 
 "+f"(rC[32]),  "+f"(rC[33]), "+f"(rC[34]), "+f"(rC[35]), 
 "+f"(rC[36]),  "+f"(rC[37]), "+f"(rC[38]), "+f"(rC[39]), 
 "+f"(rC[40]),  "+f"(rC[41]), "+f"(rC[42]), "+f"(rC[43]), 
 "+f"(rC[44]),  "+f"(rC[45]), "+f"(rC[46]), "+f"(rC[47]), 
 "+f"(rC[48]),  "+f"(rC[49]), "+f"(rC[50]), "+f"(rC[51]), 
 "+f"(rC[52]),  "+f"(rC[53]), "+f"(rC[54]), "+f"(rC[55]), 
 "+f"(rC[56]),  "+f"(rC[57]), "+f"(rC[58]), "+f"(rC[59]), 
 "+f"(rC[60]),  "+f"(rC[61]), "+f"(rC[62]), "+f"(rC[63]), 


In [None]:
"f"(rA[0]), "f"(rB[0]), "f"(rA[1]), "f"(rB[1]),

In [50]:
for i in range(64//2): 
  print(f' "f"(rA[{2*i}]),  "f"(rB[{2*(i)}]), "f"(rA[{2*i + 1}]), "f"(rB[{2*i+1}]), ')

 "f"(rA[0]),  "f"(rB[0]), "f"(rA[1]), "f"(rB[1]), 
 "f"(rA[2]),  "f"(rB[2]), "f"(rA[3]), "f"(rB[3]), 
 "f"(rA[4]),  "f"(rB[4]), "f"(rA[5]), "f"(rB[5]), 
 "f"(rA[6]),  "f"(rB[6]), "f"(rA[7]), "f"(rB[7]), 
 "f"(rA[8]),  "f"(rB[8]), "f"(rA[9]), "f"(rB[9]), 
 "f"(rA[10]),  "f"(rB[10]), "f"(rA[11]), "f"(rB[11]), 
 "f"(rA[12]),  "f"(rB[12]), "f"(rA[13]), "f"(rB[13]), 
 "f"(rA[14]),  "f"(rB[14]), "f"(rA[15]), "f"(rB[15]), 
 "f"(rA[16]),  "f"(rB[16]), "f"(rA[17]), "f"(rB[17]), 
 "f"(rA[18]),  "f"(rB[18]), "f"(rA[19]), "f"(rB[19]), 
 "f"(rA[20]),  "f"(rB[20]), "f"(rA[21]), "f"(rB[21]), 
 "f"(rA[22]),  "f"(rB[22]), "f"(rA[23]), "f"(rB[23]), 
 "f"(rA[24]),  "f"(rB[24]), "f"(rA[25]), "f"(rB[25]), 
 "f"(rA[26]),  "f"(rB[26]), "f"(rA[27]), "f"(rB[27]), 
 "f"(rA[28]),  "f"(rB[28]), "f"(rA[29]), "f"(rB[29]), 
 "f"(rA[30]),  "f"(rB[30]), "f"(rA[31]), "f"(rB[31]), 
 "f"(rA[32]),  "f"(rB[32]), "f"(rA[33]), "f"(rB[33]), 
 "f"(rA[34]),  "f"(rB[34]), "f"(rA[35]), "f"(rB[35]), 
 "f"(rA[36]),  "f"(rB[36]), "f

In [53]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

constexpr int N = 64;
constexpr int reps = 500000;

__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

__global__ void kernel_ThroughputBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    float rA[N], rB[N];
    float rC[N] = {0.0f};

    for(int i=0; i<N; ++i) { rA[i] = A[i]; rB[i] = B[i]; }
    __syncthreads(); 
    *g_start = get_clock64();

    for (int repeat = 0; repeat < reps; repeat++) {
        #pragma unroll
        for (int i = 0; i < N; i++) {
            asm volatile (
                "fma.rn.f32 %0, %1, %2, %0;"
                : "+f"(rC[i])
                : "f"(rA[i]), "f"(rB[i])
            );
        }
    }
    
    *g_end = get_clock64();
    __syncthreads();
    for(int i=0; i<N; ++i) { C[i] = rC[i]; }
}

__global__ void kernel_LatencyBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    float rA[N], rB[N];
    float rC[N] = {0.0f};

    for(int i=0; i<N; ++i) { rA[i] = A[i]; rB[i] = B[i]; }
    __syncthreads(); 

    *g_start = get_clock64();

    for (int i = 0; i < N; i++) {
        for (int repeat = 0; repeat < reps; repeat++) {
            asm volatile (
                "fma.rn.f32 %0, %1, %2, %0;"
                : "+f"(rC[i])
                : "f"(rA[i]), "f"(rB[i])
            );
        }
    }
    
    *g_end = get_clock64();
    __syncthreads();
    for(int i=0; i<N; ++i) { C[i + N] = rC[i]; }
}


int main() {
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;
    unsigned long long h_start[2], h_end[2];
    unsigned long long *d_start, *d_end;
    
    size_t vec_bytes = N * sizeof(float);
    size_t c_bytes   = vec_bytes * 2;
    size_t clock_bytes = sizeof(unsigned long long) * 2;

    h_A = (float*)malloc(vec_bytes);
    h_B = (float*)malloc(vec_bytes);
    h_C = (float*)malloc(c_bytes);

    for (int i = 0; i < N; i++) {
        h_A[i] = (float)(i + 1);
        h_B[i] = (float)(i + 1);
    }

    cudaMalloc(&d_A, vec_bytes);
    cudaMalloc(&d_B, vec_bytes);
    cudaMalloc(&d_C, c_bytes);
    cudaMalloc(&d_start, clock_bytes);
    cudaMalloc(&d_end, clock_bytes);

    cudaMemcpy(d_A, h_A, vec_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, vec_bytes, cudaMemcpyHostToDevice);
    
    printf("Launching 1: Throughput Bound Loop (Compiler Fenced)...\n");
    kernel_ThroughputBound<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);
    cudaDeviceSynchronize();

    printf("Launching 2: Latency Bound Loop (Hardware Stalled)...\n");
    kernel_LatencyBound<<<1,1>>>(d_A, d_B, d_C, d_start + 1, d_end + 1);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, c_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_start, d_start, clock_bytes, cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_end, d_end, clock_bytes, cudaMemcpyDeviceToHost);

    unsigned long long elapsed_throughput_simple = h_end[0] - h_start[0];
    unsigned long long elapsed_latency    = h_end[1] - h_start[1];
    
    unsigned long long total_ops = (unsigned long long)N * reps;
    
    double clocks_per_op_tp_simple = static_cast<double>(elapsed_throughput_simple) / total_ops;
    double clocks_per_op_latency   = static_cast<double>(elapsed_latency) / total_ops;
    
    printf("\n--- Benchmark (N=%d, Reps=%d) ---\n", N, reps);
    printf("Total FMA Ops per kernel: %llu\n", total_ops);

    printf("\n[1. Throughput (Simple Loop / Compiler Fenced)]\n");
    printf("Total Clocks:  %llu\n", elapsed_throughput_simple);
    printf("Clocks / FMA:  %f\n", clocks_per_op_tp_simple);

    printf("\n[2. Latency Bound Loop (Hardware Stalled)]\n");
    printf("Total Clocks:  %llu\n", elapsed_latency);
    printf("Clocks / FMA:  %f\n", clocks_per_op_latency);
    printf("----------------------------------------\n");

    cudaFree(d_end);
    cudaFree(d_start); 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Launching 1: Throughput Bound Loop (Compiler Fenced)...
Launching 2: Latency Bound Loop (Hardware Stalled)...

--- Benchmark (N=64, Reps=500000) ---
Total FMA Ops per kernel: 32000000

[1. Throughput (Simple Loop / Compiler Fenced)]
Total Clocks:  62500180
Clocks / FMA:  1.953131

[2. Latency Bound Loop (Hardware Stalled)]
Total Clocks:  134320314
Clocks / FMA:  4.197510
----------------------------------------

