In [17]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v -O0')

Source files will be saved in "/tmp/tmpl_a417v3".


In [18]:
# for i in range(84):
#   print(f"st.global.f32 [%thread_write_addr + {4*i}],  %fA{i};")

In [19]:

# for i in range(84):
#   for _ in range(20):
#     print(f"    fma.rn.f32 %fA{i}, %fB{i}, %fC{i}, %fA{i}; ")

In [20]:
# for i in range(84): 
#   print(f"mov.f32 %fC{i}, {(3*i/100)};")

In [29]:
import sys
import numpy as np
# Switched from 'from cuda import cuda' to 'cuda.bindings.driver' to fix deprecation warning
from cuda.bindings import driver as cuda

def check_cuda_errors(result):
    """Helper to check CUDA driver API results."""
    if isinstance(result, tuple):
        err = result[0]
        if len(result) > 1:
            val = result[1]
        else:
            val = None
    else:
        err = result
        val = None

    if err != cuda.CUresult.CUDA_SUCCESS:
        raise RuntimeError(f"CUDA Error: {err}")
    return val

def run_cuda_kernel(ptx_path, kernel_name):
    # --- Configuration ---
    # Launch parameters stay hardcoded as requested
    NUM_THREADS = 1  # One Warp
    REGS_PER_THREAD = 84

    try:
        # --- 1. Initialize CUDA ---
        check_cuda_errors(cuda.cuInit(0))
        device = check_cuda_errors(cuda.cuDeviceGet(0))
        context = check_cuda_errors(cuda.cuCtxCreate(0, device))
        print(f"Context created on device: {device}")

        # --- 2. Load PTX Module ---
        try:
            with open(ptx_path, "rb") as f:
                ptx_data = f.read()
            # Ensure null-termination for the driver
            if not ptx_data.endswith(b'\0'):
                ptx_data += b'\0'
        except FileNotFoundError:
            print(f"Error: Could not find '{ptx_path}'. Make sure it's in the same directory.")
            return None, None, None

        module = check_cuda_errors(cuda.cuModuleLoadData(ptx_data))
        kernel = check_cuda_errors(cuda.cuModuleGetFunction(module, kernel_name.encode("utf-8")))

        # --- 3. Allocate Host Memory ---
        # Timestamps
        h_start_clock = np.zeros(NUM_THREADS, dtype=np.uint64)
        h_end_clock = np.zeros(NUM_THREADS, dtype=np.uint64)
        
        # Results: 32 threads * 84 registers * 4 bytes
        total_floats = NUM_THREADS * REGS_PER_THREAD
        h_results = np.zeros(total_floats, dtype=np.float32)

        print(f"Allocating {h_results.nbytes / 1024:.2f} KB for results...")

        # --- 4. Allocate Device Memory ---
        d_start_clock = check_cuda_errors(cuda.cuMemAlloc(h_start_clock.nbytes))
        d_end_clock = check_cuda_errors(cuda.cuMemAlloc(h_end_clock.nbytes))
        d_results = check_cuda_errors(cuda.cuMemAlloc(h_results.nbytes))

        # --- 5. Prepare Kernel Arguments ---
        # The driver API expects an array of pointers to the arguments.
        # We wrap the device pointers in numpy arrays so we can get their address.
        arg_start = np.array([d_start_clock], dtype=np.uint64)
        arg_end = np.array([d_end_clock], dtype=np.uint64)
        arg_res = np.array([d_results], dtype=np.uint64)

        args = np.array([
            arg_start.ctypes.data,
            arg_end.ctypes.data,
            arg_res.ctypes.data
        ], dtype=np.uint64)

        # --- 6. Launch Kernel ---
        print(f"Launching Kernel '{kernel_name}' from '{ptx_path}'...")
        check_cuda_errors(cuda.cuLaunchKernel(
            kernel,
            1, 1, 1,            # Grid (1 block)
            NUM_THREADS, 1, 1,  # Block (32 threads)
            0,                  # Shared Mem
            0,                  # Stream
            args.ctypes.data,   # Kernel Arguments
            0                   # Extra (NULL)
        ))

        # Synchronize
        check_cuda_errors(cuda.cuCtxSynchronize())

        # --- 7. Copy Back Results ---
        check_cuda_errors(cuda.cuMemcpyDtoH(h_start_clock.ctypes.data, d_start_clock, h_start_clock.nbytes))
        check_cuda_errors(cuda.cuMemcpyDtoH(h_end_clock.ctypes.data, d_end_clock, h_end_clock.nbytes))
        check_cuda_errors(cuda.cuMemcpyDtoH(h_results.ctypes.data, d_results, h_results.nbytes))

        # --- Cleanup ---
        # IMPORTANT: Destroy context to free GPU memory for the next run
        print("Destroying context...")
        check_cuda_errors(cuda.cuCtxDestroy(context))
        
        # --- 8. Return Results ---
        # Return the three numpy arrays
        return h_start_clock, h_end_clock, h_results

    except RuntimeError as e:
        print(e)
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None


In [30]:
starts_good, ends_good, res_good = run_cuda_kernel("/root/CudaNotebooks2/ILP_DEMON/ilp_good.ptx", "fma_max_pressure")

Context created on device: <CUdevice 0>
Allocating 0.33 KB for results...
Launching Kernel 'fma_max_pressure' from '/root/CudaNotebooks2/ILP_DEMON/ilp_good.ptx'...
Destroying context...


In [31]:
starts_bad, ends_bad, res_bad = run_cuda_kernel("/root/CudaNotebooks2/ILP_DEMON/ilp_bad.ptx", "fma_retarded_pressure")

Context created on device: <CUdevice 0>
Allocating 0.33 KB for results...
Launching Kernel 'fma_retarded_pressure' from '/root/CudaNotebooks2/ILP_DEMON/ilp_bad.ptx'...
Destroying context...


In [32]:
print(np.allclose(res_bad,res_good))

True


In [33]:
times_good = ends_good - starts_good

In [34]:
times_bad = ends_bad - starts_bad

In [35]:
times_good

array([2709], dtype=uint64)

In [36]:
times_bad

array([2868], dtype=uint64)

In [37]:
N_fmas = np.array([84*20]) 
N_fmas_per_clock_good = N_fmas/times_good

In [38]:
N_fmas_per_clock_bad = N_fmas/times_bad

In [39]:
N_fmas_per_clock_bad

array([0.58577406])

In [40]:
N_fmas_per_clock_good

array([0.62015504])

In [42]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Use N=64 as you requested
constexpr int N = 64;
constexpr int reps = 500000;

__device__ __forceinline__ unsigned long long get_clock64() {
    unsigned long long clock_val;
    asm volatile("mov.u64 %0, %%clock64;" : "=l"(clock_val));
    return clock_val;
}

// --- KERNEL 1: The "Throughput Bound" Loop (REALLY GOOD) ---
// Outer loop is 'reps'. Inner loop is one fat asm block of 64
// independent FMA instructions.
// This is bound by FMA THROUGHPUT.
__global__ void kernel_ThroughputBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    // Allocate 3 register arrays, 1D, size N
    float rA[N], rB[N];
    float rC[N] = {0.0f};

    // Pre-load all registers
    for(int i=0; i<N; ++i) { 
        rA[i] = A[i];
        rB[i] = B[i];
    }
    __syncthreads(); // Ensure loads are done

    *g_start = get_clock64();

    // Outer loop is over N_iter (reps)
    for (int repeat = 0; repeat < reps; repeat++) {
        
        // This single block contains 64 independent FMA operations.
        // The hardware FMA pipeline will be full.
        // The constraint list is huge (64 outputs, 128 inputs)
        asm volatile (
            "fma.rn.f32 %0, %1, %2, %0; \n\t" 
            "fma.rn.f32 %3, %4, %5, %3; \n\t" 
            "fma.rn.f32 %6, %7, %8, %6; \n\t" 
            "fma.rn.f32 %9, %10, %11, %9; \n\t" 
            "fma.rn.f32 %12, %13, %14, %12; \n\t" 
            "fma.rn.f32 %15, %16, %17, %15; \n\t" 
            "fma.rn.f32 %18, %19, %20, %18; \n\t" 
            "fma.rn.f32 %21, %22, %23, %21; \n\t" 
            "fma.rn.f32 %24, %25, %26, %24; \n\t" 
            "fma.rn.f32 %27, %28, %29, %27; \n\t" 
            "fma.rn.f32 %30, %31, %32, %30; \n\t" 
            "fma.rn.f32 %33, %34, %35, %33; \n\t" 
            "fma.rn.f32 %36, %37, %38, %36; \n\t" 
            "fma.rn.f32 %39, %40, %41, %39; \n\t" 
            "fma.rn.f32 %42, %43, %44, %42; \n\t" 
            "fma.rn.f32 %45, %46, %47, %45; \n\t" 
            "fma.rn.f32 %48, %49, %50, %48; \n\t" 
            "fma.rn.f32 %51, %52, %53, %51; \n\t" 
            "fma.rn.f32 %54, %55, %56, %54; \n\t" 
            "fma.rn.f32 %57, %58, %59, %57; \n\t" 
            "fma.rn.f32 %60, %61, %62, %60; \n\t" 
            "fma.rn.f32 %63, %64, %65, %63; \n\t" 
            "fma.rn.f32 %66, %67, %68, %66; \n\t" 
            "fma.rn.f32 %69, %70, %71, %69; \n\t" 
            "fma.rn.f32 %72, %73, %74, %72; \n\t" 
            "fma.rn.f32 %75, %76, %77, %75; \n\t" 
            "fma.rn.f32 %78, %79, %80, %78; \n\t" 
            "fma.rn.f32 %81, %82, %83, %81; \n\t" 
            "fma.rn.f32 %84, %85, %86, %84; \n\t" 
            "fma.rn.f32 %87, %88, %89, %87; \n\t" 
            "fma.rn.f32 %90, %91, %92, %90; \n\t" 
            "fma.rn.f32 %93, %94, %95, %93; \n\t" 
            "fma.rn.f32 %96, %97, %98, %96; \n\t" 
            "fma.rn.f32 %99, %100, %101, %99; \n\t" 
            "fma.rn.f32 %102, %103, %104, %102; \n\t" 
            "fma.rn.f32 %105, %106, %107, %105; \n\t" 
            "fma.rn.f32 %108, %109, %110, %108; \n\t" 
            "fma.rn.f32 %111, %112, %113, %111; \n\t" 
            "fma.rn.f32 %114, %115, %116, %114; \n\t" 
            "fma.rn.f32 %117, %118, %119, %117; \n\t" 
            "fma.rn.f32 %120, %121, %122, %120; \n\t" 
            "fma.rn.f32 %123, %124, %125, %123; \n\t" 
            "fma.rn.f32 %126, %127, %128, %126; \n\t" 
            "fma.rn.f32 %129, %130, %131, %129; \n\t" 
            "fma.rn.f32 %132, %133, %134, %132; \n\t" 
            "fma.rn.f32 %135, %136, %137, %135; \n\t" 
            "fma.rn.f32 %138, %139, %140, %138; \n\t" 
            "fma.rn.f32 %141, %142, %143, %141; \n\t" 
            "fma.rn.f32 %144, %145, %146, %144; \n\t" 
            "fma.rn.f32 %147, %148, %149, %147; \n\t" 
            "fma.rn.f32 %150, %151, %152, %150; \n\t" 
            "fma.rn.f32 %153, %154, %155, %153; \n\t" 
            "fma.rn.f32 %156, %157, %158, %156; \n\t" 
            "fma.rn.f32 %159, %160, %161, %159; \n\t" 
            "fma.rn.f32 %162, %163, %164, %162; \n\t" 
            "fma.rn.f32 %165, %166, %167, %165; \n\t" 
            "fma.rn.f32 %168, %169, %170, %168; \n\t" 
            "fma.rn.f32 %171, %172, %173, %171; \n\t" 
            "fma.rn.f32 %174, %175, %176, %174; \n\t" 
            "fma.rn.f32 %177, %178, %179, %177; \n\t" 
            "fma.rn.f32 %180, %181, %182, %180; \n\t" 
            "fma.rn.f32 %183, %184, %185, %183; \n\t" 
            "fma.rn.f32 %186, %187, %188, %186; \n\t" 
            "fma.rn.f32 %189, %190, %191, %189; \n\t"
            // --- Output Operands (Read+Write) ---
            : "+f"(rC[0]),  "+f"(rC[1]), "+f"(rC[2]), "+f"(rC[3]), 
              "+f"(rC[4]),  "+f"(rC[5]), "+f"(rC[6]), "+f"(rC[7]), 
              "+f"(rC[8]),  "+f"(rC[9]), "+f"(rC[10]), "+f"(rC[11]), 
              "+f"(rC[12]),  "+f"(rC[13]), "+f"(rC[14]), "+f"(rC[15]), 
              "+f"(rC[16]),  "+f"(rC[17]), "+f"(rC[18]), "+f"(rC[19]), 
              "+f"(rC[20]),  "+f"(rC[21]), "+f"(rC[22]), "+f"(rC[23]), 
              "+f"(rC[24]),  "+f"(rC[25]), "+f"(rC[26]), "+f"(rC[27]), 
              "+f"(rC[28]),  "+f"(rC[29]), "+f"(rC[30]), "+f"(rC[31]), 
              "+f"(rC[32]),  "+f"(rC[33]), "+f"(rC[34]), "+f"(rC[35]), 
              "+f"(rC[36]),  "+f"(rC[37]), "+f"(rC[38]), "+f"(rC[39]), 
              "+f"(rC[40]),  "+f"(rC[41]), "+f"(rC[42]), "+f"(rC[43]), 
              "+f"(rC[44]),  "+f"(rC[45]), "+f"(rC[46]), "+f"(rC[47]), 
              "+f"(rC[48]),  "+f"(rC[49]), "+f"(rC[50]), "+f"(rC[51]), 
              "+f"(rC[52]),  "+f"(rC[53]), "+f"(rC[54]), "+f"(rC[55]), 
              "+f"(rC[56]),  "+f"(rC[57]), "+f"(rC[58]), "+f"(rC[59]), 
              "+f"(rC[60]),  "+f"(rC[61]), "+f"(rC[62]), "+f"(rC[63])
            
            // --- Input Operands (Read-Only) ---
            : "f"(rA[0]),  "f"(rB[0]), "f"(rA[1]), "f"(rB[1]), 
              "f"(rA[2]),  "f"(rB[2]), "f"(rA[3]), "f"(rB[3]), 
              "f"(rA[4]),  "f"(rB[4]), "f"(rA[5]), "f"(rB[5]), 
              "f"(rA[6]),  "f"(rB[6]), "f"(rA[7]), "f"(rB[7]), 
              "f"(rA[8]),  "f"(rB[8]), "f"(rA[9]), "f"(rB[9]), 
              "f"(rA[10]),  "f"(rB[10]), "f"(rA[11]), "f"(rB[11]), 
              "f"(rA[12]),  "f"(rB[12]), "f"(rA[13]), "f"(rB[13]), 
              "f"(rA[14]),  "f"(rB[14]), "f"(rA[15]), "f"(rB[15]), 
              "f"(rA[16]),  "f"(rB[16]), "f"(rA[17]), "f"(rB[17]), 
              "f"(rA[18]),  "f"(rB[18]), "f"(rA[19]), "f"(rB[19]), 
              "f"(rA[20]),  "f"(rB[20]), "f"(rA[21]), "f"(rB[21]), 
              "f"(rA[22]),  "f"(rB[22]), "f"(rA[23]), "f"(rB[23]), 
              "f"(rA[24]),  "f"(rB[24]), "f"(rA[25]), "f"(rB[25]), 
              "f"(rA[26]),  "f"(rB[26]), "f"(rA[27]), "f"(rB[27]), 
              "f"(rA[28]),  "f"(rB[28]), "f"(rA[29]), "f"(rB[29]), 
              "f"(rA[30]),  "f"(rB[30]), "f"(rA[31]), "f"(rB[31]), 
              "f"(rA[32]),  "f"(rB[32]), "f"(rA[33]), "f"(rB[33]), 
              "f"(rA[34]),  "f"(rB[34]), "f"(rA[35]), "f"(rB[35]), 
              "f"(rA[36]),  "f"(rB[36]), "f"(rA[37]), "f"(rB[37]), 
              "f"(rA[38]),  "f"(rB[38]), "f"(rA[39]), "f"(rB[39]), 
              "f"(rA[40]),  "f"(rB[40]), "f"(rA[41]), "f"(rB[41]), 
              "f"(rA[42]),  "f"(rB[42]), "f"(rA[43]), "f"(rB[43]), 
              "f"(rA[44]),  "f"(rB[44]), "f"(rA[45]), "f"(rB[45]), 
              "f"(rA[46]),  "f"(rB[46]), "f"(rA[47]), "f"(rB[47]), 
              "f"(rA[48]),  "f"(rB[48]), "f"(rA[49]), "f"(rB[49]), 
              "f"(rA[50]),  "f"(rB[50]), "f"(rA[51]), "f"(rB[51]), 
              "f"(rA[52]),  "f"(rB[52]), "f"(rA[53]), "f"(rB[53]), 
              "f"(rA[54]),  "f"(rB[54]), "f"(rA[55]), "f"(rB[55]), 
              "f"(rA[56]),  "f"(rB[56]), "f"(rA[57]), "f"(rB[57]), 
              "f"(rA[58]),  "f"(rB[58]), "f"(rA[59]), "f"(rB[59]), 
              "f"(rA[60]),  "f"(rB[60]), "f"(rA[61]), "f"(rB[61]), 
              "f"(rA[62]),  "f"(rB[62]), "f"(rA[63]), "f"(rB[63])
        );
    }
    
   
    __syncthreads();
     *g_end = get_clock64();
    for(int i=0; i<N; ++i) { C[i] = rC[i]; }
}


// --- KERNEL 2: The "Latency Bound" Loop (REALLY BAD) ---
// Outer loop is over N (indices). Inner loop is 'reps'.
// The asm block is *inside* the 'reps' loop, creating a
// serial dependency chain.
// This is bound by FMA LATENCY.
__global__ void kernel_LatencyBound(float *A, float *B, float *C, unsigned long long *g_start, unsigned long long *g_end) {
    
    float rA[N], rB[N];
    float rC[N] = {0.0f};

    // Pre-load all registers
    for(int i=0; i<N; ++i) { 
        rA[i] = A[i];
        rB[i] = B[i];
    }
    __syncthreads(); // Ensure loads are done

    *g_start = get_clock64();

    // Outer loop is over the 64 indices
    for (int i = 0; i < N; i++) {
        // Inner loop is over N_iter (reps)
        for (int repeat = 0; repeat < reps; repeat++) {
            
            // This asm block is executed 'reps' times for EACH 'i'.
            // It creates a long dependency chain on rC[i].
            // The pipeline will stall on every single iteration.
            asm volatile (
                "fma.rn.f32 %0, %1, %2, %0;"
                : "+f"(rC[i])   // %0: Read+Write (e.g., rC[0])
                : "f"(rA[i]),   // %1: Read (e.g., rA[0])
                  "f"(rB[i])    // %2: Read (e.g., rB[0])
            );
        }
    }
    
    
    __syncthreads();
    *g_end = get_clock64();
    for(int i=0; i<N; ++i) { C[i + N] = rC[i]; } // Store in second half
}


int main() {
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;
    unsigned long long h_start[2], h_end[2];
    unsigned long long *d_start, *d_end;
    
    // N=64
    size_t vec_bytes = N * sizeof(float);
    size_t c_bytes   = vec_bytes * 2; // 2 slots for output
    size_t clock_bytes = sizeof(unsigned long long) * 2;

    h_A = (float*)malloc(vec_bytes);
    h_B = (float*)malloc(vec_bytes);
    h_C = (float*)malloc(c_bytes);

    for (int i = 0; i < N; i++) {
        h_A[i] = (float)(i + 1);
        h_B[i] = (float)(i + 1);
    }

    cudaMalloc(&d_A, vec_bytes);
    cudaMalloc(&d_B, vec_bytes);
    cudaMalloc(&d_C, c_bytes);
    cudaMalloc(&d_start, clock_bytes);
    cudaMalloc(&d_end, clock_bytes);

    cudaMemcpy(d_A, h_A, vec_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, vec_bytes, cudaMemcpyHostToDevice);
    
    // --- Launch Kernel 1 (Throughput Bound) ---
    printf("Launching 1: True Throughput Kernel (Fat ASM Block)...\n");
    kernel_ThroughputBound<<<1,1>>>(d_A, d_B, d_C, d_start, d_end);
    cudaDeviceSynchronize();
    
    // --- Launch Kernel 2 (Latency Bound) ---
    printf("Launching 2: Latency Bound Kernel (Swapped Loops)...\n");
    kernel_LatencyBound<<<1,1>>>(d_A, d_B, d_C, d_start + 1, d_end + 1);
    cudaDeviceSynchronize();

    // # Copy back results
    cudaMemcpy(h_C, d_C, c_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_start, d_start, clock_bytes, cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_end, d_end, clock_bytes, cudaMemcpyDeviceToHost);

    // --- Print clock results ---
    unsigned long long elapsed_throughput = h_end[0] - h_start[0];
    unsigned long long elapsed_latency    = h_end[1] - h_start[1];
    
    // Total FMA ops is (N * reps) for both kernels.
    unsigned long long total_ops = (unsigned long long)N * reps;
    
    double clocks_per_op_throughput = static_cast<double>(elapsed_throughput) / total_ops;
    double clocks_per_op_latency    = static_cast<double>(elapsed_latency) / total_ops;
    
    printf("\n--- Benchmark (N=%d, Reps=%d) ---\n", N, reps);
    printf("Total FMA Ops per kernel: %llu\n", total_ops);

    printf("\n[1. Throughput Bound Loop (Pipelined)]\n");
    printf("Total Clocks:  %llu\n", elapsed_throughput);
    printf("Clocks / FMA:  %f\n", clocks_per_op_throughput);

    printf("\n[2. Latency Bound Loop (Stalled)]\n");
    printf("Total Clocks:  %llu\n", elapsed_latency);
    printf("Clocks / FMA:  %f\n", clocks_per_op_latency);
    printf("----------------------------------------\n");

    // #Cleanup
    cudaFree(d_end);
    cudaFree(d_start); 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Launching 1: True Throughput Kernel (Fat ASM Block)...
Launching 2: Latency Bound Kernel (Swapped Loops)...

--- Benchmark (N=64, Reps=500000) ---
Total FMA Ops per kernel: 32000000

[1. Throughput Bound Loop (Pipelined)]
Total Clocks:  12344243
Clocks / FMA:  0.385758

[2. Latency Bound Loop (Stalled)]
Total Clocks:  134320234
Clocks / FMA:  4.197507
----------------------------------------

