In [1]:
# for i in range(84):
#   print(f"st.global.f32 [%thread_write_addr + {4*i}],  %fA{i};")

In [2]:

# for i in range(84):
#   for _ in range(20):
#     print(f"    fma.rn.f32 %fA{i}, %fB{i}, %fC{i}, %fA{i}; ")

In [3]:
# for i in range(84): 
#   print(f"mov.f32 %fC{i}, {(3*i/100)};")

In [4]:
import sys
import numpy as np
# Switched from 'from cuda import cuda' to 'cuda.bindings.driver' to fix deprecation warning
from cuda.bindings import driver as cuda

def check_cuda_errors(result):
    """Helper to check CUDA driver API results."""
    if isinstance(result, tuple):
        err = result[0]
        if len(result) > 1:
            val = result[1]
        else:
            val = None
    else:
        err = result
        val = None

    if err != cuda.CUresult.CUDA_SUCCESS:
        raise RuntimeError(f"CUDA Error: {err}")
    return val

def run_cuda_kernel(ptx_path, kernel_name):
    # --- Configuration ---
    # Launch parameters stay hardcoded as requested
    NUM_THREADS = 1  # One Warp
    REGS_PER_THREAD = 84

    try:
        # --- 1. Initialize CUDA ---
        check_cuda_errors(cuda.cuInit(0))
        device = check_cuda_errors(cuda.cuDeviceGet(0))
        context = check_cuda_errors(cuda.cuCtxCreate(0, device))
        print(f"Context created on device: {device}")

        # --- 2. Load PTX Module ---
        try:
            with open(ptx_path, "rb") as f:
                ptx_data = f.read()
            # Ensure null-termination for the driver
            if not ptx_data.endswith(b'\0'):
                ptx_data += b'\0'
        except FileNotFoundError:
            print(f"Error: Could not find '{ptx_path}'. Make sure it's in the same directory.")
            return None, None, None

        module = check_cuda_errors(cuda.cuModuleLoadData(ptx_data))
        kernel = check_cuda_errors(cuda.cuModuleGetFunction(module, kernel_name.encode("utf-8")))

        # --- 3. Allocate Host Memory ---
        # Timestamps
        h_start_clock = np.zeros(NUM_THREADS, dtype=np.uint64)
        h_end_clock = np.zeros(NUM_THREADS, dtype=np.uint64)
        
        # Results: 32 threads * 84 registers * 4 bytes
        total_floats = NUM_THREADS * REGS_PER_THREAD
        h_results = np.zeros(total_floats, dtype=np.float32)

        print(f"Allocating {h_results.nbytes / 1024:.2f} KB for results...")

        # --- 4. Allocate Device Memory ---
        d_start_clock = check_cuda_errors(cuda.cuMemAlloc(h_start_clock.nbytes))
        d_end_clock = check_cuda_errors(cuda.cuMemAlloc(h_end_clock.nbytes))
        d_results = check_cuda_errors(cuda.cuMemAlloc(h_results.nbytes))

        # --- 5. Prepare Kernel Arguments ---
        # The driver API expects an array of pointers to the arguments.
        # We wrap the device pointers in numpy arrays so we can get their address.
        arg_start = np.array([d_start_clock], dtype=np.uint64)
        arg_end = np.array([d_end_clock], dtype=np.uint64)
        arg_res = np.array([d_results], dtype=np.uint64)

        args = np.array([
            arg_start.ctypes.data,
            arg_end.ctypes.data,
            arg_res.ctypes.data
        ], dtype=np.uint64)

        # --- 6. Launch Kernel ---
        print(f"Launching Kernel '{kernel_name}' from '{ptx_path}'...")
        check_cuda_errors(cuda.cuLaunchKernel(
            kernel,
            1, 1, 1,            # Grid (1 block)
            NUM_THREADS, 1, 1,  # Block (32 threads)
            0,                  # Shared Mem
            0,                  # Stream
            args.ctypes.data,   # Kernel Arguments
            0                   # Extra (NULL)
        ))

        # Synchronize
        check_cuda_errors(cuda.cuCtxSynchronize())

        # --- 7. Copy Back Results ---
        check_cuda_errors(cuda.cuMemcpyDtoH(h_start_clock.ctypes.data, d_start_clock, h_start_clock.nbytes))
        check_cuda_errors(cuda.cuMemcpyDtoH(h_end_clock.ctypes.data, d_end_clock, h_end_clock.nbytes))
        check_cuda_errors(cuda.cuMemcpyDtoH(h_results.ctypes.data, d_results, h_results.nbytes))

        # --- 8. Return Results ---
        # Return the three numpy arrays
        return h_start_clock, h_end_clock, h_results

    except RuntimeError as e:
        print(e)
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None


In [5]:
starts_good, ends_good, res_good = run_cuda_kernel("/root/CudaNotebooks2/ILP_DEMON/ilp_good.ptx", "fma_max_pressure")

Context created on device: <CUdevice 0>
Allocating 0.33 KB for results...
Launching Kernel 'fma_max_pressure' from '/root/CudaNotebooks2/ILP_DEMON/ilp_good.ptx'...


In [7]:
starts_bad, ends_bad, res_bad = run_cuda_kernel("/root/CudaNotebooks2/ILP_DEMON/ilp_bad.ptx", "fma_retarded_pressure")

Context created on device: <CUdevice 0>
Allocating 0.33 KB for results...
Launching Kernel 'fma_retarded_pressure' from '/root/CudaNotebooks2/ILP_DEMON/ilp_bad.ptx'...


In [8]:
print(np.allclose(res_bad,res_good))

True


In [9]:
times_good = ends_good - starts_good

In [10]:
times_bad = ends_bad - starts_bad

In [11]:
times_good

array([2709], dtype=uint64)

In [12]:
times_bad

array([2868], dtype=uint64)

In [13]:
N_fmas = np.array([84*20]) 
N_fmas_per_clock_good = N_fmas/times_good

In [14]:
N_fmas_per_clock_bad = N_fmas/times_bad

In [15]:
N_fmas_per_clock_bad

array([0.58577406])

In [16]:
N_fmas_per_clock_good

array([0.62015504])