In [1]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v -O0 -I/workspace/cutlass/include')

Source files will be saved in "/tmp/tmpk_1rluod".


In [None]:
%%cuda 

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h> // Driver API
#include <stdint.h>
#include <time.h>
#include <string>    // For std::string
#include <fstream>   // For std::ifstream
#include <sstream>   // For std::stringstream

// Driver API error checking macro
#define CHECK_CUDA_DRIVER(call) \
    do { \
        CUresult err = call; \
        if (err != CUDA_SUCCESS) { \
            const char* str; \
            cuGetErrorString(err, &str); \
            printf("CUDA Driver error: %s in %s at line %d\n", str, __FILE__, __LINE__); \
            exit(EXIT_FAILURE); \
        } \
    } while (0)

constexpr int N_ctas = 2; 
constexpr int N_threads_per_block = 32;
constexpr int N_debug_vals = 4;

// --- Function to read a file into a string ---
std::string readPtxFile(const std::string& filename) {
    std::ifstream file(filename);
    if (!file.is_open()) {
        printf("Error: Could not open PTX file: %s\n", filename.c_str());
        exit(EXIT_FAILURE);
    }
    std::stringstream buffer;
    buffer << file.rdbuf();
    return buffer.str();
}

int main()
{
  uint32_t* h_debug_val;
  size_t size = N_debug_vals * sizeof(uint32_t);
  
  CUdevice device;
  CUcontext context;
  CUmodule module;
  CUfunction kernel;
  CUdeviceptr d_debug_val;

  // --- 1. Init CUDA Driver API ---
  CHECK_CUDA_DRIVER(cuInit(0));
  CHECK_CUDA_DRIVER(cuDeviceGet(&device, 0));
  CHECK_CUDA_DRIVER(cuCtxCreate(&context, 0, device));

  // --- 2. Load PTX Kernel from file ---
  std::string ptx_content = readPtxFile("/root/CudaNotebooks/cheat_sheet/tcgen5alloc.ptx");
  CHECK_CUDA_DRIVER(cuModuleLoadData(&module, ptx_content.c_str()));
  CHECK_CUDA_DRIVER(cuModuleGetFunction(&kernel, module, "alloc_dealloc_tmem_ptx"));

  // --- 3. Allocate Host and Device Memory ---
  h_debug_val = (uint32_t*)malloc(size);
  CHECK_CUDA_DRIVER(cuMemAlloc(&d_debug_val, size));

  // --- 4. Init Host Data and Copy H -> D ---
  srand(time(NULL));
  printf("Host array before kernel (random values):\n");
  for (int i = 0; i < N_debug_vals; i++) {
    h_debug_val[i] = rand();
    printf("h_debug_val[%d] = %u\n", i, h_debug_val[i]);
  }
  
  CHECK_CUDA_DRIVER(cuMemcpyHtoD(d_debug_val, h_debug_val, size));

  // --- 5. Launch Kernel ---
  // Setup kernel parameters: one argument, a pointer to the device buffer
  void* args[] = { &d_debug_val };

  CHECK_CUDA_DRIVER(cuLaunchKernel(
      kernel,
      N_ctas, 1, 1,           // Grid dimensions
      N_threads_per_block, 1, 1, // Block dimensions
      0,                       // Shared memory (0, we use static .shared)
      NULL,                    // Stream
      args,                    // Kernel arguments
      NULL                     // Extra
  ));
  
  CHECK_CUDA_DRIVER(cuCtxSynchronize());

  // --- 6. Copy D -> H and Print ---
  CHECK_CUDA_DRIVER(cuMemcpyDtoH(h_debug_val, d_debug_val, size));

  printf("\nKernel executed successfully.\n");
  printf("TMEM addresses from device:\n");
  for (int i = 0; i < N_ctas; i++) {
    printf("CTA %d, Alloc 0 TMEM address: %u \n", i, h_debug_val[2*i + 0]);
    printf("CTA %d, Alloc 1 TMEM address: %u \n", i, h_debug_val[2*i + 1]);
  }

  // --- 7. Cleanup ---
  CHECK_CUDA_DRIVER(cuMemFree(d_debug_val));
  CHECK_CUDA_DRIVER(cuModuleUnload(module));
  CHECK_CUDA_DRIVER(cuCtxDestroy(context));
  free(h_debug_val);

  return 0;
}

ptxas info    : 0 bytes gmem
/usr/bin/ld: /tmp/tmpxft_00002eef_00000000-13_single_file.o: in function `main':
tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x1fb): undefined reference to `cuInit'
/usr/bin/ld: tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x21e): undefined reference to `cuGetErrorString'
/usr/bin/ld: tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x263): undefined reference to `cuDeviceGet'
/usr/bin/ld: tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x286): undefined reference to `cuGetErrorString'
/usr/bin/ld: tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x2d1): undefined reference to `cuCtxCreate_v2'
/usr/bin/ld: tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x2f4): undefined reference to `cuGetErrorString'
/usr/bin/ld: tmpxft_00002eef_00000000-6_single_file.compute_100a.cudafe1.cpp:(.text+0x396): undefined reference to `