In [1]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_90a -Xptxas=-v')

Source files will be saved in "/tmp/tmp3k45waxx".


In [None]:
%%cuda
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include <cuda/barrier>
#include <cuda/ptx>
using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace ptx = cuda::ptx;

static constexpr size_t buf_len = 1024;
static constexpr int N_blocks = 4;
static constexpr size_t data_len = N_blocks * buf_len;
static constexpr int N_threads_per_block = 64;
__global__ void add_one_kernel(int* data, size_t offset)
{
  //# Shared memory buffer. The destination shared memory buffer of
  //# a bulk operations should be 16 byte aligned.
  __shared__ alignas(16) int smem_data[buf_len];

  //# 1. a) Initialize shared memory barrier with the number of threads participating in the barrier.
  //#3    b) Make initialized barrier visible in async proxy.
  #pragma nv_diag_suppress static_var_with_dynamic_init
  __shared__ barrier bar;
  if (threadIdx.x == 0) { 
    init(&bar, blockDim.x);                      // a)
    ptx::fence_proxy_async(ptx::space_shared);   // b)
  }
  __syncthreads();

  //# 2. Initiate TMA transfer to copy global to shared memory.
  if (threadIdx.x == 0) {
    //# 3a. cuda::memcpy_async arrives on the barrier and communicates
    //#     how many bytes are expected to come in (the transaction count)
    cuda::memcpy_async(
        smem_data, 
        data + offset, 
        cuda::aligned_size_t<16>(sizeof(smem_data)),
        bar
    );
  }
  //# 3b. All threads arrive on the barrier
  barrier::arrival_token token = bar.arrive();
  
  //# 3c. Wait for the data to have arrived.
  bar.wait(std::move(token));

  //# 4. Compute saxpy and write back to shared memory
  for (int i = threadIdx.x; i < buf_len; i += blockDim.x) {
    smem_data[i] += 1;
  }

  //# 5. Wait for shared memory writes to be visible to TMA engine.
  ptx::fence_proxy_async(ptx::space_shared);   // b)
  __syncthreads();
  //# After syncthreads, writes by all threads are visible to TMA engine.

  // #6. Initiate TMA transfer to copy shared memory to global memory
  if (threadIdx.x == 0) {
    ptx::cp_async_bulk(
        ptx::space_global,
        ptx::space_shared,
        data + offset, smem_data, sizeof(smem_data));
    // 7. Wait for TMA transfer to have finished reading shared memory.
    // Create a "bulk async-group" out of the previous bulk copy operation.
    ptx::cp_async_bulk_commit_group();
    // Wait for the group to have completed reading from shared memory.
    ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>());
  }
}

int main()
{
  int*data_h; 
  cudaHostAlloc(&data_h, data_len, cudaHostAllocDefault); 
  int*data_d; 
  size_t offset = 0;
  cudaMalloc(&data_d, data_len); 
  cudaMemcpy(data_d, data_h, data_len, cudaMemcpyHostToDevice); 
  add_one_kernel<<<N_blocks, N_threads_per_block>>>(data_d, offset);
  cudaDeviceSynchronize(); 
  cudaMemcpy(data_h, data_d, data_len, cudaMemcpyDeviceToHost); 
  for (int i = 0; i < 100; i ++)
  {
    printf("D[%d]: %d ", i, data_h[i]);
  }
  cudaFreeHost(data_h); 
  cudaFree(data_d);
}


D[0]: 1 D[1]: 1 D[2]: 1 D[3]: 1 D[4]: 1 D[5]: 1 D[6]: 1 D[7]: 1 D[8]: 1 D[9]: 1 D[10]: 1 D[11]: 1 D[12]: 1 D[13]: 1 D[14]: 1 D[15]: 1 D[16]: 1 D[17]: 1 D[18]: 1 D[19]: 1 D[20]: 1 D[21]: 1 D[22]: 1 D[23]: 1 D[24]: 1 D[25]: 1 D[26]: 1 D[27]: 1 D[28]: 1 D[29]: 1 D[30]: 1 D[31]: 1 D[32]: 1 D[33]: 1 D[34]: 1 D[35]: 1 D[36]: 1 D[37]: 1 D[38]: 1 D[39]: 1 D[40]: 1 D[41]: 1 D[42]: 1 D[43]: 1 D[44]: 1 D[45]: 1 D[46]: 1 D[47]: 1 D[48]: 1 D[49]: 1 D[50]: 1 D[51]: 1 D[52]: 1 D[53]: 1 D[54]: 1 D[55]: 1 D[56]: 1 D[57]: 1 D[58]: 1 D[59]: 1 D[60]: 1 D[61]: 1 D[62]: 1 D[63]: 1 D[64]: 1 D[65]: 1 D[66]: 1 D[67]: 1 D[68]: 1 D[69]: 1 D[70]: 1 D[71]: 1 D[72]: 1 D[73]: 1 D[74]: 1 D[75]: 1 D[76]: 1 D[77]: 1 D[78]: 1 D[79]: 1 D[80]: 1 D[81]: 1 D[82]: 1 D[83]: 1 D[84]: 1 D[85]: 1 D[86]: 1 D[87]: 1 D[88]: 1 D[89]: 1 D[90]: 1 D[91]: 1 D[92]: 1 D[93]: 1 D[94]: 1 D[95]: 1 D[96]: 1 D[97]: 1 D[98]: 1 D[99]: 1 


In [24]:
%%cuda
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include <cuda/barrier>
#include <cuda/ptx>
#include<cuda.h>
using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace ptx = cuda::ptx;
#ifndef CUDA_UTILS_H
#define CUDA_UTILS_H



/**
 * @brief Macro to wrap CUDA API calls.
 * * Usage: CUDA_CHECK(cudaMalloc(&d_a, size));
 */
#define CUDA_CHECK(call) \
    { \
        gpuAssert((call), __FILE__, __LINE__); \
    }

/**
 * @brief Internal function to handle the error logic.
 * * @param code The error code returned by the CUDA function.
 * @param file The file name where the error occurred (provided by __FILE__).
 * @param line The line number where the error occurred (provided by __LINE__).
 * @param abort Whether to terminate the application on error (default: true).
 */
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess) 
    {
        // Print the error name (e.g., cudaErrorMemoryAllocation)
        // and the description (e.g., "out of memory")
        fprintf(stderr, "CUDA_CHECK Error: %s %s\n", 
                cudaGetErrorName(code), 
                cudaGetErrorString(code));
        
        fprintf(stderr, "  File: %s\n", file);
        fprintf(stderr, "  Line: %d\n", line);

        if (abort) 
        {
            // Optional: Reset device to flush profiling data before exit
            cudaDeviceReset();
            exit(code);
        }
    }
}

/**
 * @brief Macro to check for errors after a kernel launch.
 * * Kernel launches are asynchronous. This macro checks:
 * 1. If the launch itself failed (invalid configuration).
 * 2. Synchronizes the device to check for execution errors (bus errors, segfaults).
 * * WARNING: Using this slows down code significantly because of the synchronization.
 * Use only for debugging.
 */
#define CUDA_CHECK_KERNEL() \
    { \
        cudaError_t err = cudaGetLastError(); \
        if (err != cudaSuccess) { \
            fprintf(stderr, "Kernel Launch Error (Sync): %s\n", cudaGetErrorString(err)); \
            gpuAssert(err, __FILE__, __LINE__); \
        } \
        err = cudaDeviceSynchronize(); \
        if (err != cudaSuccess) { \
            fprintf(stderr, "Kernel Execution Error (Async): %s\n", cudaGetErrorString(err)); \
            gpuAssert(err, __FILE__, __LINE__); \
        } \
    }

#endif // CUDA_UTILS_H

#include <cudaTypedefs.h> // PFN_cuTensorMapEncodeTiled, CUtensorMap

PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() {
  // Get pointer to cuTensorMapEncodeTiled
  cudaDriverEntryPointQueryResult driver_status;
  void* cuTensorMapEncodeTiled_ptr = nullptr;
  CUDA_CHECK(cudaGetDriverEntryPointByVersion("cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr, 12000, cudaEnableDefault, &driver_status));
  assert(driver_status == cudaDriverEntryPointSuccess);

  return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
}
constexpr uint32_t N = 4096; 
constexpr uint32_t BN = 32;
constexpr uint32_t rank = 2; 
constexpr uint32_t GMEM_WIDTH = N; 
constexpr uint32_t GMEM_HEIGHT = N; 
constexpr uint32_t SMEM_WIDTH = BN; 
constexpr uint32_t SMEM_HEIGHT = BN;
uint64_t size[rank] = {GMEM_WIDTH, GMEM_HEIGHT}; //#the faster dimension is always left first. unit is number of elements
uint64_t stride[rank-1] =  {GMEM_WIDTH*sizeof(float)}; //#the number of bytes to step from one row to the next, we are making a row major one here
uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT}; //# I wonder if the reduced stride is taken automatically, but 
//# our boys at NVDA say that things are assumed LINEAR in shared memory so I guess they don't give a fuck, but at this point 
//# the layout is already decided.
uint32_t elem_stride[rank] = {1,1}; //# THIS VIDEO IS ALREADY TOO LONG FOR INSTAGRAM REEEEEELLLL FFS 
//# indeed, the element_strides are something along the lines of if each element is a row major 2 vector then {1,2} would be the elem stride
//# but we have scalar elements so we guuci.

int main()
{
  float* A_h, *A_d; 
  size_t size_tensor = N*N*sizeof(float);
  cudaHostAlloc(&A_h, size_tensor, cudaHostAllocDefault); 
  cudaMalloc(&A_d, size_tensor); 
  void *tensor_ptr =  &A_d; 
  cudaMemcpy(A_d, A_h, size_tensor, cudaMemcpyHostToDevice);
  
  CUtensorMap tensor_map{};
  auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled(); //#function pointer to the tensor map creation api
  CUresult res = cuTensorMapEncodeTiled(
  &tensor_map,                //# CUtensorMap *tensorMap,
  CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
  rank,                       //# cuuint32_t tensorRank,
  tensor_ptr,                 //# void *globalAddress,
  size,                       //# const cuuint64_t *globalDim,
  stride,                     //# const cuuint64_t *globalStrides,
  box_size,                   //# const cuuint32_t *boxDim,
  elem_stride,                //# const cuuint32_t *elementStrides,
  //# Interleave patterns can be used to accelerate loading of values that
  //# are less than 4 bytes long.
  CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
  //# Swizzling can be used to avoid shared memory bank conflicts.
  CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
  //# L2 Promotion can be used to widen the effect of a cache-policy to a wider
  //# set of L2 cache lines.
  CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
  //# Any element that is outside of bounds will be set to zero by the TMA transfer.
  CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
);
  
  return 0;
}


