In [1]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v -O0 -I/workspace/cutlass/include')

Source files will be saved in "/tmp/tmp4n309q96".


In [None]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdint.h>
#include <cuda_bf16.h>
/**
 * #Creates a 64-bit shared memory descriptor for a tensor copy (tc.cp) instruction.
 *
 * #This function assembles the descriptor by packing various fields according to 
 * #the hardware specification (Hopper/Ada architecture).
 *
 * #@param smem_start_offset The 32-bit byte offset of the matrix start in shared memory.
 * #This value **must be 16-byte aligned**.
 * #@param lbo_bytes The leading dimension byte offset (e.g., stride between 8-col blocks).
 * #This value **must be 16-byte aligned**.
 * #@param sbo_bytes The stride dimension byte offset (e.g., row pitch).
 * #This value **must be 16-byte aligned**.
 * #@param swizzle_mode Specifies the swizzling mode (0, 1, 2, 4, 6).
 * #@param lbo_mode_is_absolute Set to true for absolute LBO mode (Bit 52 = 1). 
 * #Defaults to false (relative mode, Bit 52 = 0).
 *
 * @return The 64-bit (uint64_t) descriptor ready to be used.
 */
__host__ __device__ inline uint64_t create_smem_descriptor(
    uint32_t smem_start_offset,
    uint32_t lbo_bytes,
    uint32_t sbo_bytes,
    int swizzle_mode,
    bool lbo_mode_is_absolute = false)
{
    // #--- 1. Encode the 14-bit address and stride fields ---
    
    // #The rule "matrix-descriptor-encode(x) = (x & 0x3FFFF) >> 4"
    // #is how the hardware interprets a 16-byte-aligned value.
    // #It takes the input (x), masks it to 18 bits (0x3FFFF), and shifts by 4.
    // #This is functionally equivalent to (x >> 4) and then truncating
    // #the result to the 14-bit field.
    
    // #We shift by 4 (divide by 16) and mask to 14 bits (0x3FFF).
    const uint64_t encoded_addr = (static_cast<uint64_t>(smem_start_offset) >> 4) & 0x3FFFULL;
    const uint64_t encoded_lbo  = (static_cast<uint64_t>(lbo_bytes)  >> 4) & 0x3FFFULL;
    const uint64_t encoded_sbo  = (static_cast<uint64_t>(sbo_bytes)  >> 4) & 0x3FFFULL;

    // #--- 2. Calculate Matrix Base Offset (Bits 49-51) ---
    
    // #"base offset = (pattern start addr >> 0x7) & 0x7"
    // #This is 0 for non-swizzled mode.
    // #For swizzled modes, we interpret "pattern start addr" as the smem_start_offset.
    uint64_t base_offset = 0;
    if (swizzle_mode != 0)
    {
        base_offset = (static_cast<uint64_t>(smem_start_offset) >> 7) & 0x7;
    }

    //# --- 3. Assemble the 64-bit descriptor ---
    uint64_t desc = 0;

    // #Bits 0-13: Matrix start address (encoded)
    desc |= (encoded_addr);

    //# Bits 16-29: Leading dimension byte offset (encoded)
    desc |= (encoded_lbo << 16);

    // #Bits 32-45: Stride dimension byte offset (encoded)
    desc |= (encoded_sbo << 32);

    //# Bits 46-48: Fixed constant value of 0b001
    desc |= (0b001ULL << 46);

    //# Bits 49-51: Matrix base offset
    desc |= (base_offset << 49);

    // #Bit 52: Leading dimension stride mode (0: relative, 1: absolute)
    if (lbo_mode_is_absolute)
    {
        desc |= (1ULL << 52);
    }

    // #Bits 53-60: Fixed constant value of 0xb0
    // #(The doc's "0xb00000000" is a common typo for an 8-bit field, it means 0xb0)
    desc |= (0xb0ULL << 53);

    // #Bits 61-63: Swizzling mode
    desc |= ((static_cast<uint64_t>(swizzle_mode) & 0x7) << 61);

    //# Bits 14-15 and 30-31 are implicitly 0 (reserved).

    return desc;
}

#define TC_copy_one_cta_128x256_bf16(tensor_memory_address, shared_memory_descriptor)\
  asm volatile(                                                                      \
    "tcgen05.cp.cta_group::1.128x256b [%0], %1;"                                     \
    :                                                                                \
    : "l"(tensor_memory_address), "l"(shared_memory_descriptor)                      \
    :"memory");

constexpr int n_32_bit_cols = 256/32;
constexpr int n_16_bit_elems_per_lane = 2*n_32_bit_cols;
constexpr int n_lanes = 128; 
constexpr int n_elem_cols = n_16_bit_elems_per_lane; 
constexpr int N = n_lanes*n_elem_cols;
//#we will use a standard row major computation for lbo and sbo 
constexpr int SBO_bytes = n_16_bit_elems_per_lane*sizeof(__nv_bfloat16);
constexpr int LBO_bytes = 8*sizeof(__nv_bfloat16);


__global__ void tc_load(__nv_bfloat16*A, __nv_bfloat16*B)
{
  
}

int main()
{
  __nv_bfloat16* A_h, *B_h, *A_d, *B_d;
  size_t size = N*sizeof(__nv_bfloat16);
  cudaHostAlloc(&A_h, size, cudaHostAllocDefault);
  cudaHostAlloc(&B_h, size, cudaHostAllocDefault); 
  cudaMalloc(&A_d, size); 
  cudaMalloc(&B_d, size); 
  uint32_t smem_addr = 0; 
  uint32_t lbo = LBO_bytes; 
  uint32_t sbo = SBO_bytes;
  uint32_t swizzle_mode = 0; 
  uint64_t *smem_descriptor; 
  size_t size_int64 = sizeof(uint64_t);
  cudaHostAlloc(&smem_descriptor, size_int64, cudaHostAllocDefault);
  smem_descriptor[0] = create_smem_descriptor(smem_addr, lbo, sbo, swizzle_mode, false);
  uint64_t* smem_desc;
  cudaMalloc(&smem_desc, size_int64);
  cudaMemcpy(smem_desc, smem_descriptor, size_int64, cudaMemcpyHostToDevice);


  for (int i = 0; i < N; i++)
  {

    __nv_bfloat16 g = __float2bfloat16((float)i); 
    A_h[i] = g;
    
  }
  cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
  
  
  
  printf("n_lanes_of_tmem: %d, n_bits_per_lane: %d, n_cols_per_lane: %d, n_bf16's_per_lane: %d \n",128,256,n_32_bit_cols, n_16_bit_elems_per_lane);
  cudaFree(A_d);
  cudaFree(B_d); 
  cudaFree(smem_desc);
  cudaFreeHost(smem_descriptor);
  cudaFreeHost(A_h);
  cudaFreeHost(B_h);
  return 0;
  
}

n_lanes_of_tmem: 128, n_bits_per_lane: 256, n_cols_per_lane: 8, n_bf16's_per_lane: 16 



Ok this is so fucked and I am so pissed off right now. 