In [1]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_90')

Source files will be saved in "/tmp/tmp34l4zaxy".


In [6]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cudaTypedefs.h> 
#include <vector_types.h> 
#include <cuda/barrier>
#include <cuda/ptx>

using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace cde = cuda::device::experimental;

// -------------------------------------------------------------------------
// CONFIGURATION
// -------------------------------------------------------------------------
// We use int4 (16 bytes). 
// 128 Byte Swizzle means the pattern repeats/shifts every 8 int4 elements (8 * 16 = 128).
constexpr uint32_t M = 32; 
constexpr uint32_t N = 32; 
constexpr uint32_t BM = 32; 

// FIX: BN must result in a box width <= 128 bytes for SWIZZLE_128B.
// int4 is 16 bytes. 128 / 16 = 8.
// So BN must be <= 8.
constexpr uint32_t BN = 8; 

constexpr uint32_t INT4_COMPONENTS = 4; // treating int4 as 4 ints for TMA

// -------------------------------------------------------------------------
// HOST HELPERS
// -------------------------------------------------------------------------
#define CUDA_CHECK(call) \
    { \
        cudaError_t err = (call); \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at %s:%d\n", \
                    cudaGetErrorString(err), __FILE__, __LINE__); \
            exit(err); \
        } \
    }

PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() {
    cudaDriverEntryPointQueryResult driver_status;
    void* ptr = nullptr;
    CUDA_CHECK(cudaGetDriverEntryPointByVersion("cuTensorMapEncodeTiled", &ptr, 12000, cudaEnableDefault, &driver_status));
    return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(ptr);
}

// -------------------------------------------------------------------------
// DEVICE KERNEL
// -------------------------------------------------------------------------
__global__ void show_swizzle_kernel(int4* Output_Buffer, const __grid_constant__ CUtensorMap tensor_map)
{
    // Standard linear coordinates
    uint x_idx = blockIdx.x * blockDim.x; 
    uint y_idx = blockIdx.y * blockDim.y;
    
    // Shared Memory - 128B Aligned for TMA
    // We pad the stride if necessary, but here BN*16 = 128 bytes, which is aligned.
    __shared__ alignas(128) int4 As[BM][BN];  

    #pragma nv_diag_suppress static_var_with_dynamic_init 
    __shared__ barrier bar;

    if (threadIdx.x == 0) {
        init(&bar, BM * BN); 
        cde::fence_proxy_async_shared_cta(); 
    }
    __syncthreads(); 

    // 1. TMA LOAD (SWIZZLED)
    barrier::arrival_token token;
    if (threadIdx.x == 0)
    {
        // Adjust X for INT32 view (int4 is 4 ints)
        // Note: blockIdx.x * BN gives the element offset. 
        // We multiply by 4 to get the INT32 offset.
        uint32_t tma_x = (blockIdx.x * BN) * INT4_COMPONENTS;
        uint32_t tma_y = y_idx;

        // The TMA engine will apply the XOR Swizzle here!
        cde::cp_async_bulk_tensor_2d_global_to_shared(&As, &tensor_map, tma_x, tma_y, bar);
        
        token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(As));
    } 
    else {
        token = bar.arrive();
    }

    bar.wait(std::move(token));
    cde::fence_proxy_async_shared_cta();
    __syncthreads();

    // 2. STANDARD READ (LINEAR)
    // We read the shared memory "As" just like a normal array.
    uint smem_col = threadIdx.x; 
    uint smem_row = threadIdx.y; 
    
    // Write out to global memory linearly so we can inspect it on host
    // Bounds check since we might have threads > BN if we reused the block size from M
    if (smem_col < BN && smem_row < BM) {
        uint gmem_index = (y_idx + smem_row) * N + (x_idx + smem_col);
        Output_Buffer[gmem_index] = As[smem_row][smem_col];
    }
}

int main()
{
    // Sizes
    size_t size_bytes = M * N * sizeof(int4);
    int4 *h_data, *d_data_in, *d_data_out;
    
    cudaHostAlloc(&h_data, size_bytes, cudaHostAllocDefault);
    cudaMalloc(&d_data_in, size_bytes);
    cudaMalloc(&d_data_out, size_bytes);

    // Init with sequential numbers to make swizzle obvious
    // value = row * N + col
    for (int i = 0; i < M * N; i++) {
        h_data[i].x = i; 
        h_data[i].y = i; 
        h_data[i].z = i; 
        h_data[i].w = i; 
    }

    cudaMemcpy(d_data_in, h_data, size_bytes, cudaMemcpyHostToDevice);
    cudaMemset(d_data_out, 0, size_bytes);

    // -------------------------------------------------------------------------
    // TMA SETUP - EXPLICITLY ENABLING SWIZZLE_128B
    // -------------------------------------------------------------------------
    auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();
    CUtensorMap tensor_map{};
    
    // Config: INT32 type, Rank 2
    uint64_t tensor_shape[] = {N * INT4_COMPONENTS, M}; 
    uint64_t tensor_stride[] = {N * sizeof(int4)}; 
    
    // BOX SHAPE: {32, 32} ints = {128 bytes, 32 rows}
    // This satisfies the 128B constraint.
    uint32_t smem_box[] = {BN * INT4_COMPONENTS, BM}; 
    uint32_t element_stride[] = {1, 1};

    CUresult res = cuTensorMapEncodeTiled(
        &tensor_map,
        CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32, 
        2,
        d_data_in, 
        tensor_shape,       
        tensor_stride,      
        smem_box,     
        element_stride,
        CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
        CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B, // NOW VALID because box width is 128B
        CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
        CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
    );

    if (res != CUDA_SUCCESS) { printf("TMA Encode failed %d\n", res); return 1; }

    // Launch
    // Grid X needs to cover N using blocks of BN
    dim3 grid(N/BN, M/BM);
    // Block dims must match our tile size
    dim3 block(BN, BM);
    
    show_swizzle_kernel<<<grid, block>>>(d_data_out, tensor_map);
    CUDA_CHECK(cudaDeviceSynchronize());

    // Read back
    cudaMemcpy(h_data, d_data_out, size_bytes, cudaMemcpyDeviceToHost);

    // -------------------------------------------------------------------------
    // INSPECT RESULTS
    // -------------------------------------------------------------------------
    printf("Inspection of First Row (0 to 15):\n");
    printf("Expected: 0, 1, 2, 3, 4, ...\n");
    printf("Actual  : ");
    
    // We expect to see local permutations
    for (int i = 0; i < 16; i++) {
        printf("%d, ", h_data[i].x);
    }
    printf("...\n\n");

    // Check if swizzling happened
    bool swizzled = false;
    for (int i = 0; i < 16; i++) {
        if (h_data[i].x != i) {
            swizzled = true;
            break;
        }
    }

    if (swizzled) {
        printf("SUCCESS: The data is scrambled! The TMA Swizzle pattern is visible.\n");
    } else {
        printf("FAILURE: The data looks linear. Swizzle 128B was not effective.\n");
    }

    cudaFree(d_data_in);
    cudaFree(d_data_out);
    cudaFreeHost(h_data);
    return 0;
}

Inspection of First Row (0 to 15):
Expected: 0, 1, 2, 3, 4, ...
Actual  : 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...

FAILURE: The data looks linear. Swizzle 128B was not effective.

