In [2]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v')

import numpy as np
import sys



np.set_printoptions(
    threshold=sys.maxsize,       # show the entire array
    precision=2,            # 2 decimal places
    suppress=True,          # no scientific notation
    floatmode='fixed',      # consistent decimal formatting
    linewidth=np.inf,       # prevent line wrapping

)

# example array
a = np.arange(12).reshape(3,4) / 3.0

# create string with exact control
s = np.array2string(
    a,
    max_line_width=10**6,                     # avoid wrapping
    precision=2,                              # float precision
    separator='',                             # <-- no commas/spaces between elements
    formatter={'float_kind': lambda x: f"{x:.2f}"},
    threshold=sys.maxsize,                    # show whole array
    floatmode='fixed'                         # fixed-point formatting
)

print(s)


Source files will be saved in "/tmp/tmpwjqdgl8f".
[[0.000.330.671.00]
 [1.331.672.002.33]
 [2.673.003.333.67]]


In [3]:

class matmul_spec: 
  
  def __init__ (self, M,N,K, threads_per_block, max_n_blocks_per_SMs, n_SMs, TM, BM, BK):
    # Storing inputs for a complete representation 
    self.M = M
    self.N = N
    self.K_dim = K # Renamed K to K_dim to avoid potential conflict/confusion
    self.TM = TM
    self.BM = BM
    self.BK = BK
    self.n_SMs = n_SMs
    self.max_n_blocks_per_SMs = max_n_blocks_per_SMs

    self.n_compute = M*N  #the number of elements of C that we need to compute
    self.tpb = threads_per_block 
    self.bpg = max_n_blocks_per_SMs * n_SMs 
    
    # Ensure WPT calculation handles the case where the divisor is zero to prevent errors
    divisor = self.tpb * self.bpg
    self.WPT = self.n_compute // divisor if divisor != 0 else np.inf
    
    self.WPB = self.WPT * self.tpb
    
    # Tiling calculations
    self.thread_tile = (TM, self.WPT//TM)
    self.block_tile = (BM, self.WPB//BM)
    self.A_load_tile = (BM, BK)
    self.B_load_tile = (BK, self.WPB//BM)
    
  def __repr__(self):
    """
    Provides a quick, readable representation of the matmul_spec object, 
    displaying both configuration and calculated workload/tiling.
    """
    return (
        f"matmul_spec(\n"
        f"  --- CONFIGURATION ---\n"
        f"  Matrix Dims:    (M={self.M}, N={self.N}, K={self.K_dim})\n"
        f"  Hardware:       (n_SMs={self.n_SMs}, max_blocks_per_SM={self.max_n_blocks_per_SMs})\n"
        f"  Tiling (Base):  (TM={self.TM}, BM={self.BM}, BK={self.BK})\n"
        f"  \n"
        f"  --- CALCULATED SPECS ---\n"
        f"  Total Elements (C): {self.n_compute}\n"
        f"  Threads/Block (tpb): {self.tpb}\n"
        f"  Blocks/Grid (bpg):  {self.bpg}\n"
        f"  Work/Thread (WPT):  {self.WPT}\n"
        f"  Work/Block (WPB):   {self.WPB}\n"
        f"  \n"
        f"  --- TILES ---\n"
        f"  C Thread Tile:  {self.thread_tile}\n"
        f"  C Block Tile:   {self.block_tile}\n"
        f"  A Load Tile:    {self.A_load_tile}\n"
        f"  B Load Tile:    {self.B_load_tile}\n"
        f")"
    )

# Example usage (for testing)


  
    

In [4]:
%%cuda 
#include <cuda_runtime.h> 
#include <cuda.h> 
#include<cooperative_groups.h> 
#include<stdlib.h>
#include<stdio.h>
/*
instead of being a pussy ass bitch, lets thing about a basic smem reduction kernel. 
say N is our big problem size, and BN is what a singular block would work on. 
indeed the partition [0, BN-1], [BN, 2BN-1]... [(k-1)BN, N-1] would be a k blocks. 
And this partition is okay, because shared memory and shit is per block. 
So, we have num_TPB threads in a block, ITS MATHIN TIME 

*/


__global__ void grid_sync_reduction(float* A_in, float*A_out)
{

}

#define CUDA_CHECK(call)                                                          \
    do {                                                                          \
        cudaError_t err = call;                                                   \
        if (err != cudaSuccess) {                                                 \
            fprintf(stderr, "CUDA Error at %s:%d: %s\n",                          \
                    __FILE__, __LINE__, cudaGetErrorString(err));                 \
            return 1;                                                             \
        }                                                                         \
    } while (0)

int main ()
{
  int device = 0; 
  int supports_coop_launch = 0; 
  int num_TPB = 32*16; 
  int blocks_per_sm = 0; 
  cudaDeviceProp deviceProp;
  int num_SMs_on_device = 0;
  int K_num_blocks = 0;

  // 1. Get Cooperative Launch Attribute
  CUDA_CHECK(cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, device));
  
  // 2. Get Device Properties
  CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, device));
  num_SMs_on_device = deviceProp.multiProcessorCount;
  
  // 3. Calculate Occupancy (Max Active Blocks Per SM)
  // This assumes 'grid_sync_reduction' is the actual kernel name.
  CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, (void*)grid_sync_reduction, num_TPB, 0));
  
  // 4. Calculate Total Co-Schedulable Blocks
  K_num_blocks = blocks_per_sm * num_SMs_on_device;

  // ----------------------------------------------------------------------
  // --- Pretty Print of all important numbers ---
  printf("\n--- CUDA Device Occupancy Analysis (Device %d) ---\n", device);
  printf("Device Name: %s\n", deviceProp.name);
  printf("----------------------------------------------------\n");
  printf("1. Supports Cooperative Launch:         %s (%d)\n", 
         supports_coop_launch ? "Yes" : "No", supports_coop_launch);
  printf("2. Kernel Threads Per Block (num_TPB):  %d\n", num_TPB);
  printf("3. Number of Streaming Multiprocessors: %d\n", num_SMs_on_device);
  printf("4. Max Active Blocks Per SM (Occupancy):%d\n", blocks_per_sm);
  printf("5. Total Co-Schedulable Blocks (K_num_blocks):\n");
  printf("   (SMs * Blocks/SM) = %d * %d = %d\n", num_SMs_on_device, blocks_per_sm, K_num_blocks);
  printf("----------------------------------------------------\n\n");
  
  return 0;
}



--- CUDA Device Occupancy Analysis (Device 0) ---
Device Name: NVIDIA GeForce RTX 5090
----------------------------------------------------
1. Supports Cooperative Launch:         Yes (1)
2. Kernel Threads Per Block (num_TPB):  512
3. Number of Streaming Multiprocessors: 170
4. Max Active Blocks Per SM (Occupancy):3
5. Total Co-Schedulable Blocks (K_num_blocks):
   (SMs * Blocks/SM) = 170 * 3 = 510
----------------------------------------------------




In [5]:
M,N,K = 4096, 4096, 4096
threads_per_block = 32*16
max_blocks_per_sm = 3
num_SMs = 170
TM = 8
BM = 128
BK = 16
spec = matmul_spec(M,N,K,threads_per_block,max_blocks_per_sm, num_SMs, TM,BM, BK)

In [6]:
print(spec)

matmul_spec(
  --- CONFIGURATION ---
  Matrix Dims:    (M=4096, N=4096, K=4096)
  Hardware:       (n_SMs=170, max_blocks_per_SM=3)
  Tiling (Base):  (TM=8, BM=128, BK=16)
  
  --- CALCULATED SPECS ---
  Total Elements (C): 16777216
  Threads/Block (tpb): 512
  Blocks/Grid (bpg):  510
  Work/Thread (WPT):  64
  Work/Block (WPB):   32768
  
  --- TILES ---
  C Thread Tile:  (8, 8)
  C Block Tile:   (128, 256)
  A Load Tile:    (128, 16)
  B Load Tile:    (16, 256)
)


In [7]:
512//128

4

In [8]:
def make_banks (m,n): 
  X = (np.arange(m*n) % 32).reshape(m,n)
  return X

In [None]:
np.set_printoptions(
    threshold=sys.maxsize,       # show the entire array
    precision=2,            # 2 decimal places
    suppress=True,          # no scientific notation
    floatmode='fixed',      # consistent decimal formatting
    linewidth=np.inf,       # prevent line wrapping

)

A = make_banks(128,16)
print(A)

"""
in 1 clock cycle, if many threads, issue a memory instruction to different 
addresses of the same bank, the whole bank will serialized. 
10 
t_x, t_y + 8, t_y + 2*8  
"""

[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23 24 25

'\nin 1 clock cycle, if many threads, issue a memory instruction to different \naddresses of the same bank, the whole bank will serialized. \n10 \n\n'

In [15]:
#the acess pattern spec should be like: 
"""
iterator: i RANGE_i, j RANGE_j, .. 


"""



def access_pattern_to_bank (thread_block_shape = (128,4), smem_shape = (128,17)): 
  TM = thread_block_shape[0]
  TN = thread_block_shape[1] 
  
  SM = smem_shape[0]
  SN = smem_shape[1] 
  
  threads = np.arange(TM*TN)
  smem_banks = np.arange(SM*SN) % 32 
  smem_addr = np.arange(SM*SN)

  #tx = t // TN 
  #ty = t % TN 
  tx = threads // TN 
  ty = threads % TN 
  seq = []
  addr_seq = []
  #ownership[t] = 
  for i in range (4): 
    seq.append(smem_banks[tx*TN + 4*ty + i])
    addr_seq.append(smem_addr[tx*TN + 4*ty + i])
    
    
  return addr_seq, seq
  
    
    
    
    
  
  
  
  

In [None]:
X,Y = access_pattern_to_bank((128, 4), (128,17))

In [14]:
for q in X: 
  print("----\n")
  print(q)
  print("\n")
  print("----\n")

----

[ 0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0
 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24
 16 20 24 28 20 24 28  0 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12 16
  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0 24 28  0  4 28  0  4  8
  0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0
 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24
 16 20 24 28 20 24 28  0 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12 16
  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0 24 28  0  4 28  0  4  8
  0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0
 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24
 16 20 24 28 20 24 28  0 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12 16
  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0 24 28  0  4 28  0  4  8
  0  4  8 12  4  8 12 16  8 12 16 20 12 16 20 24 16 20 24 28 20 24 28  0
 24 28  0  4 28  0  4  8  0  4  8 12  4  8 12

In [None]:
for q in Y: 
  print("----\n")
  print(q)
  print("\n")
  print("----\n")