In [3]:
import numpy as np 

class layout: 
  def __init__ (self, m: int, shape: tuple[int, ...], stride: tuple[int, ...]): 
    if (len(shape) != m) or (len(stride) != m): 
      raise ValueError("m is the dimensionality, both the shape and stride must be an m-tuple")
    for i in range(m): 
      assert isinstance(shape[i], int)  
      assert isinstance(stride[i], int) 
      assert shape[i] > 1
      assert stride[i] > 0
      
    self.m = m 
    self.N = np.prod(shape).item()
    self.shape = shape
    self.stride = stride
    
    shape_stride = [1 for i in range(m)] 
    for i in reversed(range(m-1)): 
      shape_stride[i] = shape[i+1]*shape_stride[i+1]
      
    self.shape_stride = tuple(shape_stride)
    
    self.one_d_domain = np.arange(self.N).reshape(self.N, 1)
    self.md_domain = np.indices(self.shape)
    
    self.p_shape_stride = np.array([self.N] + list(self.shape_stride))
    self.P_one_d_to_md = (self.one_d_domain // self.p_shape_stride[1:].reshape(1,self.m)) % (self.p_shape_stride[:m].reshape(1,self.m))
    
    self.P_stride = np.array(self.stride).reshape(tuple([self.m] + [1 for _ in range (self.m)]))
    self.P_md_to_one_d = np.sum(self.md_domain * self.P_stride, axis = 0)

    self.layout = np.sum((self.P_one_d_to_md.T).reshape(tuple([m] + list(self.shape)))*self.P_stride, axis = 0)
    self.test_layout = [0 for _ in range(self.N)] 
    for i in range(self.N): 
      md_index = self.one_d_to_md(i)
      print(md_index)
      
    self.perm = [i for i in range(self.m)] 
    self.sorted_stride = stride
    for i in range(self.m): 
      for j in range(self.m):
        if self.sorted_stride[i] <= self.sorted_stride[j]: 
          p_i = self.perm[i]
          p_j = self.perm[j]
          s_i = self.sorted_stride[i]
          s_j = self.sorted_stride[j]
          self.perm[i], self.perm[j] = p_j, p_i
          self.sorted_stride[i], self.sorted_stride[j] = s_j, s_i
          
         
        
    
    
  def md_to_one_d (self, md_index): 
    assert len(md_index) == self.m 
    for i in range(self.m): 
      assert md_index[i] >= 0
      assert md_index[i] < self.shape[i]
    
    one_d_index = 0
    for i in range(self.m): 
      one_d_index += self.stride[i]*md_index[i]
    
    return one_d_index
  
  def one_d_to_md (self, one_d_index): 
    assert one_d_index >= 0 
    assert one_d_index < self.N 
    
    md_index = [0 for _ in range(self.m)]
    
    temp_shape_stride = tuple([self.N]) + self.shape_stride
    for i in range(self.m): 
      md_index[i] = ((one_d_index)//temp_shape_stride[i+1]) % temp_shape_stride[i]
      
    return tuple(md_index)

  def __repr__(self):
      return (f"N_elements = {self.N} \n"
              f"layout(m={self.m}, \n"
              f"       shape={self.shape}, \n"
              f"       stride={self.stride}, \n"
              f"       shape_stride = {self.shape_stride})")
      
  
  

In [4]:
shape = (3,4,2)
stride = (8,2,1)
m = 3

matrix = layout(m, shape, stride)
print(matrix.P_md_to_one_d)
print(matrix.P_md_to_one_d.shape)
print(matrix.P_one_d_to_md)
print(matrix.P_one_d_to_md.shape)

(0, 0, 0)
(0, 0, 1)
(0, 1, 0)
(0, 1, 1)
(0, 2, 0)
(0, 2, 1)
(0, 3, 0)
(0, 3, 1)
(1, 4, 0)
(1, 4, 1)
(1, 5, 0)
(1, 5, 1)
(1, 6, 0)
(1, 6, 1)
(1, 7, 0)
(1, 7, 1)
(2, 0, 0)
(2, 0, 1)
(2, 1, 0)
(2, 1, 1)
(2, 2, 0)
(2, 2, 1)
(2, 3, 0)
(2, 3, 1)


TypeError: 'tuple' object does not support item assignment

In [5]:
print(matrix)

NameError: name 'matrix' is not defined

In [6]:
q = matrix.one_d_to_md(21)

q

NameError: name 'matrix' is not defined

In [7]:
l = matrix.md_to_one_d(q)

NameError: name 'matrix' is not defined

In [8]:
l

NameError: name 'l' is not defined

In [9]:
shape = (3,4,2)
domain = np.indices(shape)

In [10]:
domain.shape

(3, 3, 4, 2)

In [11]:
matrix.layout

print(matrix.layout.shape)


NameError: name 'matrix' is not defined

In [91]:
import numpy as np 
import math
class layout: 
  def __init__ (self, m: int, shape: tuple[int, ...], stride: tuple[int, ...]): 
    if (len(shape) != m) or (len(stride) != m): 
      raise ValueError("m is the dimensionality, both the shape and stride must be an m-tuple")
    assert shape[0] > 0 
    assert stride[0] > 0
    for i in range(1,m): 
      assert isinstance(shape[i], int)  | isinstance(shape[i], np.int64)
      assert isinstance(stride[i], int)  | isinstance(stride[i], np.int64)
      assert shape[i] > 1
      assert stride[i] > 0
      
    self.m = m 
    self.N = np.prod(shape).item()
    self.shape = np.array(shape)
    self.stride = np.array(stride)
    
    shape_stride = [1 for i in range(m)] 
    for i in reversed(range(m-1)): 
      shape_stride[i] = shape[i+1]*shape_stride[i+1]
      
    self.shape_stride = np.array(shape_stride)
    self.one_d_domain = np.arange(self.N)
    self.fan_map = (self.one_d_domain.reshape(self.N,1) // self.shape_stride.reshape(1, self.m)) % (self.shape.reshape(1, self.m))
    #the shape of the fan map is (N, m) the reason being, each one d element is a position in N (the outermost) dimension
    #the rest of the inner dimensions, is the literal m_tuple co-ordinate of that that one. that is: 
    #fan_map(i) (which is a tuple depicting co-oridnates)  =  tuple(self.fan_map[i]) (as a numpy slice)
    #the fan map itself, simply presents a row major layout ordering. 
    #the stride now, is going to induce the map from the m_tuple, back to the 1d. so composed, we get the layout
    self.layout = np.sum(self.fan_map * self.stride.reshape(1, self.m), axis = 1)
    n_warps = math.ceil(self.N/32.0) 
    n_lanes = 32
    self.pad_amount = (n_warps*n_lanes) - self.N 
    self.padded_layout = np.pad(self.layout, (0, self.pad_amount))
    
    self.bank_conflicts = np.zeros((n_warps,n_lanes)).astype(int)
    self.layout_bank_id = (self.padded_layout.reshape(n_warps, n_lanes) % 32)
    for i in range(n_warps): 
      for j in range(n_lanes): 
        if (i*32 + j < self.N):
          self.bank_conflicts[i, self.layout_bank_id[i,j]] += 1
        
        
    self.perm = [i for i in range(self.m)] 
    self.sorted_stride = list(stride)
    for i in range(self.m-1): 
      for j in range(i+1, self.m):
        if self.sorted_stride[i] <= self.sorted_stride[j]: 
          p_i = self.perm[i]
          p_j = self.perm[j]
          s_i = self.sorted_stride[i]
          s_j = self.sorted_stride[j]
          self.perm[i], self.perm[j] = p_j, p_i
          self.sorted_stride[i], self.sorted_stride[j] = s_j, s_i
          
    self.sorted_shape = [self.shape[self.perm[i]] for i in range(self.m)]
    self.is_injective = True 
    unique_len = np.unique(self.layout).size
    layout_len = self.layout.size
    if unique_len < layout_len: self.is_injective = False
    self.layout_injective = True 
    if self.sorted_stride[self.m-1] < 1: 
      self.layout_injective = False 
      
    for j in reversed(range(self.m - 1)): 
      if self.sorted_stride[j] < self.sorted_stride[j+1]*self.sorted_shape[j+1]: 
        self.layout_injective = False
        break
    
  def __repr__(self):
    return (f"N_elements = {self.N} \n"
            f"layout(m={self.m}, \n"
            f"       shape={self.shape}, \n"
            f"       stride={self.stride}, \n"
            f"       shape_stride = {self.shape_stride})")
    
  

In [92]:
tensor = layout(4, (2,4,8,2), (50,2,4,4))

In [93]:
### For a given shape, when is a stride going to produce an injective map? 
### idea 1. the number of elements is same, that is stride product equals shape product
### that doesn't work. consider shape = (4,3,4,4) stride = (2,2,3,16)
### what if stride(i) <= row_major_stride(i)? 


S = 2*2*3*4*4
print(S)
shape = (4,3,4,4) 
row_major_stride = (48,16,4,1)
stride = (49,17,5,1)

L = layout(4, shape, stride)

192


In [94]:
L.layout_injective

False

In [95]:
#to get injective layouts, pick a shape S, and now write a non increasing stride $D$ (unstrictly decereasing) 
#for which D_{m-1} > 0, and D_{k} >= D_{k+1}*S_{K+1} for each k in [0,m-2] now any permutation of this layout
#(applying the same permutation to both S and D) will be injective. 
from itertools import permutations

S = (2,4,3,2)
D = (63,15,5,2)

def generate_permuted_layouts(S,D): 
  S_np = np.array(S).astype(int)
  D_np = np.array(D).astype(int)
  m = len(S) 
  assert m == len(D)
  assert m < 7
  perms = permutations([i for i in range(m)])
  layouts = []
  for pi in perms: 
    pi_np = np.array(list(pi)).astype(int) 
    S_pi = S_np[pi_np].astype(int)
    D_pi = D_np[pi_np].astype(int)

    new_S = tuple(np.asarray(S_pi))
    new_D = tuple(np.asarray(D_pi))

    
    layouts.append(layout(m, new_S, new_D))
  return layouts

In [96]:
layouts = generate_permuted_layouts(S,D)

print(layouts[2])
for l in layouts: 
  print(l.layout_injective)
  print("foo")
  print(l.is_injective)
  

N_elements = 48 
layout(m=4, 
       shape=[2 3 4 2], 
       stride=[63  5 15  2], 
       shape_stride = [24  8  2  1])
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True
True
foo
True


In [97]:
x = np.random.randn(4)
x

array([ 1.52149872, -0.69421857, -0.15926733, -0.0499443 ])

In [98]:
np.pad(x, (0,4))

array([ 1.52149872, -0.69421857, -0.15926733, -0.0499443 ,  0.        ,
        0.        ,  0.        ,  0.        ])

# Warp periodicity. 
For the purposes of SMEM bank conflict, different warps issue load instructions on different clock cycles. In fact, a good way to measure The latenecy and throughput of an instruction on a single thread is to write asm=volatile micro benchmarks (I will do this sometime), to measure single instance throughput, just spawn a bunch of threads, issue that instruction on independent peices of data in a for loop, concurrently accross all threads, wait for them to finish, run iterations and get time. 

if total time is $T$, time per iteration is $Q = T/N_iter$ 
time per iteration per thread = $H =Q/N_threads$ 

Now, suppose the single thread latency and throughput of this instruction is $L, T$ respectively. 
if our inner for loop is size $G$, then since each iteration does independent work, per thread, we should be able to spawn an instruction every $T$ clock cycles, and so totally, we take $TG$ time to spawn them, but an instruction that spawns at clock cycle $iT$ is going to finish computing at clock cycle $iT + L$, but nevertheless, no matter how long the latency of this instruction is, the last issued instruction is the one thats going to finish the last. so at most, the time of this loop, per iteration of testing, per thread, (that is $H$ = TG + L) where $L$ is latency. 


To get $L$ on the other hand, make each run of the loop dependent on the previous iteration and get $H_latency = GL$ where $G$ is the length of the loop 

so we can solve for $T,L$ using these two equations 



Now, moving on, a layout map $f_L :[N_{elems}] -> Q \subset \mathbb N$ can be interpreted as: 
the domain is a set of threads, and the range $Q$ is addresses in the shared memory that a thread is assigned to. in this view, we know that [N_{elems}] can be partitioned into warps of 32 congtigous threads, and bank conflicts are whithin a warp, but are independent across warps.


so then, we say that $f_L$ has a period of $p$ warps, (Say for example 2 warps), then every 64 thread chunk's layout is just a shift of the pattern of the previous two warps, and so on. 

Ie for some constant $c$, f_L(i) = (i//p)*c + f_L(i%p). 

(i//p) obviousl indicates which period this i belongs 2, and $c$ depicts the shifting factor of the period. 




In [99]:
for l in layouts: 
  print(l.bank_conflicts[1])

[0 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 2 0 1 1 0 2 0 1 0 0 1 0 1]
[0 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 1 0 2 0 1 0 0 1 0 1]
[0 0 1 0 2 0 1 1 0 2 0 1 0 0 0 0 0 1 0 2 0 1 1 0 2 0 1 0 0 0 0 0]
[0 0 1 0 2 0 1 1 0 2 0 1 0 0 0 0 0 1 0 2 0 1 1 0 2 0 1 0 0 0 0 0]
[0 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1 0 0 0 0 1]
[0 1 0 0 1 0 1 1 0 2 0 1 0 0 1 0 1 0 0 1 0 1 1 0 2 0 1 0 0 0 0 1]
[0 0 1 0 1 0 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0]
[0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1]
[0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1]
[0 0 0 0 1 1 0 0 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 1 2 2 1 1 0 0 0 0 0 0 0 0 0 1 1 2 2 1 1 0 0 0 0]
[0 0 0 0 0 0 0 1 1 2 2 1 1 0 0 0 0 0 0 0 0 0 1 1 2 2 1 1 0 0 0 0]
[0 0 0 0 0 0 0 1 1 2 2 1 1 0 0 0 0 0 0 0 0 0 1 1 2 2 1 1 0 0 0 0]
[0 0 0 0 0

In [106]:
warp_shape = (6,32)
warp_stride = (96,3)
L = layout(2, warp_shape, warp_stride)
L

N_elements = 192 
layout(m=2, 
       shape=[ 6 32], 
       stride=[96  3], 
       shape_stride = [32  1])

In [107]:
L.layout
L.bank_conflicts


array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])