In [3]:
import os
os.environ['TRITON_DEBUG'] = '1'
os.environ['TRITON_INTERPRET'] = '1'
import triton
import triton.language as tl
import torch

@triton.jit
def blockwise_multiply_kernel(A_ptr, out_ptr, B: tl.constexpr):
    # Each kernel instance processes one block of A.
    pid = tl.program_id(0)
    block_start = 0
    offsets = block_start + tl.arange(0, 2)

    # Load the 32-element block from A.
    A = tl.load(A_ptr + tl.arange(0, 2))

    # Load the corresponding element from B.

    # tl.device_print(a_vals2.shape)
    B = tl.load(B_ptr + tl.arange(0,2))

    # Multiply the block by the scale factor.
    result = A + B

    # Store the results.
    tl.store(out_ptr + tl.arange(0, 2), result)

def multiply_blocks(A, B):
    """
    A: torch tensor of shape [8192]
    B: torch tensor of shape [256]
    Returns:
      out: torch tensor of shape [8192] where each block of 32 elements
           from A is multiplied by the corresponding element of B.
    """
    out = torch.empty_like(A)
    # Each block processes 32 elements, so grid size is set to the number of blocks.
    blockwise_multiply_kernel[(1,)](A, B, out)
    return out

# Example usage:
A = torch.tensor([0.000298662664135918, 0.000298662664135918], device=torch.device('cuda:0'))
B = torch.tensor([0.021757077425718307], device=torch.device('cuda:0'))
out = multiply_blocks(A, B)
print(out)


tensor([0.0221, 0.0221], device='cuda:0')


In [4]:
out[0].item()

0.022055739536881447

In [6]:
(A + B)[0].item()

0.022055739536881447

In [59]:
print((A.reshape(2, 4).T*B).T.reshape(8))

tensor([ 0.,  1.,  2.,  3.,  8., 10., 12., 14.], device='cuda:0')


In [47]:
A

tensor([0., 1., 2., 3., 4., 5., 6., 7.], device='cuda:0')

In [1]:
for i in range(262144):
    for j in range(32):
        if ((i*j+ j)== 6558820):
            print(i, j)

In [2]:
262144 * 32

8388608

In [None]:
offset = 0.021757077425718307 # Torch.float32
value = 0.000298662664135918 # Torch.float32

In [11]:
tl.zeros((4,), dtype=tl.bfloat16)

array([0, 0, 0, 0], dtype=uint16)