In [1]:
import torch 
x = torch.arange(3).reshape(3,1)
y = torch.arange(2).reshape(1,2)
print("x:\n", x)
print("y:\n", y)
print("Broadcasted add:\n", x + y)

x:
 tensor([[0],
        [1],
        [2]])
y:
 tensor([[0, 1]])
Broadcasted add:
 tensor([[0, 1],
        [1, 2],
        [2, 3]])


In [17]:
# masking 
t = torch.tensor([[1, 2, 3], [4, 5, 6]])
mask = t > 2
filtered = t[mask]
print("Mask:\n", mask)
print("Filtered:", t[mask])
rows = torch.tensor([0, 1])
cols = torch.tensor([2, 1])
print("Fancy index:", t[rows, cols])

# When you do t[rows, cols], PyTorch zips up both tensors:
# If rows and cols have the same shape, each pair defines one coordinate to grab.

Mask:
 tensor([[False, False,  True],
        [ True,  True,  True]])
Filtered: tensor([3, 4, 5, 6])
Fancy index: tensor([3, 5])


In [25]:
x = torch.ones(3, 4)
print(x)
print("Sum:", x.sum())
print(x.shape)
print("Mean over rows:", x.sum(dim=1))
print("Mean over cols :", x.sum(dim=0))


tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
Sum: tensor(12.)
torch.Size([3, 4])
Mean over rows: tensor([4., 4., 4.])
Mean over cols : tensor([3., 3., 3., 3.])


In [None]:
a = torch.arange(24).reshape(2,3,4)
p = a.permute(1,0,2)
print(p.shape)
b = torch.ones_like(a) # shape of b  3 2 4 
stacked = torch.stack([a, b], dim=0) # 2 2 3 4 
print(stacked.shape) 
concat = torch.cat([a, b], dim=2)
print(concat.shape) # 2 3 8 

torch.Size([3, 2, 4])
torch.Size([2, 2, 3, 4])
torch.Size([2, 3, 8])


In [51]:
# Split a tensor of shape (10, 20) into five equal chunks along dim=1, then recombine them using torch.cat().
a = torch.randn(10, 20)
x = a.reshape(5,-1, 4 )
print(x.shape)
# combine usign cat 
y = torch.cat([x], dim = 0 )
print(y.shape)
# use torch.chunk instead 
chunks = torch.chunk(a, 5, dim=1)
print(len(chunks))        # 5 chunks
print(chunks[0].shape)
y = torch.cat(chunks, dim = -1)
print(y.shape)

torch.Size([5, 10, 4])
torch.Size([5, 10, 4])
5
torch.Size([10, 4])
torch.Size([10, 20])


In [52]:
import torch

A = torch.randn(3, 4)
B = torch.randn(4, 2)
C = torch.einsum('ik,kj->ij', A, B)
print(C.shape)  # (3,2)


torch.Size([3, 2])


In [59]:
a = torch.randn(3)
b = torch.randn(3)
print(a.shape)
y = torch.einsum("i,j->ij", a, b)
print(y)
# batch matrix multiplication torch.einsum("bij,bjk->bik", a, b)

torch.Size([3])
tensor([[ 1.0403, -0.8417,  0.4006],
        [-0.7540,  0.6100, -0.2904],
        [-0.7912,  0.6401, -0.3047]])


In [75]:
x = torch.arange(24).reshape(2, 3, 4)
y = x.permute(1, 0, 2)      # change axis order
print(y.shape) # 3, 2,4 
print(y.is_contiguous())    # often False
z = y.contiguous().view(-1, 4) # 3, 2,4 view changes to 6, 4 
print(z.shape)


# ques2 
x = torch.randn(2, 3,4 )
print(x.is_contiguous())
y = x.permute(2,1,0  ) # .permute often needs contiguous 
print(y.shape)
print(y.is_contiguous())
z = x.reshape(4, 3, 2 )
print(z.shape)
print(z.is_contiguous())

torch.Size([3, 2, 4])
False
torch.Size([6, 4])
True
torch.Size([4, 3, 2])
False
torch.Size([4, 3, 2])
True


In [82]:
A = torch.randn(5, 3, 4)
B = torch.randn(5, 4, 2)
C = torch.bmm(A, B)
print(C.shape) # 5 3 2 
A = torch.randn(10, 32, 100)
B = torch.randn(10, 100, 64)
C = torch.matmul(A, B, )
print(C.shape)
#. torch.bmm → when you’re multiplying clean batches inside a model (e.g., attention heads).
#torch.matmul → when your shapes might vary or need broadcasting flexibility.

torch.Size([5, 3, 2])
torch.Size([10, 32, 64])


In [88]:
x = torch.tensor([[1], [2], [3]]) # shap 3 , 1

y = x.expand(3,4)
print(y.shape)
print(y)
z =  x.repeat(3,4)
print(z.shape)

torch.Size([3, 4])
tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])
torch.Size([9, 4])


In [89]:
a = torch.tensor([[1,2,3],
                  [4,5,6]])
b = torch.tensor([[10],
                  [20]])
print(a+b) # 10 12 13, 25, 25, 26

tensor([[11, 12, 13],
        [24, 25, 26]])


In [92]:
x = torch.arange(8).reshape(2, 2, 2)
y = x.permute(2, 0, 1)
print(y.shape)
x

torch.Size([2, 2, 2])


tensor([[[0, 1],
         [2, 3]],

        [[4, 5],
         [6, 7]]])

In [95]:
a = torch.randn(3, 1, 5)
b = torch.randn(1, 4, 1)
c = a + b
c.shape


torch.Size([3, 4, 5])

In [99]:
assert torch.allclose(c[0, :, 0 ] ,  b[0, :, 0 ]+ a [0, 0 ,0 ])

In [104]:
# torch repeat vs expand
x = torch.tensor([[1], [2], [3]])      # (3,1)
y = x.expand(3, 4)
z = x.repeat(1, 4)
z
# .expand() just views the same data in memory, pretending the tensor grew by using stride tricks (no copy).
#.repeat() actually copies data, creating a fresh chunk of memory.

tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])

tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])

In [106]:
class WeirdSquare(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        # save tensor for backward
        ctx.save_for_backward(input)
        return input**2

    @staticmethod
    def backward(ctx, grad_output):
        # compute custom grad
        x, = ctx.saved_tensors

        return 4*x
x = torch.tensor(3.0, requires_grad=True)
y = WeirdSquare.apply(x)
y.backward()
print(x.grad)  # should be 12

tensor(12.)


In [None]:
class SmoothL1Beta(torch.autograd.Function):
    @staticmethod
    def forward(ctx, y_hat, y, beta):
        beta_clamped = beta.clamp_min(1e-6)
        r = y_hat - y
        mask = (r.abs() < beta_clamped)
        ctx.save_for_backward(r, beta_clamped, mask)
        loss = torch.where(mask,
                           0.5 * r**2 / beta_clamped,
                           r.abs() - 0.5 * beta_clamped)
        return loss.sum()

    @staticmethod
    def backward(ctx, grad_output):
        r, beta_clamped, mask = ctx.saved_tensors

        # ∂L/∂r (which is also ∂L/∂y_hat)
        grad_r = torch.where(mask, r / beta_clamped, r.sign())

        # ∂L/∂β
        grad_beta = torch.where(mask,
                                -0.5 * r**2 / (beta_clamped**2),
                                -0.5 * torch.ones_like(beta_clamped))

        grad_yhat = grad_output * grad_r
        grad_y = -grad_yhat   # because L depends on (y_hat - y)
        grad_beta = grad_output * grad_beta

        return grad_yhat, grad_y, grad_beta


In [126]:
def stable_logsumexp(x, dim = 1):
    x_max = x.max(dim=dim, keepdim=True).values
    return x_max+ torch.log(torch.sum(torch.exp(x - x_max), dim=dim, keepdim=True))
    # pull xmax outside 
x = (torch.randn(4, 7) * 10).requires_grad_() 
y = stable_logsumexp(x, dim=1).sum()
y.backward()
assert torch.allclose(x.grad, torch.softmax(x, dim=1), atol=1e-5, rtol=1e-5)


In [None]:
# LayerNorm backward check (spot the bug)
mu = x.mean(dim=-1, keepdim=True)
var = ((x - mu)**2).mean(dim=-1, keepdim=True)
y = (x - mu) / torch.sqrt(var + 1e-5)
out = y * gamma + beta

In [128]:
# gradient vs activation checkpoinintg 

import torch
# Activation Checkpointing
from torch.utils.checkpoint import checkpoint

# Example function that logs what it's doing
def fn(x):
    print("  running forward fn()")
    return x ** 2

x = torch.tensor(3.0, requires_grad=True)

print("---- normal forward ----")
y1 = fn(x)
y1.backward()
print("  backward done\n")

x = torch.tensor(3.0, requires_grad=True)
print("---- checkpointed forward ----")
y2 = checkpoint(fn, x)
y2.backward()
print("  backward done\n")




---- normal forward ----
  running forward fn()
  backward done

---- checkpointed forward ----
  running forward fn()
  running forward fn()
  backward done



Gradient Checkpointing (the broader umbrella term)

Technically, activation checkpointing is a specific kind of gradient checkpointing.
“Gradient checkpointing” means any strategy that saves only a subset of activations (“checkpoints”) and recomputes everything else on backward to reduce memory load.

So:

“Activation checkpointing” → PyTorch’s implementation: store only segment boundaries, recompute insides.

“Gradient checkpointing” → general term covering that plus fancier graph-level algorithms like:

Rematerialization (used in JAX, DeepSpeed)

Gradient-remap checkpointing (saves intermediate grads instead of activations)

Selective gradient offload (move grads/acts to CPU or NVMe, not recompute them)

ZeRO-Offload / DeepSpeed Stage-3 (shard or offload gradients, parameters, optimizer states)

In [None]:
## mixed precision logic 
# scaler = torch.cuda.amp.GradScaler()
# for inputs, labels in dataloader:
#     optimizer.zero_grad()
#     with torch.cuda.amp.autocast():
#         outputs = model(inputs)
#         loss = loss_fn(outputs, labels)
#     scaler.scale(loss).backward()
#     scaler.step(optimizer)
#     scaler.update()
# It automatically runs each operation in the cheapest safe precision:

# FP16/BF16 for matmuls, convolutions, attention blocks — anything with big, well-behaved values.

# FP32 for operations prone to numerical instability — e.g., softmax, normalization, variance, loss functions.
# PyTorch tracks this automatically via op-specific “whitelists” and “blacklists.”


# 2. scaler.scale() / scaler.step()

# scaler.scale(loss) multiplies the loss by a dynamic scaling factor before backward, magnifying small gradients so they don’t underflow to zero in FP16.

# The backward pass computes scaled gradients in FP16/FP32 mix.

# scaler.step(optimizer) first unscales those gradients (divides by the same scale), checks for NaNs/Infs, and only updates weights if they’re finite.

# scaler.update() adjusts the scale up or down depending on whether gradients overflowed.
#so we scale up the loss, compute gradient, scale down by the smae scale , check for inf or Nans and then update, adjust the scale usign scale.update() if needed


# 3. Why this matters
# FP16 has only ~10 bits of mantissa → values smaller than ~1e-7 vanish (underflow).
# Without scaling, gradients can become all zeros or NaN.
# AMP keeps most compute in FP16 for speed and memory efficiency, but protects numerically sensitive ops and gradients in FP32 for precision.

In [133]:
import torch

# --- Tensors for Your Practice ---

# A 2D tensor (a matrix)
tensor_a = torch.tensor([[1, 2, 3, 4],
                         [5, 6, 7, 8],
                         [9, 10, 11, 12]], dtype=torch.float32)

# A 3D tensor representing a small batch of 2 images, each with 1 channel and 3x3 pixels
tensor_b = torch.arange(18, dtype=torch.float32).reshape(2, 1, 3, 3)

# A 1D tensor (a vector)
tensor_c = torch.tensor([10, 20, 30])

# A tensor with random values for matrix multiplication practice
tensor_d = torch.randn(4, 5)


# --- Print tensors to see what you're working with ---
print("Tensor A (shape {}):\n{}".format(tensor_a.shape, tensor_a))
print("\nTensor B (shape {}):\n{}".format(tensor_b.shape, tensor_b))
print("\nTensor C (shape {}):\n{}".format(tensor_c.shape, tensor_c))
print("\nTensor D (shape {}):\n{}".format(tensor_d.shape, tensor_d))

Tensor A (shape torch.Size([3, 4])):
tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])

Tensor B (shape torch.Size([2, 1, 3, 3])):
tensor([[[[ 0.,  1.,  2.],
          [ 3.,  4.,  5.],
          [ 6.,  7.,  8.]]],


        [[[ 9., 10., 11.],
          [12., 13., 14.],
          [15., 16., 17.]]]])

Tensor C (shape torch.Size([3])):
tensor([10, 20, 30])

Tensor D (shape torch.Size([4, 5])):
tensor([[-0.5172, -1.1899, -0.3756,  0.0917,  2.8955],
        [-0.9880,  0.0651,  0.7884,  0.3716,  0.2358],
        [ 0.9267,  1.2892, -0.8128,  1.4235,  0.8271],
        [ 1.7586,  1.6394,  0.7147, -1.2115, -0.7852]])


In [168]:
tensor_a

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])

In [None]:
tensor_b.shape

tensor([[[[ 0.,  1.,  2.],
          [ 3.,  4.,  5.],
          [ 6.,  7.,  8.]]],


        [[[ 9., 10., 11.],
          [12., 13., 14.],
          [15., 16., 17.]]]])

In [174]:
max_b  = torch.max(tensor_b)
tensor_b = tensor_b/ max_b

In [200]:
import torch
import math

# --- Tensors for Advanced Practice ---

# A batch of logits (raw outputs from a model)
# Shape: (batch_size, num_classes)
logits = torch.tensor([[2.0, 1.0, 0.1],
                       [-0.5, 1.5, 2.5]], dtype=torch.float32)

# A batch of integer class labels for one-hot encoding
# Shape: (batch_size,)
labels = torch.tensor([0, 2])

# Tensors for a simplified self-attention mechanism
# Shape: (batch_size, sequence_length, embedding_dim)
query = torch.randn(2, 4, 8) # 2 batches of 4 tokens, each with an 8-dim embedding
key = torch.randn(2, 4, 8)
value = torch.randn(2, 4, 8)

# A target tensor for the gradient problem
target = torch.tensor([1.0, 1.0])

In [201]:
def softmax(z):
    z = torch.exp(z)
    return z/ torch.sum(z, dim = 1 , keepdim= True)

In [202]:
softmax(logits)

tensor([[0.6590, 0.2424, 0.0986],
        [0.0351, 0.2595, 0.7054]])

In [204]:
n_classes = logits.shape[1]
labels = labels.unsqueeze(1)
one_hot = torch.zeros_like(logits) # batch x classes 
one_hot.scatter_(dim= 1, src = 1, index = labels )

TypeError: scatter_() received an invalid combination of arguments - got (src=int, index=Tensor, dim=int, ), but expected one of:
 * (int dim, Tensor index, Tensor src)
      didn't match because some of the arguments have invalid types: (dim=int, index=Tensor, !src=int!, )
 * (int dim, Tensor index, Tensor src, *, str reduce)
 * (int dim, Tensor index, Number value)
      didn't match because some of the keywords were incorrect: src
 * (int dim, Tensor index, Number value, *, str reduce)


In [226]:
src  = torch.arange(1,11, dtype = torch.float32).reshape((2,5))
index = torch.tensor([[0, 1, 2, 0]])
target = torch.zeros(5,10,dtype = torch.float32)
target.shape

torch.Size([5, 10])

In [227]:
target.scatter_(1, index, src )

tensor([[4., 2., 3., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [229]:
import torch
import torch.nn as nn

# --- Data for a simple linear regression problem ---
# We want to learn the relationship: y = 3x_1 + 2x_2 + noise
X = torch.randn(100, 2)
noise = torch.randn(100, 1) * 0.1
y = 3 * X[:, 0:1] + 2 * X[:, 1:2] + noise

# --- Data for an embedding lookup problem ---
# An embedding matrix for a vocabulary of size 10, with 4-dim vectors
embedding_matrix = torch.rand(10, 4)

# A batch of 2 "sentences", each with 5 word indices
sentences_indices = torch.tensor([[1, 5, 2, 8, 0],
                                  [3, 4, 6, 9, 7]], dtype=torch.long)

In [253]:
class LinearRegression(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(2,1)
    def forward(self, x):
        y = self.linear(x)
        return y 
model = LinearRegression()
output = model(X)
print(output.shape)
loss = nn.MSELoss()
optim = torch.optim.SGD(model.parameters(), lr = 0.01)
for _ in range(1000):

    y_hat = model(X)
    mse_loss =loss(y_hat, y )
    optim.zero_grad()
    mse_loss.backward()
    optim.step()
    
print(mse_loss.item())

torch.Size([100, 1])
0.00871671736240387


In [258]:
model.linear.bias

Parameter containing:
tensor([0.0002], requires_grad=True)

In [261]:

# --- Data for an embedding lookup problem ---
# An embedding matrix for a vocabulary of size 10, with 4-dim vectors
embedding_matrix = torch.rand(10, 4)

# A batch of 2 "sentences", each with 5 word indices
sentences_indices = torch.tensor([[1, 5, 2, 8, 0],
                                  [3, 4, 6, 9, 7]], dtype=torch.long)

A = embedding_matrix[sentences_indices]
A.shape

torch.Size([2, 5, 4])

In [262]:
# 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- Data for the new problems ---

# A list of tensors, each representing a sentence with different lengths.
# The numbers are word indices.
sentences = [
    torch.tensor([4, 1, 7, 2]),
    torch.tensor([8, 2]),
    torch.tensor([5, 9, 3, 6, 8, 1])
]

# A simple model you might have trained earlier (from previous questions)
class LinearRegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(2, 1)

    def forward(self, x):
        return self.linear(x)

In [263]:
import torch

# Starter tensors
a = torch.arange(24).reshape(2, 3, 4).float()
b = torch.randn(2, 3, 4)