In [10]:
# Conceptual: Cross-Entropy vs. KL Divergence
# Answer:The Cross-Entropy 
# $H(P, Q)$ is $H(P, Q) = -\sum P(x) \log Q(x)$.
# The KL Divergence $D_{KL}(P || Q)$ is $\sum P(x) \log\left(\frac{P(x)}{Q(x)}\right)$.
# If you expand the KL Divergence formula:DKL​(P∣∣Q)=∑P(x)logP(x)−∑P(x)logQ(x)DKL​(P∣∣Q)=−H(P)+H(P,Q)
# This means $H(P, Q) = D_{KL}(P || Q) + H(P)$, where $H(P)$ is the entropy of the true distribution $P$.
# The punchline: In classification, our true distribution $P$ is a one-hot vector (e.g., [0, 0, 1, 0]). 
# The entropy of this distribution, $H(P)$, is a constant 0.Therefore, for classification, 
# minimizing the Cross-Entropy is mathematically identical to minimizing the KL Divergence, 
# and Cross-Entropy is simpler to compute.
# Assume a batch size of 2, with 3 classes
import torch 
B, C = 2, 3
# Model output (logits)
logits_q = torch.tensor([
    [0.5, 0.2, 0.3],  # Sample 1
    [0.1, 0.8, 0.1]   # Sample 2
])
# Target output (logits)
logits_p = torch.tensor([
    [0.6, 0.1, 0.3],  # Sample 1
    [0.1, 0.8, 0.1]   # Sample 2
])

log_probs_Q = torch.log_softmax(logits_q,dim = 1 )
log_probs_P = torch.log_softmax(logits_p, dim = 1)
kl_elements = log_probs_P - log_probs_Q
kl_per_sample = torch.sum(logits_p*kl_elements, dim=1)
torch.mean(kl_per_sample)

tensor(0.0183)

In [30]:
# RMS Norm 
N = 10 
features = 14
import torch.nn as nn 
X = torch.randn(N, features )
# normalise using RMS norm 
class RMSNorm(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.epsilon = 1e-6 
        self.hidden_size = hidden_size
        self.weights = nn.Parameter(torch.ones(hidden_size)) # this is the learnable part 

    def forward(self, X):
        # last dimention of X is hidden size, so its N, hidden size lets say 
        rms = torch.sqrt(torch.mean(X**2,dim=-1,keepdim=True) + self.epsilon)
        X = X/ rms
        X = self.weights*X
        
        return X

rmsnorm = RMSNorm(hidden_size=features)
y=  rmsnorm(X)

In [42]:
# layernorm 
# X is the same 
N = 14 
features = 19
X = torch.randn(N, features )
class LayerNorm(nn.Module):
    def __init__(self, hidden_size, epsilon):
        super().__init__()
        self.hidden_size = hidden_size
        self.epsilon = epsilon
        self.beta = nn.Parameter(torch.zeros(hidden_size))
        self.gamma = nn.Parameter(torch.ones(hidden_size))
    def forward(self,X):
        mean = torch.mean(X, dim= 1, keepdim=True)
        #variance = torch.var(X, dim = 1, keepdim=True)
        # torch.var() by default calculates the unbiased sample variance (using $N-1$ as the denominator).
        variance = torch.mean((X - mean)**2, dim=-1, keepdim=True)
        X = (X-mean)/torch.sqrt(variance+self.epsilon) 
        return self.gamma*X + self.beta


layernorm = LayerNorm(hidden_size=features, epsilon= 1e-6)
#layernorm(X)

In [47]:
# grouped query attention 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Synthetic data
torch.manual_seed(42)
batch_size = 3
seq_len = 4
d_model = 8
num_heads = 2

q = torch.rand(batch_size, seq_len, d_model)
k = torch.rand(batch_size, seq_len, d_model)
v = torch.rand(batch_size, seq_len, d_model)
print(q.shape)

device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

import torch
import torch.nn as nn
import torch.nn.functional as F
# lets do Multi Head, MutliQuery and Group Query 
# MHA: $N$ Query heads, $N$ Key heads, $N$ Value heads.
# MQA: $N$ Query heads, $1$ Key head, $1$ Value head.
# GQA: $N$ Query heads, $G$ Key heads, $G$ Value heads. 
# (Where $G$ is a small number, and $N$ is divisible by $G$).

torch.Size([3, 4, 8])


In [72]:
d_model = 512 
num_heads_q =8
num_heads_kv = 2 
head_dim = 64 
# 2 K/V heads (So, 4 Q heads will share one K/V head)
class GroupedQueryAttention(nn.Module):
    def __init__(self, d_model: int, num_heads_q: int, num_heads_kv: int, head_dim: int):
        super().__init__()
        self.d_model = d_model 
        self.num_heads_kv = num_heads_kv
        self.num_heads_q = num_heads_q 
        self.head_dim = head_dim
        self.q_dim = head_dim* num_heads_q
        kv_dim = head_dim*num_heads_kv
        self.num_groups = num_heads_q // num_heads_kv
        # we need learnable parameters, for WQ, WK, WV and WO 
        self.Wq = nn.Linear(d_model,self.q_dim, bias = False) 
        self.Wk = nn.Linear(d_model, kv_dim, bias = False)
        self.Wv = nn.Linear(d_model, kv_dim, bias = False)
        self.Wo = nn.Linear(self.q_dim, d_model ,bias = False)
    def forward(self,x, mask = False):
        q = self.Wq(x) # batch , seq_len, q_dim
        k = self.Wk(x) # batch , seq_len, kvdim
        v = self.Wv(x) # batch , seq_len, kvdim
        batch = x.shape[0]
        q = q.view(batch, -1, self.num_heads_q,head_dim  ).transpose(1,2)
        v = v.view(batch, -1, self.num_heads_kv,head_dim  ).transpose(1,2)
        k = k.view(batch, -1, self.num_heads_kv,head_dim  ).transpose(1,2)
        if self.num_groups>1:
            k = k.repeat_interleave(self.num_groups, dim=1)
            v = v.repeat_interleave(self.num_groups,dim=1)
        # now all of them are in batch , num_heads_q , seq_len, head_dim 
            

        scores = F.scaled_dot_product_attention(q, k, v , attn_mask=None).transpose(1,2).contiguous().view(batch, -1,self.q_dim )
        output = self.Wo(scores)
        return output
X = torch.rand(10, 20, 512 )
d_model = 512 
num_heads_q =8
num_heads_kv = 2 
head_dim = 64 
f = GroupedQueryAttention(d_model, num_heads_q, num_heads_kv, head_dim)
y = f.forward(X)

In [82]:
d_model = 512 
num_heads_q =8
num_heads_kv = 2 
head_dim = 64 
# 2 K/V heads (So, 4 Q heads will share one K/V head)
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, head_dim: int):
        super().__init__()
        self.d_model = d_model 
       
        self.head_dim = head_dim
        self.n_heads = d_model//head_dim
        self.Wq = nn.Linear(d_model,d_model ,bias = False) 
        self.Wk = nn.Linear(d_model, d_model, bias = False)
        self.Wv = nn.Linear(d_model, d_model, bias = False)
        self.Wo = nn.Linear(d_model, d_model ,bias = False)
    def forward(self,x, mask = False):
        q = self.Wq(x) # batch , seq_len, d_model
        k = self.Wk(x) # batch , seq_len, d_model
        v = self.Wv(x) # batch , seq_len, d_model
        batch = x.shape[0]
        q = q.view(batch, -1, self.n_heads, head_dim  ).transpose(1,2)
        v = v.view(batch, -1, self.n_heads,head_dim  ).transpose(1,2)
        k = k.view(batch, -1, self.n_heads,head_dim  ).transpose(1,2)
        # if self.num_groups>1:
        #     k = k.repeat_interleave(self.num_groups, dim=1)
        #     v = v.repeat_interleave(self.num_groups,dim=1)
        # now all of them are in batch , num_heads_q , seq_len, head_dim 
            

        scores = F.scaled_dot_product_attention(q, k, v , attn_mask=None).transpose(1,2).contiguous().view(batch, -1,self.d_model )
        output = self.Wo(scores)
        return output
X = torch.rand(10, 20, 512 )
d_model = 512 
head_dim = 64 
f = MultiHeadAttention(d_model, head_dim)
y = f.forward(X)
y.shape

torch.Size([10, 20, 512])

In [84]:
xb = torch.tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
                [25, 17, 27, 10,  0, 21,  1, 54]])



In [118]:

block_size = 32
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [143]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self,idx,  max_new_tokens=100):
        # idx is the geenrations so far or the prompt B, 
        for _ in range(max_new_tokens):
            logits, loss = self(idx) 
            logits = logits[:, -1, : ]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1 )
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx 

In [144]:
yb

tensor([[41, 59, 40, 47, 52, 43,  8,  0,  0, 23, 21, 26, 19,  1, 17, 16, 35, 13,
         30, 16,  1, 21, 34, 10,  0, 37, 53, 59,  1, 41, 39, 60],
        [ 1, 57, 50, 39, 60, 43, 57,  6,  1, 21,  1, 41, 39, 52,  1, 58, 43, 50,
         50,  1, 63, 53, 59,  1, 52, 43, 61, 57,  6,  7,  7,  1],
        [47, 44, 43, 57, 58, 43, 42, 11,  0, 35, 46, 47, 41, 46,  6,  1, 58, 46,
         53, 59, 45, 46,  1, 58, 46, 53, 59,  1, 61, 53, 59, 50]])

In [145]:
xb, yb = get_batch("train")
vocab_size= 65
model = BigramLanguageModel(vocab_size)
# yb = torch.tensor([[43, 58,  5, 57,  1, 46, 43, 39],
#         [53, 56,  1, 58, 46, 39, 58,  1],
#         [58,  1, 58, 46, 39, 58,  1, 46],
#         [17, 27, 10,  0, 21,  1, 54, 39]])
logits, loss = model(xb,yb)
y = model.generate(xb, 100 )


In [146]:
# pytorch optimisation object 
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3 )
for i in range(10000):
    xb,yb  = get_batch("train")
    logits, loss = model(xb,yb )
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

context = torch.zeros((1, 1), dtype=torch.long, device=device)


2.3859660625457764


In [150]:
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))



plave athalprowfilaileno BAhezyothicod'he at cohe hrveldekod apt Ihe as o t; ig nd, Whes, th nd;
Sthee manyoincedsare, inkiourext memo averel'd m p
We?X;
S:
Whiomake t was ierast:

'lsth te ond s wot w w!; ce;
TCa ar I lll he o wn on on,
OPny; Apondithe byowhoyeckisot Vo s ne u cof is co ntofougm I fue?
OLHAMENNS:
FzThy ffon:
Hot t mnt wo n Yote?
HESathe cr:
INNGwel s
SSS myomocad,
Ked ofuthedsende ng
KI:
ARY g seRLe o-LARTIORoretrtithe?
NS:
Th wrtowrifous fes'se.
Thamedes thes s yousho gr pitir je Y t ale; atwnolmy, tu cepithiel glese-ltim'd ded or ivincewisth:
Bu hugeyowbe
Byech IZve wan hirpres m cheat'd OPINe ss:
Ylenournde avequt, pa y p, spldyoure thed

Loffoukkericeprmea shrennow.
Fad ow t bacerom breatthers ot IUSCOGLean g u iogo'th a, he. amom;
Fin y boloute tulf a itou llal if:
Bzandroveathou telve; bu ais WUENGLUET:
Ay, avetcestit yo?--hes s Iowhato-the t s:
Myofe sl.
ANThad afus

nou?
Boof fime ady
HEShonorse sh t f it by s, ory mpte fa t ke torw:
Trear.

yond?
O:

LI nogu

In [173]:
B, T , C = 4,8, 2
torch.manual_seed(42)
x = torch.randn(B,T,C)
xbow = torch.zeros((B,T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b,:t+1]
        xbow[b,t]= torch.mean(x_prev, 0 )
a = torch.tril(torch.ones(T, T))
a = a / a.sum(1, keepdim= True)
y = a@x
torch.allclose(xbow, y)

True

In [177]:
# 3 
tril = torch.tril(torch.ones(T, T)) 
wei = torch.zeros(T, T )
wei = wei.masked_fill(tril==0, float("-inf"))
wei = F.softmax(wei, dim = -1 )
xbow3 = wei@x

In [204]:
# lora matrix 
d = 10 
k = 10 
import numpy as np 
W_rank = 2
W = torch.randn(d, W_rank) @ torch.randn(W_rank, k)
W_rank = np.linalg.matrix_rank(W)
U, S, V = torch.svd(W)
Ur = U[:, :W_rank]
Sr = torch.diag(S[:W_rank])
Vr = V[:, :W_rank].t()
B = Ur@Sr
A = Vr
d = 10
bias = torch.randn(d)
x = torch.randn(d)
y = (B@A)@x + bias 
y2 = W@x + bias 
print(W.numel())
print(B.numel()+ A.numel())
torch.allclose(y, y2)

100
40


True

In [199]:
# 

tensor([-0.0651, -6.2567,  2.2067,  4.6670,  3.5105,  1.7308,  2.3720, -1.6852,
         2.8738, -0.4953])

In [201]:
y

tensor([-0.0651, -6.2567,  2.2067,  4.6670,  3.5105,  1.7308,  2.3720, -1.6852,
         2.8738, -0.4953])

In [259]:
# Tasks 1

# Create a tensor x of shape (2, 3, 4).

# Make y by summing over the last dim to shape (2, 3).

# Make z so it broadcasts with y to add back into x without copying.

# Given a with shape (10, 1), turn it into (1, 10) three different ways.

# Take a batch image tensor imgs of shape (N, C, H, W) and permute it to (N, H, W, C).

# Show why in-place ops can break autograd using +=.

# Move a tensor to GPU if available, otherwise CPU.

# Show the difference between view and reshape when the underlying storage isn’t contiguous.

x = torch.randn(2,3 , 4)
y = torch.sum(x, dim = -1)
print(y.shape) # 2,3
y1 = y.unsqueeze(2)
print(y1.shape)
z = x+y1
print(z.shape)
a = torch.randn(10,1)
b = a.permute(1,0)
print(b.shape)
c = a.reshape(a.shape[1], a.shape[0])
d = a.view(-1, a.shape[0])
print(c.shape, d.shape)

A = torch.randn(10, 3, 4, 8 )
B = A.transpose(1,2)
print(B.shape)
# in place operations can break autograd 
X = torch.randn(12, 3, requires_grad=True)
Y = X**2 
Y+=1
Z = torch.mean(Y+2)

Z.backward()
print(X.grad)
# this works and gives an output of shape 12 3 
# XX = torch.randn(12, 3, requires_grad=True)
# YY = XX**2 
# YY+=1
# ZZ = 2
# ZZ+= torch.mean(YY+2)
# ZZ.backward()
# XX.grad

# Show the difference between view and reshape when the underlying storage isn’t contiguous.
a_vector = torch.randn(10, 3)
# this is true 
a_vector.is_contiguous()
b_vector = a_vector.transpose(1, 0 )
print(b_vector.is_contiguous()) # this is false 
#print(b_vector.view(-1, 30 ))# this needs the vector to be conitguous 
print(b_vector.reshape(30))

torch.Size([2, 3])
torch.Size([2, 3, 1])
torch.Size([2, 3, 4])
torch.Size([1, 10])
torch.Size([1, 10]) torch.Size([1, 10])
torch.Size([10, 4, 3, 8])
tensor([[-0.0427,  0.0239, -0.0104],
        [ 0.1114,  0.0105,  0.0509],
        [-0.0551,  0.0556,  0.0309],
        [ 0.0179,  0.0057,  0.0958],
        [-0.0476,  0.0057, -0.0829],
        [ 0.0097, -0.1204, -0.0503],
        [ 0.0721, -0.0390,  0.0262],
        [-0.0077,  0.0022,  0.0396],
        [-0.0360, -0.0171, -0.0558],
        [-0.0221, -0.0195, -0.0934],
        [ 0.0897, -0.1019, -0.0475],
        [ 0.0301,  0.0503,  0.0614]])
False
tensor([ 2.1315, -2.3444,  0.7151,  1.5471, -1.7347,  1.0947, -1.1606, -0.8675,
         0.7789, -1.3507,  0.4524, -0.3931,  1.0412, -1.0035,  1.5404,  1.3737,
        -2.1651, -0.3115, -1.1131,  0.9503,  0.3048,  1.2917,  0.1542,  1.2265,
        -0.1243, -0.9948, -0.9691,  0.8433,  1.1119, -1.4676])


In [264]:
x = torch.tensor([2.0], requires_grad=True)
x = x +1
y = x * x
y += 1  # modifies y in place
z = y * x
z.backward()
x.grad

  x.grad


In [267]:
x = torch.randn(4, requires_grad=True)
y = x**2
z = y.mean()
z.backward()

In [None]:
# show that the checkpoint works
from torch.utils.checkpoint import checkpoint
def checkpoint_function(x):# this will throw away the computations fro activations if enabled 
    print("throwing checkpoints")
    return x*2
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(10,10)
    def forward(self, x):
        print("this iis the forward passs")
        x = checkpoint(checkpoint_function, x, use_reentrant=False)  # if set to false u can use the activations from thsi again, means there is no checkpoitning  
        # by default this is true, but to checkpoiutn we set this to True # so we run the forward again if we can enteer again 
        return self.linear(x)
model = MyModel()
x = torch.randn(4, 10, requires_grad=True)
y = model(x)
print(y)
y.sum().backward()

this iis the forward passs
throwing checkpoints
tensor([[ 0.7021,  0.4966, -1.7743, -1.1046, -0.0101,  0.4191, -0.6398, -0.6989,
         -0.7336, -0.1511],
        [-1.3401, -0.6799,  1.6131,  0.7541,  0.7405,  0.9105,  0.5754,  0.5524,
         -0.4698,  0.5182],
        [ 1.0117, -0.9354,  0.4439,  1.4630,  0.2563, -0.6287,  0.2358, -0.2923,
          0.8597, -1.0530],
        [ 0.5218, -0.1538, -0.5312,  0.1068,  0.0176, -0.6218, -0.2450,  0.0115,
         -0.9111,  0.3211]], grad_fn=<AddmmBackward0>)


In [345]:
# Q1. Create a 3-layer MLP in PyTorch (no Sequential) with ReLU activations, input dim 16, hidden 32, output 10. 
# Forward should take (batch, 16) and return (batch, 10).

# Q2. Write a training loop for 5 epochs using MSELoss and Adam optimizer. Use random tensors for both input and target. Print loss every epoch.
torch.manual_seed(42)
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(16, 32)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(32, 10 )
        self.batchnorm = nn.BatchNorm1d(10)
        self.dropout  = nn.Dropout(0.9)
    def forward(self, x):
        # shape of forward is batch , 16 
        layer1 = self.l1(x) # shape is batch , 32 
        relu = self.relu(layer1)
        layer2 = self.l2(relu )
        layer_after_drop = self.dropout(layer2)
        layer_after_drop = self.batchnorm(layer_after_drop)
        return layer_after_drop
batch = 10 
X  = torch.randn(batch, 16 )
y = torch.randn(batch ,10 )
print(X.shape)
n_epochs = 5
loss = nn.MSELoss()
model = MLP()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3 )
model.train()
for _ in range(n_epochs):
    
    logits = model(X)
    
    loss_ = loss(logits, y) 
    for param in model.parameters():
        loss_+= torch.norm(param)**2
        
    optimizer.zero_grad()
    loss_.backward() 
    
    # clip the gradients 
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
    
    optimizer.step()
    

torch.Size([10, 16])


In [346]:
import torch, torch.nn as nn
torch.manual_seed(0)

x = torch.randn(10000, 32)  # large batch to smooth randomness
drop = nn.Dropout(p=0.5)

# Train mode: dropout active, survivors scaled by 1/(1-p)=2
drop.train()
yt = drop(x)
print("train mean ≈", yt.mean().item(), "train std ≈", yt.std().item())

# Eval mode: dropout off
drop.eval()
ye = drop(x)
print("eval  mean ≈", ye.mean().item(), "eval  std ≈", ye.std().item())


train mean ≈ -0.0027171927504241467 train std ≈ 1.4131661653518677
eval  mean ≈ -0.0038609839975833893 eval  std ≈ 1.0004689693450928


In [369]:
# autograd 
class Autograd(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x**3 
    @staticmethod
    def backward(ctx,grad_output):
        x, = ctx.saved_tensors
        return 3*grad_output*x**2

# Use the function
x = torch.randn(3, 4, requires_grad=True)
y = Autograd.apply(x)
y.sum().backward()

print("x:", x)
print("Analytical gradient (autograd):", x.grad)

x: tensor([[-6.4725e-01,  9.2750e-02, -4.7973e-02, -1.0900e+00],
        [ 1.8613e-01, -9.8721e-05, -7.0357e-01, -1.0379e+00],
        [ 7.3243e-01,  3.9538e-01,  4.6574e-01,  1.9137e-01]],
       requires_grad=True)
Analytical gradient (autograd): tensor([[1.2568e+00, 2.5808e-02, 6.9044e-03, 3.5644e+00],
        [1.0393e-01, 2.9237e-08, 1.4850e+00, 3.2317e+00],
        [1.6094e+00, 4.6897e-01, 6.5073e-01, 1.0987e-01]])


In [None]:
# autograd 
def sigmoid(x):
    return 1/ (1+torch.exp(-x))
class Swish(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x*sigmoid(x)
    @staticmethod
    def backward(ctx,grad_output):
        x, = ctx.saved_tensors
        return grad_output*(sigmoid(x)+ x*sigmoid(x)*(1-sigmoid(x)) )

# Use the function
x = torch.randn(3, 4, requires_grad=True)
y = Swish.apply(x)
print(y.shape)
y.sum().backward()

print("x:", x)
print("Analytical gradient (autograd):", x.grad)
class MLP(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.layer1 = nn.Linear(8,16)
        self.layer2 =  Swish()
        self.layer3 = nn.Linear(16,3)
    def forward(self, x):
        y = self.layer1(x)
        y = self.layer3(self.layer2.apply(y))
        return y 
x = torch.randn(10, 8)
model = MLP()
loss = nn.CrossEntropyLoss() # this takes the logits 
output = model(x)
loss_ = loss()

torch.Size([3, 4])
x: tensor([[ 0.3738,  3.2050,  1.3382, -1.0091],
        [ 0.1184, -2.1165,  0.0848, -1.8846],
        [-0.6045, -0.8623,  0.0271, -0.0036]], requires_grad=True)
Analytical gradient (autograd): tensor([[ 0.6826,  1.0811,  1.0125,  0.0696],
        [ 0.5591, -0.0956,  0.5424, -0.0839],
        [ 0.2152,  0.1169,  0.5135,  0.4982]])


  self.layer2 =  Swish()


torch.Size([10, 3])

In [385]:
import torch

# 1. Create two random tensors of shape (64, 100)
A = torch.randn(64, 100)
B = torch.randn(64, 100)
C = A[:, :20 ]
D = B[:, -30:]
E = torch.cat([C, D], dim=1)

In [388]:
import torch

x = torch.randn(8, 3, 224, 224)  # batch of 8 RGB images
print(x.shape)  # torch.Size([8, 3, 224, 224])
print(x.dtype, x.device)


torch.Size([8, 3, 224, 224])
torch.float32 cpu


In [None]:
# Q7. Minimal custom layer (10 pts)
# Write a tiny nn.Module called ScaledLinear that:

# takes in_features, out_features, scale in __init__

# computes y = scale * (x @ W.T + b)
# Properly register parameters, set a Kaiming-uniform init for W, zeros for b. 
# Keep device/dtype-safe (i.e., use register_buffer for scale if it’s a float).

class ScaledLinear(nn.Module):
    def __init__(self,in_features, out_features, scale):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale 
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.bias = nn.Parameter(torch.zeros(out_features))
        self.register_buffer(self.scale)
        #self.Weight = nn.Linear(in_features, out_features, bias = True)
    def forward(self, X):
        # assume X is hte shape  N, in_features
        y = X@self.weight + self.bias 
        return self.scale*y 
    

In [391]:
# Write a correct training step using torch.cuda.amp.autocast and GradScaler. 
# Assume model, optimizer, scaler, images, targets, criterion exist.
with torch.cuda.amp.autocast :
    optimizer.zero_grad(set_to_none=True)
    logits = model(x)
    loss  = ce_loss(logits, labels)
    # we scale the gradients
    scaler.scale(loss)
    loss.backward()
    scaler.update()
    optimizer.step()
    


TypeError: 'type' object does not support the context manager protocol

In [392]:
import torch
torch._logging.set_logs(graph_code=True)

In [394]:
def foo(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a + b
opt_foo1 = torch.compile(foo)

In [396]:
opt_foo1(torch.randn(3, 3), torch.randn(3, 3))

V1019 14:10:15.066000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [0/0] [__graph_code] TRACED GRAPH
V1019 14:10:15.066000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [0/0] [__graph_code]  ===== __compiled_fn_1_cdd1ceef_e6fd_4bb3_bfe0_6e4dc55a0b69 =====
V1019 14:10:15.066000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [0/0] [__graph_code]  /Users/pchiniya/Desktop/hoverboard_workspace/mine/.conda/lib/python3.11/site-packages/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
V1019 14:10:15.066000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [0/0] [__graph_code]     def forward(self, L_x_: "f32[3, 3][3, 1]cpu", L_y_: "f32[3, 3][3, 1]cpu"):
V1019 14:10:15.066000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [0/0] [__graph_code]         l_x_ = L_x_
V1019 14:10:15.066000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [0/0] [__graph_code]         l_y_ = L_y_
V1019 14:10:15.066000 13271 site-packages/torch/_dynam

tensor([[1.1524, 1.4718, 0.6141],
        [0.1689, 0.6164, 0.0785],
        [1.0145, 1.9452, 1.0423]])

In [397]:
@torch.compile
def opt_foo2(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a + b


print(opt_foo2(torch.randn(3, 3), torch.randn(3, 3)))

V1019 14:14:57.784000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [1/0] [__graph_code] TRACED GRAPH
V1019 14:14:57.784000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [1/0] [__graph_code]  ===== __compiled_fn_3_e5554d5c_6eca_4911_9ce2_d3aceeaf93d5 =====
V1019 14:14:57.784000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [1/0] [__graph_code]  /Users/pchiniya/Desktop/hoverboard_workspace/mine/.conda/lib/python3.11/site-packages/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
V1019 14:14:57.784000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [1/0] [__graph_code]     def forward(self, L_x_: "f32[3, 3][3, 1]cpu", L_y_: "f32[3, 3][3, 1]cpu"):
V1019 14:14:57.784000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [1/0] [__graph_code]         l_x_ = L_x_
V1019 14:14:57.784000 13271 site-packages/torch/_dynamo/output_graph.py:1667] [1/0] [__graph_code]         l_y_ = L_y_
V1019 14:14:57.784000 13271 site-packages/torch/_dynam

tensor([[-0.6722,  1.7498, -0.2148],
        [ 0.1570, -0.7484,  0.1241],
        [ 0.9981,  1.6794, -0.2474]])


In [447]:
import torch 
import torch.nn as nn 
from torch.utils.checkpoint import checkpoint # for activation checkpoitning 
from torch.nn.utils import clip_grad_norm_ # for clipping before update 
from torch.amp import autocast, GradScaler
class ToyBlock(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, in_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x
class ToyModel(nn.Module):
    def __init__(self, in_dim=128, hidden_dim=256):
        super().__init__()
        self.big_block = ToyBlock(in_dim, hidden_dim)
        self.head = nn.Linear(in_dim, 10)

    def forward(self, x):
        h = self.big_block(x)
        return self.head(h)
device = "cpu"
accum_steps = 4       # gradient accumulation factor
max_grad_norm = 1.0   # clipping threshold
n_batches = 20        # simulate 20 batches
model = ToyModel().to(device=device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scaler = GradScaler()
criterion = nn.CrossEntropyLoss()
model.train()
optimizer.zero_grad(set_to_none=True)

X = torch.randn(64, 128, device=device)
Y = torch.randint(0, 10, (64,), device=device)




In [452]:
def run_block(x):
    return model(x)
# checkpoint recomputes activations in backward, saves memory
with autocast(device_type="cpu"):
    logits  = checkpoint(run_block, X, use_reentrant=False)
    loss = criterion(logits, Y)
    
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
clip_grad_norm_(model.parameters(), max_grad_norm)

scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)

In [None]:
n_samples = 100 
n_features = 64
X = torch.randn((n_samples, n_features))
k = 4 
n_iter = 20
# start with k random centroids 
centroids = X[torch.randperm(n_samples)[:k]] # k, n_features #O(n)
# get the closest points to these centroids 
centroids = centroids.unsqueeze(1)
print(centroids.shape)
for i in range(n_iter):
    old_centroids = centroids.clone()
    euclidean_distances = torch.sqrt(torch.sum((X-centroids)**2, dim = -1)) # k X N 
    # so now we need to get labels for each datapoitn like where do they belong
    # we need to know what data points belong to cluster 0 , 1, 2, 3 
    cluster_indices = torch.argmin(euclidean_distances, dim = 0)
    for i in range(k):
        points = X[cluster_indices==i]
        # poitns will be a tensor of x, features 
        centroids[i] = torch.mean(points, dim = 0 )
    if torch.allclose(old_centroids, centroids):
        break 


torch.Size([4, 1, 64])


In [531]:
def knn_predict(X_train, y_train, X_test, k=3):
    # x train is of shape N, d 
    # y train fro shape N 
    # X_test is for B, d 
    X_test = X_test.unsqueeze(1) # so its now B, 1, d 
    euclidean_distances = torch.sum( (X_train - X_test)**2, dim = 2 )
    # B, N
    topk_labels,topk_indices = torch.topk(euclidean_distances, k=k, dim = 1, largest = False)
    # so this is B,k 
    # so for this the label for each is final mod for dim 1
    final_labels, _ = torch.mode(y_train[topk_indices])
    return final_labels


In [532]:
N, B, d = 100, 10, 64
X_train = torch.randn(N, d )
y_train = torch.randint(0,2, size=(N,))
X_test = torch.randn(B, d)

y = knn_predict(X_train, y_train, X_test, k=3)

In [None]:
def linear_forward(X, W, b):
    # (N, in_features)
    # (in_features, out_features)
    # (out_features,)
    y = X@W+b
    return y 

tensor([1, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [None]:
import math
def scaled_dot_product_attention(Q, K, V, mask=None):
    # given Q, K, V vectores we implemetn scaled dot product attneitn 
    dk = K.shape[-1]
    scores = torch.matmul(Q, K.transpose(1,2)) #N, seq_len, seq_len 
    
    scores = scores/ math.sqrt(dk)
    if mask is not None:
        scores= scores.masked_fill(mask==0, float("-inf"))
    # N, seqlen, seqlen scores 
    # V is N,
    scores = nn.functional.softmax(scores, dim = -1)
    print(scores.shape)
    weights = torch.matmul(scores, V) 
    return weights , scores

    


In [582]:
seq_len_q = seq_len_k = seq_len_v = 100
d_k = d_v = d_q = 64
N = 32
Q = torch.randn((N, seq_len_q, d_k))
K = torch.randn((N, seq_len_k, d_k))
V = torch.randn((N, seq_len_k, d_v))
mask = torch.tril(torch.randn(N, 1, 1, seq_len_k))
mask_decoder = torch.tril(torch.randn((N, 1, seq_len_q, seq_len_k)))
#scaled_dot_product_attention(Q, K, V, mask)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model ,h ):
        super().__init__() # to import form the super class 
        self.d_model = d_model 
        self.h = h 
        assert d_model% h ==0 
        self.head_dim  = d_model// h 
        self.W_q= nn.Linear(d_model, d_model, bias = False)
        self.W_k =nn.Linear(d_model, d_model, bias = False)
        self.W_v = nn.Linear(d_model, d_model, bias = False)
        self.W_o = nn.Linear(d_model, d_model, bias = False)
    def forward(self, query, key , value, mask = None):
        # calculate the projects fro the learnable matrices for Qk V 
        query = self.W_q(query)
        key = self.W_k(key)
        value = self.W_v(value)
        batch , seq, dk = key.shape

        # split into heads for parallel compuations 
        query = query.view(batch, seq, self.h, self.head_dim ).transpose(1, 2)
        value  = value.view(batch, seq, self.h, self.head_dim ).transpose(1, 2)
        key = key.view(batch, seq, self.h, self.head_dim ).transpose(1, 2)

        # now these can go in to the function above 
        outputs, scores = scaled_dot_product_attention(query, key , value , mask )
        # batch, seq, self.h, self.head_dim
        outputs = self.W_o(outputs.transpose(1,2).contiguous().view(batch, seq, self.d_model))
        return outputs, scores

In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, h, d_ff, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.h = h 
        self.dff = d_ff 
        assert d_model% h ==0 
        self.dropout = nn.Dropout(dropout)
        self.mha = MultiHeadAttention(d_model, h)
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.FFN = nn.Sequential(nn.Linear(d_model, d_ff),
                                 nn.ReLU(),
                                   nn.Linear(d_ff, d_model))
        self._init_weights()
    def _init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0 )
                
    def forward(self, x, mask=None):
        # in encoder the inputs will be the same 
        # N, S, d_model 
        x_norm = self.layernorm1(x)
        attention_output, attention_weights = self.mha(x_norm, x_norm, x_norm, mask )
        x = x+ self.dropout(attention_output)

        # second vlock for layernorm 
        residual = x 
        ffn_output = self.FFN(self.layernorm2(x))
        x = residual + self.dropout(ffn_output)
        return x 

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self,n_layers,  d_model, h, d_ff,vocab_size, max_seq_len, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.h = h 
        self.vocab_size = vocab_size
        self.max_seq_len = max_seq_len
        self.dff = d_ff 
        assert d_model% h ==0 
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_seq_len, d_model)
        self.layers = nn.ModuleList([TransformerEncoderLayer(d_model, h, d_ff,dropout) 
                                    for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)
        
        self.layernorm1 = nn.LayerNorm(d_model)
                
    def forward(self, x, mask=None):
        # in encoder the inputs will be the same 
        # N, seq_len 
        N, S = x.shape
        positions = torch.arange(0,S, device=x.device)
        pos = self.pos_emb(positions)
        token_emb = self.tok_emb(x)

        x_ = pos+ token_emb
        x = self.dropout(x_)
        for layer in self.layers:
            x = layer(x, mask )
        x = self.layernorm1(x)
        return x 

In [None]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, h, d_ff, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.h = h 
        self.dff = d_ff 
        assert d_model% h ==0 
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.self_attn = MultiHeadAttention(d_model, h)
        self.cross_attention = MultiHeadAttention(d_model, h)
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)
        self.FFN = nn.Sequential(nn.Linear(d_model, d_ff),
                                 nn.ReLU(),
                                   nn.Linear(d_ff, d_model))
                
    def forward(self, x, enc_output, lookahead_mask, padding_mask=None):
        # in encoder the inputs will be the same 
        # N, S, d_model 
        x_norm = self.layernorm1(x)
        attention_output, _ = self.self_attn(x_norm, x_norm, x_norm, lookahead_mask )
        x = x+self.dropout1(attention_output)

        x_norm2 = self.layernorm2(x)
        cross_attention_output, _ = self.cross_attention(x_norm2,enc_output, enc_output, padding_mask )
        x = x + self.dropout2(cross_attention_output)
        
        x_norm3 = self.layernorm3(x)
        ffn_output = self.FFN(x_norm3)
        x = x + self.dropout3(ffn_output)
        return x 

In [None]:
from torch.utils.checkpoint import checkpoint
class TransformerDecoder(nn.Module):
    def __init__(self,n_layers,  d_model, h, d_ff,vocab_size, max_seq_len, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.h = h 
        self.vocab_size = vocab_size
        self.max_seq_len = max_seq_len
        self.dff = d_ff 
        assert d_model% h ==0 
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_seq_len, d_model)
        self.layers = nn.ModuleList([TransformerDecoderLayer(d_model, h, d_ff,dropout) 
                                    for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)
        
        self.layernorm1 = nn.LayerNorm(d_model)
                
    def forward(self, x, enc_output, lookahead_mask, padding_mask=None):
        # in encoder the inputs will be the same 
        # N, seq_len 
        N, S = x.shape
        positions = torch.arange(0,S, device=x.device)
        pos = self.pos_emb(positions)
        token_emb = self.tok_emb(x)

        x_ = pos+ token_emb
        x = self.dropout(x_)
        for layer in self.layers:
            x = checkpoint(lambda input_x: layer(input_x, enc_output, lookahead_mask, padding_mask),x, use_reentrant=False)
        x = self.layernorm1(x)
        return x 

In [607]:
class MyReLUFunction(torch.autograd.Function):
    # this must inherit from a autograd class 
    # we have to call the super to handle the upper class functions 
    # super().__init__
    # this a static method which should be defined to have no leanrable parametrs
    @staticmethod
    def forward(ctx, input):
        # ctx is hte context object to keep track of the inputs 
        ctx.save_for_backward(input)
        return torch.clamp(input, min=0)
    @staticmethod
    def backward(ctx, grad_input):
        inputs = ctx.saved_tensors
        if inputs >0:
            return  grad_input*1 
        else:
            return 0 
    
    
gradient = MyReLUFunction()
gradient.apply(torch.tensor([1,2,3 , -1]))
# You don't call super().__init__() because torch.autograd.Function has no state.
# ctx is th etemoirtary satte that the autograd passes from forward to backward 
#The .apply() method is magically created by the base class (torch.autograd.Function).
#You inherit .apply() just by subclassing. It's the "magic" that connects your functions to PyTorch's computation graph.

  gradient = MyReLUFunction()


tensor([1, 2, 3, 0])

In [612]:
x = torch.tensor([-2.0, 0.5, 3.7, 10.0])
torch.clamp(x, min=0,max = 9 )
# → tensor([0.0, 0.5, 3.7, 5.0])


tensor([0.0000, 0.5000, 3.7000, 9.0000])

In [622]:
# register hooks 
# The hook you attach with tensor.register_hook(hook_fn) is a gradient hook.
# This is different from a module hook, which you attach to an nn.Module (like model.layer1.register_forward_hook(...)). Those can fire on the forward pass to inspect activations.

In [623]:
import torch
import torch.nn as nn

# 1. The Hook Function
def print_grad_hook(grad):
    """
    A simple hook function that prints the stats of a gradient tensor.
    """
    if grad is not None:
        print(f"--- HOOK FIRED ---")
        print(f"Gradient Mean: {grad.mean():.2e}")
        print(f"Gradient Std:  {grad.std():.2e}")
        print(f"Gradient Norm: {grad.norm():.2e}")
        print(f"------------------")
    else:
        print("--- HOOK FIRED (Grad is None) ---")

# --- Scenario Setup ---
model = nn.Linear(10, 10)
x = torch.randn(5, 10, requires_grad=True) # Input
target = torch.randn(5, 10)               # Target
criterion = nn.MSELoss()

# --- The Solution ---

# 3. The Training Step (Forward Pass)
y = model(x) # y is the tensor we want to inspect

# 2. Attach the Hook
# We attach the hook directly to the 'y' tensor.
# It will fire when the gradient *for 'y'* is computed.
y.register_hook(print_grad_hook)

# 3. Trigger the Hook (Loss & Backward Pass)
loss = criterion(y, target)
loss.backward()

--- HOOK FIRED ---
Gradient Mean: -9.08e-03
Gradient Std:  5.06e-02
Gradient Norm: 3.60e-01
------------------


In [624]:
# collate function 


In [638]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence

class ImbalancedTextDataset(Dataset):
    def __init__(self):
        # 10 samples total
        self.texts = [
            torch.tensor([101, 4, 5, 102]),                 # class 0
            torch.tensor([101, 6, 7, 8, 9, 10, 102]),       # class 0
            torch.tensor([101, 11, 12, 102]),               # class 0
            torch.tensor([101, 13, 14, 15, 16, 102]),       # class 0
            torch.tensor([101, 17, 18, 19, 20, 21, 22, 102]), # class 0
            torch.tensor([101, 23, 24, 102]),               # class 0
            torch.tensor([101, 25, 26, 27, 28, 102]),       # class 0
            torch.tensor([101, 29, 30, 102]),               # class 0
            
            torch.tensor([101, 100, 200, 300, 102]),       # class 1 (rare)
            torch.tensor([101, 400, 500, 102]),           # class 1 (rare)
        ]
        self.labels = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

    def __len__(self):
        return len(self.texts) 
    def __getitem__(self, index):
        return (self.texts[index], self.labels[index])
dataset = ImbalancedTextDataset()
def pad_collate_fn(batch):
    # batch , 2 
    texts = [item[0] for item in batch]
    labels =  [item[1] for item in batch]
    padded_texts = pad_sequence(texts, batch_first=True, padding_value =0 )
    labels = torch.stack(labels)
    return padded_texts, labels
labels = dataset.labels

class_counts = torch.bincount(labels)
class_weights = 1/class_counts
sample_weights = class_weights[labels]

In [641]:
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

In [649]:
# SGD with momentum 
class momentumSGD:
    def __init__(self,params, lr, momentum=0.9):
        
        self.lr = lr 
        self.momentum = momentum
        self.params = list(params) 
        self.velocity_buffers = []
        for param in self.params:
            self.velocity_buffers.append(torch.zeros_like(param.data))
        # veloctity buffers is no of layers x param shape 
    def step(self):
        # update logic 
        with torch.no_grad():
            for i  in range(len(self.params)):
                param = self.params[i]
                if param.grad is not None:
                    gt = param.grad 
                    v_t_minus_1 = self.velocity_buffers[i]
                    v_t = self.momentum*v_t_minus_1 + gt
                    self.velocity_buffers[i] = v_t
                    param.data -= self.lr*v_t
        
    def zero_grad(self, set_to_none = True):
        for param in self.params:
            if param.grad is not None:
                if set_to_none:
                    param.grad = None 
                else:
                    param.grad.zero_()
                





In [650]:
# SGD with momentum 
class Adam:
    def __init__(self,params, lr, beta1=0.9, beta2=0.99,eps=1e-6):
        
        self.lr = lr 
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.params = list(params) 
        self.momentum_buffer = []
        self.second_moment_buffer = []
        self.t = 0 
        for param in self.params:
            self.momentum_buffer.append(torch.zeros_like(param.data))
            self.second_moment_buffer.append(torch.zeros_like(param.data))
            
        # veloctity buffers is no of layers x param shape 
    def step(self):
        # update logic 
        self.t +=1
        with torch.no_grad():
            for i  in range(len(self.params)):
                param = self.params[i]
                if param.grad is not None:
                    gt = param.grad 
                    mt_1 = self.momentum_buffer[i]
                    vt_1 = self.second_moment_buffer[i]

                    mt = self.beta1*mt_1 + (1-self.beta1)*gt
                    vt = self.beta2*vt_1 + (1-self.beta2)*(gt**2)
                    self.momentum_buffer[i] = mt 
                    self.second_moment_buffer[i] = vt 
                    mt = mt/(1-self.beta1**self.t)
                    vt = vt/(1-self.beta2**self.t)
                    
                    param.data -= self.lr* mt /(vt**0.5+ self.eps)
        
    def zero_grad(self, set_to_none = True):
        for param in self.params:
            if param.grad is not None:
                if set_to_none:
                    param.grad = None 
                else:
                    param.grad.zero_()

In [660]:
# moe 
class MOE_Layer(nn.Module):
    def __init__(self,d_model, dff, n_experts ):
        super().__init__()
        self.d_model = d_model 
        self.dff = dff 
        self.n_experts = n_experts
        self.gating_network = nn.Linear(d_model, n_experts)
        self.experts = nn.ModuleList([nn.Sequential(
            nn.Linear(d_model, dff), nn.ReLU(),
            nn.Linear(dff,d_model)) for _ in range(n_experts)])
    
    def forward(self, X):
        # X is the output from attention 
        # shape is N,seq_len , d_model
        # this passes through the gating network 
        # output will be N, seq_len, E
        gate_output = self.gating_network(X)
        # select which experts to actiiivate 
        router_weights = nn.functional.softmax(gate_output, dim=-1) # N, seq_len
        # N, seq_len
        output = torch.zeros_like(X)
        for i in range(self.n_experts):
            
            expert_output = self.experts[i](X) # N seq_len, d_model
            w = router_weights[:, :, i] # N, seq_len
            w = w.unsqueeze(-1)
            output += w*expert_output
            # N, seq_len  N seq_len d_model 
        return output 
X = torch.randn(100, 10, 64)
moe_layer = MOE_Layer(64, 64*4, 4)
out = moe_layer(X)
out.shape

torch.Size([100, 10, 64])

In [None]:
# weighted sum MoE
# sparse top k routing 
# instead of softmax use topk and choose only specific indices 
# moe 
class Topk_MOE_Layer(nn.Module):
    def __init__(self,d_model, dff, n_experts ):
        super().__init__()
        self.d_model = d_model 
        self.dff = dff 
        self.n_experts = n_experts
        self.gating_network = nn.Linear(d_model, n_experts)
        self.experts = nn.ModuleList([nn.Sequential(
            nn.Linear(d_model, dff), nn.ReLU(),
            nn.Linear(dff,d_model)) for _ in range(n_experts)])
    
    def forward(self, X, k ):
        # X is the output from attention 
        # shape is N,seq_len , d_model
        # this passes through the gating network 
        # output will be N, seq_len, E
        gate_output = self.gating_network(X)
        top_k_logits, top_k_indices = torch.topk(gate_output, dim = -1, k=k)
        # N, seq_len k 
        print(top_k_logits.shape)
        router_weights = F.softmax(top_k_logits, dim = -1)
        # N, seq_len
        output = torch.zeros_like(X)
        all_expert_outputs = []
        for expert in self.experts:
            all_expert_outputs.append(expert(X))
        all_expert_outputs = torch.stack(all_expert_outputs, dim =2)
        



        # for i in top_k_indices:
        #     expert_output = self.experts[i](X) # N seq_len, d_model
        #     w = router_weights[:, :, i] # N, seq_len
        #     w = w.unsqueeze(-1)
        #     output += w*expert_output
        #     # N, seq_len  N seq_len d_model 
        # return output 
X = torch.randn(100, 10, 64)
moe_layer = Topk_MOE_Layer(64, 64*4, 4)
out = moe_layer(X, 4)
out.shape

torch.Size([100, 10, 4])


TypeError: only integer tensors of a single element can be converted to an index