In [76]:
#Implement L2-regularized logistic regression trained with minibatch gradient descent.
# lets say we have some data 
import torch 
import torch.nn as nn 
torch.manual_seed(42)
n_samples = 100 
features = 64 
X = torch.randn(n_samples, features)
y = torch.randint(0,2, size=(n_samples, ),dtype = torch.int64)
in_features = features
out_features = 2 
W = torch.randn(in_features, out_features , requires_grad=True)
b = torch.randn(out_features, requires_grad=True)
l2_lambda = 1e-2  # true L2 strength (on weights only)
ce_loss = nn.CrossEntropyLoss()
# now we want to train a logistic regression with L2 norm and with minibatch gradient descent 
def fit_logistic_regression(X,y):
    logits = torch.matmul(X,W)+ b 
    # we can compute the loss here as well 
    loss_total = ce_loss(logits, y) +  0.5*l2_lambda*(W**2).sum()
    return logits, loss_total
n_epochs = 10 
lr = 0.01
mini_batch = 20
for _ in range(n_epochs):
    for i in range(0,len(X),mini_batch ):
        X_batch = X[i:i+mini_batch]
        y_batch = y[i:i+mini_batch]
        _, loss = fit_logistic_regression(X_batch, y_batch)
        loss.backward()
        
        with torch.no_grad():
            W-= lr*W.grad
            b-= lr*b.grad
        W.grad = None
        b.grad = None
print(loss)


tensor(3.5740, grad_fn=<AddBackward0>)


In [92]:
####Ques. 2 
'''Input: X shape (N, D), labels y shape (N,) in {0,1}.
Hidden layer: 32 units, ReLU activation.
Output layer: 1 unit, sigmoid activation.
Loss: binary cross-entropy.
Optimizer: plain SGD, manually updating parameters.
Regularization: L2 penalty on all weights (not biases).
Mini-batch training, obviously.
Print loss every few epochs. '''
N_samples = 1000
D = 64
X = torch.randn(N_samples,D)
y = torch.randint(0, 2, size= (N_samples , ), dtype=torch.float32)
W1 = torch.randn(D, 32, requires_grad=True)
b1 = torch.randn(32 ,requires_grad=True)
relu = nn.ReLU()
W2 =  torch.randn(32, 1,  requires_grad=True)
b2 =  torch.randn( 1,  requires_grad=True)
ce_loss = nn.BCEWithLogitsLoss()
lambda_2 = 0.01
lambda_1 = 0.02
def forward(X, y):
    y1 = X@W1+b1 
    y1_relu = relu(y1)
    y2 = y1_relu@W2 +b2 
    y2 = y2.squeeze(1)
    #y2_sigmoid = 1/(1+torch.exp(-y2))
    return y2

n_epochs = 10
mini_batch = 8 
lr = 0.0001
for _ in range(n_epochs):
    for i in range(0, N_samples, mini_batch):
        y_batch = y[i:i+mini_batch]
        X_batch = X[i:i+mini_batch]
        y_logits  = forward(X_batch, y_batch )
        loss = ce_loss(y_logits, y_batch) + lambda_1*0.5* (W1**2).sum() + lambda_2*0.5* (W2**2).sum()
        loss.backward()
        with torch.no_grad():
            W1 -= lr*W1.grad
            b1 -= lr*b1.grad
            W2 -= lr*W2.grad
            b2 -= lr*b2.grad
    print(loss)

tensor(30.5306, grad_fn=<AddBackward0>)
tensor(31.7651, grad_fn=<AddBackward0>)
tensor(19.3271, grad_fn=<AddBackward0>)
tensor(16.7114, grad_fn=<AddBackward0>)
tensor(13.6852, grad_fn=<AddBackward0>)
tensor(6.4430, grad_fn=<AddBackward0>)
tensor(3.8677, grad_fn=<AddBackward0>)
tensor(2.1974, grad_fn=<AddBackward0>)
tensor(5.1975, grad_fn=<AddBackward0>)
tensor(10.7520, grad_fn=<AddBackward0>)


In [132]:
import torch
import torch.nn as nn
import torch.optim as optim
N = 1000
torch.manual_seed(42)
X = torch.randn(N, 3) # N, 3 
noise = 0.1*torch.randn(N,1) 
#y = 3x1 - 2x2 + 0.5x3^2 + noise
Y = 3*X[:,0] - 2*X[:,1] + 0.5*X[:,2]**2 +noise  # N, 1, shape 
class Model(nn.Module):
    def __init__(self ):
        super().__init__()
        self.model = nn.Sequential(nn.Linear(3, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU(), nn.Linear(16,1))
        self.loss = torch.nn.MSELoss()
        
    def forward(self, x, y ):
        # shape  of x is (N, 3 )
        logits = self.model(x)
        # shape (N,1 )
        loss = self.loss(logits, y ) # shape is same as y and mean of that scalar output 
        return logits, loss 

n_train = int(0.7*N)
n_val= int(0.85*N)
x_train, y_train= X[:n_train] ,  Y[:n_train]
x_test, y_test = X[n_train:n_val] ,  Y[n_train:n_val]
x_val , y_val = X[n_val:] ,  Y[n_val:]
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
n_epochs = 5
mini_batch=  100
model = Model()
best_val = float("inf")
optimizer = torch.optim.Adam(model.parameters(),lr= 0.001, weight_decay=0.01 )
train_loss_total = 0 
for epoch in range(n_epochs):
    model.train()
    train_loss = 0 
    for i in range(0, n_train,mini_batch ):
        x_batch, y_batch = x_train[i:i+mini_batch],  y_train[i:i+mini_batch]
        batch_logits, batch_loss = model(x_batch, y_batch)
        optimizer.zero_grad(set_to_none=True)
        batch_loss.backward()
        optimizer.step()
        if (epoch + 1) % 2 == 0:
            print(f"Epoch {epoch+1}, Loss: {batch_loss.item():.4f}")
        train_loss += batch_loss.item()*x_batch.size(0)
    train_loss_total += train_loss
    model.eval()
    with torch.no_grad():
        val_preds, val_loss  = model(x_val, y_val)
    improved = val_loss.item() < best_val - 1e-7
    if improved:
        best_val = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), "best_model.pt")
    else:
        break 
print(f"Best epoch: {best_epoch}, Best val MSE: {best_val:.4f}")


torch.Size([700, 3])
torch.Size([150, 3])
torch.Size([150, 3])
Epoch 2, Loss: 13.1744
Epoch 2, Loss: 13.1910
Epoch 2, Loss: 13.1722
Epoch 2, Loss: 13.1712
Epoch 2, Loss: 13.1738
Epoch 2, Loss: 13.1747
Epoch 2, Loss: 13.1500
Epoch 4, Loss: 13.1185
Epoch 4, Loss: 13.1293
Epoch 4, Loss: 13.1164
Epoch 4, Loss: 13.1157
Epoch 4, Loss: 13.1195
Epoch 4, Loss: 13.1201
Epoch 4, Loss: 13.1082
Best epoch: 4, Best val MSE: 13.1055


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [136]:
class MyDropout(nn.Module):
    def __init__(self,p):
        super().__init__()
        self.p = p 
        self.scale = 1/(1-self.p)
        self.mask = None
        
    def forward(self,x):
        if  not self.training: 
            return x 
    
        mask = (torch.rand_like(x)>self.p)
        return mask*x*self.scale
    

In [160]:
class MyRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size =input_size
        self.hidden_size = hidden_size
        # (sequence_length, batch_size, input_size)
        self.hidden_to_hidden = nn.Linear(hidden_size, hidden_size)
        self.input_to_hidden = nn.Linear(input_size, hidden_size)
        self.initial_hidden = torch.randn(batch_size, hidden_size)
        self.tanh = torch.nn.Tanh()
    def forward(self, x,h_prev ):
        
        hidden_out = self.hidden_to_hidden(h_prev)
        x_out = self.input_to_hidden(x)
        
        return  self.tanh(hidden_out + x_out )
seq_len = 10 
batch_size = 2
input_size = 5 
hidden_size = 10 
x = torch.randn(seq_len, batch_size, input_size)
print(x.shape)
rnn = MyRNNCell(input_size,hidden_size )
h_prev = torch.randn(batch_size, hidden_size)
outputs = []
for i in range(seq_len):
    x_t = x[i, :, :]
    print(h_prev.shape)
    h_prev = rnn(x_t, h_prev )
    outputs.append(h_prev)
outputs = torch.stack(outputs, dim=0)


torch.Size([10, 2, 5])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10])


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model):
        super().__init__()
        # we need ot deine the QKV matrices we project to 
        self.num_heads = num_heads
        self.dk = d_model/ num_heads
        self.d_model = d_model 
        self.WQ = nn.Linear(self.d_model, self.d_model) # projections are applied before any head splits 
        self.WK = nn.Linear(self.d_model, self.d_model) # projections are applied before any head splits 
        self.WV = nn.Linear(self.d_model, self.d_model) # projections are applied before any head splits 
        self.WO = nn.Linear(self.d_model, self.d_model) # projections are applied before any head splits 
    def forward(self, q, k , v):
        query = self.WQ(q)
        key = self.WK(k)
        value = self.WV(v) # batch_size, seq_len, d_model 
        # now we can split the head and call the attention 
        batch_size = q.shape[0]
        # split the headds 
        query = torch.view(batch_size, -1, self.num_heads, self.dk).transpose(1,2)
        value = torch.view(batch_size, -1, self.num_heads, self.dk).transpose(1,2)
        key = torch.view(batch_size, -1, self.num_heads, self.dk).transpose(1,2)



In [2]:
import torch
def masked_batch_cosine(a: torch.Tensor, b: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
    """
    a: (B, N, D), b: (B, M, D), mask: (B, N, M) boolean, where True means "include this pair".
    Returns: (B,) mean cosine similarity over masked pairs per batch.
    If a batch has 0 valid pairs, return 0 for that batch (not NaN), with proper gradient behavior.
    """
    B= 10 
    N = 10; M = 8 
    D = 76
    a = torch.tensor(B, N, D)
    b = torch.tensor(B,M,D)
    mask = torch.randint(0, 2, size = (B, N, M ))



In [19]:
B= 10 
N = 10; M = 8 
D = 76
a = torch.randn(B, N, D)
b = torch.randn(B,M,D)
mask = torch.randint(0, 2, size = (B, N, M ))
matrix_multiplication = torch.matmul(a, b.transpose(1, 2))
a_mod = torch.norm(a)
b_mod = torch.norm(b)
matrix_multiplication = matrix_multiplication /(a_mod*b_mod)
y = matrix_multiplication[mask]

In [7]:
mask.shape

torch.Size([10, 10, 8])

In [20]:
def predict_knn(X_train: torch.Tensor, y_train: torch.Tensor, x_test: torch.Tensor, k: int) -> int:
    """
    Predicts the class label for a single test point x_test.

    Args:
        X_train: The training data features (shape: [num_train_samples, num_features])
        y_train: The training data labels (shape: [num_train_samples])
        x_test: The single test point to classify (shape: [num_features])
        k: The number of neighbors to consider
    
    Returns:
        The predicted class label (an integer)
    """

In [78]:
X_train = torch.tensor([
    [1.0, 1.0],  # Sample 0
    [1.5, 2.0],  # Sample 1
    [5.0, 5.0],  # Sample 2
    [4.5, 5.5]   # Sample 3
]) # 4 x 2 
y_train = torch.tensor([0, 0, 1, 1])
#x_test = torch.tensor([1.2, 1.3]) # 1 x 2  
N = 2
x_test = torch.randn(N, 2)#(num_test_samples, num_features)
print(x_test)
k = 3
# k nearest neghbors, the class of new poitn is majority of these 3 neighbors 
# calculate distance to all the X_Train poitns 
#distances = torch.sqrt(torch.sum((X_train - x_test)**2, dim=1))

# distance = X_train - x_test
# top_k_values, top_k_indices = torch.topk(distances, k = k,largest = False )
# label, _ = torch.mode(y_train[top_k_indices])
# print(label)
x_test  =  torch.unsqueeze(x_test, 1)

distances  = torch.sqrt(torch.sum((X_train - x_test)**2, dim = 2)) # num test, num_train, featuers 
top_k_values, top_k_indices = torch.topk(distances, k = k,largest = False )
y_train[top_k_indices]

tensor([[ 0.7821, -1.3115],
        [ 0.3817,  0.6157]])


tensor([[0, 0, 1],
        [0, 0, 1]])

In [82]:
X_train = torch.tensor([
    [1.0, 1.0],  # 0
    [1.5, 2.0],  # 0
    [5.0, 5.0],  # 1
    [4.5, 5.5],  # 1
    [1.1, 1.1]   # 0
])
y_train = torch.tensor([0, 0, 1, 1, 0])

# Batch of 2 test points
X_test_batch = torch.tensor([
    [1.2, 1.3],  # Should be 0
    [4.9, 5.1]   # Should be 1
])
k = 3
# to broadcast the distances to be xtest, xtrain, features shape 
X_test_batch = X_test_batch.unsqueeze(1)
distances = torch.sum((X_train -X_test_batch)**2, dim = 2 ) # sum along the features dimentions 
top_k_distances, top_k_indices = torch.topk(distances, k = k ,largest =False)
k_nearest_labels = y_train[top_k_indices]
predicted_labels = torch.mode(k_nearest_labels, dim=1).values # dimentsion along the x train 
predicted_labels

tensor([0, 1])

In [94]:
# weightted KNN
num_classes = y_train.max().item() + 1
# only instad of taking mode now, you mutlipy by weights and take the final one 
weights = 1.0 / (top_k_distances + 1e-6)
num_test = X_test_batch.shape[0]
scores = torch.zeros(num_test, num_classes)
scores.scatter_add_(dim = 1, index=k_nearest_labels, src=weights)
predicted_labels = torch.argmax(scores, dim=1)

In [95]:
predicted_labels

tensor([0, 1])

In [141]:
# K means 
X= torch.randn(100, 3)
k= 4 
max_iters = 10 

centroids = X[torch.randperm(X.shape[0])[:k]]
X = X.unsqueeze(1) # k, 3 goes to k, 1, 3 
# find closest points to this centroids 
centroids.shape
for _ in range(max_iters):
    old_centroids = centroids.clone()
    distances  = torch.sqrt(torch.sum((X- centroids)**2 , dim = 2))# here we broadcast to k , N, 3 # we sum across last dim to get N, k 
    # for each N poitns we have distance to each  K centroids
    # assign each poitn to a centroids 
    labels = torch.argmin(distances, dim = 1 )
    # shape i sN 
    # these are the new labels 
    # new centroids are the cneters of these poitns 
    for i in range(k):
        centroids[i] = torch.mean(X[labels==i]) # N, 3 
    if torch.allclose(old_centroids, centroids):
        break 
    

In [137]:
centroids

tensor([[ 0.2544,  0.2544,  0.2544],
        [ 0.8059,  0.8059,  0.8059],
        [-0.5770, -0.5770, -0.5770],
        [ 2.0863,  0.2419, -0.8668]])

In [150]:
!pip install pandas 

Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m9.6 MB/s[0m  [33m0:00:01[0meta [36m0:00:01[0m
[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]2m2/3[0m [pandas]
[1A[2KSuccessfully installed pandas-2.3.3 pytz-2025.2 tzdata-2025.2


In [154]:
import torch 
import pandas as pd 
epochs_no_improve = 0 
from torch.utils.data import DataLoader
class MyCSVDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file_path):
        super().__init__()
        df = pd.read_csv(csv_file_path)
        self.features = torch.tensor(df.iloc[:, 0:4].values, dtype = torch.float32)
        self.labels =  torch.tensor(df.iloc[:, 4].values,dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        return (self.features[index], self.labels[index])
my_dataset = MyCSVDataset("some_path.csv")
num_epochs = 10 
train_loader = DataLoader(dataset=my_dataset, batch_size=32, shuffle=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
ce_loss = nn.CrossEntropyLoss()
val_loader = DataLoader(dataset=my_dataset, batch_size=8, shuffle=True)
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0 
    # train_loader handles all the batching and shuffling automatically!
    for features_batch, labels_batch in train_loader:
        preds = model(features_batch) 
        loss = ce_loss(preds, labels_batch)
        total_train_loss += loss.item()
        optimizer.zero_grad(set_to_none=False) # true is faster 
        loss.backward()
        optimizer.step()
    model.eval() # FIX: Put model in evaluation mode
    total_val_loss = 0.0
    with torch.no_grad():
        for val_features, val_labels in val_loader:
        
            val_preds =  model(val_features)
            val_loss = ce_loss(val_preds, val_labels)
            total_val_loss+= (val_loss.item())
    avg_val_loss  = total_val_loss/len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0 
    else:
        epochs_no_improve+=1
    if epochs_no_improve >= 4:
        print(f"Early stopping at epoch {epoch+1} as val loss did not improve.")
        break # Stop training






FileNotFoundError: [Errno 2] No such file or directory: 'some_path.csv'

In [None]:
#kernel_size and stride.
import torch.nn as nn 
batch_size = 8 
channels = 3 
in_height = 20 
in_width = 40
X = torch.randn(batch_size, channels, in_height, in_width)
class MyMaxPool2d(nn.Module):
    def __init__(self, kernel_size, stride):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride 
    def forward(self, X):
        (batch_size, channels, in_height, in_width) = X.shape
        out_height = (in_height - self.kernel_size)// self.stide +1 
        out_width  = (in_width - self.kernel_size)// self.stide +1 
        output = torch.zeros(batch_size, channels, out_height, out_width, 
                             dtype=x.dtype, device=x.device)
        for b in range(batch_size):
            for c in range(channels):
                for h_out in range(out_height):
                    for w_out in range(out_width):
                        h_start = h_out * self.stride
                        w_start = w_out * self.stride 
                        h_end = h_start+  self.kernel_size
                        w_end  = w_start+  self.kernel_size
                        window = x[b, c, h_start:h_end, w_start:w_end]
                        max_val = torch.max(window)
                        output[b, c, h_out, w_out] = max_val
        return output 