A3. seeing models as computational graph 

In [6]:
import torch 

AUTOGRAD

In [7]:
import math 
def dz_dx(x):
    return 2 * x * math.cos(x**2)


In [8]:
dz_dx(4)

-7.661275842587077

In [9]:
x = torch.tensor(4.0, requires_grad=True)
y = x**2 

In [10]:
x

tensor(4., requires_grad=True)

In [11]:
y

tensor(16., grad_fn=<PowBackward0>)

In [12]:
z = torch.sin(y)

In [13]:
z

tensor(-0.2879, grad_fn=<SinBackward0>)

In [14]:
# # Recompute y and z before backward to restore the computation graph
y = x**2
z = torch.sin(y)
z.backward()

In [15]:
x.grad

tensor(-7.6613)

In [16]:
y.grad

  y.grad


In [17]:
#inputs 

x = torch.tensor(6.7) # input feature 
y = torch.tensor(0.0) # true label 


w = torch.tensor(1.0) # weight
b = torch.tensor(0.0) # bias


In [18]:
# binary cross entropy loss for scalar
def binary_cross_entropy(prediction, target):
    epsilon = 1e-7 # small constant to avoid log(0)
    prediction = torch.clamp(prediction, epsilon, 1 - epsilon) # clamp is used to limit the values within a range
    return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))


In [19]:
# forward pass 
z = w * x + b # linear transformation 
y_pred = torch.sigmoid(z) # activation function 

# compute binary cross entrpy loss
loss = binary_cross_entropy(y_pred, y)

In [20]:
loss

tensor(6.7012)

In [21]:
# Derivatives:
# 1. dL/d(y_pred): Loss with respect to the prediction (y_pred)
dloss_dy_pred = (y_pred - y)/(y_pred*(1-y_pred))

# 2. dy_pred/dz: Prediction (y_pred) with respect to z (sigmoid derivative)
dy_pred_dz = y_pred * (1 - y_pred)

# 3. dz/dw and dz/db: z with respect to w and b
dz_dw = x  # dz/dw = x
dz_db = 1  # dz/db = 1 (bias contributes directly to z)

dL_dw = dloss_dy_pred * dy_pred_dz * dz_dw
dL_db = dloss_dy_pred * dy_pred_dz * dz_db

In [22]:
print(f"Manual Gradient of loss w.r.t weight (dw): {dL_dw}")
print(f"Manual Gradient of loss w.r.t bias (db): {dL_db}")

Manual Gradient of loss w.r.t weight (dw): 6.691762447357178
Manual Gradient of loss w.r.t bias (db): 0.998770534992218


In [23]:
x = torch.tensor(6.7, requires_grad=True)
y = torch.tensor(0.0, requires_grad= True)

w = torch.tensor(1.0, requires_grad=True) # weight
b = torch.tensor(0.0, requires_grad=True) # bias

In [24]:
x

tensor(6.7000, requires_grad=True)

In [25]:
w, b

(tensor(1., requires_grad=True), tensor(0., requires_grad=True))

In [26]:
# forward pass 
z = w * x + b # linear transformation 
y_pred = torch.sigmoid(z) # activation function 

# compute binary cross entrpy loss
loss = binary_cross_entropy(y_pred, y)

In [27]:
print(loss)

tensor(6.7012, grad_fn=<NegBackward0>)


In [28]:
# Recompute forward pass to restore computation graph
z = w * x + b
y_pred = torch.sigmoid(z)
loss = binary_cross_entropy(y_pred, y)
loss.backward()

In [29]:
print(w.grad)
print(b.grad)

tensor(6.6918)
tensor(0.9988)


In [30]:
x = torch.tensor([1.0 , 2.0 , 3.0], requires_grad=True)

In [31]:
y = (x**2).mean()
y

tensor(4.6667, grad_fn=<MeanBackward0>)

In [32]:
y.backward()

In [33]:
x.grad

tensor([0.6667, 1.3333, 2.0000])

In [34]:
# clearing gradient 
x = torch.tensor(2.0 , requires_grad=True)
x

tensor(2., requires_grad=True)

In [35]:
y = x**2
y

tensor(4., grad_fn=<PowBackward0>)

In [36]:
y.backward()

In [37]:
x.grad

tensor(4.)

In [38]:
x.grad.zero_()

tensor(0.)

## iterator funciton in python . 
## next function in python . 

In [39]:
my_list = [1,2,3,4]
my_iterator = iter(my_list)

print(next(my_iterator))
print(next(my_iterator))

1
2


In [40]:
import random 

def roll_dice():
    return random.randint(1, 6)

#create an iterator that calls roll_dice untill it returns 6
dice_rolls = iter(roll_dice, 6)

for roll in dice_rolls:
    print(roll)
# this will print random untill 6 is rolled

5
4
1
4
2
3


## Understanding the difference between Embeddings layer and Linear layers 

embeddings layers are compuational efficent 

In [None]:
# suppose we have following 3 training examples 
# which may represnt token IDs in a LLM context 
idx = torch.tensor([2,3,1])

# The number of rows in the embedding matrix can be determined
# by obtaining the largest token ID + 1.
# If the highest token ID is 3, then we want 4 rows, for the possible
# token IDs 0, 1, 2, 3
num_idx = max(idx) + 1 #max(idx) -> max value

# the desired embeddings dimension is an hyper parameter
out_dim = 5 

implementing an simple embedding layer 


In [42]:
# we random seed for reproducibility since
# weights in embedding layers are initilaized with 
# small random values 
torch.manual_seed(123)

embedding = torch.nn.Embedding(num_idx, out_dim)

In [43]:
embedding.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  1.5810],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015],
        [ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953]], requires_grad=True)

In [44]:
embedding(torch.tensor([2]))

tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315]],
       grad_fn=<EmbeddingBackward0>)

In [45]:
idx = torch.tensor([2,3,1])
embedding(idx)

tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],
       grad_fn=<EmbeddingBackward0>)

converting tokenIDs into one hot encoding

In [None]:
onehot = torch.nn.functional.one_hot(idx) #unlike classes like (nn.linear, nn.Conv2D), 
onehot                                   #funciton in torch.functional do not maintain internal state or learnable parameters like weights and biases


tensor([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0]])

next we initaialize Linear layer to carry out matrix multiplication X.W(transpose)

In [52]:
torch.manual_seed(123)
# nn.layer(in_features, out_features, bias=True)
linear = torch.nn.Linear(num_idx, out_dim, bias=False) #this layer computes y=xW(transpose)+b
linear.weight #weight matrix of shape (out_features, in_features)

Parameter containing:
tensor([[-0.2039,  0.0166, -0.2483,  0.1886],
        [-0.4260,  0.3665, -0.3634, -0.3975],
        [-0.3159,  0.2264, -0.1847,  0.1871],
        [-0.4244, -0.3034, -0.1836, -0.0983],
        [-0.3814,  0.3274, -0.1179,  0.1605]], requires_grad=True)

Note that the LINEAR layer in Pytorch is also intialized with small random weights; to directly compare it to the EMBEDDING layer above , we have to use same small random weights, which is why we reassign them here:

In [55]:
linear.weight = torch.nn.Parameter(embedding.weight.T)

now we can use linear layer on one hot encoded representation of the inputs:

In [56]:
linear(onehot.float())

tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]], grad_fn=<MmBackward0>)

this is exactly same as we got when we used embeding layer

In [58]:
embedding(idx)

tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],
       grad_fn=<EmbeddingBackward0>)

## Nested loops : Pairwise comparison 

In [None]:
for hour in range(0,24):
    for minute in range(0,60):
        print(f"{hour:02d}:{minute:02d}")

00:00
00:01
00:02
00:03
00:04
00:05
00:06
00:07
00:08
00:09
00:10
00:11
00:12
00:13
00:14
00:15
00:16
00:17
00:18
00:19
00:20
00:21
00:22
00:23
00:24
00:25
00:26
00:27
00:28
00:29
00:30
00:31
00:32
00:33
00:34
00:35
00:36
00:37
00:38
00:39
00:40
00:41
00:42
00:43
00:44
00:45
00:46
00:47
00:48
00:49
00:50
00:51
00:52
00:53
00:54
00:55
00:56
00:57
00:58
00:59
01:00
01:01
01:02
01:03
01:04
01:05
01:06
01:07
01:08
01:09
01:10
01:11
01:12
01:13
01:14
01:15
01:16
01:17
01:18
01:19
01:20
01:21
01:22
01:23
01:24
01:25
01:26
01:27
01:28
01:29
01:30
01:31
01:32
01:33
01:34
01:35
01:36
01:37
01:38
01:39
01:40
01:41
01:42
01:43
01:44
01:45
01:46
01:47
01:48
01:49
01:50
01:51
01:52
01:53
01:54
01:55
01:56
01:57
01:58
01:59
02:00
02:01
02:02
02:03
02:04
02:05
02:06
02:07
02:08
02:09
02:10
02:11
02:12
02:13
02:14
02:15
02:16
02:17
02:18
02:19
02:20
02:21
02:22
02:23
02:24
02:25
02:26
02:27
02:28
02:29
02:30
02:31
02:32
02:33
02:34
02:35
02:36
02:37
02:38
02:39
02:40
02:41
02:42
02:43
02:44
02:45
02:4

In [None]:
players = ["yash", "lakshy", "Ishaan", "Gaurav"]

for player1 in players:
    for player2 in players:
        print(f"{player1} vs {player2}")

yash vs yash
yash vs lakshy
yash vs Ishaan
yash vs Gaurav
lakshy vs yash
lakshy vs lakshy
lakshy vs Ishaan
lakshy vs Gaurav
Ishaan vs yash
Ishaan vs lakshy
Ishaan vs Ishaan
Ishaan vs Gaurav
Gaurav vs yash
Gaurav vs lakshy
Gaurav vs Ishaan
Gaurav vs Gaurav


## torch.nn.module 

In [1]:
#implementation without nn.module
import torch

input_size = 2
hidden_size = 4
output_size = 1

w1 = torch.randn(input_size, hidden_size, requires_grad = True)
b1 = torch.randn(hidden_size, requires_grad = True)
w2 = torch.randn(hidden_size, output_size, requires_grad= True)
b2 = torch.randn(output_size, requires_grad = True)

def forward(x):
    
    hidden = torch.matmul(x, w1) + b1
    hidden_activation = torch.relu(hidden)

    output = torch.matmul(hidden_activation, w2) + b2
    return output

x = torch.tensor([[1.0, 2.0]]) #batch of one input wiht 2 features 
y = forward(x)
print("output without nn.module:", y)


output without nn.module: tensor([[-8.3074]], grad_fn=<AddBackward0>)


In [18]:
# Implementation with torchh.nn.module

import torch
import torch.nn as nn

input_size = 2
hidden_size = 4
output_size = 1

class SimpleNN(nn.Module):              # inheriting the nn.Module
    def __init__(self, input_size, hidden_size, output_size):  # constructor
        super(SimpleNN, self).__init__()                       # envoking the parent class(nn.Module) constructor with the help of super()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        hidden = self.hidden(x)
        hidden_activation = torch.relu(hidden)
        output = self.output(hidden_activation)
        return output

model = SimpleNN(input_size, hidden_size, output_size)

x = torch.tensor([[1.0,2.0]])
y = model(x)
print("output with the torch.nn: ", y)

output with the torch.nn:  tensor([[0.1165]], grad_fn=<AddmmBackward0>)


1.> super(SimpleNN, self).__init__() : this is use for initiallization of the nn.module(calls the __init__ method of the parent class(nn.Module))

2.> super() is an inbuilt function that provides a way to access methods and properties of a parent or sibling class from within a child (subclass)

3.> self (constructor) : It represnets the instance(object) of the class being used. it acts as a pointer or refreence to the current object that is created from the class 

In [10]:
## Common layers in torch.nn ##

import torch 
import torch.nn as nn

class Model(nn.Module):

    def __init__(self, num_features):
        super(). __init__()
        self.linear = nn.Linear(num_features, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, features):
        out = self.linear(features)
        out = self.sigmoid(out)
        return out


In [11]:
#create dataset with 10 samples and 5 features
features = torch.randn(10, 5)
#create model instance
model = Model(features.shape[1])
# forward pass
# model.forward(features)
model(features)

tensor([[0.5838],
        [0.4856],
        [0.5555],
        [0.5633],
        [0.4824],
        [0.4630],
        [0.5721],
        [0.5589],
        [0.5632],
        [0.4683]], grad_fn=<SigmoidBackward0>)

In [14]:
model.linear.weight
model.linear.bias

Parameter containing:
tensor([0.1246], requires_grad=True)

In [17]:
from torchinfo import summary 

summary(model, input_size=(10,5))

Layer (type:depth-idx)                   Output Shape              Param #
Model                                    [10, 1]                   --
├─Linear: 1-1                            [10, 1]                   6
├─Sigmoid: 1-2                           [10, 1]                   --
Total params: 6
Trainable params: 6
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [20]:
import torch
import torch.nn as nn 

class Model(nn.Module):

    def __init__(self, num_features):
        super().__init__()
        self.linear1 = nn.Linear(num_features, 3)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(3, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, features):
        out = self.linear1(features)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.sigmoid(out)
        return out 
    
# create dataset with 10 samples an 5 features
features = torch.randn(10, 5)
#create model instance 
model = Model(features.shape[1])
# forward pass
model(features)

tensor([[0.4911],
        [0.3792],
        [0.4684],
        [0.5009],
        [0.4806],
        [0.5012],
        [0.4070],
        [0.4893],
        [0.4675],
        [0.4991]], grad_fn=<SigmoidBackward0>)

In [21]:
model.linear1.weight 

Parameter containing:
tensor([[-0.2807, -0.0019, -0.1647,  0.0266,  0.1987],
        [ 0.0672, -0.1692, -0.2085,  0.0874, -0.1970],
        [ 0.3311, -0.1621,  0.1110,  0.0038, -0.2571]], requires_grad=True)

In [22]:
model.linear2.weight 

Parameter containing:
tensor([[-0.0522, -0.5020, -0.5402]], requires_grad=True)

In [23]:
model.linear1.bias

Parameter containing:
tensor([0.3340, 0.0607, 0.1546], requires_grad=True)

In [24]:
model.linear2.bias

Parameter containing:
tensor([0.0368], requires_grad=True)

In [25]:
from torchinfo import summary 
summary(model, input_size=(10,5))

Layer (type:depth-idx)                   Output Shape              Param #
Model                                    [10, 1]                   --
├─Linear: 1-1                            [10, 3]                   18
├─ReLU: 1-2                              [10, 3]                   --
├─Linear: 1-3                            [10, 1]                   4
├─Sigmoid: 1-4                           [10, 1]                   --
Total params: 22
Trainable params: 22
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [None]:
# another approach by using sequentiall container 

import torch
import torch.nn as nn

class Model(nn.Module):

    def __init__(self, num_features):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(num_features, 3),
            nn.ReLU(),
            nn.Linear(3,1),
            nn.Sigmoid()
        )
    def forward(self, features):
        out = self.network(features)
        return out 
# create dataset with 10 sample and 5 features
features = torch.randn(10, 5)
# create model instance 
model = Model(features.shape[1])
# forward pass
model(features)



tensor([[0.5977],
        [0.6049],
        [0.5956],
        [0.4674],
        [0.6049],
        [0.6049],
        [0.5850],
        [0.6049],
        [0.5409],
        [0.5783]], grad_fn=<SigmoidBackward0>)

In [30]:
model.network[0].weight

Parameter containing:
tensor([[ 0.0607, -0.1247,  0.1379,  0.2983,  0.3690],
        [-0.1642, -0.1074, -0.0650, -0.1598,  0.1487],
        [ 0.1833, -0.0237,  0.0553, -0.3360,  0.2641]], requires_grad=True)

In [33]:
model.network[0].bias

Parameter containing:
tensor([ 0.1783, -0.2730, -0.4188], requires_grad=True)

In [32]:
model.network[2].weight

Parameter containing:
tensor([[-0.1337, -0.3417, -0.5530]], requires_grad=True)

In [34]:
model.network[2].bias

Parameter containing:
tensor([0.4258], requires_grad=True)

## nn.Parameter

* torch.nn.Parameter is a subclass of torch.Tensor, designed for holding parameters in the model that should be considered during the training.

* when a tensor is wrapped with torch.nn.Parameter, it automatically becomes the part of the model's parameters, and thus it will be updated when backpropagation is applied during training.

In [11]:
import torch
import torch.nn as nn

class MyLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super(MyLinear, self).__init__()
        # define weights and bias parameters
        self.weights = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.randn(out_features))
    
    def forward(self, x):
        #implementing a forward pass
        return torch.matmul(x, self.weights.t()) + self.bias

In [13]:
x = torch.randn(3,4) # batch_size = 3, in_features = 4

linear = MyLinear(in_features=4, out_features=2)

output = linear(x)

print("Input: \n", x)
print("output: \n", output)

Input: 
 tensor([[-0.5461, -0.9793, -0.6256, -0.6881],
        [ 0.2745, -1.0656,  0.6502,  1.7235],
        [ 0.9593,  0.1801, -0.0964, -1.7881]])
output: 
 tensor([[-2.9095, -1.0992],
        [ 2.9338,  1.4810],
        [-4.1624, -1.1269]], grad_fn=<AddBackward0>)


SLICING OF TENSOR IN PYTHON 

In [None]:
# Slicing 1D tensor using -> [start, stop, step]

import torch 
x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
sliced_x = x[2:7:2]
print(sliced_x)

In [None]:
# Slicing Multi dimensional tensor 
# Applying slicing to each dimension seprated by commmas

import torch
y = torch.tensor([[1,2,3],
                  [4,5,6],
                  [7,8,9]])

# select rows from index 0 upto (but not including) 2, and all columns 
sliced_y = y[0:2,:]
print(sliced_y)

# select all rows, and columns form index 1 upto (but not including) 3
sliced_y_cols = y[0:,1:3]
print(sliced_y_cols)

In [5]:
z = torch.triu(torch.ones(6,6), diagonal=1)
print(z)
z[3:4, :4] # array[start_row:end_row, start_column:end_column]
# 3:4 -> specifies of rows to select
# start index (3), end index (4)(go up to but not include row index 4)
# column slicing [:4] -> specifes the range of columns to select

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])


tensor([[0., 0., 0., 0.]])

In [1]:
##### torch.arange(....) ######
import torch

tensor1 = torch.arange(5)
print(tensor1)

# torch.arange(start=0, end=None, step=1, dtype=None, device=None)

tensor2 = torch.arange(1, 10, 2)
print(tensor2)

tensor3 = torch.arange(5, dtype=torch.float32)
print(tensor3)

tensor([0, 1, 2, 3, 4])
tensor([1, 3, 5, 7, 9])
tensor([0., 1., 2., 3., 4.])


In [10]:
##### torch.unsqueeze() ######

x = torch.tensor([[1,2,3,4], [4,5,6,7], [7,8,9,10]])
print(x.shape)
print(x)

y = x.unsqueeze(0) # add a new dimension at index 0
print(y.shape)
print(y)

z = x.unsqueeze(1) 
print(z.shape)
print(z)

torch.Size([3, 4])
tensor([[ 1,  2,  3,  4],
        [ 4,  5,  6,  7],
        [ 7,  8,  9, 10]])
torch.Size([1, 3, 4])
tensor([[[ 1,  2,  3,  4],
         [ 4,  5,  6,  7],
         [ 7,  8,  9, 10]]])
torch.Size([3, 1, 4])
tensor([[[ 1,  2,  3,  4]],

        [[ 4,  5,  6,  7]],

        [[ 7,  8,  9, 10]]])
