<a href="https://colab.research.google.com/github/Sanghita-C/mle-python-stack/blob/main/PyTorch_Basic_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#PyTorch

I followed a 1 hour tutorial to brush up Pytorch. Found this amazing youtube video by Zachary Huang. This notebook is simply me following along that video

https://www.youtube.com/watch?v=r1bquDz5GGA

##Tensor Basics

In [36]:
import torch

In [37]:
# You have a Python list and you try tp create a tensor out of it

data = [[1,2,3],[4,5,6]]
my_tensor = torch.tensor(data)

print(my_tensor)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [38]:
# you know a shape , but not the values yet - very useful for initializing model weights

shape = (2,3)

ones = torch.ones(shape)
zeros = torch.zeros(shape)
random = torch.randn(shape)

print(f"random tensor: \n {random} \n")
print(f"ones tensor: \n {ones} \n")

random tensor: 
 tensor([[-0.8462,  0.8574,  0.1167],
        [-0.1595, -0.6895,  0.4240]]) 

ones tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 



In [39]:
# You can also create tensors by mimicing another tensor - to follow the exact shape and dtype

template = torch.tensor([[1,2],[3,4]])

mimic_tensor = torch.rand_like(template, dtype=torch.float)

print(f"Template tensor \n{template}\n")
print("Randn_like tensor: \n", mimic_tensor)

Template tensor 
tensor([[1, 2],
        [3, 4]])

Randn_like tensor: 
 tensor([[0.2838, 0.0106],
        [0.6098, 0.6571]])


In [40]:
# critical properties of a tensor - shape , type and device

tensor = torch.randn(3,4)

print(f" Shape of tensor: {tensor.shape}")
print(f"Datatype : {tensor.dtype}")
print(f"Device : {tensor.device}")


 Shape of tensor: torch.Size([3, 4])
Datatype : torch.float32
Device : cpu


Note: The default datatype in pytorch is float - not integer : because DL heavily depends on maing tiny step changes. you can't do those changes if your matric dtype is int. So by default it. is always float

##AutoGrad:

AutoGrad is used for automatic differentiation.
It uses computational graph to keep track of all changes made to a variable. But for that you need **requires_grad** to be set to true.

In [41]:
x = torch.tensor([[1. ,2. ], [3., 4.]])
w = torch.tensor([[1.0],[2.0]], requires_grad = True)

print(f"Data tensor requires grad: {x.requires_grad}")
print(f"Weight tensor requires grad: {w.requires_grad}")

Data tensor requires grad: False
Weight tensor requires grad: True


In [42]:
# demo for grad_fn - something that kind of shows the history of the tensor using by refering to the computation graph.

x = torch.tensor([[1. ,2. ], [3., 4.]], requires_grad = True)
y = torch.tensor([[5. ,6. ], [7., 8.]], requires_grad = True)

z = x + y
print(z.grad_fn)

<AddBackward0 object at 0x78e1286631f0>


## Star (*) vs At (@)

In [43]:
a = torch.tensor([[1,2],[3,4]])
b = torch.tensor([[10,20],[30,40]])

element_wise_produt = a * b
print(element_wise_produt)

tensor([[ 10,  40],
        [ 90, 160]])


In [44]:
# @ powers neural network

m1 = torch.tensor([[1,2,3],[4,5,6]])
m2 = torch.tensor([[10,20],[30,40],[50,60]])

matrix_product = m1 @ m2
print(matrix_product)

tensor([[220, 280],
        [490, 640]])


## Reduction and dim

In [45]:
# default behavious = collapse the whole tensor

scores = torch.randn(2,3)

avg = scores.mean()
print(avg)

tensor(-0.1652)


In [46]:
# mean along specific directions

avg_along_col = scores.mean(dim = 0) # rows are collapsed
avg_along_row = scores.mean(dim = 1) # columns are collapsed

print(avg_along_col)
print(avg_along_row)


tensor([-0.5838, -0.4759,  0.5642])
tensor([-0.8129,  0.4826])


In [47]:
# arg_max

print(scores)
print(scores.argmax())
print(torch.argmax(scores)) # index of the overall maximum value in the tensor

tensor([[-0.8477, -2.1317,  0.5408],
        [-0.3199,  1.1800,  0.5876]])
tensor(4)
tensor(4)


In [48]:
print(scores.argmax(dim =0)) # index of the maximum value in each column - collapsing rows
print(scores.argmax(dim =1)) # index of the maximum value in each row - collapsing columns

tensor([1, 1, 1])
tensor([2, 1])


In [49]:
#torch.gather

'''
 What if you want :
 - from row0, get the element at coulmn 1
 - from row2, get the element at column 0
 '''
data = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
indices_to_select = torch.tensor([[0],[2],[0]])
selected_indices = torch.gather(data, dim=1, index = indices_to_select)
print(selected_indices)

tensor([[1],
        [6],
        [7]])


## Forward Pass using raw tensor

In [50]:
# linear regression

N = 10
# Each datapoint has 1 input feature and 1 output value
D_in = 1
D_out = 1

#Creating input data
X = torch.randn(N, D_in)

#Create our true target labels Y using the eqution Y =2*x +1 and add some noise

true_w = torch.tensor([[2.0]])
true_b = torch.tensor(1.0)

y_true = X @ true_w + true_b +torch.randn(N, D_out)*0.1


W = torch.randn(D_in,D_out, requires_grad = True)
b = torch.randn(1, requires_grad = True)

print(f"True weights: {true_w} \nTrue bias: {true_b}")
print(f"Initial weights: {W} \nInitial bias: {b}")



True weights: tensor([[2.]]) 
True bias: 1.0
Initial weights: tensor([[-0.0722]], requires_grad=True) 
Initial bias: tensor([0.5781], requires_grad=True)


In [51]:
#Forward Pass
Y_hat = X @W + b
print(Y_hat)

tensor([[0.6346],
        [0.5998],
        [0.5715],
        [0.5710],
        [0.6304],
        [0.5357],
        [0.6359],
        [0.7218],
        [0.6149],
        [0.5974]], grad_fn=<AddBackward0>)


In [52]:
#Backward pass
loss = torch.mean((Y_hat - y_true)**2)
print(loss)

tensor(2.0695, grad_fn=<MeanBackward0>)


In [53]:
# now we need to reduce the loss - Autgrad does the heavy lifting for us
# Auograd will calculate gradients for all the variables for which requires_grad = True

loss.backward()

In [54]:
print(f"Gradients for weights: {W.grad} \nGradients for bias: {b.grad}")

Gradients for weights: tensor([[-2.2678]]) 
Gradients for bias: tensor([1.0998])


In [55]:
W.grad.zero_()
b.grad.zero_() #we will come to grad.zero_ in the next cell

tensor([0.])

## Training using Gradient Descent ‚Åâ

- Torch.no_grad() : This command tells Autograd not to track certain calculations that we want to manually track

- .grad.zero_() : Resets gradients after each iteration

In [56]:
learning_rate, epochs = 0.01,100

W,b = torch.randn(D_in,D_out,requires_grad=True), torch.randn(1,requires_grad = True)

for epoch in range(epochs):
  #Forward pass and loss
  y_hat = X @ W + b
  loss = torch.mean((y_hat - y_true)**2)

  #Backward pass
  loss.backward()

  #update weights
  with torch.no_grad():
    W -= learning_rate*W.grad
    b -= learning_rate*b.grad

    W.grad.zero_()
    b.grad.zero_()

  if epoch % 10 == 0:
    print(f"Epoch {epoch} loss: {loss}")

Epoch 0 loss: 1.3811923265457153
Epoch 10 loss: 0.8123468160629272
Epoch 20 loss: 0.4787147045135498
Epoch 30 loss: 0.28299403190612793
Epoch 40 loss: 0.16813963651657104
Epoch 50 loss: 0.10070731490850449
Epoch 60 loss: 0.06108848378062248
Epoch 70 loss: 0.0377860926091671
Epoch 80 loss: 0.02405869960784912
Epoch 90 loss: 0.01595306769013405


## Moving to professional PyTorch

We updated W and b on our own. But for large neural networks, we have billions of parameters. It is not possible to manually update them, that's where PyTorch libraries come into play

###  Linear Layer

This is what performs the Y = X @ w + b

In [57]:
D_in = 1
D_out = 1

linear_layer = torch.nn.Linear(in_features = D_in, out_features = D_out)

print("the weight of the model",linear_layer.weight)
print("the bias of the model",linear_layer.bias)

y_hat_nn = linear_layer(X)

print(f"Output of nn.Linear (first 3 rows): \n {y_hat_nn[:3]}")

the weight of the model Parameter containing:
tensor([[0.2127]], requires_grad=True)
the bias of the model Parameter containing:
tensor([-0.9449], requires_grad=True)
Output of nn.Linear (first 3 rows): 
 tensor([[-1.1112],
        [-1.0087],
        [-0.9255]], grad_fn=<SliceBackward0>)


###Activation Function

In [58]:
relu = torch.nn.ReLU()
sample_data = torch.tensor([[-1.0], [0.0], [1.0]])

print(f"Input data: \n {sample_data}")

print(f"Output of ReLU: \n {relu(sample_data)}")


Input data: 
 tensor([[-1.],
        [ 0.],
        [ 1.]])
Output of ReLU: 
 tensor([[0.],
        [0.],
        [1.]])


In [59]:
gelu = torch.nn.GELU()
sample_data = torch.tensor([[-1.0], [0.0], [1.0]])

print(f"Input data: \n {sample_data}")

print(f"Output of GELU: \n {gelu(sample_data)}")

Input data: 
 tensor([[-1.],
        [ 0.],
        [ 1.]])
Output of GELU: 
 tensor([[-0.1587],
        [ 0.0000],
        [ 0.8413]])


In [60]:
#softmax

softmax = torch.nn.Softmax(dim =-1)

logits = torch.tensor([[1.0,3.0,0.5, 1.5],[-1.0,2.0,1.0,0.0]])

probabilities = softmax(logits)

print(f"Logits: \n {logits}")
print(f"Probabilities: \n {probabilities}")

Logits: 
 tensor([[ 1.0000,  3.0000,  0.5000,  1.5000],
        [-1.0000,  2.0000,  1.0000,  0.0000]])
Probabilities: 
 tensor([[0.0939, 0.6942, 0.0570, 0.1549],
        [0.0321, 0.6439, 0.2369, 0.0871]])


### Word Embeddings


In [61]:
vocab_size = 10
embedding_dim = 3

embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim)



###Layer Norm

Prevents values from exploding or vanish

It rescales all the values into a stable range


In [62]:
# if the input vector that needs to be stabilized has feature dimension 3, then we create norm layer of dimension 3
norm_layer = torch.nn.LayerNorm(normalized_shape = 3)
input_features= torch.tensor([[1.0,2.0,3.0],[4.0,5.0,6.0]])

normalized_features = norm_layer(input_features)
print(f"Input features: \n {input_features}")
print(f"Normalized features: \n {normalized_features}")



Input features: 
 tensor([[1., 2., 3.],
        [4., 5., 6.]])
Normalized features: 
 tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], grad_fn=<NativeLayerNormBackward0>)


###Dropout

Prevents overfitting by randomly tuning neurons to zero during training

In [63]:
dropout_layer = torch.nn.Dropout(p=0.5)
input_layer = torch.ones(1,10)

#Activate Dropout Layer for training
dropout_layer.train()
output_train = dropout_layer(input_layer) # the input also gets scaled

#Switch off drop out during test
dropout_layer.eval()
output_test = dropout_layer(input_layer)


print(f"Input layer: \n {input_layer}")
print(f"Output train: \n {output_train}")
print(f"Output test: \n {output_test}")

Input layer: 
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
Output train: 
 tensor([[0., 0., 2., 0., 2., 0., 0., 2., 0., 0.]])
Output test: 
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])


##Professional Pytorch

Main elements are -:

- nn.module : for struturing all the layers of neural network
- nn.optim : to automate the learning


In [64]:
import torch.nn as nn

class LinearRegressionModule ( nn.Module):
  def __init__(self, in_feature, out_feature):
    super().__init__() # very important
    self.linear_layer = nn.Linear(in_feature, out_feature) # as we learnt earlier - this will do the  y = wx + b



  def forward(self, x):
    y_hat = self.linear_layer(x)
    return y_hat


model = LinearRegressionModule(1,1)
print(model)


LinearRegressionModule(
  (linear_layer): Linear(in_features=1, out_features=1, bias=True)
)


In [65]:
#Optimizer module
import torch.optim as optim
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
loss_fn = torch.nn.MSELoss()


In [67]:
epochs = 100
D_in = 1
D_out = 1
N = 10
X = torch.randn(N, D_in)
y_true = 2* X + 1 + torch.randn(N, D_out)*0.1

for epoch in range(epochs):
  #Forward pass
  y_hat = model(X)

  # Loss Calculation
  loss = loss_fn(y_hat,y_true)

  #Training code
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  if epoch % 10 == 0:
    print(f"Epoch {epoch} loss: {loss}")

Epoch 0 loss: 3.3351001739501953
Epoch 10 loss: 2.8976895809173584
Epoch 20 loss: 2.4963088035583496
Epoch 30 loss: 2.133594512939453
Epoch 40 loss: 1.8100875616073608
Epoch 50 loss: 1.524714469909668
Epoch 60 loss: 1.2753255367279053
Epoch 70 loss: 1.059229850769043
Epoch 80 loss: 0.8735106587409973
Epoch 90 loss: 0.7151986956596375


In [70]:
print(model.parameters())
print(model.linear_layer.weight)
print(model.linear_layer.bias)

<generator object Module.parameters at 0x78e12a80d620>
Parameter containing:
tensor([[1.4388]], requires_grad=True)
Parameter containing:
tensor([0.7673], requires_grad=True)
