# Automatic Differntiation/ Auto-grad

In [3]:
import numpy as np
import torch
import matplotlib.pyplot as plt

In [4]:
x = torch.ones([3,2], requires_grad = True)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [5]:
y = x + 5
print(y)

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [6]:
# Linear Regression
z = y*y + 1
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [7]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


In [8]:
t.backward()

In [9]:
print(x.grad) #d(t)/d(x)

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


# Sigmoid Function

In [10]:
x = torch.ones([3,2], requires_grad= True)
y = x+5
r = 1/(1 + torch.exp(-y))
print(r)

s = torch.sum(r)
s.backward()
print(x.grad) #d(s)/d(x)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


In [11]:
# Directly calling r.backward(arg.) with a argument placed in it.
x = torch.ones([3,2], requires_grad= True)
y = x+5
r = 1 / (1 + torch.exp(-y))
print(r)

# we have avoided the line <<<s = torch.sum(r)>>>
a = torch.ones([3,2])
r.backward(a)
print(x.grad)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


# Loss Function with Auto-Grad

In [12]:
x = torch.randn([20, 1], requires_grad= True) # Actual Input
y = 3*x -2 # Actual Output

In [13]:
print(x)
print(y)

tensor([[ 1.5954],
        [-1.2187],
        [-0.4944],
        [ 0.0032],
        [ 1.0180],
        [ 1.0031],
        [ 0.4573],
        [-1.2611],
        [ 0.0989],
        [ 1.2099],
        [-1.0815],
        [ 0.2515],
        [ 0.4745],
        [ 0.9378],
        [ 0.9828],
        [-0.4619],
        [-1.7401],
        [-0.2351],
        [-1.1070],
        [ 1.1544]], requires_grad=True)
tensor([[ 2.7862],
        [-5.6562],
        [-3.4832],
        [-1.9905],
        [ 1.0539],
        [ 1.0094],
        [-0.6281],
        [-5.7833],
        [-1.7032],
        [ 1.6297],
        [-5.2446],
        [-1.2455],
        [-0.5764],
        [ 0.8134],
        [ 0.9483],
        [-3.3856],
        [-7.2204],
        [-2.7052],
        [-5.3209],
        [ 1.4633]], grad_fn=<SubBackward0>)


In [14]:
# Forward pass
w = torch.tensor([1.], requires_grad= True)
b = torch.tensor([1.], requires_grad= True)

y_hat = w*x + b #Predicted model

loss = torch.sum((y_hat - y)**2)

In [15]:
print(loss)

tensor(235.9511, grad_fn=<SumBackward0>)


In [16]:
# Backward Propagation
loss.backward()

In [17]:
#d(loss) / d(w) ........ & ......... d(loss)/d(b)
print(w.grad, b.grad)

tensor([-65.4734]) tensor([113.6518])


# Learning Loop in GPU ⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡

In [18]:
learning_rate = 0.01 # Alpha

w = torch.tensor([1.], requires_grad= True) # Randomly(not really -> 1.) assigning the value of the w => Weight
b = torch.tensor([1.], requires_grad= True) # Randomly(not really -> 1.) assigning the value of the b => bias

print(w.item(), b.item())

# This for-loop will works as epoches, here number or epohes = 10 !!!!
for i in range(10):
    x = torch.randn([20, 1]) # Actual i/p
    y = 3*x - 2 # Actual o/p

    # Forward Propagation
    y_hat = w*x + b # Predicted o/p

    loss = torch.sum((y_hat - y)**2) # total loss = Squared Error

    # Backward Propagation
    loss.backward()

    # Standard/ Basic Gradient Descent
    with torch.no_grad():
        # torch.no_grad() is used to interrupt the forward propagation, and execute following part.
        # This will update the value of w and b, basic gradient descent stuff.
        w -= learning_rate * (w.grad)
        b -= learning_rate * (b.grad)
        
        # w.grad and b.grad is set to zero, so that in the next itretion we can get a fresh w.grad and b.grad
        w.grad.zero_()
        b.grad.zero_()

    print(w.item(), b.item())


1.0 1.0
1.5085163116455078 -0.14959490299224854
2.0427613258361816 -0.905558705329895
2.363485097885132 -1.3303828239440918
2.6522881984710693 -1.693071961402893
2.7809579372406006 -1.839675784111023
2.825223445892334 -1.8972423076629639
2.896878957748413 -1.917272925376892
2.9337754249572754 -1.961525797843933
2.968257188796997 -1.973510980606079
2.9771547317504883 -1.9779293537139893


As we can see that the value of w and b is converging to the actual value: **y = 3*x - 2**, w_actual = 3 and b_actual = -2

## GPU

In [19]:
import time

In [20]:
%%time
learning_rate = 0.001
N = 10000000
epoches = 200

w = torch.rand([N], requires_grad= True)
b = torch.ones([1], requires_grad= True)

# print(torch.mean(w).item(), b.item())

for i in range(epoches):
    x = torch.rand([N])
    y = torch.dot(3*(torch.ones([N])), x) - 2

    y_hat = (torch.dot(w,x) + b)
    loss = ((y_hat - y)**2)

    loss.backward()

    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

    # print(torch.mean(w), b.item())

CPU times: total: 13.5 s
Wall time: 14.1 s


## After GPU Activation

In [21]:
!nvidia-smi

Tue Aug 27 00:55:13 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.81                 Driver Version: 560.81         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   42C    P0             24W /  130W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [22]:
cuda0 = torch.device("cuda:0")

In [24]:
%%time
learning_rate = 0.001
N = 10000000
epoches = 2000

w = torch.rand([N], requires_grad= True, device = cuda0)
b = torch.ones([1], requires_grad= True, device = cuda0)

# print(torch.mean(w).item(), b.item())

for i in range(epoches):
    x = torch.rand([N], device= cuda0)
    y = torch.dot(3*(torch.ones([N], device = cuda0)), x) - 2

    y_hat = (torch.dot(w,x) + b)
    loss = ((y_hat - y)**2)

    loss.backward()

    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

   # print(torch.mean(w), b.item())

CPU times: total: 109 ms
Wall time: 5.08 s
