# Automatic Differntiation/ Auto-grad

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt

In [2]:
x = torch.ones([3,2], requires_grad = True)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [3]:
y = x + 5
print(y)

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [4]:
# Linear Regression
z = y*y + 1
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [5]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


In [6]:
t.backward()

In [7]:
print(x.grad) #d(t)/d(x)

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


# Sigmoid Function

In [8]:
x = torch.ones([3,2], requires_grad= True)
y = x+5
r = 1/(1 + torch.exp(-y))
print(r)

s = torch.sum(r)
s.backward()
print(x.grad) #d(s)/d(x)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


In [9]:
# Directly calling r.backward(arg.) with a argument placed in it.
x = torch.ones([3,2], requires_grad= True)
y = x+5
r = 1 / (1 + torch.exp(-y))
print(r)

# we have avoided the line <<<s = torch.sum(r)>>>
a = torch.ones([3,2])
r.backward(a)
print(x.grad)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


# Loss Function with Auto-Grad

In [10]:
x = torch.randn([20, 1], requires_grad= True) # Actual Input
y = 3*x -2 # Actual Output

In [11]:
print(x)
print(y)

tensor([[ 0.3864],
        [-0.9655],
        [-0.3749],
        [-0.3060],
        [-1.1073],
        [-0.9955],
        [-1.8654],
        [ 0.7612],
        [-0.9324],
        [ 0.0593],
        [-0.2563],
        [ 1.2054],
        [ 0.4411],
        [-0.4439],
        [-0.3408],
        [-0.4004],
        [-1.1543],
        [ 0.3900],
        [-0.4887],
        [ 0.3466]], requires_grad=True)
tensor([[-0.8409],
        [-4.8966],
        [-3.1248],
        [-2.9181],
        [-5.3220],
        [-4.9864],
        [-7.5962],
        [ 0.2836],
        [-4.7973],
        [-1.8222],
        [-2.7688],
        [ 1.6163],
        [-0.6767],
        [-3.3316],
        [-3.0224],
        [-3.2013],
        [-5.4629],
        [-0.8301],
        [-3.4660],
        [-0.9603]], grad_fn=<SubBackward0>)


In [12]:
# Forward pass
w = torch.tensor([1.], requires_grad= True)
b = torch.tensor([1.], requires_grad= True)

y_hat = w*x + b #Predicted model

loss = torch.sum((y_hat - y)**2)

In [13]:
print(loss)

tensor(302.4803, grad_fn=<SumBackward0>)


In [14]:
# Backward Propagation
loss.backward()

In [15]:
#d(loss) / d(w) ........ & ......... d(loss)/d(b)
print(w.grad, b.grad)

tensor([-86.2306]) tensor([144.1665])


# Learning Loop in GPU ⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡

In [16]:
learning_rate = 0.01 # Alpha

w = torch.tensor([1.], requires_grad= True) # Randomly(not really -> 1.) assigning the value of the w => Weight
b = torch.tensor([1.], requires_grad= True) # Randomly(not really -> 1.) assigning the value of the b => bias

print(w.item(), b.item())

# This for-loop will works as epoches, here number or epohes = 10 !!!!
for i in range(10):
    x = torch.randn([20, 1]) # Actual i/p
    y = 3*x - 2 # Actual o/p

    # Forward Propagation
    y_hat = w*x + b # Predicted o/p

    loss = torch.sum((y_hat - y)**2) # total loss = Squared Error

    # Backward Propagation
    loss.backward()

    # Standard/ Basic Gradient Descent
    with torch.no_grad():
        # torch.no_grad() is used to interrupt the forward propagation, and execute following part.
        # This will update the value of w and b, basic gradient descent stuff.
        w -= learning_rate * (w.grad)
        b -= learning_rate * (b.grad)
        
        # w.grad and b.grad is set to zero, so that in the next itretion we can get a fresh w.grad and b.grad
        w.grad.zero_()
        b.grad.zero_()

    print(w.item(), b.item())


1.0 1.0
1.4259353876113892 -0.1710425615310669
1.8226149082183838 -0.8780316710472107
2.209345817565918 -1.2151975631713867
2.5231480598449707 -1.4404618740081787
2.642162799835205 -1.6120680570602417
2.8532190322875977 -1.8350183963775635
2.9238667488098145 -1.8994003534317017
2.97902774810791 -1.9512786865234375
2.9925806522369385 -1.9729222059249878
2.994602680206299 -1.9833199977874756


As we can see that the value of w and b is converging to the actual value: **y = 3*x - 2**, w_actual = 3 and b_actual = -2

## GPU

In [17]:
import time

In [37]:
%%time
learning_rate = 0.001
N = 10000000
epoches = 200

w = torch.rand([N], requires_grad= True)
b = torch.ones([1], requires_grad= True)

# print(torch.mean(w).item(), b.item())

for i in range(epoches):
    x = torch.rand([N])
    y = torch.dot(3*(torch.ones([N])), x) - 2

    y_hat = (torch.dot(w,x) + b)
    loss = ((y_hat - y)**2)

    loss.backward()

    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

    # print(torch.mean(w), b.item())

CPU times: user 24 s, sys: 243 ms, total: 24.2 s
Wall time: 24.2 s


## After GPU Activation

In [20]:
!nvidia-smi

Fri Jan  1 11:03:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
cuda0 = torch.device("cuda:0")

In [38]:
%%time
learning_rate = 0.001
N = 10000000
epoches = 200

w = torch.rand([N], requires_grad= True, device = cuda0)
b = torch.ones([1], requires_grad= True, device = cuda0)

# print(torch.mean(w).item(), b.item())

for i in range(epoches):
    x = torch.rand([N], device= cuda0)
    y = torch.dot(3*(torch.ones([N], device = cuda0)), x) - 2

    y_hat = (torch.dot(w,x) + b)
    loss = ((y_hat - y)**2)

    loss.backward()

    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        w.grad.zero_()
        b.grad.zero_()

   # print(torch.mean(w), b.item())

CPU times: user 298 ms, sys: 209 ms, total: 507 ms
Wall time: 513 ms
