<a href="https://colab.research.google.com/github/Renan-Domingues/LearnPytorchWithExamples/blob/main/LearnPytorchWithExamples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a tutorial with the fundamental concepts of PyTorch through self-contained examples.

At its core, PyTorch provides two main features:

- An n-dimensional  Tensor, similar to numpy but can run on GPUs
- Automatic defferentiation for buiding and training neural networks


# Tensors

### Warming-up: Numpy

Before the PyTorch, lets use numpy.


In [None]:
import numpy as np
import math

# Create random input and putput data

x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# randomly initialize weights

a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
  # forward pass: compute predicted y
  y_pred = a + b * x + c * x ** 2 + d * x ** 3

  # Compute and print loss
  loss = np.square(y_pred - y).sum()
  if t % 100 == 99:
    print(t, loss)

  # backprop to compute gradients of a, b, c, d with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_a = grad_y_pred.sum()
  grad_b = (grad_y_pred * x). sum()
  grad_c = (grad_y_pred * x ** 2).sum()
  grad_d = (grad_y_pred * x ** 3).sum()

  # update weights
  a -= learning_rate * grad_a
  b -= learning_rate * grad_b
  c -= learning_rate * grad_c
  d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x² + {b} x³')

99 2680.9887533506426
199 1794.694432379336
299 1203.1063492954818
399 808.0077427351555
499 543.9811738882942
599 367.43509774515235
699 249.3080634416786
799 170.21579770622597
899 117.22199694962104
999 81.68870710618067
1099 57.84476336658441
1199 41.83200674878525
1299 31.069525056444405
1399 23.829669602368508
1499 18.955157491727675
1599 15.670208940008227
1699 13.4543904817235
1799 11.958294124548189
1899 10.947143538656022
1999 10.263053121884399
Result: y = 0.025291679606859557 + 0.8279868325864196 x + -0.004363235587548523 x² + 0.8279868325864196 x³


### PyTorch Tensors
Numpy cannot utilize GPU, dont make track of the graph and gradient, so for now on we will use Tensors

In [None]:
import torch
import math
dtype = torch.float
device = torch.device("cpu")

x = torch.linspace(-math.pi, math.pi, 200, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x² + {d.item()} x³')

99 479.69915771484375
199 442.11376953125
299 421.3863525390625
399 404.37969970703125
499 388.6007080078125
599 373.54669189453125
699 359.10113525390625
799 345.2232666015625
899 331.88739013671875
999 319.0716857910156
1099 306.75555419921875
1199 294.91925048828125
1299 283.5438232421875
1399 272.61181640625
1499 262.1056823730469
1599 252.00857543945312
1699 242.30447387695312
1799 232.97801208496094
1899 224.01431274414062
1999 215.3992919921875
Result: y = -0.8872179388999939 + -0.3107714056968689 x + 0.15163396298885345 x² + 0.07145480811595917 x³


# Autograd

### Pytorch: Tensors and autograd

if x is a tensor that has "x.requires_grad=True" then "x.grad" is another Tensor holding the gradient od x with respect to some scalar value.

now we are no longer manually implement the backward pass through the network

In [None]:
import torch
import math

dtype = torch.float
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype)
y = torch.sin(x)

a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
  y_pred = a + b * x + c * x ** 2 + d * x ** 3

  loss = (y_pred - y).pow(2).sum()
  if t % 100 == 99:
    print(t, loss.item())

    loss.backward()

    with torch.no_grad():
      a -= learning_rate * a.grad
      b -= learning_rate * b.grad
      c -= learning_rate * c.grad
      d -= learning_rate * d.grad

      # manually zero the gradients
      a.grad = None
      b.grad = None
      c.grad = None
      d.grad = None

  print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x² + {d.item()} x³')


Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 x + 2.004030466079712 x² + -1.1853593587875366 x³
Result: y = 0.12150567024946213 + 0.042225487530231476 

### Pytorch:  Defining new autograd functions

In PyTorch we can easily define our own autograd operator by defining a subclass of torch.autograd.Function and implementing the forward and backward functions.

In [None]:
import torch
import math

class LegendrePolynominal3(torch.autograd.Function):

  @staticmethod
  def forward(ctx, input):
    """
    In the forward pass we receive a Tensor containing the input and return
    a Tensor containing the output. ctx is a context object that can be used
    to stash information for backward computation. You can cache arbitrary
    objects for use in the backward pass using the ctx.save_for_backward method.
    """

    ctx.save_for_backward(input)
    return 0.5 * (5 * input ** 3 - 3 * input)

  @staticmethod
  def backward(ctx, grad_output):
    """
    In the backward pass we receive a Tensor containing the gradient of the loss
    with respect to the output, and we need to compute the gradient of the loss
    with respect to the input.
    """
    input, = ctx.saved_tensors
    return grad_output * 1.5 * (5 * input ** 2 - 1)

dtype = torch.float
device = torch.device("cpu")

x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
  P3 = LegendrePolynominal3.apply

  y_pred = a + b * P3(c + d * x)

  loss = (y_pred - y).pow(2).sum()
  if t % 100 == 99:
    print(t, loss.item())

  loss.backward()

  with torch.no_grad():
    a -= learning_rate * a.grad
    b -= learning_rate * b.grad
    c -= learning_rate * c.grad
    d -= learning_rate * d.grad

    # Manually zero the gradients after updating weights
    a.grad = None
    b.grad = None
    c.grad = None
    d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')






99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.978511810302734
599 37.403133392333984
699 28.206867218017578
799 21.97318458557129
899 17.7457275390625
999 14.877889633178711
1099 12.93176555633545
1199 11.610918045043945
1299 10.71425724029541
1399 10.10548210144043
1499 9.692105293273926
1599 9.411375999450684
1699 9.220745086669922
1799 9.091285705566406
1899 9.003361701965332
1999 8.943641662597656
Result: y = -6.71270206087371e-10 + -2.208526849746704 * P3(-3.392665037793563e-10 + 0.2554861009120941 x)


# nn module

### Pytorch: nn

When building neural networks we frequently think of arranging the computation into layers, some of which have learnable parameters which will be optimized during learning.

The nn package defines a set of Modules, which are roughly equivalent to neural network layers. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters. The nn package also defines a set of useful loss functions that are commonly used when training neural networks.

In [None]:
import torch
import math

# Create a tensor to hold input and outputs

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):
  y_pred = model(xx)

  loss = loss_fn(y_pred, y)
  if t % 100 == 99:
    print(t, loss.item())

  model.zero_grad()

  loss.backward()

  with torch.no_grad():
    for param in model.parameters():
      param -= learning_rate * param.grad

linear_layer = model[0] # I can access the firs layer of a model like accessing the first layer of a list

print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x² + {linear_layer.weight[:, 2].item()} x³')


99 972.0076904296875
199 648.0205078125
299 433.11383056640625
399 290.5321044921875
499 195.9141845703125
599 133.1103515625
699 91.4130859375
799 63.721778869628906
899 45.326629638671875
999 33.10335159301758
1099 24.978477478027344
1199 19.576078414916992
1299 15.982598304748535
1399 13.591495513916016
1499 11.999807357788086
1599 10.939809799194336
1699 10.233579635620117
1799 9.762859344482422
1899 9.448921203613281
1999 9.239453315734863
Result: y = -0.008809790946543217 + 0.8384766578674316 x + 0.0015198362525552511 x² + -0.09073247015476227 x³


### Pytorch Optim

This is not a huge burden for simple optimization algorithms like stochastic gradient descent, but in practice we often train neural networks using more sophisticated optimizers like AdaGrad, RMSProp, Adam, etc.

In [None]:
import torch
import math

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
  y_pred = model(xx)
  loss = loss_fn(y_pred, y)
  if t % 100 == 99:
    print(t, loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x² + {linear_layer.weight[:, 2].item()} x³')


99 35534.1171875
199 33202.74609375
299 31628.5
399 30383.78125
499 29333.671875
599 28415.50390625
699 27594.2265625
799 26847.92578125
899 26161.82421875
999 25525.416015625
1099 24930.9140625
1199 24372.359375
1299 23845.0703125
1399 23345.296875
1499 22869.97265625
1599 22416.5625
1699 21982.931640625
1799 21567.26953125
1899 21168.0234375
1999 20783.85546875
Result: y = 0.06080242618918419 + -0.4521670937538147 x + -0.32679933309555054 x² + -0.13871225714683533 x³


# PyTorch Custm nn Modules

In [None]:
import torch
import math

class Polynominal3(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.a = torch.nn.Parameter(torch.randn(()))
    self.b = torch.nn.Parameter(torch.randn(()))
    self.c = torch.nn.Parameter(torch.randn(()))
    self.d = torch.nn.Parameter(torch.randn(()))
  def forward(self, x):
    return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

  def string(self):
    return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x² + {self.d.item()} x³'

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

model = Polynominal3()

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)

for t in range(2000):
  y_pred = model(x)
  loss = criterion(y_pred, y)
  if t % 100 == 99:
    print(t, loss.item())

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

print(f'Result: {model.string()}')


99 3697.9228515625
199 2466.67333984375
299 1647.1220703125
399 1101.3792724609375
499 737.8074951171875
599 495.4854736328125
699 333.89849853515625
799 226.09246826171875
899 154.12892150878906
999 106.06394958496094
1099 73.94207763671875
1199 52.461517333984375
1299 38.087791442871094
1399 28.463207244873047
1499 22.013843536376953
1599 17.689090728759766
1699 14.78676700592041
1799 12.837495803833008
1899 11.5272216796875
1999 10.645703315734863
Result: y = -0.024976816028356552 + 0.8220748901367188 x + 0.004308916162699461 x² + -0.08839945495128632 x³


# PyTorch: Control Flow + Weight Sharing

In [None]:
import random
import torch
import math
import torch.nn as nn

class DynamicNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.a = nn.Parameter(torch.randn(()))
    self.b = nn.Parameter(torch.randn(()))
    self.c = nn.Parameter(torch.randn(()))
    self.d = nn.Parameter(torch.randn(()))
    self.e = nn.Parameter(torch.randn(()))

  def forward(self, x):
    y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
    for exp in range(4, random.randint(4, 6)):
      y =  y + self.e * x ** exp
    return y

  def string(self):
    return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x² + {self.d.item()} x³ + {self.e.item()} x⁴ ? + {self.e.item()} x⁵ ?'

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

model = DynamicNet()

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)

for t in range(3000):
  y_pred = model(x)

  loss = criterion(y_pred, y)
  if t % 2000 == 1999:
    print(t, loss.item())

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

print(f'Result: {model.string()}')


1999 3092.886962890625
Result: y = -0.514687716960907 + -0.4589218497276306 x + 0.10279680043458939 x² + 0.0984167531132698 x³ + -0.0025536692701280117 x⁴ ? + -0.0025536692701280117 x⁵ ?
