# Defining and Training Neural Networks

What we will implement:
- How to initialize a NN
- Forward pass
- Backward pass
- Optimization of the network parameters

## Pytorch: <code>nn</code>

The <code>nn</code> package defines a set of Modules (i.e. neural networks layers).

Each module receive an input and produces an output.

The <code>nn</code> package also defines losses. 

In [1]:
# Import libs
import torch
import torch.nn as nn

import math
from IPython import display

In [2]:
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

In [3]:
# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)
print(xx.size())

torch.Size([2000, 3])


In [4]:
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. The Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# The Flatten layer flatens the output of the linear layer to a 1D tensor,
# to match the shape of `y`.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

In [5]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-6
# Construct the Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [6]:
for t in range(2000):

    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(xx)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward() # (y_pred - y).pow(2).sum()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()
    
    # Alternative: Update the weights using gradient descent MANUALLY. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    """
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    """

99 978.2300415039062
199 649.8440551757812
299 432.6981506347656
399 289.11004638671875
499 194.16184997558594
599 131.37692260742188
699 89.86022186279297
799 62.40728759765625
899 44.253944396972656
999 32.24992752075195
1099 24.312265396118164
1199 19.06338882446289
1299 15.592564582824707
1399 13.2974271774292
1499 11.779789924621582
1599 10.776226043701172
1699 10.112586975097656
1799 9.673775672912598
1899 9.3836030960083
1999 9.191720008850098


In [7]:
# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

Result: y = 0.0002835117047652602 + 0.8379294872283936 x + -4.891062417300418e-05 x^2 + -0.0906546413898468 x^3


In [13]:
# The network has effectively learned something?
print(xx[500]) # x[500] = -pi/2
print("%.6f %.6f" % (model(xx)[500].item(), torch.sin(x)[500]))

tensor([-1.5700,  2.4649, -3.8700])
-0.964564 -1.000000


## Custom models

In [14]:
class SinModel(nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate all the layer of the NN
        """
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(3, 1),
            nn.Flatten(0, 1)
        )
        """
        In alternative we could also define each layer individually
        """
        # self.l1 = nn.Linear(3, 1)
        # self.flt = nn.Flatten(0, 1)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.model(x)
        # x = self.l1(x)
        # return self.flt(x)

In [17]:
# Construct our model by instantiating the class defined above
model = SinModel()

In [21]:
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(xx)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 9.038326263427734
199 8.972274780273438
299 8.926005363464355
399 8.893573760986328
499 8.870832443237305
599 8.854877471923828
699 8.843677520751953
799 8.835809707641602
899 8.830282211303711
999 8.826396942138672
1099 8.823665618896484
1199 8.821743965148926
1299 8.820390701293945
1399 8.819438934326172
1499 8.818768501281738
1599 8.818296432495117
1699 8.817963600158691
1799 8.817728042602539
1899 8.8175630569458
1999 8.8174467086792


In [22]:
# The network has effectively learned something?
print(xx[500]) # x[500] = -pi/2
print("%.6f %.6f" % (model(xx)[500].item(), torch.sin(x)[500]))

tensor([-1.5700,  2.4649, -3.8700])
-0.983484 -1.000000


In [None]:
# Ex1: write a model (using custom modules) where the output y is a linear function of (x, x^2, x^3, x^4)
# and it approximate the cosine function

In [None]:
# Ex2: write a model (using custom modules) where the output y is a linear function of (x, x^2, x^3)
# and it approximate the function -5 + 2*x + 3/4x^2 + 7*x^3