**6.1 Layers and Modules**

In [None]:
import torch
from torch import nn
from torch.nn import functional as F

In [None]:
#define a MLP model with 1) an init and 2) a forward function
#architecture is input -> hidden with 256 neurons -> relu -> 10 output classifications
class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden = nn.LazyLinear(256)
    self.output = nn.LazyLinear(10)

  def forward(self, X):
    return self.output(F.relu(self.hidden(X)))


In [None]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

X = torch.rand(2, 20)
net(X).shape

torch.Size([2, 10])

In [None]:
mlp = MLP()
mlp.forward(X).shape

torch.Size([2, 10])

In [None]:
# nn.Module has the add_module(str(idx), module) function
# for every arg pass in, add module
# then apply each arg layer in the forward pass
class MySequential(nn.Module):
  def __init__(self, *args):
    super().__init__()
    for index, module in enumerate(args):
      self.add_module(str(index), module)

  def forward(self, X):
    for module in self.children():
      X = module(X)
    return X

In [None]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net(X).shape

torch.Size([2, 10])

In [None]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net(X).shape

torch.Size([2, 10])

1) Implement a module that takes two modules as an argument, say net1 and net2 and returns the concatenated output of both networks in the forward propagation. This is also called a parallel module.
2) Assume that you want to concatenate multiple instances of the same network. Implement a factory function that generates multiple instances of the same module and build a larger network from it.

In [None]:
class ParallelModule(nn.Module):
  def __init__(self, module1, module2):
    super().__init__()
    self.m1 = module1
    self.m2 = module2

  def forward(self, X):
    return torch.concat((self.m1(X), self.m2(X)), dim =1)

In [None]:
class Factory(nn.Module):
  def __init__(self, module, instance_number, *args):
    super().__init__()
    self.module_list = nn.Module_List([module(*args) for _ in range(instance_number)])

  def forward(self, X):
    outs = [module(X) for module in self.module_list]
    return torch.concat(outs, dim = 0)

**6.2 Parameter Management**

In [None]:
import torch
from torch import nn

In [None]:
net = nn.Sequential(
    nn.LazyLinear(8),
    nn.ReLU(),
    nn.LazyLinear(1)
)

X = torch.rand((2,4))
net(X).shape

torch.Size([2, 1])

In [None]:
net[0].state_dict()

OrderedDict([('weight',
              tensor([[-0.1861, -0.0162, -0.2997,  0.2197],
                      [ 0.3840, -0.2031, -0.4833, -0.4304],
                      [-0.3437,  0.2530, -0.4603, -0.2165],
                      [-0.2484,  0.4672, -0.1788, -0.4103],
                      [-0.3861,  0.1321,  0.4490,  0.1312],
                      [ 0.3881, -0.3586, -0.1096,  0.3437],
                      [ 0.1393, -0.4766, -0.2616,  0.1280],
                      [ 0.2667, -0.2040, -0.4299, -0.2025]])),
             ('bias',
              tensor([ 0.3671,  0.1426, -0.0596,  0.2485, -0.0377, -0.0431, -0.3125, -0.2779]))])

In [None]:
type(net[2].bias), net[2].bias.data, net[2].weight.data

(torch.nn.parameter.Parameter,
 tensor([0.2256]),
 tensor([[ 0.3089,  0.0087,  0.3364, -0.1094,  0.2066,  0.0778,  0.0350, -0.2416]]))

In [None]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [None]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.LazyLinear(8)
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.LazyLinear(1))

net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


Use the NestMLP model defined in Section 6.1 and access the parameters of the various layers.
Construct an MLP containing a shared parameter layer and train it. During the training process, observe the model parameters and gradients of each layer.
Why is sharing parameters a good idea?

SyntaxError: invalid syntax (ipython-input-363198985.py, line 1)