In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

 Within the nn package, there is a class called Module, and it is the base class for all of neural network modules which includes layers.

This means that all of the layers in PyTorch extend the nn.Module class and inherit all of PyTorch’s built-in functionality within the nn.Module class. In OOP this concept is known as inheritance.

## PyTorch nn.Modules Have A forward() Method
When we pass a tensor to our network as input, the tensor flows forward though each layer transformation until the tensor reaches the output layer. This process of a tensor flowing forward though the network is known as a forward pass.

## Extending PyTorch’s nn.Module Class

In [2]:
class Network:
    def __init__(self):
        self.layer = None

    def forward(self, t):
        t = self.layer(t)
        return t

This is a good start, but the class hasn’t yet extended the nn.Module class. To make our Network class extend nn.Module, we must do two additional things:

Specify the nn.Module class in parentheses on line 1.

Insert a call to the super class constructor on line 3 inside the constructor.

In [3]:
class Network(nn.Module):
    def __init__(self):
        super.__init__()
        self.layer = None

    def forward(self, t):
        t = self.layer(t)
        return t

## Define The Network’s Layers As Class Attributes

In [4]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12 * 4 * 4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)

    def forward(self, t):
        # implement the forward pass
        return t

## Learnable Parameters

In [5]:
network = Network()

In [6]:
print(network)

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)


### Accessing the nework's layers

In [7]:
network.conv1

Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))

In [8]:
network.conv2

Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))

In [9]:
network.fc1

Linear(in_features=192, out_features=120, bias=True)

In [10]:
network.fc2

Linear(in_features=120, out_features=60, bias=True)

In [11]:
network.out

Linear(in_features=60, out_features=10, bias=True)

### Accessing the network's weights

In [12]:
network.conv1.weight

Parameter containing:
tensor([[[[ 0.0565, -0.0085, -0.0731, -0.1604,  0.0631],
          [-0.1369,  0.0301,  0.1682,  0.1925,  0.0433],
          [-0.0471,  0.0155,  0.0630,  0.0913, -0.1415],
          [-0.1438, -0.0906, -0.1931,  0.1034,  0.1658],
          [-0.1366, -0.1732,  0.1457, -0.0589, -0.1808]]],


        [[[-0.1862, -0.0447, -0.0768, -0.0053, -0.1874],
          [ 0.0168, -0.0018, -0.1452,  0.0638, -0.1108],
          [-0.0698, -0.1965, -0.1463, -0.0899, -0.1410],
          [-0.1080, -0.0968, -0.0606,  0.1662,  0.1950],
          [ 0.1109, -0.0469, -0.1869,  0.0969,  0.1418]]],


        [[[-0.0398, -0.1087, -0.1663,  0.1179, -0.1993],
          [-0.1948, -0.1440,  0.1426, -0.0327,  0.0691],
          [ 0.1114, -0.1082, -0.1781, -0.0055, -0.0818],
          [ 0.0022, -0.1272,  0.0708, -0.1735,  0.1079],
          [ 0.1137, -0.1520,  0.1764, -0.1550,  0.1003]]],


        [[[-0.0207, -0.1446, -0.0057,  0.1126,  0.1983],
          [-0.1816,  0.1322, -0.1681,  0.0434, -0.1221

## Weight tensor shape

### Convolutional Layers

In [13]:
network.conv1

Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))

In [14]:
network.conv1.weight.shape

torch.Size([6, 1, 5, 5])

In [15]:
network.conv2

Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))

In [16]:
network.conv2.weight.shape

torch.Size([12, 6, 5, 5])

Our tensors are rank-4 tensors. The first axis represents the number of filters. The second axis represents the depth of each filter which corresponds to the number of input channels being convolved.

The last two axes represent the height and width of each filter. We can pull out any single filter by indexing into the weight tensor’s first axis.

### Linear Layers

In [17]:
network.fc1

Linear(in_features=192, out_features=120, bias=True)

In [18]:
network.fc1.weight.shape

torch.Size([120, 192])

In [19]:
network.fc2

Linear(in_features=120, out_features=60, bias=True)

In [20]:
network.fc2.weight.shape

torch.Size([60, 120])

## Accessing the network parameters

In [21]:
for name, param in network.named_parameters():
    print(name, '\t\t', param.shape)

conv1.weight 		 torch.Size([6, 1, 5, 5])
conv1.bias 		 torch.Size([6])
conv2.weight 		 torch.Size([12, 6, 5, 5])
conv2.bias 		 torch.Size([12])
fc1.weight 		 torch.Size([120, 192])
fc1.bias 		 torch.Size([120])
fc2.weight 		 torch.Size([60, 120])
fc2.bias 		 torch.Size([60])
out.weight 		 torch.Size([10, 60])
out.bias 		 torch.Size([10])


## Transform using a Matrix

In [22]:
in_features = torch.tensor([1,2,3,4], dtype=torch.float32)

weight_matrix = torch.tensor([
    [1,2,3,4],
    [2,3,4,5],
    [3,4,5,6]
], dtype=torch.float32)

weight_matrix.matmul(in_features)

tensor([30., 40., 50.])

## Transform using a pytorch linear layer

In [23]:
fc = nn.Linear(in_features=4, out_features=3, bias=True)

In [24]:
fc(in_features)

tensor([-2.7660,  1.2724, -2.4813], grad_fn=<AddBackward0>)

In [25]:
fc.weight = nn.Parameter(weight_matrix)

In [26]:
fc(in_features)

tensor([29.8724, 40.0469, 49.5870], grad_fn=<AddBackward0>)

In [27]:
fc = nn.Linear(in_features=4, out_features=3, bias=False)
fc.weight = nn.Parameter(weight_matrix)
fc(in_features)

tensor([30., 40., 50.], grad_fn=<SqueezeBackward3>)

#### The values are different because of the difference in bias.

## Callable layers

#### Note that we were able to call a layer as if it were a function

What makes this possible is that PyTorch module classes implement another special Python function called "\_\_call\_\_()". If a class implements the "\_\_call\__\()" method, the special call method will be invoked anytime the object instance is called.

This fact is an important PyTorch concept because of the way the \_\_call\_\_() method interacts with the forward() method for our layers and networks.

Instead of calling the forward() method directly, we call the object instance. After the object instance is called, the \_\_call\_\_() method is invoked under the hood, and the \_\_call\_\_() in turn invokes the forward() method. This applies to all PyTorch neural network modules, namely, networks and layers.

In [29]:
# torch/nn/modules/module.py (version 1.0.1)

def __call__(self, *input, **kwargs):
    for hook in self._forward_pre_hooks.values():
        hook(self, input)
    if torch._C._get_tracing_state():
        result = self._slow_forward(*input, **kwargs)
    else:
        result = self.forward(*input, **kwargs)
    for hook in self._forward_hooks.values():
        hook_result = hook(self, input, result)
        if hook_result is not None:
            raise RuntimeError(
                "forward hooks should never return any values, but '{}'"
                "didn't return None".format(hook))
    if len(self._backward_hooks) > 0:
        var = result
        while not isinstance(var, torch.Tensor):
            if isinstance(var, dict):
                var = next((v for v in var.values() if isinstance(v, torch.Tensor)))
            else:
                var = var[0]
        grad_fn = var.grad_fn
        if grad_fn is not None:
            for hook in self._backward_hooks.values():
                wrapper = functools.partial(hook, self)
                functools.update_wrapper(wrapper, hook)
                grad_fn.register_hook(wrapper)
    return result

#### The extra code that PyTorch runs inside the \_\_call\_\_() method is why we never invoke the forward() method directly. If we did, the additional PyTorch code would not be executed. As a result, any time we want to invoke our forward() method, we call the object instance. This applies to both layers, and networks because they are both PyTorch neural network modules.

## Implementing the forward() method

### Input layer #1

In [30]:
# t=t

### Hidden convolutional layer: layer#2 and layer#3

In [31]:
# (2) hidden conv layer
#t = self.conv1(t)
#t = F.relu(t)
#t = F.max_pool2d(t, kernel_size=2, stride=2)

In [32]:
# (3) hidden conv layer
# t = self.conv2(t)
# t = F.relu(t)
# t = F.max_pool2d(t, kernel_size=2, stride=2)

Note that though every module of the nn has a forward function, we needn't call it explicitly because of the \_\_call()\_\_ method.

### Hidden linear layers: layer#4 and layer#5

In [33]:
# (4) hidden linear layer
# t = t.reshape(-1, 12 * 4 * 4)
# t = self.fc1(t)
# t = F.relu(t)

In [34]:
# (5) hidden linear layer
# t = self.fc2(t)
# t = F.relu(t)

### Output layer#6

In [35]:
# (6) output layer
# t = self.out(t)

Inside the network we usually use relu() as our non-linear activation function, but for the output layer, whenever we have a single category that we are trying to predict, we use softmax(). The softmax function returns a positive probability for each of the prediction classes, and the probabilities sum to 1.

However, in our case, we won't use softmax() because the loss function that we'll use, F.cross_entropy(), implicitly performs the softmax() operation on its input, so we'll just return the result of the last linear transformation.

## Final implementation

In [60]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12 * 4 * 4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
    def forward(self, t):
        # (1) input layer
        t = t

        # (2) hidden conv layer
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # (3) hidden conv layer
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # (4) hidden linear layer
        t = t.reshape(-1, 12 * 4 * 4)
        t = self.fc1(t)
        t = F.relu(t)

        # (5) hidden linear layer
        t = self.fc2(t)
        t = F.relu(t)

        # (6) output layer
        t = self.out(t)
        #t = F.softmax(t, dim=1)

## Predicting with the network: Forward pass

In [61]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7ff00e223a90>

### Passing a single image to network

In [62]:
network = Network()

In [63]:
import torchvision
import torchvision.transforms as transforms
train_set = torchvision.datasets.FashionMNIST(
    root='./data'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

In [64]:
sample = next(iter(train_set)) 
image, label = sample 
image.shape

torch.Size([1, 28, 28])

In [65]:
image.unsqueeze(0).shape

torch.Size([1, 1, 28, 28])

In [66]:
pred = network(image.unsqueeze(0)) # image shape needs to be (batch_size × in_channels × H × W)

In [67]:
print(pred)

None


In [57]:
pred.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
label

In [None]:
pred.argmax(dim=1)