In [2]:
import os
from torch import nn
from torchvision import datasets, transforms
import torch
from torch.utils.data import DataLoader

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


Define the Neural Network by subclassing nn.Module

Implement the operations on input data in forward method

In [4]:
class NeuralNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
                                                nn.Linear(28*28, 512),
                                                nn.ReLU(),
                                                nn.Linear(512, 512),
                                                nn.ReLU(),
                                                nn.Linear(512, 10),
                                            )
    
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



In [5]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [6]:
X = torch.rand(1, 28, 28, device=device)
X

tensor([[[0.4309, 0.3866, 0.9199, 0.6738, 0.5324, 0.6543, 0.7047, 0.1817,
          0.5209, 0.1916, 0.6448, 0.7683, 0.1728, 0.6307, 0.2201, 0.6140,
          0.3486, 0.2821, 0.8802, 0.3718, 0.3747, 0.8448, 0.7826, 0.1113,
          0.1735, 0.4839, 0.0096, 0.5959],
         [0.4412, 0.4205, 0.1117, 0.3860, 0.6082, 0.7323, 0.0991, 0.5928,
          0.1278, 0.3668, 0.9016, 0.7525, 0.2069, 0.4935, 0.0573, 0.8584,
          0.5149, 0.7826, 0.8102, 0.9803, 0.7765, 0.1051, 0.8351, 0.1980,
          0.7310, 0.3803, 0.9461, 0.9864],
         [0.3298, 0.3257, 0.2081, 0.2162, 0.6965, 0.3753, 0.0387, 0.9535,
          0.7369, 0.6591, 0.9137, 0.6023, 0.1439, 0.3817, 0.5558, 0.3662,
          0.2674, 0.0190, 0.2688, 0.5340, 0.9470, 0.4960, 0.0852, 0.6926,
          0.0653, 0.3890, 0.6029, 0.4240],
         [0.5949, 0.3079, 0.4993, 0.2983, 0.3208, 0.9593, 0.2615, 0.6153,
          0.3768, 0.8649, 0.0612, 0.5778, 0.6882, 0.3451, 0.6764, 0.9164,
          0.0869, 0.2258, 0.4438, 0.9103, 0.3527, 0.4065,

In [9]:
logits

tensor([[ 0.0640,  0.0018,  0.0910,  0.0696, -0.0322, -0.0187,  0.0259,  0.0091,
          0.0514,  0.0253]], grad_fn=<AddmmBackward0>)

In [8]:
logits = model(X)
pred = nn.Softmax(dim=1)(logits)
pred

tensor([[0.1035, 0.0973, 0.1063, 0.1041, 0.0940, 0.0953, 0.0997, 0.0980, 0.1022,
         0.0996]], grad_fn=<SoftmaxBackward0>)

In [11]:
y_pred = pred.argmax(1)
y_pred

tensor([2])

Layers

What happens to data as it passes through layers

In [13]:
input_img = torch.rand(3, 28, 28)

flat_img = nn.Flatten()(input_img)
flat_img.size()

torch.Size([3, 784])

In [15]:
layer_1 = nn.Linear(28*28, 50)
hidden_1 = layer_1(flat_img)
hidden_1.size()

torch.Size([3, 50])

Applying non linearity with ReLU

In [17]:
hidden_1 = nn.ReLU()(hidden_1)
hidden_1

tensor([[0.0807, 0.4021, 0.0000, 0.0000, 0.0000, 0.0000, 0.4829, 0.0000, 0.0960,
         0.0000, 0.2729, 0.0000, 0.0883, 0.0000, 0.1603, 0.0000, 0.1913, 0.1415,
         0.0000, 0.3949, 0.1296, 0.0000, 0.2271, 0.2227, 0.2195, 0.0473, 0.0000,
         0.0959, 0.2223, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1646,
         0.1907, 0.5544, 0.0000, 0.0000, 0.1303, 0.1831, 0.0000, 0.0000, 0.0000,
         0.0000, 0.6671, 0.0000, 0.0731, 0.0000],
        [0.0730, 0.2559, 0.0000, 0.0000, 0.0000, 0.0000, 0.4693, 0.1549, 0.3645,
         0.0000, 0.0000, 0.0000, 0.2338, 0.0000, 0.4889, 0.0000, 0.0000, 0.0677,
         0.0000, 0.7933, 0.3388, 0.0062, 0.5563, 0.2714, 0.3681, 0.0000, 0.0000,
         0.3401, 0.0000, 0.0000, 0.0000, 0.1547, 0.0000, 0.0000, 0.0000, 0.3498,
         0.0000, 0.3643, 0.0000, 0.0000, 0.0000, 0.1349, 0.0000, 0.0000, 0.0000,
         0.0000, 0.5113, 0.0000, 0.0175, 0.0000],
        [0.0000, 0.1875, 0.0000, 0.0000, 0.0000, 0.0000, 0.5675, 0.1027, 0.1116,
         

nn.Sequential is an ordered container of modules

In [18]:
seq_mods = nn.Sequential(
                          nn.Flatten(),
                          layer_1,
                          nn.ReLU(),
                          nn.Linear(50, 10)      
)

Last layer returns logits in the range -inf, inf. nn.Softmax scales these to 0,1 and dim specified is the one along which prob sum upto 1. 

In [19]:
softmax = nn.Softmax(dim=1)
pred_probs = softmax(logits)

Model params: weights & biases

Subclassing nn.Module makes all parameters accessible

In [28]:
for name, param in model.named_parameters():
    print(name, param.size(), param[:2])

linear_relu_stack.0.weight torch.Size([512, 784]) tensor([[ 0.0343, -0.0215,  0.0252,  ...,  0.0135, -0.0194, -0.0216],
        [-0.0146, -0.0254, -0.0012,  ...,  0.0282,  0.0168,  0.0287]],
       grad_fn=<SliceBackward0>)
linear_relu_stack.0.bias torch.Size([512]) tensor([0.0243, 0.0313], grad_fn=<SliceBackward0>)
linear_relu_stack.2.weight torch.Size([512, 512]) tensor([[-0.0220,  0.0330, -0.0284,  ...,  0.0270, -0.0205,  0.0334],
        [-0.0163, -0.0312, -0.0359,  ..., -0.0224,  0.0084,  0.0006]],
       grad_fn=<SliceBackward0>)
linear_relu_stack.2.bias torch.Size([512]) tensor([-0.0369, -0.0174], grad_fn=<SliceBackward0>)
linear_relu_stack.4.weight torch.Size([10, 512]) tensor([[ 0.0062, -0.0336, -0.0198,  ...,  0.0386, -0.0246,  0.0079],
        [ 0.0331, -0.0091, -0.0257,  ...,  0.0195,  0.0084, -0.0075]],
       grad_fn=<SliceBackward0>)
linear_relu_stack.4.bias torch.Size([10]) tensor([-0.0439, -0.0087], grad_fn=<SliceBackward0>)
