In [1]:
import torch
from torch import nn
from torchvision import datasets, transforms

In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

In [3]:
batch_size = 64

In [4]:
trainset = datasets.FashionMNIST(
    '~/.pytorch/F_MNIST_data/',
    download=True,
    train=True,
    transform=transform)

trainloader = torch.utils.data.DataLoader(
    trainset,
    batch_size=batch_size,
    shuffle=True)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /Users/nikolavetnic/.pytorch/F_MNIST_data/FashionMNIST/raw
Processing...



Done!


In essence `__init__` defines the component parts and `forward()` chains them together.

To add nonlinear capabilities to `nn.Linear()` layers (which are basically linear functions defined as *x * W + B*) `nn.ReLU()` activator is used. Final layer is fed to `nn.Softmax()` activator since a single output class with higher probability than all others is needed (while all output probabilities should sum up to 1).

Parameter `dim=1` in `self.softmax = nn.Softmax(dim=1)` ensures that `Softmax` is taken across the columns of the output.

EDIT 1: Subsequently `Softmax` is changed to `LogSoftmax()`.

EDIT 2: The following cell contains two different ways to implement the same class.

In [30]:
import torch.nn.functional as F

class FashionNetwork(nn.Module):
    
#     def __init__(self):
#         super().__init__()
#         self.hidden1 = nn.Linear(784, 256)
#         self.hidden2 = nn.Linear(256, 128)
#         self.output = nn.Linear(128, 10)
#         # self.softmax = nn.Softmax(dim=1)
#         self.log_softmax = nn.LogSoftmax()
#         self.activation = nn.ReLU()
#         self.drop = nn.Dropout(p=0.25)
        
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(784, 256)
        self.hidden2 = nn.Linear(256, 128)
        self.output = nn.Linear(128, 10)
    
#     def forward(self, x):
#         x = self.hidden1(x)
#         x = self.activation(x)
#         x = self.drop(x)
#         x = self.hidden2(x)
#         x = self.activation(x)
#         x = self.drop(x)
#         x = self.output(x)
#         # output = self.softmax(x)
#         output = self.log_softmax(x)
#         return output

    def forward(self, x):
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = F.log_softmax(self.output(x))
        return x

In [31]:
model = FashionNetwork()
print(model)

FashionNetwork(
  (hidden1): Linear(in_features=784, out_features=256, bias=True)
  (hidden2): Linear(in_features=256, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=10, bias=True)
)


*Jibin Mathew, PyTorch Artificial Intelligence Fundamentals (2020), p34*: 

"...In this recipe, we replaced softmax with log softmax so that we could then use the log of probabilities over probabilities, which has nice theoretic interpretations. There are various reasons for doing this, including improved numerical performance and gradient optimization. These advantages can be extremely important when training a model that can be computationally challenging and expensive. Furthermore, it has a high penalizing effect when it is not predicting the correct class.

We therefore use negative log likelihood when dealing with log softmax, as softmax is not compatible. It is useful in classification between n number of classes. The log would ensure that we are not dealing with very small values between 0 and 1, and negative values would ensure that a logarithm of probability that is less than 1 is nonzero. Our goal would be to reduce this negative log loss error function. In PyTorch, the loss function is called a criterion, and so we named our loss function `criterion`."

In [32]:
criterion = nn.NLLLoss()

*Jibin Mathew, PyTorch Artificial Intelligence Fundamentals (2020), p35*:

"...Optimizers can be thought of as fiddling with the model weights to get the best possible model based on the difference in prediction from the model and the actual output, and the loss function acts as a guide by indicating when the optimizer is going right or wrong."

In [33]:
from torch import optim

optimizer = optim.Adam(model.parameters(), lr=3e-3)
optimizer.defaults

{'lr': 0.003,
 'betas': (0.9, 0.999),
 'eps': 1e-08,
 'weight_decay': 0,
 'amsgrad': False}

With `image.view(image.shape[0], -1)` each tensor is reshaped from 64x28x28 (64 images in a batch, where each image is 28x28) to 64x784.

In [34]:
epoch = 10

for _ in range(epoch):
    running_loss = 0
    for image, label in trainloader:
        optimizer.zero_grad()
        image = image.view(image.shape[0], -1)
        pred = model(image)
        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    else:
        print(f'Training loss: {running_loss/len(trainloader):.4f}')



Training loss: 0.4975
Training loss: 0.3842
Training loss: 0.3515
Training loss: 0.3264
Training loss: 0.3126
Training loss: 0.2970
Training loss: 0.2864
Training loss: 0.2767
Training loss: 0.2701
Training loss: 0.2608


*Jibin Mathew, PyTorch Artificial Intelligence Fundamentals (2020), p39*:

"...Dropouts can be intuitively understood as creating a large number of ensemble models, learning to capture various features under one big definition of a model."