In [3]:
import torch
import torchvision
from IPython import display

## Softmax Regression

### Model
$$\mathbf{o}=\mathbf{W}\mathbf{x}+\mathbf{b}$$
$$\hat{\mathbf{y}} = \mathrm{softmax}(\mathbf{o})\quad \text{where}\quad \hat{y}_j = \frac{\exp(o_j)}{\sum_k \exp(o_k)}.$$

### Loss function
Here we still want to maximize the likelihood of $P(\mathbf{Y}|\mathbf{X})$, which is equivalent to minimizing the negtive log-likelihood
$$-\log P(\mathbf{Y} \mid \mathbf{X}) = \sum_{i=1}^n -\log P(\mathbf{y}^{(i)} \mid \mathbf{x}^{(i)})
= \sum_{i=1}^n l(\mathbf{y}^{(i)}, \hat{\mathbf{y}}^{(i)}),$$
    And for each pair of $<\mathbf{y}, \mathbf{\hat y}>$ over $q$ classes, the negtive log-likelihood is
$$l(\mathbf{y}, \hat{\mathbf{y}}) = - \sum_{j=1}^q y_j \log \hat{y}_j.$$

So the total loss function is 
$$L(\mathbf{Y}, \mathbf{\hat Y})=-\frac{1}{n}\sum_{i=1}^n\sum_{j=1}^q y_j \log \hat{y}_j$$

## Implemantation from scratch 

### load the dataset
Here we are using the Fashion-MNIST dataset.

#### download dataset

In [46]:
# get a transformer to transfer the PIL.Image type into a torch.FloatTensor type, and the range of the tensor is [0, 1.0]
trans = torchvision.transforms.ToTensor()

mnist_train = torchvision.datasets.FashionMNIST(root="./data", train=True, transform=trans, download=True)
mnist_test = torchvision.datasets.FashionMNIST(root="./data", train=False, transform=trans, download=True)

In [47]:
print(mnist_train)
print(mnist_test)

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: ToTensor()
Dataset FashionMNIST
    Number of datapoints: 10000
    Root location: ./data
    Split: Test
    StandardTransform
Transform: ToTensor()


The `mnist_train` object and `mnist_test` objects are both `torch.utils.data.Dataset` objects. 

In [9]:
print(isinstance(mnist_train, torch.utils.data.Dataset),isinstance(mnist_test, torch.utils.data.Dataset))

True True


Thus we can access them through index(key).

In [50]:
print(mnist_train[0][0].shape)

torch.Size([1, 28, 28])


#### Retrieve text labels
Here we define a function to retrieve the corresponding text label given the numerical label.

In [16]:
def get_fashion_mnist_labels(labels):
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt',
        'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

#### Read a minibatch

In [51]:
batch_size = 256

train_iter = iter(torch.utils.data.DataLoader(mnist_train, batch_size = batch_size, shuffle=True, num_workers=4))

#### Put all things together

In [43]:
def load_data_fashion_mnist(batch_size, resize=None):  #@save
    """Download the Fashion-MNIST dataset and then load it into memory."""
    trans = [torchvision.transforms.ToTensor()]
    if resize:
        trans.insert(0, torchvision.transforms.Resize(resize))
    trans = torchvision.transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root="./data",
                                                    train=True,
                                                    transform=trans,
                                                    download=True)
    mnist_test = torchvision.datasets.FashionMNIST(root="./data",
                                                   train=False,
                                                   transform=trans,
                                                   download=True)
    return (torch.utils.data.DataLoader(mnist_train, batch_size, shuffle=True,
                            num_workers=4),
            torch.utils.data.DataLoader(mnist_test, batch_size, shuffle=False,
                            num_workers=4))


### Initializing model parameters

In [59]:
num_inputs = 784
num_outputs = 10
W = torch.normal(0,0.01, size=(num_inputs,num_outputs), requires_grad=True)
b = torch.zeros(num_outputs, requires_grad=True)

### Defining the softmax operation
The softmax function is:
$$\mathrm{softmax}(\mathbf{X})_{ij} = \frac{\exp(\mathbf{X}_{ij})}{\sum_k \exp(\mathbf{X}_{ik})}.$$

In [32]:
def softmax(X):
    X_exp = torch.exp(X)
    partition = torch.sum(X_exp, axis=1, keepdim=True)
    return X_exp/partition

### Defining the model

In [39]:
def soft_rg(X):
    return softmax(torch.matmul(X.reshape(-1, W.shape[0]),W)+b)

### The cross-entropy loss function
Recall that the cross entropy loss is defined as:
$$l(\mathbf{y}, \hat{\mathbf{y}}) = - \sum_{j=1}^q y_j \log \hat{y}_j.$$

And among the q classes, there is only one $y_j$ bigger than 0. So we just need to pick the corresponding ${\hat y}_j$ for which $j$ we have $y_j>0$. And multiplying them to get the final cross entropy loss.

In [38]:
## this function calculates the cross-entropy loss for each pair of <y_hat,y>
def cross_entropy(y_hat, y):
    return -torch.log(y_hat[range(len(y)), y])

### Defining the accuracy

In [40]:
def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())

### Training

In [61]:
num_epochs=10
lr = 0.1
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=None)
def sgd(params, lr, batch_size):
    with torch.no_grad():
        for param in params:
            param -= lr*param.grad/batch_size
            param.grad.zero_()
            
for epoch in range(num_epochs):
    for X,y in train_iter:
        y_hat = soft_rg(X)
        ce_loss = cross_entropy(y_hat,y)
        ce_loss.sum().backward()
        sgd([W,b],lr,batch_size)
    with torch.no_grad():
        y_hat = soft_rg(X)
        ce_loss = cross_entropy(y_hat, y)
        acct = accuracy(y_hat, y
        print(f'epoch {epoch + 1}, loss {float(ce_loss.mean()):f}, accuracy {float(acct):f}')

epoch 1, loss 0.397758, accuracy 84.000000
epoch 2, loss 0.311017, accuracy 86.000000
epoch 3, loss 0.404468, accuracy 83.000000
epoch 4, loss 0.365813, accuracy 85.000000
epoch 5, loss 0.514253, accuracy 78.000000
epoch 6, loss 0.307236, accuracy 85.000000
epoch 7, loss 0.397973, accuracy 86.000000
epoch 8, loss 0.397311, accuracy 81.000000
epoch 9, loss 0.398390, accuracy 82.000000
epoch 10, loss 0.439022, accuracy 83.000000


## Concise Implementation

###  load dataset

In [64]:
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)

### Initializing model parameters

In [68]:
soft_reg = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(num_inputs, num_outputs))

def init_weights(m):
    if type(m)==torch.nn.Linear:
        m.weight.data.normal_(0,0.01)

soft_reg.apply(init_weights)

Sequential(
  (0): Flatten()
  (1): Linear(in_features=784, out_features=10, bias=True)
)

### Cross entropy loss

In [69]:
loss = torch.nn.CrossEntropyLoss()

### Optimizer

In [71]:
trainer = torch.optim.SGD(soft_reg.parameters(), lr=0.1)

### Training

In [73]:
num_epochs = 10

for epoch in range(num_epochs):
    for X,y in train_iter:
        l = loss(soft_reg(X), y)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    with torch.no_grad():
        y_hat = soft_reg(X)
        l = loss(y_hat, y)
        acct = accuracy(y_hat, y)
        print(f'epoch {epoch + 1}, loss {float(ce_loss.mean()):f}, accuracy {float(acct):f}')

epoch 1, loss 0.439022, accuracy 79.000000
epoch 2, loss 0.439022, accuracy 85.000000
epoch 3, loss 0.439022, accuracy 81.000000
epoch 4, loss 0.439022, accuracy 79.000000
epoch 5, loss 0.439022, accuracy 81.000000
epoch 6, loss 0.439022, accuracy 78.000000
epoch 7, loss 0.439022, accuracy 85.000000
epoch 8, loss 0.439022, accuracy 77.000000
epoch 9, loss 0.439022, accuracy 76.000000
epoch 10, loss 0.439022, accuracy 91.000000
