In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [29]:
batch_size, class_count = 5, 3
x = torch.randn(batch_size, class_count)

In [30]:
x.shape

torch.Size([5, 3])

In [31]:
x

tensor([[-2.9086,  0.3386, -1.4360],
        [-0.7140, -0.5427, -0.4925],
        [ 2.5223,  1.0760, -0.3562],
        [-0.0757,  0.6754, -0.0623],
        [-0.6869, -0.3563,  0.8194]])

In [32]:
target = torch.randint(low=0, high=class_count, size=(batch_size,), dtype=torch.long)  # [0, high)

In [33]:
target

tensor([0, 2, 1, 1, 0])

## softmax + negative-likelihood

In [51]:
def softmax(x):
    return x.exp()/(x.exp().sum(-1)).unsqueeze(-1)

def nl(input, target):
    return -input[range(target.shape[0]), target].log().mean()

**softmax**: *normalize* every vector in a batch, with exp() fun. ---> xi.exp()/sum(xi.exp())

In [39]:
x.exp()

tensor([[ 0.0546,  1.4029,  0.2379],
        [ 0.4897,  0.5812,  0.6111],
        [12.4571,  2.9328,  0.7003],
        [ 0.9271,  1.9648,  0.9396],
        [ 0.5032,  0.7003,  2.2692]])

In [36]:
x.exp().sum(-1).unsqueeze(-1)

tensor([[ 1.6953],
        [ 1.6820],
        [16.0903],
        [ 3.8314],
        [ 3.4727]])

In [37]:
x.exp()/(x.exp().sum(-1)).unsqueeze(-1)

tensor([[0.0322, 0.8275, 0.1403],
        [0.2911, 0.3455, 0.3633],
        [0.7742, 0.1823, 0.0435],
        [0.2420, 0.5128, 0.2452],
        [0.1449, 0.2016, 0.6535]])

In [41]:
pred = softmax(x)

In [42]:
pred

tensor([[0.0322, 0.8275, 0.1403],
        [0.2911, 0.3455, 0.3633],
        [0.7742, 0.1823, 0.0435],
        [0.2420, 0.5128, 0.2452],
        [0.1449, 0.2016, 0.6535]])

In [43]:
range(target.shape[0])

range(0, 5)

In [44]:
[range(target.shape[0]), target]

[range(0, 5), tensor([0, 2, 1, 1, 0])]

In [45]:
pred[range(target.shape[0]), target]

tensor([0.0322, 0.3633, 0.1823, 0.5128, 0.1449])

In [46]:
pred[[0,1,2,3,4],[2,1,1,0,2]]

tensor([0.1403, 0.3455, 0.1823, 0.2420, 0.6535])

target: [0,2,1,1,0]

pred[range(target.shape[0]), target] select pred's specific class value corresponding target for every pred in the batch.

In [47]:
-pred[range(target.shape[0]), target].log()

tensor([3.4365, 1.0124, 1.7022, 0.6679, 1.9318])

In [48]:
-pred[range(target.shape[0]), target].log().mean()

tensor(1.7502)

In [52]:
loss = nl(pred, target)

In [53]:
loss

tensor(1.7502)

**negative likelihood**:

*likelihood*: log of prediction(probability)

*negative*: likelihood is negative, -likelihood is positive.

*loss*: mean of neg-like

## log_softmax + negative-log-likelihood

In [54]:
def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def nll(input, target):
    return -input[range(target.shape[0]), target].mean()

In [56]:
pred = log_softmax(x)
loss = nll(pred, target)

In [57]:
loss

tensor(1.7502)

the loss result is same as former one, so
*softmax + nl* == *log_softmax + nll*

## F.log_softmax + F.nll_loss

In [58]:
pred = F.log_softmax(x, dim=-1)  # dim: on which the softmax fun operate
loss = F.nll_loss(pred, target)

In [59]:
loss

tensor(1.7502)

## F.cross_entropy

In [60]:
loss = F.cross_entropy(x, target)  # here, there is no exp/softmax operate on x !

In [61]:
loss

tensor(1.7502)