# Tutorial Serise One : BCE, CrossEntropy And Focal Loss

## Meaning

## BCE: binary cross entropy
## NLL: negative log loss
## focal loss: general format of cross entropy

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Problem Scenario

### Normally the output of the model will be a unbounded number, when doing classification problem, we want to add a proper loss, such that the model could be trained

In [79]:
batch_size, n_classes = 5, 2
logits  = torch.rand(batch_size,n_classes)

In [80]:
# first applying softmax
# higher values have higher prob, which makes sense
probs = torch.softmax(logits,dim=-1)

In [81]:
probs

tensor([[0.4863, 0.5137],
        [0.5810, 0.4190],
        [0.4922, 0.5078],
        [0.5650, 0.4350],
        [0.4202, 0.5798]])

In [82]:
## softmax implementation
def cus_softmax(x):return x.exp()/x.exp().sum(-1).unsqueeze(1)

In [83]:
cus_softmax(logits)

tensor([[0.4863, 0.5137],
        [0.5810, 0.4190],
        [0.4922, 0.5078],
        [0.5650, 0.4350],
        [0.4202, 0.5798]])

### protential problem lying on the exp operation

In [85]:
y = torch.randint(high=n_classes,size=(5,))
y

tensor([1, 1, 0, 1, 1])

### softmax + nl (softmax and negative likelihood)

In [86]:
def cus_cross_entropy(inputs,targets):return -probs[range(batch_size),y].log().mean()

In [87]:
cus_cross_entropy(probs,y)

tensor(0.7245)

### log_softmax + nll

In [88]:
def cus_log_softmax(x):return x - x.exp().sum(-1).log().unsqueeze(1)

In [92]:
def nll(inputs,targets): return -inputs[range(batch_size),y].mean()

In [93]:
nll(cus_log_softmax(logits),y)

tensor(0.7245)

### The reason why doing this is:
1, Reducing computation.
2, Numerically more stable, why?

In [94]:
# above example is how F.cross_entropy fun being implemented
F.cross_entropy(logits,y.reshape(-1))

tensor(0.7245)

We intentionally choose case with 2 classes, so that we can compare directly to binary_crossentropy, which is ensentially same thing

In [96]:
logits

tensor([[0.6431, 0.6980],
        [0.9518, 0.6247],
        [0.1736, 0.2046],
        [0.3819, 0.1205],
        [0.5808, 0.9027]])

In [98]:
probs

tensor([[0.4863, 0.5137],
        [0.5810, 0.4190],
        [0.4922, 0.5078],
        [0.5650, 0.4350],
        [0.4202, 0.5798]])

In [106]:
F.binary_cross_entropy(probs[range(batch_size),[1]*batch_size],y.type(torch.float32))

tensor(0.7245)

### Notice the only difference in these two scenario is one we have two neurons and one we only have one

In [107]:
F.binary_cross_entropy??

In [108]:
F.binary_cross_entropy_with_logits??

In [None]:
def sigmoid(x): return (1 + (-x).exp()).reciprocal()

### Focal Loss

In brief: -y*log(prob) --> -y(1-prob)^gamma * log(prob)

In [111]:
# binary cases
x = torch.rand(1)
print(x)
F.logsigmoid(x)

tensor([0.0707])


tensor([-0.6584])

In [130]:
s_logits = torch.rand(batch_size,1)
s_logits

tensor([[0.0934],
        [0.0798],
        [0.2099],
        [0.1974],
        [0.8598]])

In [134]:
log_probs = F.logsigmoid(s_logits)
log_probs

tensor([[-0.6475],
        [-0.6541],
        [-0.5937],
        [-0.5993],
        [-0.3529]])

In [135]:
log_probs.exp()

tensor([[0.5233],
        [0.5199],
        [0.5523],
        [0.5492],
        [0.7026]])

In [None]:
y*

In [127]:
F.logsigmoid(x)

tensor([-0.6584])

In [112]:
x - torch.log(1+x.exp())

tensor([-0.6584])

In [122]:
logits

tensor([[0.6431, 0.6980],
        [0.9518, 0.6247],
        [0.1736, 0.2046],
        [0.3819, 0.1205],
        [0.5808, 0.9027]])

In [126]:
logits.reciprocal()                            

tensor([[1.5550, 1.4326],
        [1.0506, 1.6007],
        [5.7602, 4.8872],
        [2.6182, 8.2975],
        [1.7218, 1.1078]])

In [113]:
def focal_loss(input, target, OHEM_percent=None):
    gamma = 2
    assert target.size() == input.size()

    max_val = (-input).clamp(min=0)
    loss = input - input * target + max_val + ((-max_val).exp() + (-input - max_val).exp()).log()
    invprobs = F.logsigmoid(-input * (target * 2 - 1))
    loss = (invprobs * gamma).exp() * loss

    if OHEM_percent is None:
        return loss.mean()
    else:
        OHEM, _ = loss.topk(k=int(10008 * OHEM_percent), dim=1, largest=True, sorted=True)
        return OHEM.mean()

In [121]:
(-torch.tensor(0.5)).clamp(min=0)

tensor(0.)

In [115]:
focal_loss(logits,y.reshape(-1,1))

AssertionError: 

In [116]:
logits.shape

torch.Size([5, 2])