# Day 6

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

In [2]:
# For reproducibility
torch.manual_seed(1)

<torch._C.Generator at 0x10ff54650>

## Discrete Probability Distribution
이산적인 확률 분포를 일컬음 <br>
pdf 의 경우 함수의 면적이 합률 값을 의미했다.

이산적인 확률 분포는 값의 확률을 구할 수 있다.

## Softmax
convert numbers to probalities with softmax<br>
가위를 냈을 때 주먹을 낼 확률? P(주먹|가위) = ? <br>
가위를 냈을 때 보를 낼 확률? P(보|가위) = ?<br>
이런식으로 확률 분포를 근사하고 싶다.<br>

softmax 는 예측이 맞을 땐 값이 작은 값이 되고(cost 값은 0에 가까워질수록 좋음) 예측이 틀리게 되면 값을 엄청 크게 주어서 일종의 벌(?)을 주는 형태로 수식이 만들어져있다.<br>

$$ P(class=i) = \frac{e^i}{e^1+e^2+e^3+...+e^i} = \frac{e^i}{\sum e^i} $$

In [4]:
z = torch.FloatTensor([1,2,3])

PyTorch has a `softmax` function <br>
max 값을 soft하게 뽑아주는 것이 `softmax`

z 의 max 값은 (0,0,1)이지만 그것을 부드럽게 뽑아주는 방법이다.

In [16]:
hypothesis = F.softmax(z, dim = 0)
print(hypothesis)

tensor([0.0900, 0.2447, 0.6652])


Since they are probabilities, they should add up to 1. Let's do a sanity check

In [17]:
hypothesis.sum()

tensor(1.)

## Corss Entropy Loss (Low-level)
$$ L = \frac{1}{N} \sum - y \log(\hat{y}) $$
y는 실제 확률 P(x) h hat은 예측한 확률 Q(x)

In [19]:
z = torch.rand(3,5, requires_grad= True)
hypothesis = F.softmax(z, dim=1) # dimention 두번째 걸로 수행 = 한 행의 값이 1이 되도록
print(hypothesis) # y hat

tensor([[0.1664, 0.1871, 0.1737, 0.2695, 0.2033],
        [0.2002, 0.1783, 0.2218, 0.1944, 0.2054],
        [0.1809, 0.2380, 0.2318, 0.1084, 0.2409]], grad_fn=<SoftmaxBackward0>)


class 수를 다섯 개, sample 수를 세 개로

In [25]:
# Random 하게 답 index 지정
# (2,0,0)인 경우 각 열의 2,0,0번째 값이 답이 된다
y = torch.randint(5,(3,)).long()
print(y)

tensor([2, 0, 0])


In [45]:
y_one_hot = torch.zeros_like(hypothesis) #(3,5), hypothesis와 같은 크기의 0으로 이루어진 matrix 생성
print('y.squeeze(1) \n', y.unsqueeze(1))
print('\ny_one_hot.scatter_(): \n', y_one_hot.scatter_(1,y.unsqueeze(1),1))
print('\ny: \n', y)

# y_one_hot 함수에 차원 1의 index, 즉 y.unsqueeze(1)위치에 값 1이 들어간다.
# unsqueeze를 하는 이유는 차원을 맞춰주기 위해서
y_one_hot.scatter_(1, y.unsqueeze(1),1) # y size: (3,) -> y.unsqueeze(1) size: (3,1)
# scatter 값을 넣으면 새롭게 메모리 할당하는 것이 아닌 교체된 값이 들어간다.

y.squeeze(1) 
 tensor([[2],
        [0],
        [0]])

y_one_hot.scatter_(): 
 tensor([[0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]])

y: 
 tensor([2, 0, 0])


tensor([[0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]])

In [63]:
print('y_one_hot = \n',y_one_hot)

print('\n-torch.log(hypothesis) = \n', -torch.log(hypothesis))

print('\n-torch.log(hypothesis)).sum(dim=1) = \n'
      , -torch.log(hypothesis).sum(dim=1))

print('\n-torch.log(hypothesis).sum(dim=1).shape\n'
      ,torch.log(hypothesis).sum(dim=1).shape)

print('\n-torch.log(hypothesis)).sum(dim=1).mean() = \n'
      , -torch.log(hypothesis).sum(dim=1).mean())

cost = (y_one_hot * -torch.log(hypothesis)).sum(dim=1).mean()
print('\ncost = ' ,cost)

y_one_hot = 
 tensor([[0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]])

-torch.log(hypothesis) = 
 tensor([[1.7935, 1.6760, 1.7504, 1.3114, 1.5929],
        [1.6086, 1.7244, 1.5062, 1.6381, 1.5826],
        [1.7096, 1.4354, 1.4617, 2.2223, 1.4236]], grad_fn=<NegBackward0>)

-torch.log(hypothesis)).sum(dim=1) = 
 tensor([8.1241, 8.0598, 8.2526], grad_fn=<NegBackward0>)

-torch.log(hypothesis).sum(dim=1).shape
 torch.Size([3])

-torch.log(hypothesis)).sum(dim=1).mean() = 
 tensor(8.1455, grad_fn=<NegBackward0>)

cost =  tensor(1.6895, grad_fn=<MeanBackward0>)


In [40]:
src = torch.arange(1, 11).reshape((2, 5))
print(src)
index = torch.tensor([[0, 1, 2, 0, 2]])
print(index)
torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)

tensor([[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10]])
tensor([[0, 1, 2, 0, 2]])


tensor([[1, 0, 0, 4, 0],
        [0, 2, 0, 0, 0],
        [0, 0, 3, 0, 5]])

In [44]:
src = torch.arange(1, 16).reshape((3, 5))
index = torch.tensor([[0, 1, 2,3,4], [0, 1, 2,3,4],[0,1,2,3,4]])
torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)

tensor([[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10],
        [11, 12, 13, 14, 15]])

## Cross-entropy Loss with torch.nn.functional

In [65]:
# Low level
torch.log(F.softmax(z,dim = 1))

tensor([[-1.7935, -1.6760, -1.7504, -1.3114, -1.5929],
        [-1.6086, -1.7244, -1.5062, -1.6381, -1.5826],
        [-1.7096, -1.4354, -1.4617, -2.2223, -1.4236]], grad_fn=<LogBackward0>)

In [67]:
# High level
F.log_softmax(z,dim=1)

tensor([[-1.7935, -1.6760, -1.7504, -1.3114, -1.5929],
        [-1.6086, -1.7244, -1.5062, -1.6381, -1.5826],
        [-1.7096, -1.4354, -1.4617, -2.2223, -1.4236]],
       grad_fn=<LogSoftmaxBackward0>)

In [70]:
# low level
cost = (y_one_hot * -torch.log(hypothesis)).sum(dim=1).mean()
print(cost)

# high level (NLL = Negative Log Likelihood)
cost = F.nll_loss(F.log_softmax(z, dim = 1), y)
print(cost)

tensor(1.6895, grad_fn=<MeanBackward0>)
tensor(1.6895, grad_fn=<NllLossBackward0>)


PyTorch also has `F.cross_entropy` that combines `F.log_softmax()` and `F.nll_loss()``

In [71]:
F.cross_entropy(z,y)

tensor(1.6895, grad_fn=<NllLossBackward0>)

## Training with Low-level Cross Entropy Loss

In [93]:
x_train = [[1,2,1,1],
           [2,1,3,2],
           [3,1,3,4],
           [4,1,5,5],
           [1,7,5,5],
           [1,2,5,6],
           [1,6,6,6],
           [1,7,7,7]] # size: (m,4)
y_train = [2,2,2,1,1,1,0,0] # size: (m,)
x_train = torch.FloatTensor(x_train)
y_train = torch.LongTensor(y_train) # discrete 하므로 long tensor 로 표현

print(x_train.shape)
print(y_train.shape)

torch.Size([8, 4])
torch.Size([8])


In [75]:
# 초기화
W = torch.zeros((4,3), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

# Optimizer 설정
optimizer = optim.SGD([W,b], lr = 0.01)

# nb_epochs
nb_epochs = 10000

for epoch in range(nb_epochs +1):

    # Cost 계산
    hypothesis = F.softmax(x_train.matmul(W) + b, dim = 1) # or .mm or @
    y_one_hot = torch.zeros_like(hypothesis)
    y_one_hot.scatter_(1, y_train.unsqueeze(1),1)
    cost = (y_one_hot * -torch.log(F.softmax(hypothesis, dim = 1))).sum(dim = 1).mean()

    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    # 100번마다 로그 출력
    if epoch % 100 == 0:
        print('Epoch {:4d}/{} cost: {:.6f}'.format(
            epoch, nb_epochs, cost.item()))

Epoch    0/10000 cost: 1.098612
Epoch  100/10000 cost: 1.036919
Epoch  200/10000 cost: 1.010905
Epoch  300/10000 cost: 0.991541
Epoch  400/10000 cost: 0.974942
Epoch  500/10000 cost: 0.959928
Epoch  600/10000 cost: 0.946149
Epoch  700/10000 cost: 0.933492
Epoch  800/10000 cost: 0.921898
Epoch  900/10000 cost: 0.911302
Epoch 1000/10000 cost: 0.901625
Epoch 1100/10000 cost: 0.892785
Epoch 1200/10000 cost: 0.884700
Epoch 1300/10000 cost: 0.877291
Epoch 1400/10000 cost: 0.870486
Epoch 1500/10000 cost: 0.864220
Epoch 1600/10000 cost: 0.858436
Epoch 1700/10000 cost: 0.853084
Epoch 1800/10000 cost: 0.848117
Epoch 1900/10000 cost: 0.843497
Epoch 2000/10000 cost: 0.839189
Epoch 2100/10000 cost: 0.835160
Epoch 2200/10000 cost: 0.831385
Epoch 2300/10000 cost: 0.827838
Epoch 2400/10000 cost: 0.824499
Epoch 2500/10000 cost: 0.821348
Epoch 2600/10000 cost: 0.818368
Epoch 2700/10000 cost: 0.815545
Epoch 2800/10000 cost: 0.812865
Epoch 2900/10000 cost: 0.810315
Epoch 3000/10000 cost: 0.807886
Epoch 31

## Training with F.cross_entropy

In [77]:
# 초기화
W = torch.zeros((4,3), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

# Optimizer 설정
optimizer = optim.SGD([W,b], lr = 0.01)

# nb_epochs
nb_epochs = 1000

for epoch in range(nb_epochs +1):

    # H(x)
    z = x_train.matmul(W) + b
    
    # Cost 계산
    cost = F.cross_entropy(z, y_train)

    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    # 100번마다 로그 출력
    if epoch % 100 == 0:
        print('Epoch {:4d}/{} cost: {:.6f}'.format(
            epoch, nb_epochs, cost.item()))

Epoch    0/1000 cost: 1.098612
Epoch  100/1000 cost: 0.850816
Epoch  200/1000 cost: 0.784908
Epoch  300/1000 cost: 0.744590
Epoch  400/1000 cost: 0.714646
Epoch  500/1000 cost: 0.690688
Epoch  600/1000 cost: 0.670780
Epoch  700/1000 cost: 0.653828
Epoch  800/1000 cost: 0.639125
Epoch  900/1000 cost: 0.626180
Epoch 1000/1000 cost: 0.614641


## High-level Implementation with nn.Module

In [94]:
class SoftmaxClassifierModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4,3) # Output이 3, 3개의 class 에 대한 확률값 return

    def forward(self,x):
        return self.linear(x) # |x| = (m,4) -> |linear(4,3)|= (m,3)

In [95]:
model = SoftmaxClassifierModel()

In [96]:
# Optimizer 설정
optimizer = optim.SGD(model.parameters(), lr = 0.01) # linear layer가 한 개 들어있음

# nb_epochs
nb_epochs = 1000

for epoch in range(nb_epochs +1):

    # H(x)
    prediction = model(x_train)
    
    # Cost 계산
    cost = F.cross_entropy(prediction, y_train)

    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    # 100번마다 로그 출력
    if epoch % 100 == 0:
        print('Epoch {:4d}/{} cost: {:.6f}'.format(
            epoch, nb_epochs, cost.item()))

Epoch    0/1000 cost: 2.251004
Epoch  100/1000 cost: 0.771897
Epoch  200/1000 cost: 0.687554
Epoch  300/1000 cost: 0.644173
Epoch  400/1000 cost: 0.613487
Epoch  500/1000 cost: 0.589456
Epoch  600/1000 cost: 0.569720
Epoch  700/1000 cost: 0.553035
Epoch  800/1000 cost: 0.538632
Epoch  900/1000 cost: 0.525991
Epoch 1000/1000 cost: 0.514744


### Binary 뷴류 문제의 경우

### Binary Cross Entropy, 즉 Sigmoid를 사용하는 것이 맞고,

### 다중 분류 문제의 경우

### Cross Entropy, 즉 Softmax를 사용하는 것이 맞다.