<a href="https://colab.research.google.com/github/SLCFLAB/Fintech2022/blob/main/ML_day14_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequence to Sequence

Ref. 

[1] https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

[2] https://happy-jihye.github.io/nlp/nlp-6/

[3] https://github.com/ndb796/Deep-Learning-Paper-Review-and-Practice/blob/master/code_practices/Sequence_to_Sequence_with_Attention_Tutorial.ipynb

## Structure

![Seq2seq](https://github.com/bentrevett/pytorch-seq2seq/raw/49df8404d938a6edbf729876405558cc2c2b3013//assets/seq2seq1.png)

Sequence to Sequence (Seq2seq)는 위 그림처럼,
(1) 문장(Sentence)을 압축하여 하나의 임베딩(Embedding)인 z로 표현한 뒤,
(2) z에서 원하는 문장을 출력하도록 함.

(1)을 하는 데에 활용되는 모델은 Encoder라고 부르고, (2)를 하는데에 활용되는 모델은 Decoder라고 부름.

## Code

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

### Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

### Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

### Seq2Seq := Encoder + Decoder

In [None]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

## Training

In [None]:
INPUT_DIM = 1000  # len(SRC.vocab)
OUTPUT_DIM = 2000 # len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
device = 'cuda'

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1000, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2000, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=2000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
# 실제 src의 정보
# torch.Size([128, 31])
# tensor([   2,    4, 4334,   14,   22,   69,   25,   66,    5,    3,    1,    1,
#            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
#            1,    1,    1,    1,    1,    1,    1], device='cuda:0')

In [None]:
BATCH_SIZE = 5
#src = [src len, batch size]
SRC_LEN = 31
src = torch.ones(SRC_LEN, BATCH_SIZE).long().to(device)
#trg = [trg len, batch size]
TRG_LEN = 20
trg = torch.ones(TRG_LEN, BATCH_SIZE).long().to(device)

In [None]:
output = model(src, trg)
output.shape
# 각 TRG 위치에 따른, Batch마다의 2000개의 단어 중 확률

torch.Size([20, 5, 2000])

In [None]:
output_dim = output.shape[-1]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)

In [None]:
trg.shape, output.shape
#trg = [(TRG_LEN - 1) * BATCH_SIZE]
#output = [(TRG_LEN - 1) * BATCH_SIZE, output dim]

(torch.Size([95]), torch.Size([95, 2000]))

In [None]:
# When we get a batch of examples using an iterator we need to make sure that
# all of the source sentences are padded to the same length,
# the same with the target sentences.
# Luckily, torchText iterators handle this for us!
TRG_PAD_IDX = # the index of the <pad> token
# e.g.
# print(TRG.vocab.stoi["abcabc"]) # 없는 단어: 0
# print(TRG.vocab.stoi[TRG.pad_token]) # 패딩(padding): 1
# print(TRG.vocab.stoi["<sos>"]) # <sos>: 2
# print(TRG.vocab.stoi["<eos>"]) # <eos>: 3
# print(TRG.vocab.stoi["hello"])
# print(TRG.vocab.stoi["world"])
nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)(output, trg)