# MT + Transformer

In [1]:
# !pip install datasets
# !pip install pythainlp

In [2]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F
from pythainlp.tokenize import word_tokenize
import random, math, time
from datasets import load_dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda


In [4]:
torch.cuda.get_device_name(0)

'Tesla T4'

In [5]:
torch.__version__

'1.13.1+cu116'

In [6]:
torchtext.__version__

'0.14.1'

## 1. ETL: Loading the dataset

**Note**: Here I chose to translate English to German, simply it is easier for myself, since I don't understand German so it is difficult for me to imagine a sentence during inference.

In [7]:
ds = load_dataset("thaisum")



  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'summary', 'type', 'tags', 'url'],
        num_rows: 358868
    })
    validation: Dataset({
        features: ['title', 'body', 'summary', 'type', 'tags', 'url'],
        num_rows: 11000
    })
    test: Dataset({
        features: ['title', 'body', 'summary', 'type', 'tags', 'url'],
        num_rows: 11000
    })
})

In [9]:
from sklearn.model_selection import train_test_split


In [10]:
from torchtext.vocab import build_vocab_from_iterator

In [11]:
ds["train"][0]['summary']

'วิษณุ ยันโรดแม็ปตามขั้นตอนเดิม เชื่อ สนช.หยุดพูดขยับเลือกตั้ง ปัดวิจารณ์ ยึดตามกรอบเวลา ย้ำเริ่มนับโรดแม็ปเมื่อ รธน.ประกาศใช้'

In [12]:
datasetss = ds["train"]['summary'][:20000]

In [13]:
train = ds["train"]['summary'][:16000]
valid = ds["train"]['summary'][len(train):18000]
test = ds["train"]['summary'][18000:20000]

In [14]:
def yeild_token(data_iter) :
  for aa in data_iter :
    yield word_tokenize(aa)

In [15]:
tokenized_dataset_train = yeild_token(train)
tokenized_dataset_val = yeild_token(valid)
tokenized_dataset_test = yeild_token(test)

In [16]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

In [17]:
vocab_transform = build_vocab_from_iterator(yeild_token(datasetss), 
                                  min_freq=5,
                                  specials=special_symbols,
                                  special_first=True)
        
vocab_transform.set_default_index(UNK_IDX)   
print(len(vocab_transform))                         
print(vocab_transform.get_itos()[:25])

11254
['<unk>', '<pad>', '<sos>', '<eos>', ' ', 'ที่', 'ใน', 'ไม่', 'และ', 'ของ', 'มี', 'การ', 'ให้', 'เป็น', 'ได้', '-', 'ไป', 'จาก', 'ว่า', 'จะ', 'มา', 'คน', 'ไทย', 'กับ', 'ปี']


In [18]:
vocab_transform

Vocab()

In [19]:
from torch.utils.data   import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab_transform['<pad>'] 
text_pipeline = lambda word: vocab_transform(word_tokenize(word))

def collate_batch(batch):
    text_list = []
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    return pad_sequence(text_list, padding_value=pad_idx, batch_first=True)

In [20]:
batch_size = 4

train_loader = DataLoader(train, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)
test_loader  = DataLoader(test, batch_size=batch_size,
                             shuffle=True, collate_fn=collate_batch)

In [21]:
next(iter(test_loader))

tensor([[ 1606,   108,  2312,  9678,  1435,     4,     0,     4,  3517,   412,
           450,  1703,     4,   841,  5004,     4,   959,  1873,   744,     0,
           999,  1370,  2634,  1125,  7191,    15, 10891,     4,    49,  2269,
          1962,     4,  1780,     4,    82,     4,   363,    55,     4,    69,
             4,   405,     4,     8,  3337,  4533,   473,     4,    27,     4,
           133,     4],
        [ 1800,   654,    82,     4,     0,     4,    83,   905,     8,     0,
            15,   875,     0,     5,    37,    58,    16,   194,  2079,    49,
          2364,  7080,     4,  7842,   292,     4,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [  309,     7,  1475,   217,     4,   108,   952,  1481,  7276,     9,
          2745,     4,   908,    16,    13,  9812,     5,  1402,     4,   186,
    

## 5. Design the model


### Mutli Head Attention Layer

$$ \text{Attention}(Q, K, V) = \text{Softmax} \big( \frac{QK^T}{\sqrt{d_k}} \big)V $$ 

This is similar to standard *dot product attention* but is scaled by $d_k$, which the paper states is used to stop the results of the dot products growing large, causing gradients to become too small.

$$ \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,...,\text{head}_h)W^O $$

$$\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) $$

$W^O$ is the linear layer applied at the end of the multi-head attention layer, `fc`. $W^Q, W^K, W^V$ are the linear layers `fc_q`, `fc_k` and `fc_v`.

In [22]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads #make sure it's divisible....

        self.fc_q = nn.Linear(hid_dim,hid_dim) 
        self.fc_k = nn.Linear(hid_dim,hid_dim) 
        self.fc_v = nn.Linear(hid_dim,hid_dim) 

        self.fc = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, q, k, v, mask = None):
        batch_size = q.shape[0]
        
        Q = self.fc_q(q)
        K = self.fc_k(k)
        V = self.fc_v(v)
        
        #Q, K, V = [b, l, h]
        #reshape them into head_dim
        #reshape them to [b, n_headm, l, head_dim]

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        #Q, K, V = [b, m_head, l, head_dim]

        #e = QK/sqrt(dk)
        e =  torch.matmul(Q, K.permute(0,1,3,2)) / self.scale
        #e = [b, n_heads, ql, kl]
        
        # torch.Size([64, 8, 50, 50])
        # torch.Size([64, 1, 1, 50, 256])

        if mask is not None:
            e = e.masked_fill(mask == 0, -1e10)

        a = torch.softmax(e, dim=-1)
        #a = [batch size, n_heads, ql, kl]
                    
        #eV
        x = torch.matmul(self.dropout(a),V)
        #x : [b, n_heads, ql, head_di]

        x = x.permute(0, 2, 1, 3).contiguous()
        #x: [b, ql, n_heads, head_dim]

        #concat them together
        x = x.view(batch_size, -1, self.hid_dim)
        #x = [b, ql, h]

        x = self.fc(x)
        #x = [b, ql, h]

        return x, a

### Position-wise Feedforward Layer

In [23]:
class PositionwiseFeedforwardLayer(nn.Module):
    
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc1 = nn.Linear(hid_dim, pf_dim)
        self.fc2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.fc2(self.dropout(torch.relu(self.fc1(x))))

### Decoder

In [24]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()

        self.norm_ff = nn.LayerNorm(hid_dim) #second yellow box
        self.norm_maskedatt = nn.LayerNorm(hid_dim) #first red box
        
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.ff = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, trg_mask):
        #trg      : [b, l, h]
        #enc_src  : [b, sl, h]
        #trg_mask : [b, 1, tl, tl]
        #src_mask : [b, 1, 1, sl]

        #1st box : mask multi, add & norm
        _trg, attention = self.self_attention(trg, trg, trg, trg_mask) #Q, K, V
        _trg    = self.dropout(_trg)
        _trg    = trg + _trg
        trg     = self.norm_maskedatt(_trg)

        #2rd box : ff, add & norm
        _trg    = self.ff(trg)
        _trg    = self.dropout(_trg)
        _trg    = trg + _trg
        trg     = self.norm_ff(_trg)

        return trg, attention

### Decoder Layer

<img src = "../figures/transformer-decoder.png" >


In [25]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads,
                 pf_dim, dropout, device, src_pad_idx,trg_pad_idx, max_length = 100):
        super().__init__()
        self.pos_emb = nn.Embedding(max_length, hid_dim)
        self.trg_emb = nn.Embedding(output_dim, hid_dim)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.dropout = nn.Dropout(dropout)
        self.layers = nn.ModuleList(
                            [
                            DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device)
                            for _ in range(n_layers)
                            ]
                            )
        self.fc = nn.Linear(hid_dim, output_dim)
        self.device = device
        
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
        #src = [batch size, src len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        #src_mask = [batch size, 1, 1, src len]
        return src_mask
    
    def make_trg_mask(self, trg):
        trg_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        #trg_mask : [batch size, 1, 1, trg len]
        
        trg_len = trg_mask.shape[-1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device =self.device)).bool() #lower triangle
        #trg_sub_mask = [trg len, trg len]
        trg_mask = trg_mask & trg_sub_mask 
        #trg_mask : [batch size, 1, trg len, trg len]
        return trg_mask     
    def decode(self, trg, method='beam-search'):
        
        
        if method == 'beam-search':
            return self.beam_decode(trg)
        else:
            return self.greedy_decode(trg)     
    def greedy_decode(self, trg): # Get the best prop? maybe
        
        prediction= self.forward(trg)
        prediction = prediction.squeeze(0)
        prediction = prediction.argmax(1) 

        return prediction
    def forward(self, x):
        #src : = [batch size, trg len]
        #enc_src : hidden state from encoder = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]

        batch_size = x.shape[0]
        trg_len = x.shape[1]
        
        src_mask = self.make_trg_mask(x)

        #pos
        pos = torch.arange(0,trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        #pos = [batch size, trg len]

        pos_emb = self.pos_emb(pos) #[batch size, trg len, hid dim]
        trg_emb = self.trg_emb(x) #[batch size, trg len, hid dim]

        x = pos_emb + trg_emb * self.scale #[batch size, trg len, hid dim]
        x = self.dropout(x)
        
        for layer in self.layers: #output, hidden
            trg, attention = layer(x, src_mask)
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]

        output = self.fc(trg)
        #output = [batch size, trg len, output dim]

        return output, attention

In [26]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()

        self.norm_ff = nn.LayerNorm(hid_dim) #second yellow box
        self.norm_maskedatt = nn.LayerNorm(hid_dim) #first red box
        
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.ff = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, trg_mask):
        #trg      : [b, l, h]
        #enc_src  : [b, sl, h]
        #trg_mask : [b, 1, tl, tl]
        #src_mask : [b, 1, 1, sl]

        #1st box : mask multi, add & norm
        _trg, attention = self.self_attention(trg, trg, trg, trg_mask) #Q, K, V
        _trg    = self.dropout(_trg)
        _trg    = trg + _trg
        trg     = self.norm_maskedatt(_trg)

        #2rd box : ff, add & norm
        _trg    = self.ff(trg)
        _trg    = self.dropout(_trg)
        _trg    = trg + _trg
        trg     = self.norm_ff(_trg)

        return trg, attention

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        self.h        = hiddenstate  #define the hidden state
        self.prevNode = previousNode  #where does it come from
        self.wordid   = wordId  #the numericalized integer of the word
        self.logp     = logProb  #the log probability
        self.len      = length  #the current length; first word starts at 1

    def eval(self, alpha=0.7):
        # the score will be simply the log probability penaltized by the length 
        # we add some small number to avoid division error
        # read https://arxiv.org/abs/1808.10006 to understand how alpha is selected
        return self.logp / float(self.len + 1e-6) ** (alpha)
    
    #this is the function for comparing between two beamsearchnodes, whether which one is better
    #it is called when you called "put"
    def __lt__(self, other):
        return self.len < other.len

    def __gt__(self, other):
        return self.len > other.len

In [29]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [30]:
output_dim  = len(vocab_transform)
hid_dim = 256
dec_layers = 12
dec_heads = 8
dec_pf_dim = 512
dec_dropout = 0.1

SRC_PAD_IDX = PAD_IDX
TRG_PAD_IDX = PAD_IDX

model = Decoder(output_dim, 
              hid_dim, 
              dec_layers, 
              dec_heads, 
              dec_pf_dim, 
              dec_dropout, 
              device,SRC_PAD_IDX,TRG_PAD_IDX).to(device)
model.apply(initialize_weights)

Decoder(
  (pos_emb): Embedding(100, 256)
  (trg_emb): Embedding(11254, 256)
  (dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0): DecoderLayer(
      (norm_ff): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (norm_maskedatt): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=256, out_features=256, bias=True)
        (fc_k): Linear(in_features=256, out_features=256, bias=True)
        (fc_v): Linear(in_features=256, out_features=256, bias=True)
        (fc): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): PositionwiseFeedforwardLayer(
        (fc1): Linear(in_features=256, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): Decoder

In [31]:
import torch.optim as optim

lr = 0.001

#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX) #combine softmax with cross entropy
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 12,124,150 trainable parameters


In [32]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape # [batch_size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, seq_len, target

In [33]:
def train(model, loader, optimizer, criterion, clip, seq_len):
    
    model.train()
    epoch_loss = 0
    
    # drop all batches that are not a multiple of seq_len
    num_batches = loader.shape[-1]
    loader = loader[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = loader.shape[-1]

    # hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        # hidden = model.detach_hidden(hidden)

        src, _, trg = get_batch(loader, seq_len, idx) #src, target: [batch size, seq len]
        src, trg = src.to(device), trg.to(device)
        batch_size = src.shape[0]
        
        output, attentions = model(src)

        #trg    = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output = output.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        trg = trg.reshape(-1)

        loss = criterion(output, trg)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #prevent gradient explosion - clip is basically 
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [34]:
type(vocab_transform)

torchtext.vocab.vocab.Vocab

In [35]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]
    decoded_batch_list = []
    # hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            # hidden = model.detach_hidden(hidden)
            src, src_len, trg = get_batch(data, seq_len, idx)
            src, trg = src.to(device), trg.to(device) #src,trg = batch_size,seq len
            batch_size= src.shape[0]

            #prediction, hidden = model(src, 0, target)
            prediction, attention = model(src)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            trg = trg.reshape(-1)
 
            #decoding using beam_search as example (you don't need to put here, because beam_search is for intference)
            # decoded_batch = model.decode(src, src_len, trg, attention, method='beam-search')
            # #len(decoded_batch) = 64
            # #len(decoded_batch[0]) = 1 = number of sentence generated, i.e., topk            
            # decoded_batch_list.append(decoded_batch)

            #remove the first output (SOS) and then reshape both output and trg so we can calculate loss
            # output_dim = prediction.shape[-1]
            # prediction = prediction[1:].view(-1, output_dim)
            # trg = trg[1:].view(-1)
            #trg    = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(prediction, trg)
            epoch_loss += loss.item() * seq_len
        
        #this is optional; you don't have to; printing first three samples of the first batch
        # print("print samples from first decode batch")
        # for sentence_index in decoded_batch_list[0][:3]:
        #     decode_text_arr = [vocab_transform.lookup_token(i) for i in sentence_index[0]]
        #     decode_sentence = " ".join(decode_text_arr[1:-1])  #no need sos and eos
        #     print("pred target : {}".format(decode_sentence))

    return epoch_loss / num_batches

In [36]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [37]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example.append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

In [38]:
batch_size = 450
train_data = get_data(tokenized_dataset_train, vocab_transform, batch_size)
valid_data = get_data(tokenized_dataset_val, vocab_transform, batch_size)
test_data  = get_data(tokenized_dataset_test, vocab_transform, batch_size)

In [39]:
len(train_data)

450

In [40]:
from tqdm import tqdm
best_valid_loss = float('inf')
num_epochs = 20 #<----for the sake of brevity
clip       = 0.25
seq_len    = 50

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.45, patience=0.001)

save_path = f'/content/drive/MyDrive/NLP/transformer/new_model/{model.__class__.__name__}.pt'

train_losses = []
valid_losses = []

for epoch in range(num_epochs):
   
    start_time = time.time()

    train_loss = train(model, train_data, optimizer, criterion, clip, seq_len)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)

    #for plotting
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



Epoch: 01 | Time: 0m 10s
	Train Loss: 7.090 | Train PPL: 1199.936
	 Val. Loss: 6.257 |  Val. PPL: 521.779




Epoch: 02 | Time: 0m 10s
	Train Loss: 5.969 | Train PPL: 391.143
	 Val. Loss: 5.697 |  Val. PPL: 298.060




Epoch: 03 | Time: 0m 10s
	Train Loss: 5.383 | Train PPL: 217.619
	 Val. Loss: 5.379 |  Val. PPL: 216.753




Epoch: 04 | Time: 0m 10s
	Train Loss: 5.005 | Train PPL: 149.118
	 Val. Loss: 5.211 |  Val. PPL: 183.304




Epoch: 05 | Time: 0m 10s
	Train Loss: 4.743 | Train PPL: 114.820
	 Val. Loss: 5.115 |  Val. PPL: 166.451




Epoch: 06 | Time: 0m 10s
	Train Loss: 4.541 | Train PPL:  93.814
	 Val. Loss: 5.064 |  Val. PPL: 158.186




Epoch: 07 | Time: 0m 10s
	Train Loss: 4.378 | Train PPL:  79.651
	 Val. Loss: 5.041 |  Val. PPL: 154.695




Epoch: 08 | Time: 0m 10s
	Train Loss: 4.237 | Train PPL:  69.182
	 Val. Loss: 5.036 |  Val. PPL: 153.857




Epoch: 09 | Time: 0m 11s
	Train Loss: 4.113 | Train PPL:  61.142
	 Val. Loss: 5.040 |  Val. PPL: 154.463




Epoch: 10 | Time: 0m 11s
	Train Loss: 3.969 | Train PPL:  52.926
	 Val. Loss: 5.035 |  Val. PPL: 153.711




Epoch: 11 | Time: 0m 11s
	Train Loss: 3.901 | Train PPL:  49.436
	 Val. Loss: 5.048 |  Val. PPL: 155.658




Epoch: 12 | Time: 0m 11s
	Train Loss: 3.823 | Train PPL:  45.721
	 Val. Loss: 5.053 |  Val. PPL: 156.447




Epoch: 13 | Time: 0m 11s
	Train Loss: 3.782 | Train PPL:  43.913
	 Val. Loss: 5.052 |  Val. PPL: 156.337




Epoch: 14 | Time: 0m 11s
	Train Loss: 3.764 | Train PPL:  43.137
	 Val. Loss: 5.054 |  Val. PPL: 156.660




Epoch: 15 | Time: 0m 11s
	Train Loss: 3.756 | Train PPL:  42.767
	 Val. Loss: 5.055 |  Val. PPL: 156.800




Epoch: 16 | Time: 0m 11s
	Train Loss: 3.753 | Train PPL:  42.642
	 Val. Loss: 5.055 |  Val. PPL: 156.844




Epoch: 17 | Time: 0m 11s
	Train Loss: 3.751 | Train PPL:  42.556
	 Val. Loss: 5.055 |  Val. PPL: 156.867




Epoch: 18 | Time: 0m 11s
	Train Loss: 3.749 | Train PPL:  42.472
	 Val. Loss: 5.055 |  Val. PPL: 156.881




Epoch: 19 | Time: 0m 11s
	Train Loss: 3.749 | Train PPL:  42.475
	 Val. Loss: 5.056 |  Val. PPL: 156.886




Epoch: 20 | Time: 0m 11s
	Train Loss: 3.749 | Train PPL:  42.472
	 Val. Loss: 5.056 |  Val. PPL: 156.889


In [41]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    # hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src)

            # print(prediction.shape)
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [42]:
vocab_path = '/content/drive/MyDrive/NLP/transformer/new_model/vocab.pt'
torch.save(vocab_transform, vocab_path)

In [43]:
vvv = torch.load(vocab_path)

In [44]:
output_dim  = len(vocab_transform)
hid_dim = 256
dec_layers = 12
dec_heads = 8
dec_pf_dim = 512
dec_dropout = 0.1

SRC_PAD_IDX = PAD_IDX
TRG_PAD_IDX = PAD_IDX

model = Decoder(output_dim, 
              hid_dim, 
              dec_layers, 
              dec_heads, 
              dec_pf_dim, 
              dec_dropout, 
              device,SRC_PAD_IDX,TRG_PAD_IDX).to(device)
model.apply(initialize_weights)

Decoder(
  (pos_emb): Embedding(100, 256)
  (trg_emb): Embedding(11254, 256)
  (dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0): DecoderLayer(
      (norm_ff): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (norm_maskedatt): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=256, out_features=256, bias=True)
        (fc_k): Linear(in_features=256, out_features=256, bias=True)
        (fc_v): Linear(in_features=256, out_features=256, bias=True)
        (fc): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): PositionwiseFeedforwardLayer(
        (fc1): Linear(in_features=256, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): Decoder

In [46]:
prompt = 'ผมไปกินข้าว'
max_seq_len = 10
seed = 0
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/transformer/new_model/Decoder.pt'))
temperatures = [0.3] 

for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, word_tokenize, 
                          vocab_transform, device, seed)
    print(str(temperature)+'\n'+''.join(generation)+'\n')

0.3
ผมไปกินข้าว แต่ก็ไม่ได้เลยว่ามันก็ไม่



In [47]:
# import pickle
# from torchtext.vocab import Vocab

# # define file path for loading
# file_path = 'vocab.pkl'

# # load the Vocab object
# with open(file_path, 'rb') as f:
#     vocab = pickle.load(f)