https://www.kaggle.com/code/columbine/seq2seq-pytorch

In [1]:
import time, random, math, string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [2]:
tokenizer = lambda x: str(x).translate(str.maketrans('', '', string.punctuation)).strip().split() 
reverse_tokenizer = lambda x: tokenizer(x)[::-1]

SRC = Field(tokenize=reverse_tokenizer, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                   fields=(SRC, TRG))

downloading training.tar.gz


.data\multi30k\training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 1.52MB/s]


downloading validation.tar.gz


.data\multi30k\validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 629kB/s]


downloading mmt_task1_test2016.tar.gz


.data\multi30k\mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 611kB/s]


In [3]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of test examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of test examples: 1000


In [4]:
print(vars(train_data.examples[0]))

{'src': ['büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes']}


In [5]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7805
Unique tokens in target (en) vocabulary: 5940


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATHC_SIZE = 128
# We use a BucketIterator instead of the standard Iterator as it create batches in such a way that it minimizes the amount 
# of padding in both the source and target sentences.
train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data),
                                                          batch_size=BATHC_SIZE, device=device)

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        # src : [sen_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        
        # embedded : [sen_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [sen_len, batch_size, hid_dim * n_directions]
        # hidden = [n_layers * n_direction, batch_size, hid_dim]
        # cell = [n_layers * n_direction, batch_size, hid_dim]
        return hidden, cell

In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=self.n_layers, dropout=dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        # input = [batch_size]
        # hidden = [n_layers * n_dir, batch_size, hid_dim]
        # cell = [n_layers * n_dir, batch_size, hid_dim]
        
        input = input.unsqueeze(0)
        # input : [1, ,batch_size]
        
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq_len, batch_size, hid_dim * n_dir]
        # hidden = [n_layers * n_dir, batch_size, hid_dim]
        # cell = [n_layers * n_dir, batch_size, hid_dim]
        
        # seq_len and n_dir will always be 1 in the decoder
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch_size, output_dim]
        return prediction, hidden, cell

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            'hidden dimensions of encoder and decoder must be equal.'
        assert encoder.n_layers == decoder.n_layers, \
            'n_layers of encoder and decoder must be equal.'
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [sen_len, batch_size]
        # trg = [sen_len, batch_size]
        # teacher_forcing_ratio : the probability to use the teacher forcing.
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        # first input to the decoder is the <sos> token.
        input = trg[0, :]
        for t in range(1, trg_len):
            # insert input token embedding, previous hidden and previous cell states 
            # receive output tensor (predictions) and new hidden and cell states.
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            # replace predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            # decide if we are going to use teacher forcing or not.
            teacher_force = random.random() < teacher_forcing_ratio
            
            # get the highest predicted token from our predictions.
            top1 = output.argmax(1)
            # update input : use ground_truth when teacher_force 
            input = trg[t] if teacher_force else top1
            
        return outputs

In [10]:
# First initialize our model.
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)

In [11]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7805, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5940, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5940, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,922,356 trainable parameters


In [13]:
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [62]:
next(iter(train_iter)).src[2]

tensor([   5,    5,   73,    0,   76,  243,  893,  151, 3121,   23,  388,   13,
           5,    7,    5,    5,    5,  288,  250,   12,   12,  142,   84,    5,
          18,  107, 3193,  134,    8,  958, 1199,   15,    5,   53,   13,  866,
         151, 1605,   38,   31, 3012,  117,   12,   17,    5,  387,   26, 4035,
          61,  110,   57,    4,  430,    9,  807,    8,    8, 1190,  342,   73,
          87,   13,   15,  272,   12,  191,    7,   32,    0,    5,   13,    5,
         589,   36,  963, 2096,  142,   17,    8,    7,   17,    5,    4,   22,
         592,   12,    9,   42,   12,    5,    0,   36,  737,  669,   12,  183,
         769,   18,   46,    5,    5, 1808,   18,  494, 1465, 6669,   21,  269,
           4,   98,   71, 1076,  222,    8, 1190,    5,    0, 1122, 1094,    5,
          12,   20,   31,   10, 5713, 2801,  133,  676])

In [58]:
next(iter(train_iter)).trg.shape

torch.Size([24, 128])

In [52]:
tmp = encoder(next(iter(train_iter)).src)
tmp[0].shape

torch.Size([2, 128, 512])

In [53]:
tmp[1].shape

torch.Size([2, 128, 512])

In [59]:
model(next(iter(train_iter)).src, next(iter(train_iter)).trg).shape

torch.Size([27, 128, 5940])

In [72]:
next(iter(train_data)).src

['büsche',
 'vieler',
 'nähe',
 'der',
 'in',
 'freien',
 'im',
 'sind',
 'männer',
 'weiße',
 'junge',
 'zwei']

In [73]:
next(iter(train_data)).trg

['two', 'young', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes']

In [78]:
for x in train_iter:
    print(x.src.shape)
    print(x.trg.shape)
    print(x.src[1])
    print(x.trg[1])
    break

torch.Size([25, 128])
torch.Size([26, 128])
tensor([ 887,   85, 2736,  205,  972,  428,   88,  707, 4768,  298,  346,   54,
        1631,  776,  153,  167,   52, 1684,  152,  221,  260,  561, 2481, 1797,
          89,   95,   61,  332,   88,  208,  236,   70,  356,  106,  709,    0,
         231,  720,   19,   55,   98,    0, 1304,   58,    0,   10,  593,  672,
        1154,   70,  102, 3200,  307,   19,  154,    0, 6233, 1396,   32,   90,
         187,  307, 6558,  721,   19, 4093,    0,  109, 1211,   21, 2150,  154,
        1811,  949,    0, 3997,  437, 1143,    0,   70,   29,    0,  113,   88,
         693,  573,  577,  699,  347, 2982,  408,  176,    0,  127, 3047, 5946,
          26,    0, 2447,  167,   19, 3434,    0,  139, 2904,    0,  280,   88,
        1305,  219, 1286,   35,   51,  231,  727,    0,   89,  417,  146, 2536,
         389,  181,   70,  643,   32,  136,    0,  142])
tensor([  19,   98,    4,   32,    4,   43,    4,    4,   14,   44,    4,    4,
          19,    4,

In [82]:
!python -m spacy download de_core_news_md > logs.txt


[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: C:\Users\Febrin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [83]:
import spacy
import math
import re
# https://www.kaggle.com/code/tuannguyenvananh/machine-translation-with-multi30k-de-en

spacy_de = spacy.load('de_core_news_md')

def preprocessing_text(text):
    text = text.lower().strip()
    text = re.sub(f'[{string.punctuation}\n]', '', text)
    return text

def tokenize_de(text):
    text = preprocessing_text(text)
    return [tok.text for tok in spacy_de.tokenizer(text)]