## Machine Translation

### 必要なパッケージをインストール

In [None]:
!pip install portalocker

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [None]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### データ準備

In [None]:
from torchtext.datasets import Multi30k

In [None]:
# データのロード
data = Multi30k(split='train', language_pair=('de', 'en')) # [(ドイツ語キャプション1, 英語キャプション1),(ドイツ語キャプション2, 英語キャプション2),・・・]のようなreturnがある
data = list(data) 

In [None]:
from sklearn.model_selection import train_test_split
# データの分割
train_data, remaining = train_test_split(data, train_size=0.1, random_state=0)
_, val_data = train_test_split(remaining, test_size=0.1, random_state=0)

In [None]:
print(len(train_data), len(val_data))

2900 2611


In [None]:
# tokenizerと辞書作成
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
token_transform = {} # トークンの辞書を用意
vocab_transform = {} # トークンをインデックス番号に変換したものを格納する辞書を用意
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'
specials=['<unk>', '<pad>', '<bos>', '<eos>']

def yield_tokens(data_iter, language):
    """
    data_iter : イテレータを指定
    language : イテレータの言語を指定

    return : トークンの辞書（言語ごとに作成される）
    """
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE:1} # Multi30kは0番目にドイツ語キャプション、1番目に英語キャプションが格納されている
    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    token_transform[ln] = get_tokenizer('spacy', language=ln)
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(data, ln), specials=specials)
    # 今回は全ての単語で辞書を作ってるので，unknownはない想定
    vocab_transform[ln].set_default_index(vocab_transform[ln]['<unk>'])



In [None]:
import torch
# DataLoader作成
# DataLoaderを作るためには，
# 1. token化
print(token_transform[TGT_LANGUAGE](train_data[0][1]))
# 2. 符号化
print([vocab_transform[TGT_LANGUAGE][token] for token in token_transform[TGT_LANGUAGE](train_data[0][1])])
# 3. tensor化
print(torch.tensor([vocab_transform[TGT_LANGUAGE][token] for token in token_transform[TGT_LANGUAGE](train_data[0][1])]))

['Two', 'people', 'are', 'walking', 'on', 'a', 'striped', 'path', '.']
[19, 22, 17, 42, 9, 4, 198, 297, 5]
tensor([ 19,  22,  17,  42,   9,   4, 198, 297,   5])


In [None]:
def data_propcess(data_iter):
    data = []
    for (src, tgt) in data_iter:
        src_tensor = torch.tensor([vocab_transform[SRC_LANGUAGE][token] for token in token_transform[SRC_LANGUAGE](src)])
        tgt_tensor = torch.tensor([vocab_transform[TGT_LANGUAGE][token] for token in token_transform[TGT_LANGUAGE](tgt)])
        data.append((src_tensor, tgt_tensor))
    return data

In [None]:
train_data_tensor = data_propcess(train_data)
val_data_tensor = data_propcess(val_data)

In [None]:
train_data_tensor[:3]

[(tensor([ 21,  42,  77,  11,   6, 259, 237,   4]),
  tensor([ 19,  22,  17,  42,   9,   4, 198, 297,   5])),
 (tensor([   5,   12,   70,   11,   13, 3256,   15,  428,   10,   26,  189,    4]),
  tensor([   6,   12,    9,    4, 1347, 1226,  173,    4,  267,  328,    5])),
 (tensor([    5,    12,    10,  2012,    70, 13529]),
  tensor([   6,   12,  581, 1066,   14,   27,  570,  359,    9,    5]))]

In [None]:
# padding
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn(batch):
    src_batch = []
    tgt_batch = []
    for src, tgt in batch:
        # <bos>と<eos>を追加
        src_batch.append(torch.cat([torch.tensor([vocab_transform[SRC_LANGUAGE]["<bos>"]]),
                                    src,
                                    torch.tensor([vocab_transform[SRC_LANGUAGE]["<eos>"]])], dim=0))
        tgt_batch.append(torch.cat([torch.tensor([vocab_transform[TGT_LANGUAGE]["<bos>"]]),
                                    tgt,
                                    torch.tensor([vocab_transform[TGT_LANGUAGE]["<eos>"]])], dim=0))

    # padding
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=vocab_transform[SRC_LANGUAGE]['<pad>'])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=vocab_transform[TGT_LANGUAGE]['<pad>'])

    return src_batch, tgt_batch

train_loader = DataLoader(train_data_tensor, batch_size=16, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_data_tensor, batch_size=16, collate_fn=collate_fn, shuffle=False)

In [None]:
src, tgt = next(iter(train_loader))
print(src.shape, tgt.shape)

torch.Size([16, 28]) torch.Size([16, 26])


In [None]:
torch.tensor([1]).shape

torch.Size([1])

### モデル構築

In [None]:
import torch.nn as nn
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, embedding_matrix=None, num_layers=1, rnn_type='LSTM', bidirectional=False):

        super().__init__()
        self.hidden_size = hidden_size
        self.num_directions = 2 if bidirectional else 1

        # embedding layer追加 (vocab_size x embedding_dim)
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False) # embedding matrixで重みを初期化
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        input_size = embedding_dim

        if rnn_type == 'RNN':
            self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        else:
            raise ValueError('Unsupported RNN type. Choose from ["LSTM", "RNN", "GRU", "UGRNN"]')

        self.fc = nn.Linear(hidden_size*self.num_directions, output_size)

In [None]:
# encoderとdecoder作成
class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super().__init__(vocab_size, embedding_dim, hidden_size, hidden_size, num_layers=num_layers, rnn_type='LSTM', bidirectional=False)

    def forward(self, src):
        embedded = self.embedding(src)
        output_seq, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        self.output_size = vocab_size
        super().__init__(vocab_size, embedding_dim, hidden_size, vocab_size, num_layers=num_layers, rnn_type='LSTM', bidirectional=False)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1) # [batch_size] -> [batch_size, 1]
        embedded = self.embedding(input) # [batch_size, 1] -> [batch_size, 1, emb_dim]
        output_seq, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output_seq: [batch_size, 1, hidden_size]
        # hidden: [num_layers, batch_size, hidden_size]
        # cell: [num_layers, batch_size, hidden_size]
        prediction = self.fc(output_seq.squeeze(1))
        # prediction: [batch_size, vocab_size]
        return prediction, hidden, cell

In [None]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing=0.5):
        batch_size = tgt.shape[0]
        tgt_len = tgt.shape[1]
        # encoderのforward
        hidden, cell = self.encoder(src)
        tgt_vocab_size = self.decoder.output_size

        input = tgt[:, 0] # <bos>
        outputs =  torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)# [batch_size, seq_len, vocab_size]
        # decoderのforward
        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output: [batch_size, vocab_size]
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing
            top1 = output.argmax(1) # greedy search
            input = tgt[:, t] if teacher_force else top1
        return outputs

### 学習ループ

In [None]:
# 学習ループ
def train(model, train_loader, val_loader, optimizer, criterion, num_epochs, model_save_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for i, (src, tgt) in enumerate(train_loader):
            src = src.to(device)
            tgt = tgt.to(device)
            optimizer.zero_grad()

            outputs = model(src, tgt) # [batch_size, tgt_len, tgt_vocab_size]

            # outputs[:, 0]は<bos>なので無視（値を入れていない）
            output_size = outputs.shape[-1]
            outputs = outputs[:, 1:].reshape(-1, output_size) # [batch_size, tgt_len, tgt_vocab_size] => [batch_size * tgt_len, tgt_vocab_size]
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(outputs, tgt)

            loss.backward()

            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

        # 検証データを使用して検証エラーを計算
        model.eval()
        val_loss = 0
        total_samples = 0
        total_correct = 0
        for src, tgt in val_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            outputs = model(src, tgt, 0) # torcher_forcingは0にする
            # loss計算
            output_size = outputs.shape[-1]
            outputs = outputs[:, 1:].reshape(-1, output_size)
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(outputs, tgt)
            val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Val Loss: {avg_val_loss:.4f}")

        # モデル保存
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'{model_save_path}/seq2seq_{epoch}')
            print(f'Model saved with validation loss: {best_val_loss:.4f}')

In [None]:
# ハイパーパラメータ
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_vocab_size = len(vocab_transform[SRC_LANGUAGE])
tgt_vocab_size = len(vocab_transform[TGT_LANGUAGE])
embedding_dim = 300
hidden_size = 512
num_layers = 2
num_epochs = 10

enc = Encoder(src_vocab_size, embedding_dim, hidden_size, num_layers)
dec = Decoder(tgt_vocab_size, embedding_dim, hidden_size, num_layers)
model = Seq2Seq(enc, dec, device)

# Optimizerと損失関数
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab_transform[TGT_LANGUAGE]['<pad>'])

In [None]:
from google.colab import drive
drive_path = '/content/drive'
drive.mount(drive_path)
model_save_path = f'{drive_path}/MyDrive/models/machine_translation_models/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train(model, train_loader, val_loader, optimizer, criterion, num_epochs, model_save_path=model_save_path)

Epoch 1/10, Average Training Loss: 4.5489
Val Loss: 5.0947
Model saved with validation loss: 835.5343
Epoch 2/10, Average Training Loss: 4.3223
Val Loss: 5.1255
Epoch 3/10, Average Training Loss: 4.1514
Val Loss: 4.9929
Model saved with validation loss: 818.8406
Epoch 4/10, Average Training Loss: 3.9867
Val Loss: 4.9433
Model saved with validation loss: 810.7090
Epoch 5/10, Average Training Loss: 3.8281
Val Loss: 4.9343
Model saved with validation loss: 809.2225
Epoch 6/10, Average Training Loss: 3.6722
Val Loss: 4.9725
Epoch 7/10, Average Training Loss: 3.5369
Val Loss: 4.9882
Epoch 8/10, Average Training Loss: 3.4056
Val Loss: 4.9373
Epoch 9/10, Average Training Loss: 3.2591
Val Loss: 5.0002
Epoch 10/10, Average Training Loss: 3.1248
Val Loss: 5.1264


### モデルのロード

In [None]:
enc = Encoder(src_vocab_size, embedding_dim, hidden_size, num_layers)
dec = Decoder(tgt_vocab_size, embedding_dim, hidden_size, num_layers)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(enc, dec, device).to(device)
epoch = 4
model.load_state_dict(torch.load(f'{model_save_path}/seq2seq_{epoch}', map_location=torch.device(device)))

<All keys matched successfully>

### 推論

In [None]:
def translate_sentence(model, setence, device, max_len=50):

    # 1. tokenize
    tokenized = token_transform[SRC_LANGUAGE](sentence)
    # 2. 符号化と<bos>, <eos>をつける
    numericalized = [vocab_transform[SRC_LANGUAGE]['<bos>']] \
                    + [vocab_transform[SRC_LANGUAGE][token] for token in tokenized] \
                    + [vocab_transform[SRC_LANGUAGE]['<eos>']]

    #Tensor化
    numericalized = torch.LongTensor(numericalized).unsqueeze(0).to(device)

    # Encoderのforward
    hidden, cell = model.encoder(numericalized)

    # <bos> (最初のDecoderへの入力)
    input = torch.LongTensor([vocab_transform[TGT_LANGUAGE]['<bos>']]).to(device)

    # Decoderのforwardで文章生成
    translated_sentence = []
    for _ in range(max_len):
        output, hidden, cell = model.decoder(input, hidden, cell)
        # output: [1, vocab_size]
        # greedy search
        best_guess = output.argmax(1).item()

        # 生成されたのが<eos>なら翻訳終了
        if best_guess == vocab_transform[TGT_LANGUAGE]['<eos>']:
            break
        best_word = vocab_transform[TGT_LANGUAGE].lookup_token(best_guess)
        translated_sentence.append(best_word)
        input = torch.LongTensor([best_guess]).to(device)
    return ' '.join(translated_sentence)

In [None]:
# A boat with several men on it is being pulled to the shore by a large team of horses.
sentence = "Ein Boot mit mehreren Männern darauf wird von einem großen Pferdegespann ans Ufer gezogen."
translation = translate_sentence(model, sentence, device)
print(translation)

A young player in a a a a a to a the other in the . .


In [None]:
vocab_transform[TGT_LANGUAGE].lookup_token(best_guess)

'A'

In [None]:
vocab_transform[SRC_LANGUAGE]['<eos>']

3

### Beam Search

In [None]:
def translate_sentence_beam_search(model, setence, device, max_len=50, k=10, alpha=0.7):

    # 1. tokenize
    tokenized = token_transform[SRC_LANGUAGE](sentence)
    # 2. 符号化と<bos>, <eos>をつける
    numericalized = [vocab_transform[SRC_LANGUAGE]['<bos>']] \
                    + [vocab_transform[SRC_LANGUAGE][token] for token in tokenized] \
                    + [vocab_transform[SRC_LANGUAGE]['<eos>']]

    #Tensor化
    numericalized = torch.LongTensor(numericalized).unsqueeze(0).to(device)

    # Encoderのforward
    hidden, cell = model.encoder(numericalized)

    # <bos> (最初のDecoderへの入力)
    input = torch.LongTensor([vocab_transform[TGT_LANGUAGE]['<bos>']]).to(device)

    # Decoderのforwardで文章生成
    translated_sentence = []

    beam = [(0, [vocab_transform[TGT_LANGUAGE]['<bos>']], hidden, cell)]
    beam_log =[]
    for _ in range(max_len):
        all_candidates = []
        for score, word_list, hidden, cell in beam:
            if word_list[-1] == vocab_transform[TGT_LANGUAGE]['<eos>']:
                all_candidates.append((score, word_list, hidden, cell))
            else:
                input = torch.tensor([word_list[-1]]).to(device)
                output, hidden, cell = model.decoder(input, hidden, cell)
                # output: [1, vocab_size]
                probs = torch.log_softmax(output, dim=-1)
                probs = probs.view(-1)

                for i, prob in enumerate(probs):
                    next_score = (score * (len(word_list)**alpha) + prob.item()) / ((len(word_list)+1)**alpha)
                    # (score, wordlist, hidden, cell)
                    all_candidates.append((next_score, word_list + [i], hidden, cell))
        all_candidates.sort(key=lambda x: x[0], reverse=True)
        beam = all_candidates[:k]

        # log 用
        beam_log.append(cand[0]for cand in all_candidates[:k])
    return beam, beam_log

In [None]:
# A boat with several men on it is being pulled to the shore by a large team of horses.
sentence = "Ein Boot mit mehreren Männern darauf wird von einem großen Pferdegespann ans Ufer gezogen."
beam, beam_log = translate_sentence_beam_search(model, sentence, device)

In [None]:
len(beam)

10

In [None]:
for b in beam:
    word_list = b[1]
    translated_sentence = [vocab_transform[TGT_LANGUAGE].lookup_token(word_id) for word_id in word_list]
    print(translated_sentence)

['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'on', 'a', 'a', 'in', 'the', 'other', 'in', 'the', 'background', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'on', 'a', 'a', 'in', 'the', 'other', 'in', 'a', '.', '.', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'a', 'to', 'a', 'a', 'in', 'the', 'other', 'in', 'the', '.', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'a', 'with', 'a', 'man', 'in', 'the', 'other', 'in', 'the', '.', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'on', 'a', 'a', 'in', 'the', 'other', 'in', 'the', 'background', '.', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'on', 'a', 'a', 'in', 'the', 'other', 'in', 'the', '.', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'a', 'to', 'a', 'a', 'in', 'the', 'other', 'in', 'a', '.', '.', '.', '<eos>']
['<bos>', 'An', 'older', 'man', 'is', 'a', 'a', 'on', 'a', 'a', 'in', 'the', 'other', 'in', 'a', '.', '.', '<eos>']
['<bos>', 