# Language Translation

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, random_split, DataLoader

import torchtext

from torchsummary import summary

import spacy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import time
import math
from PIL import Image
import glob
from IPython.display import display

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


HYPERPARAMETERS

In [3]:
BATCH_SIZE = 64
LR = 1e-6
NUM_EPOCHES = 20

## Preprocessing

In [4]:
nlp_english = spacy.load("en")

In [5]:
nlp_german = spacy.load("de")

In [6]:
def tokenizer_english(text):
    return [token.text for token in nlp_english.tokenizer(text)]

In [7]:
def tokenizer_german(text):
    return [token.text for token in nlp_german.tokenizer(text)]

In [8]:
tokenizer_english("Hi guys, my name Jeff")

['Hi', 'guys', ',', 'my', 'name', 'Jeff']

In [9]:
tokenizer_german("I dont know any German")

['I', 'dont', 'know', 'any', 'German']

In [10]:
ENGLISH = torchtext.data.Field(tokenize=tokenizer_english, lower=True, init_token="<sos>", eos_token="<eos>")

In [11]:
GERMAN = torchtext.data.Field(tokenize=tokenizer_german, lower=True, init_token="<sos>", eos_token="<eos>")

In [12]:
train, validation, test = torchtext.datasets.Multi30k.splits(exts=(".de", ".en"), fields=(GERMAN, ENGLISH))

In [13]:
ENGLISH.build_vocab(train, max_size=10000, min_freq=1)

In [14]:
GERMAN.build_vocab(train, max_size=10000, min_freq=1)

In [15]:
print("ENGLISH vocab_size: ", len(ENGLISH.vocab))
print("GERMAN vocab_size: ", len(GERMAN.vocab))

ENGLISH vocab_size:  9799
GERMAN vocab_size:  10004


In [16]:
train_dataloader, validation_dataloader, test_dataloader = torchtext.data.BucketIterator.splits(
    (train, validation, test),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

In [17]:
for batch_idx, data in enumerate(train_dataloader):
    print(batch_idx)
    print(data.src.size())
    print(data.trg.size())
    break

0
torch.Size([18, 64])
torch.Size([22, 64])


In [18]:
def german2english(model, german_sentence, device="cpu", max_len=100):
    tokens = [token.text.lower() for token in nlp_german(german_sentence)]
    tokens = ["<sos>"] + tokens + ["<eos>"]
    
    indexes = [GERMAN.vocab.stoi[token] for token in tokens]
    indexes_tensor = torch.LongTensor(indexes).unsqueeze(1).to(device)
    
    english_sentence = [ENGLISH.vocab.stoi["<sos>"]]
    
    for i in range(max_len):
        trg = torch.LongTensor(english_sentence).unsqueeze(1).to(device)

        with torch.no_grad():
            word = model(indexes_tensor, trg)

        top = word.argmax(2)[-1, :].item()
        english_sentence.append(top)

        if top == ENGLISH.vocab.stoi["<eos>"]:
            break

    english_sentence = [ENGLISH.vocab.itos[word] for word in english_sentence]
    
    return english_sentence[1:-1]

## Model

In [19]:
from models.transformer import Transformer

In [20]:
source_vocab_size = len(GERMAN.vocab)
target_vocab_size = len(ENGLISH.vocab)
embed_size = 50
num_head = 5
num_ff = 300
encoder_layers = 2
decoder_layers = 2

In [21]:
model = Transformer(source_vocab_size, target_vocab_size, embed_size, num_head, num_ff, encoder_layers, decoder_layers, device=device).to(device)
model

Transformer(
  (encoder_embed): Embedding(10004, 50)
  (decoder_embed): Embedding(9799, 50)
  (encoder_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=50, out_features=50, bias=True)
    )
    (linear1): Linear(in_features=50, out_features=300, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=300, out_features=50, bias=True)
    (norm1): LayerNorm((50,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((50,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttenti

In [22]:
sample_in_x = torch.rand(BATCH_SIZE, 500).type(torch.LongTensor).to(device)
sample_in_y = torch.rand(BATCH_SIZE, 400).type(torch.LongTensor).to(device)
sample_out = model(sample_in_x, sample_in_y)

print("Dimenstions of Input Source Vector: ", sample_in_x.size())
print("Dimenstions of Input Target Vector: ", sample_in_y.size())
print("Dimenstions of Predicted Vector: ", sample_out.size())

Dimenstions of Input Source Vector:  torch.Size([64, 500])
Dimenstions of Input Target Vector:  torch.Size([64, 400])
Dimenstions of Predicted Vector:  torch.Size([64, 400, 9799])


## Training

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index = ENGLISH.vocab.stoi["<pad>"])

In [24]:
for epoch in range(NUM_EPOCHES):
    model.train()
    
    epoch_train_loss = 0
    epoch_test_loss = 0
    
    for batch_idx, data in enumerate(train_dataloader):
        x = data.src.to(device)
        y = data.trg.to(device)
        
        y_pred = model(x, y[:-1, :])
        y_pred = y_pred.reshape(-1, y_pred.size(-1))
        y = y[1:, :].reshape(-1)
        
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_train_loss += loss.item()
        
    with torch.no_grad():
        model.eval()
        
        for batch_idx, data in enumerate(validation_dataloader):
            x = data.src.to(device)
            y = data.trg.to(device)
        
            y_pred = model(x, y[:-1, :])
            y_pred = y_pred.reshape(-1, y_pred.size(-1))
            y = y[1:, :].reshape(-1)
        
            loss = criterion(y_pred, y)
        
            epoch_test_loss += loss.item()
    
    epoch_train_loss = epoch_train_loss / len(train_dataloader.dataset)
    epoch_test_loss = epoch_test_loss / len(validation_dataloader.dataset)
    
    print("-------------------------------------------------")
    print("Epoch: {} Train mean loss: {:.8f}".format(epoch, epoch_train_loss))
    print("       {} Test  mean loss: {:.8f}".format(epoch, epoch_test_loss))
    print("-------------------------------------------------")

-------------------------------------------------
Epoch: 0 Train mean loss: 0.14624718
       0 Test  mean loss: 0.14650946
-------------------------------------------------
-------------------------------------------------
Epoch: 1 Train mean loss: 0.14480896
       1 Test  mean loss: 0.14490047
-------------------------------------------------
-------------------------------------------------
Epoch: 2 Train mean loss: 0.14331298
       2 Test  mean loss: 0.14328783
-------------------------------------------------
-------------------------------------------------
Epoch: 3 Train mean loss: 0.14183115
       3 Test  mean loss: 0.14172112
-------------------------------------------------
-------------------------------------------------
Epoch: 4 Train mean loss: 0.14039767
       4 Test  mean loss: 0.14025022
-------------------------------------------------
-------------------------------------------------
Epoch: 5 Train mean loss: 0.13908836
       5 Test  mean loss: 0.13890658
------

## Test

In [25]:
german2english(model, "Es freut mich, dich kennenzulernen", device=device) #Translates to: I'm pleased to meet you              #https://www.mondly.com/german-phrases-expressions

['a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a']