# English quenya translation using a transformer - Data collection

## Main page

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import nltk
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn
import torch.nn.functional as F
from model import Transformer
from sklearn.model_selection import train_test_split

url = "https://eldamo.org/content/phrase-indexes/phrases-q.html"
r = requests.get(url)

In [2]:
soup = BeautifulSoup(r.content, 'html.parser')

In [3]:
rows_english = soup.select('li')

In [4]:
rows_english[50].text

'A anamelda na ar ilyan  “A is dearest of all”\n               '

In [5]:
quenya_sentences =[]
for row in rows_english:
    quenya_sentences.append(row.text)

In [6]:
sentences = quenya_sentences[15:]

In [7]:
sentences_english = []
sentences_quenya = []
for sentence in sentences:
    compteur = 0
    for i in range(len(sentence)):
        if sentence[i]=="“":
            beginning = i+1
        if sentence[i]=="”":
            ending = i
            last_ending = ending
            compteur+=1
            sentences_english.append(sentence[beginning:ending])
            if compteur==1:
                sentences_quenya.append(sentence[:beginning])
            else:
                sentences_quenya.append(sentence[last_ending:beginning])

In [8]:
len(sentences_english)

414

In [9]:
len(sentences_quenya)

414

## Poems & prayers found on the website

In [10]:
list_url = ["https://eldamo.org/content/words/word-2555725393.html", "https://eldamo.org/content/words/word-2245526111.html",
            "https://eldamo.org/content/words/word-671674147.html", "https://eldamo.org/content/words/word-311699583.html",
            "https://eldamo.org/content/words/word-2920398593.html", "https://eldamo.org/content/words/word-3295893985.html",
            "https://eldamo.org/content/words/word-2124111669.html", "https://eldamo.org/content/words/word-4161205007.html",
            "https://eldamo.org/content/words/word-436003197.html", "https://eldamo.org/content/words/word-2774144071.html",
            "https://eldamo.org/content/words/word-3330342599.html", "https://eldamo.org/content/words/word-1216507117.html",
            "https://eldamo.org/content/words/word-2721399773.html", "https://eldamo.org/content/words/word-1235857611.html"]
for url in list_url:
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    rows_english = soup.select('td')
    for i in range(0,len(rows_english)):
        if i%2==0:
            sentences_quenya.append(rows_english[i].text)
        else:
            sentences_english.append(rows_english[i].text)

In [11]:
len(sentences_english)

653

In [12]:
len(sentences_quenya)

653

## Dictionary pulled from Eldamo

In [13]:
url = "https://eldamo.org/content/vocabulary-indexes/vocabulary-words-nq.html?neo"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
rows = soup.select('dt')

In [14]:
words = []
for row in rows:
    words.append(row.text)
    

In [15]:
words_english = []
words_quenya = []
for word in words:
    compteur=0
    for i in range(len(word)):
        if ((word[i]==" ") | (word[i]=="-"))&(compteur==0)&(i!=0):
            ending_quenya = i
            words_quenya.append(word[:ending_quenya])
            compteur+=1
        if word[i]=="“":
            beginning = i+1 
        if word[i]=="”":
            ending = i
            words_english.append(word[beginning:ending])
            break
            
            

In [16]:
len(words_english)

4921

In [17]:
len(words_quenya)

4921

In [18]:
sentences_english = np.concatenate([sentences_english, words_english])

In [19]:
sentences_quenya = np.concatenate([sentences_quenya, words_quenya])

In [20]:
list_english = []
list_quenya = []
for sentence in sentences_english:
    list_english.append(list(sentence))
for sentence in sentences_quenya:
    list_quenya.append(list(sentence))

In [21]:
m = 0
for i in range(len(sentences_english)):
    if len(list_english[i])>m: 
        m = len(list_english[i])
    if len(list_quenya[i])>m:
        m = len(list_quenya[i])

In [22]:
m

146

## Tokenization

In [23]:
char2index = {}
index2char = {}
counter = 1
for i in range(len(sentences_english)):
    sent_english = sentences_english[i]
    sent_quenya = sentences_quenya[i]
    for w in sent_english:
        if w not in char2index:
            counter+=1
            char2index[w] = counter
            index2char[counter] = w
    for w in sent_quenya:
        if w not in char2index:
            counter+=1
            char2index[w] = counter
            index2char[counter] = w
char2index['<EOS>'] = counter+1
index2char[counter+1] = '<EOS>'

In [24]:
data_en = np.zeros([len(list_english), m+1])
data_quenya = np.zeros([len(list_english), m+1])
data_quenya.shape

(5574, 147)

In [25]:
for i in range(len(list_english)):
    for j in range(len(list_english[i])):
        data_en[i,j] = char2index[list_english[i][j]]
    data_en[i,len(list_english[i])] = char2index['<EOS>']
    for j in range(len(list_quenya[i])):
        data_quenya[i,j] = char2index[list_quenya[i][j]]
    data_quenya[i,len(list_quenya[i])] = char2index['<EOS>']

In [26]:
a = np.zeros(147)

In [27]:
L=[]
for d in range(len(data_quenya)):
    if np.array_equal(a,data_quenya[d]):
        L.append(d)

In [28]:
data_en = np.delete(data_en,L,0)
data_quenya = np.delete(data_quenya,L,0)

In [29]:
data_quenya.shape

(5574, 147)

# Model definition

In [30]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [31]:
VOC_SIZE = len(char2index)+2
EMB_SIZE = 512
NHEAD = 4
HID_DIM = 1024
BATCH_SIZE = 40
NUM_ENCODER_LAYERS = 4
NUM_DECODER_LAYERS = 4

In [32]:
transformer = Transformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, HID_DIM, 0.01, VOC_SIZE)

In [33]:
transformer = transformer.to(DEVICE)

In [34]:
en_train, en_test, qu_train, qu_test = train_test_split(data_en, data_quenya, test_size = 0.2)

In [35]:
en_train = torch.Tensor(en_train).long()
en_test = torch.Tensor(en_test).long()
qu_train = torch.Tensor(qu_train).long()
qu_test = torch.Tensor(qu_test).long()

In [36]:
train_dataset = TensorDataset(en_train, qu_train)
test_dataset = TensorDataset(en_test, qu_test)

In [37]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Masks creation

In [38]:
def positionalEncoding(length, embed_dim):
    angles = np.array([[pos/10000**(2*(i//2)) for i in range(embed_dim)] for pos in range(length)])
    pos_encoding = np.zeros((length, embed_dim))
    pos_encoding[:,0::2] = np.sin(angles[:,0::2])
    pos_encoding[:,1::2] = np.cos(angles[:,1::2])
    return torch.from_numpy(pos_encoding)

In [39]:
def create_look_ahead_mask(size):
    mask = torch.triu(torch.ones(size, size), diagonal=1).to(torch.bool)
    return mask

In [40]:
def padding_mask(x):
    return x==0

In [41]:
pos_encoding = positionalEncoding(m+1, EMB_SIZE).to(DEVICE)
attn_mask = create_look_ahead_mask(m+1).to(DEVICE)


## Training

In [None]:
criterion = torch.nn.CrossEntropyLoss()
transformer.train()
num_epochs = 10
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-4)
torch.autograd.set_detect_anomaly(True) 
for epoch in range(num_epochs):
    # Iterate over the batches in the train_dataloader
    for batch_idx, batch in enumerate(train_dataloader):
        # Get the batch of input sentences and labels
        sentences = batch[0].to(DEVICE)
        labels = batch[1].to(DEVICE)
        src_padding_mask = padding_mask(sentences).to(DEVICE)
        tgt_padding_mask = padding_mask(labels).to(DEVICE)
        # Reset the gradients
        
        optimizer.zero_grad()
        # Forward pass
        outputs = transformer(sentences, labels, src_padding_mask, tgt_padding_mask, attn_mask, pos_encoding)
        
        # Compute the loss
        loss = criterion(outputs.view(-1,VOC_SIZE,m+1), labels)
        torch.nn.utils.clip_grad_norm_(transformer.parameters(), 0.5)

        # Backward pass
        loss.backward()
        # Update the weights
        optimizer.step()

        # Print the loss every 100 batches
        if (batch_idx + 1) % 5 == 0:
            print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, batch_idx+1, len(train_dataloader), loss.item()))

Epoch [1/10], Batch [5/112], Loss: 4.7412


## Inference

In [None]:
transformer.train(mode=False)

In [None]:
input_sentence = np.zeros(m+1)
output_sentence = np.zeros(m+1)
generated_sentence = []
my_sentence = "This is a test sentence"
for i in range(len(my_sentence)):
    input_sentence[i] = char2index[my_sentence[i]]
input_sentence[len(my_sentence)]=char2index['<EOS>']
input_sentence = torch.Tensor(input_sentence).long().to(DEVICE)
output_sentence = torch.Tensor(output_sentence).long().to(DEVICE)
output_sentence[0]=6

In [None]:
src_padding_mask = padding_mask(input_sentence)
s=1
generated_letter = 6
while (len(generated_sentence))<10 and (generated_letter!=char2index['<EOS>']):
    tgt_padding_mask = padding_mask(output_sentence)
    tgt_padding_mask[0]=False
    print(output_sentence)
    last_output = transformer(input_sentence, output_sentence, src_padding_mask, tgt_padding_mask, attn_mask, pos_encoding)
    last_output = torch.argmax(last_output, -1)
    generated_letter = last_output[s]
    generated_sentence.append(generated_letter)
    output_sentence[s] = generated_letter
    s+=1


In [None]:

L=[]
for c in output_sentence:
    if c.item()!=0:
        L.append(index2char[c.item()])
print(L)
''.join(L)

In [None]:
char2index[' ']