In this project, we will use an RNN architecture to build a Machine Translation model.

It will use as a corpus wikipedia dumps.

Either the source or the target will be English. We will, in our case, try English to French Translation.

In [2]:
#imports
!pip3 install numpy
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu117
#or any nightly version so long as pytorch > 1.11 https://pytorch.org/
!pip3 install gensim transformers d2l==1.0.0a1.post0

#In pytorch functional.py, change PILLOW_VERSION to __version__
#there are two places to change

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cu117
Collecting sympy
  Using cached sympy-1.11.1-py3-none-any.whl (6.5 MB)
Collecting mpmath>=0.19
  Using cached mpmath-1.2.1-py3-none-any.whl (532 kB)
Installing collected packages: mpmath, sympy
Successfully installed mpmath-1.2.1 sympy-1.11.1


Dataset

In [8]:
#Test samples location and preprocessing
import os
import torch
from d2l import torch as d2l

class MTFraEng(d2l.DataModule):  #@save
    def _download(self):
        d2l.extract(d2l.download(
            d2l.DATA_URL+'fra-eng.zip', self.root,
            '94646ad1522d915e7b0f9296181140edcf86a4f5'))
        with open(self.root + '/fra-eng/fra.txt', encoding='utf-8') as f:
            return f.read()

@d2l.add_to_class(MTFraEng)  #@save
def _preprocess(self, text):
    # Replace non-breaking space with space
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    # Insert space between words and punctuation marks
    no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text.lower())]
    return ''.join(out)

@d2l.add_to_class(MTFraEng)  #@save
def _tokenize(self, text, max_examples=None):
    src, tgt = [], []
    for i, line in enumerate(text.split('\n')):
        if max_examples and i > max_examples: break
        parts = line.split('\t')
        if len(parts) == 2:
            # Skip empty tokens
            src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
            tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
    return src, tgt

@d2l.add_to_class(MTFraEng)  #@save
def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
    super(MTFraEng, self).__init__()
    self.save_hyperparameters()
    self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
        self._download())


@d2l.add_to_class(MTFraEng)  #@save
def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
    def _build_array(sentences, vocab, is_tgt=False):
        pad_or_trim = lambda seq, t: (
            seq[:t] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
        sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
        if is_tgt:
            sentences = [['<bos>'] + s for s in sentences]
        if vocab is None:
            vocab = d2l.Vocab(sentences, min_freq=2)
        array = torch.tensor([vocab[s] for s in sentences])
        valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
        return array, vocab, valid_len
    src, tgt = self._tokenize(self._preprocess(raw_text),
                              self.num_train + self.num_val)
    src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
    tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
    return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
            src_vocab, tgt_vocab)

@d2l.add_to_class(MTFraEng)  #@save
def get_dataloader(self, train):
    idx = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader(self.arrays, train, idx)

data = MTFraEng(batch_size=5)
    
src, tgt, src_valid_len, label = next(iter(data.train_dataloader()))
print('source:', src.type(torch.int32))
print('decoder input:', tgt.type(torch.int32))
print('source len excluding pad:', src_valid_len.type(torch.int32))
print('label:', label.type(torch.int32))

@d2l.add_to_class(MTFraEng)  #@save
def build(self, src_sentences, tgt_sentences):
    raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
        src_sentences, tgt_sentences)])
    arrays, _, _ = self._build_arrays(
        raw_text, self.src_vocab, self.tgt_vocab)
    return arrays

#src, tgt, _,  _ = data.build(['hi .'], ['salut .'])
#print('source:', data.src_vocab.to_tokens(src[0].type(torch.int32)))
#print('target:', data.tgt_vocab.to_tokens(tgt[0].type(torch.int32)))

source: tensor([[ 69, 175,   2,   3,   4,   4,   4,   4,   4],
        [ 86,  56,   2,   3,   4,   4,   4,   4,   4],
        [ 94, 175,   2,   3,   4,   4,   4,   4,   4],
        [ 28, 150,   2,   3,   4,   4,   4,   4,   4],
        [ 59,   9,   2,   3,   4,   4,   4,   4,   4]], dtype=torch.int32)
decoder input: tensor([[  3,   6,   0,   4,   5,   5,   5,   5,   5],
        [  3, 108, 183,  58, 113, 144,   2,   4,   5],
        [  3, 109, 210, 135,   2,   4,   5,   5,   5],
        [  3, 204,  31,   0,   4,   5,   5,   5,   5],
        [  3,   6,   6,   0,   4,   5,   5,   5,   5]], dtype=torch.int32)
source len excluding pad: tensor([4, 4, 4, 4, 4], dtype=torch.int32)
label: tensor([[  6,   0,   4,   5,   5,   5,   5,   5,   5],
        [108, 183,  58, 113, 144,   2,   4,   5,   5],
        [109, 210, 135,   2,   4,   5,   5,   5,   5],
        [204,  31,   0,   4,   5,   5,   5,   5,   5],
        [  6,   6,   0,   4,   5,   5,   5,   5,   5]], dtype=torch.int32)


Word embedding

# We will use three different types of word embeddings:
# 1. Word2Vec
# 2. GloVe
# 3. FastText


In [1]:
"""## Word2Vec"""
import os

# Python program to generate word vectors using Word2Vec
 
# importing all necessary modules
 
import gensim
from gensim.models import Word2Vec

In [3]:
#keep in mind you have to launch the notebook inside the git folder to make this work (second one)
from inspect import getsourcefile
import sys
print(os.path.dirname(getsourcefile(lambda:0)))
print(sys.path[0])
print(os.path.abspath(sys.argv[0]))

C:\Users\gille\AppData\Local\Temp\ipykernel_16216
c:\Users\gille\OneDrive\Desktop\web\webtextanalysis
c:\Users\gille\miniconda3\envs\thesis\lib\site-packages\ipykernel_launcher.py


In [53]:
import numpy as np

def save_split():
    data = MTFraEng(batch_size=5)
    with open("samples/source.txt", "w") as f:
        for i in range(0, data.num_train):
            for word in data.arrays[0][i].numpy() :
                f.write(str(word) + " ")
            f.write("\n")
            
    with open("samples/target.txt", "w") as f:
        for i in range(0, data.num_train):
            for word in data.arrays[1][i].numpy() :
                f.write(str(word) + " ")
            f.write("\n")
            
def load_source():
    return np.loadtxt("samples/source.txt", dtype=str)

def load_target():
    return np.loadtxt("samples/target.txt", dtype=str)

def word_to_token(word, src=True):
    if src :
        return data.src_vocab[word]
    else :
        return data.tgt_vocab[word]

def token_to_word(token, src=True):
    if src :
        return data.src_vocab.to_tokens(token)
    else :
        return data.tgt_vocab.to_tokens(token)

def test_similarity(model, word1, word2, model_name, src=True):
    print("Cosine similarity between '" + word1 + "' and '"+ word2 +"' - " + model_name + " : " + str(model.similarity(word_to_token(word1, src), word_to_token(word2, src))))


In [48]:
data = MTFraEng(batch_size=5)

In [54]:
#print a few samples
for i in range(5):
    print(token_to_word(data.arrays[0][i].numpy(), True))

for i in range(5):
    print(token_to_word(data.arrays[1][i].numpy(), False))

['go', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['hi', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['run', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['run', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['who', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'va', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'salut', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'cours', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'courez', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'qui', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [55]:
save_split();

In [57]:

source_text = load_source()
one_line_source = source_text.reshape([np.prod(source_text.shape)])

#format to be accepted by Word2Vec
one_line_source = [str(i).split() for i in one_line_source]

print(one_line_source[:5])
#print in words 
print([token_to_word(int(i[0]), src=True) for i in one_line_source[:5]])



target_text = load_target()
one_line_target = target_text.reshape([np.prod(target_text.shape)])

#format to be accepted by Word2Vec
one_line_target = [str(i).split() for i in one_line_target]

print(one_line_target[:5])
#print in words 
print([token_to_word(int(i[0]), src=False) for i in one_line_target[:5]])

[['59'], ['2'], ['3'], ['4'], ['4']]
['go', '.', '<eos>', '<pad>', '<pad>']
[['3'], ['201'], ['0'], ['4'], ['5']]
['<bos>', 'va', '!', '<eos>', '<pad>']


In [58]:
"""## Word2Vec"""

print(one_line_source[:5])
 
# Create CBOW model
w2v_model_cbow = gensim.models.Word2Vec(one_line_source, min_count = 3,
                              vector_size = 100, window = 5)
 
# Print results
test_similarity(w2v_model_cbow.wv, 'hi', '.', "CBOW")
     
test_similarity(w2v_model_cbow.wv, 'hi', 'go', "CBOW")
 
# Create Skip Gram model
w2v_model_skip = gensim.models.Word2Vec(one_line_source, min_count = 3, vector_size = 100,
                                             window = 5, sg = 1)
 
# Print results
test_similarity(w2v_model_skip.wv, 'hi', '.', "SkipGram")
     
test_similarity(w2v_model_skip.wv, 'hi', 'go', "SkipGram")

[['59'], ['2'], ['3'], ['4'], ['4']]
Cosine similarity between 'hi' and '.' - CBOW : -0.025094546
Cosine similarity between 'hi' and 'go' - CBOW : -0.14607012
Cosine similarity between 'hi' and '.' - SkipGram : -0.025094546
Cosine similarity between 'hi' and 'go' - SkipGram : -0.14607012


In [34]:
#save the models
w2v_model_cbow.wv.save_word2vec_format("models/w2v_cbow.txt", binary=False)
w2v_model_skip.wv.save_word2vec_format("models/w2v_skip.txt", binary=False)

In [59]:
"""## GloVe"""

# coding: utf-8
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
#once we have the tokenized file, we can call the glove model

####CALL FROM BASH glove_run.py

In [62]:
#Only do this once
source_file = sys.path[0] + '\\models\\source_glove.txt'
target_file = sys.path[0] + '\\models\\target_glove.txt'
# Load the model, can take a bit of time
source_glove_model = KeyedVectors.load_word2vec_format(source_file, binary=False)
target_glove_model = KeyedVectors.load_word2vec_format(source_file, binary=False)

In [64]:

# Test the model
test_similarity(source_glove_model, 'hi', '.', "GloVe", src=True)
test_similarity(source_glove_model, 'hi', 'go', "GloVe", src=True)

test_similarity(target_glove_model, 'bonjour', '.', "GloVe", src=False)
test_similarity(target_glove_model, 'bonjour', 'cours', "GloVe", src=False)

Cosine similarity between 'hi' and '.' - GloVe : -0.9790179
Cosine similarity between 'hi' and 'go' - GloVe : 0.9545123
Cosine similarity between 'bonjour' and '.' - GloVe : 0.9993236
Cosine similarity between 'bonjour' and 'cours' - GloVe : -0.99489284


#FastText

In [69]:
"""## FastText"""
from gensim.models import FastText

fast_model = FastText(vector_size=100, window=5, min_count=3)
fast_model.build_vocab(corpus_file='models/source_glove.txt')
fast_model.train(corpus_file='models/source_glove.txt', epochs=100, total_examples=fast_model.corpus_count, total_words=fast_model.corpus_total_words)

test_similarity(fast_model.wv,'hi', '.', "FastText", src=True)
test_similarity(fast_model.wv,'hi', 'go', "FastText", src=True)

Cosine similarity between 'hi' and '.' - FastText : 0.10711907
Cosine similarity between 'hi' and 'go' - FastText : 0.0017310102


RNN

In [None]:
# Now we can create the RNN model that will translate from english to french using one of the previous embeddings

from torch import nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
#From this model we can create a loss function and an optimizer

def loss_function(tag_scores, gold_tags):
    loss_function = nn.NLLLoss()
    loss = loss_function(tag_scores, gold_tags)
    return loss

In [None]:
#Now we can train the model
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNN(100, 128, 100, 100).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

n_epoch = 100

for epoch in range(n_epoch):
    for sentence, tags in data:
        sentence = torch.tensor(sentence, dtype=torch.long).to(device)
        tags = torch.tensor(tags, dtype=torch.long).to(device)

        model.zero_grad()

        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        
    #here we can use the test data to evaluate the model
    
    losses = torch.zeros(len(test_data))
    
    for sentence, tags in test_data:
        sentence = torch.tensor(sentence, dtype=torch.long).to(device)
        tags = torch.tensor(tags, dtype=torch.long).to(device)

        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tags)
        
        losses.append(loss.item())
        
    print("Epoch " + str(epoch) + " : " + str(losses.mean()))
    print("Std : " + str(losses.std()))
        

    print("Epoch: {}/{}.............".format(epoch, n_epoch), end=" ")
    print("Loss: {:.4f}".format(loss.item()))

To this model we can now try to add contextual embeddings

In [None]:
#for contextual embedding we will use BERT

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

#we use bert and we will train it on the data
bert_model = BertModel.from_pretrained('bert-base-uncased')
#we will use the tokenizer to tokenize the sentences
sentences = ["I love machine learning", "I love coding in python"]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]