In this project, we will use an RNN architecture to build a Machine Translation model.

It will use as a corpus wikipedia dumps.

Either the source or the target will be English. We will, in our case, try English to French Translation.

In [None]:
#imports
!pip3 install numpy
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu117
#or any nightly version so long as pytorch > 1.11 https://pytorch.org/
!pip3 install gensim transformers d2l==1.0.0a1.post0

#In pytorch functional.py, change PILLOW_VERSION to __version__
#there are two places to change

Dataset

In [None]:
#Test samples location and preprocessing

#cell almost entirely from https://d2l.ai/chapter_recurrent-modern/machine-translation-and-dataset.html
import os
import torch
from d2l import torch as d2l

class MTFraEng(d2l.DataModule):  #@save
    def _download(self):
        d2l.extract(d2l.download(
            d2l.DATA_URL+'fra-eng.zip', self.root,
            '94646ad1522d915e7b0f9296181140edcf86a4f5'))
        with open(self.root + '/fra-eng/fra.txt', encoding='utf-8') as f:
            return f.read()

@d2l.add_to_class(MTFraEng)  #@save
def _preprocess(self, text):
    # Replace non-breaking space with space
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    # Insert space between words and punctuation marks
    no_space = lambda char, prev_char: char not in 'aàbcdeéèâfghiîjklmnoôpqrstuûvwxyz ' and prev_char != ' '
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text.lower())]
    
    #insert space after punctuation marks
    no_space = lambda char, next_char: char not in 'aàbcdeéèâfghiîjklmnoôpqrstuûvwxyz ' and next_char != ' '
    out = [char + ' ' if i < len(text) - 1 and no_space(char, text[i + 1]) else char
              for i, char in enumerate(out)]
    return ''.join(out)

@d2l.add_to_class(MTFraEng)  #@save
def _tokenize(self, text, max_examples=None):
    src, tgt = [], []
    for i, line in enumerate(text.split('\n')):
        if max_examples and i > max_examples: break
        parts = line.split('\t')
        if len(parts) == 2:
            # Skip empty tokens
            src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])  # src.append(EOS_token) ? 
            tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
    return src, tgt

@d2l.add_to_class(MTFraEng)  #@save
def __init__(self, batch_size=10, num_steps=15, num_train=162000):  #15, 162000
    super(MTFraEng, self).__init__()
    self.save_hyperparameters()
    self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
        self._download())


@d2l.add_to_class(MTFraEng)  #@save
def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
    def _build_array(sentences, vocab, is_tgt=False):
        pad_or_trim = lambda seq, t: (
            seq[:t] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
        sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
        if is_tgt:
            sentences = [['<bos>'] + s for s in sentences]
        if vocab is None:
            vocab = d2l.Vocab(sentences, min_freq=2)
        array = torch.tensor([vocab[s] for s in sentences])
        valid_len = (array != vocab['<pad>']).type(torch.int64).sum(1)
        return array, vocab, valid_len
    src, tgt = self._tokenize(self._preprocess(raw_text),
                              self.num_train)
    src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
    tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
    return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
            src_vocab, tgt_vocab)

@d2l.add_to_class(MTFraEng)  #@save
def build(self, src_sentences, tgt_sentences):
    raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
        src_sentences, tgt_sentences)])
    arrays, _, _ = self._build_arrays(
        raw_text, self.src_vocab, self.tgt_vocab)
    return arrays

#src, tgt, _,  _ = data.build(['hi .'], ['salut .'])
#print('source:', data.src_vocab.to_tokens(src[0].type(torch.int64)))
#print('target:', data.tgt_vocab.to_tokens(tgt[0].type(torch.int64)))

@d2l.add_to_class(MTFraEng)  #@save
def get_dataloader(self, train, seed=0, maxi=158140):
    if (maxi > self.num_train):
        raise ValueError("maxi must be less than the length of the dataset")
    
    for array in self.arrays:
        array = array[0:maxi]
        
    self.num_train = maxi
    self.num_test = int(maxi * 0.3)
    
    idx = torch.randperm(generator=torch.Generator().manual_seed(seed), n=maxi)
    
    #0 is train, 1 is test, 2 is valid
    if (train == 1):
        idx = idx[int(maxi * 0.8):]
    if (train == 2):
        idx = idx[int(maxi * 0.6):int(maxi * 0.8)]
    else :
        idx = idx[:int(maxi * 0.6)]
    return self.get_tensorloader(self.arrays, train, idx)
    

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

In [None]:

data = MTFraEng()

In [None]:
print(len(data.get_dataloader(train=1)))
print(len(data.get_dataloader(train=2)))
print(len(data.get_dataloader(train=0)))

Word embedding

# We will use three different types of word embeddings:
# 1. Word2Vec
# 2. GloVe
# 3. FastText


In [None]:
"""## Word2Vec"""
import gensim

In [None]:
#keep in mind you have to launch the notebook inside the git folder to make this work (second one)
from inspect import getsourcefile
import sys
print(os.path.dirname(getsourcefile(lambda:0)))
print(sys.path[0])
print(os.path.abspath(sys.argv[0]))

In [None]:
src, tgt, src_valid_len, label = next(iter(data.get_dataloader(train=0)))
print('source:', src[0].type(torch.int64))
print('decoder input:', tgt[0].type(torch.int64))

In [None]:
import numpy as np

def save_split():
    data = MTFraEng(batch_size=5)
    with open("samples/source.txt", "w") as f:
        for i in range(0, data.num_train):
            for word in data.arrays[0][i].numpy() :
                f.write(str(word) + " ")
            f.write("\n")
            
    with open("samples/target.txt", "w") as f:
        for i in range(0, data.num_train):
            for word in data.arrays[1][i].numpy() :
                f.write(str(word) + " ")
            f.write("\n")
            
def load_source():
    return np.loadtxt("samples/source.txt", dtype=str)

def load_target():
    return np.loadtxt("samples/target.txt", dtype=str)

def word_to_token(word, src=True):
    if src :
        return data.src_vocab[word]
    else :
        return data.tgt_vocab[word]

def token_to_word(token, src=True):
    if src :
        return data.src_vocab.to_tokens(token)
    else :
        return data.tgt_vocab.to_tokens(token)

def test_similarity(model, word1, word2, model_name, src=True):
    print("Cosine similarity between '" + word1 + "' and '"+ word2 +"' - " + model_name + " : " + str(model.similarity(word_to_token(word1, src), word_to_token(word2, src))))


In [None]:
#print a few samples
for i in range(2500,2505):
    print(token_to_word(data.arrays[0][i].numpy(), True))

for i in range(2500,2505):
    print(token_to_word(data.arrays[1][i].numpy(), False))

In [None]:
if not os.path.exists(sys.path[0] + "/samples/source.txt") or not os.path.exists(sys.path[0] + "/samples/target.txt"):
    save_split()

In [None]:

source_text = load_source()
one_line_source = source_text.reshape([np.prod(source_text.shape)])

#format to be accepted by Word2Vec
one_line_source = [str(i).split() for i in one_line_source]

print(one_line_source[:4])
#print in words 
print([token_to_word(int(i[0]), src=True) for i in one_line_source[:4]])



target_text = load_target()
one_line_target = target_text.reshape([np.prod(target_text.shape)])

#format to be accepted by Word2Vec
one_line_target = [str(i).split() for i in one_line_target]

print(one_line_target[:4])
#print in words 
print([token_to_word(int(i[0]), src=False) for i in one_line_target[:4]])

In [None]:
"""## Word2Vec"""
if not os.path.exists(sys.path[0] + "/models/source_w2v_cbow.txt"):
    # Create CBOW model
    source_w2v_model_cbow = gensim.models.Word2Vec(one_line_source, min_count = 1,
                                vector_size = 100, window = 5).wv

if not os.path.exists(sys.path[0] + "/models/source_w2v_skip.txt"):
    # Create Skip Gram model
    source_w2v_model_skip = gensim.models.Word2Vec(one_line_source, min_count = 1, vector_size = 100,
                                                window = 5, sg = 1).wv
    
if not os.path.exists(sys.path[0] + "/models/target_w2v_cbow.txt"):
    # Create CBOW model
    target_w2v_model_cbow = gensim.models.Word2Vec(one_line_target, min_count = 1,
                                vector_size = 100, window = 5).wv

if not os.path.exists(sys.path[0] + "/models/target_w2v_skip.txt"):
    # Create Skip Gram model
    target_w2v_model_skip = gensim.models.Word2Vec(one_line_target, min_count = 1, vector_size = 100,
                                                window = 5, sg = 1).wv

In [None]:
#save the models
if not os.path.exists(sys.path[0] + "/models/source_w2v_cbow.txt"):
    source_w2v_model_cbow.save_word2vec_format(sys.path[0] + "/models/source_w2v_cbow.txt", binary=False)
    
if not os.path.exists(sys.path[0] + "/models/source_w2v_skip.txt"):
    source_w2v_model_skip.save_word2vec_format(sys.path[0] + "/models/source_w2v_skip.txt", binary=False)
    
if not os.path.exists(sys.path[0] + "/models/target_w2v_cbow.txt"):
    target_w2v_model_cbow.save_word2vec_format(sys.path[0] + "/models/target_w2v_cbow.txt", binary=False)

if not os.path.exists(sys.path[0] + "/models/target_w2v_skip.txt"):
    target_w2v_model_skip.save_word2vec_format(sys.path[0] + "/models/target_w2v_skip.txt", binary=False)

In [None]:
#load the models
source_w2v_model_cbow = gensim.models.KeyedVectors.load_word2vec_format(sys.path[0] + "/models/source_w2v_cbow.txt", binary=False)
source_w2v_model_skip = gensim.models.KeyedVectors.load_word2vec_format(sys.path[0] + "/models/source_w2v_skip.txt", binary=False)
target_w2v_model_cbow = gensim.models.KeyedVectors.load_word2vec_format(sys.path[0] + "/models/target_w2v_cbow.txt", binary=False)
target_w2v_model_skip = gensim.models.KeyedVectors.load_word2vec_format(sys.path[0] + "/models/target_w2v_skip.txt", binary=False)

In [None]:
test_similarity(source_w2v_model_cbow, 'hi', '.', "CBOW")
test_similarity(source_w2v_model_cbow, 'hi', 'run', "CBOW")

test_similarity(source_w2v_model_skip, 'hi', '.', "SkipGram")
test_similarity(source_w2v_model_skip, 'hi', 'run', "SkipGram")

test_similarity(target_w2v_model_cbow, 'bonjour', '.', "CBOW")
test_similarity(target_w2v_model_cbow, 'bonjour', 'cours', "CBOW")

test_similarity(target_w2v_model_skip, 'bonjour', '.', "CBOW")
test_similarity(target_w2v_model_skip, 'bonjour', 'cours', "CBOW")

In [None]:
"""## GloVe"""

# coding: utf-8
from gensim.models import KeyedVectors

In [None]:
#once we have the tokenized file, we can call the glove model

####CALL FROM BASH glove_run.py

In [None]:
#Only do this once (depends on if windows or linux sometimes)
#source_file = sys.path[0] + '\\models\\source_glove.txt'
#target_file = sys.path[0] + '\\models\\target_glove.txt'

source_file = sys.path[0] + '/models/source_glove.txt'
target_file = sys.path[0] + '/models/target_glove.txt'
# Load the model, can take a bit of time
source_glove_model = KeyedVectors.load_word2vec_format(source_file, binary=False, no_header=True)
target_glove_model = KeyedVectors.load_word2vec_format(source_file, binary=False, no_header=True)

In [None]:

# Test the model
test_similarity(source_glove_model, 'hi', '.', "GloVe", src=True)
test_similarity(source_glove_model, 'hi', 'run', "GloVe", src=True)

test_similarity(target_glove_model, 'bonjour', '.', "GloVe", src=False)
test_similarity(target_glove_model, 'bonjour', 'cours', "GloVe", src=False)

#FastText

In [None]:
"""## FastText"""
from gensim.models import FastText

#if not saved yet we train it
if not os.path.exists(sys.path[0] + "/models/source_fast.txt"):
    source_fast_model = FastText(vector_size=100, window=5, min_count=1)
    source_fast_model.build_vocab(corpus_file=sys.path[0] + '/samples/source.txt')
    source_fast_model.train(corpus_file=sys.path[0] + '/samples/source.txt', epochs=10, total_examples=source_fast_model.corpus_count, total_words=source_fast_model.corpus_total_words)
    source_fast_model = source_fast_model.wv


In [None]:
if not os.path.exists(sys.path[0] + "/models/target_fast.txt"):
    target_fast_model = FastText(vector_size=100, window=5, min_count=1)
    target_fast_model.build_vocab(corpus_file=sys.path[0] + '/samples/target.txt')
    target_fast_model.train(corpus_file=sys.path[0] + '/samples/target.txt', epochs=10, total_examples=target_fast_model.corpus_count, total_words=target_fast_model.corpus_total_words)
    target_fast_model = target_fast_model.wv

In [None]:
if not os.path.exists(sys.path[0] + "/models/source_fast.txt"):
    source_fast_model.save_word2vec_format(sys.path[0] + "/models/source_fast.txt", binary=False)
if not os.path.exists(sys.path[0] + "/models/target_fast.txt"):
    target_fast_model.save_word2vec_format(sys.path[0] + "/models/target_fast.txt", binary=False)

In [None]:
#if saved we load it
source_fast_model = KeyedVectors.load_word2vec_format(sys.path[0] + "/models/source_fast.txt", binary=False)
target_fast_model = KeyedVectors.load_word2vec_format(sys.path[0] + "/models/target_fast.txt", binary=False)

In [None]:
test_similarity(source_fast_model,'hi', '.', "FastText", src=True)
test_similarity(source_fast_model,'hi', 'run', "FastText", src=True)

test_similarity(target_fast_model,'bonjour', '.', "FastText", src=False)
test_similarity(target_fast_model,'bonjour', 'cours', "FastText", src=False)

RNN

In [None]:
# Now we can create the RNN model that will translate from english to french using one of the previous embeddings

from torch import nn
import torch.nn.functional as F

class RNN_encode(nn.Module):
    def __init__(self, embedding_model_input, embedding_model_output, hidden_size=200):
        super(RNN_encode, self).__init__()

        self.hidden_size = hidden_size
        
        embedding = torch.tensor(embedding_model_input.vectors).to(device)
        self.embedding_in = (embedding / torch.norm(embedding, dim=1, keepdim=True)).to(device)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.embedding_dim_in = self.embedding_in.shape[1]
        self.lstm_in = nn.LSTM(self.embedding_dim_in, self.embedding_dim_in, bidirectional=True, num_layers=2, dropout=0.01)
        self.hidden_in = nn.Linear(self.embedding_dim_in, self.hidden_size)
        self.hidden_in2 = nn.Linear(self.hidden_size, self.hidden_size)


    def forward(self, input_sentence):
        #words_embeddings is a gensim model
        input_sentence = torch.where((input_sentence < self.embedding_in.shape[0]).to(device), input_sentence.to(device), torch.zeros_like(input_sentence).to(device))
        one_hot = F.one_hot(input_sentence.type(torch.int64), self.embedding_in.shape[0]).to(device)
        embeds = torch.matmul(one_hot.type(torch.float), self.embedding_in.type(torch.float))
         
        #encoder
        output_lstm_1, _ = self.lstm_in(embeds.view(input_sentence.shape[0], input_sentence.shape[1], self.embedding_dim_in))
        
        #sum the two directions
        output_lstm_1 = output_lstm_1[:, :, :self.embedding_dim_in] + output_lstm_1[:, : ,self.embedding_dim_in:] + embeds
        soft = F.log_softmax(output_lstm_1, dim=2)
        
        output_hidden_1 = self.hidden_in(soft.view(input_sentence.shape[0], input_sentence.shape[1], self.embedding_dim_in))
        output_hidden_2 = self.hidden_in2(F.relu(output_hidden_1.view(input_sentence.shape[0], input_sentence.shape[1], self.hidden_size)))
        
        return F.log_softmax(output_hidden_2, dim=2)
        
class RNN_decode(nn.Module):
    def __init__(self, embedding_model_input, embedding_model_output, hidden_size=200):
        super(RNN_decode, self).__init__()

        embedding = torch.tensor(embedding_model_output.vectors).to(device)
        self.embedding_out = embedding / torch.norm(embedding, dim=1, keepdim=True).to(device)
        
        self.hidden_size = hidden_size
        
        self.embedding_dim_out = self.embedding_out.shape[1]
        self.lstm_out = nn.LSTM(self.hidden_size, self.hidden_size, bidirectional=True, num_layers=2, dropout=0.01)
        self.hidden_out = nn.Linear(self.hidden_size, self.embedding_dim_out)
        self.hidden_out2 = nn.Linear(self.embedding_dim_out, self.embedding_dim_out)
        
    def forward(self, hidden_sentence):
        #decoder
        output_lstm_2, _ = self.lstm_out(hidden_sentence.view(hidden_sentence.shape[0], hidden_sentence.shape[1], self.hidden_size))
        
        #sum the two directions
        output_lstm_2 = output_lstm_2[:, :, :self.hidden_size] + output_lstm_2[:, : ,self.hidden_size:] + hidden_sentence
        soft1 = F.log_softmax(output_lstm_2, dim=2)
        output_hidden_1 = self.hidden_out(soft1.view(hidden_sentence.shape[0], hidden_sentence.shape[1], self.hidden_size))
        output_hidden_2 = self.hidden_out2(F.relu(output_hidden_1.view(hidden_sentence.shape[0], hidden_sentence.shape[1], self.embedding_dim_out)))
    
        #similarity
        out = torch.abs(output_hidden_2 @ self.embedding_out.transpose(0,1))
        soft =  F.softmin(out, dim=2)
        return  (soft @ self.embedding_out) # VOCAB_SIZE * EMBEDDING_DIM
    
    def get_embedding(self):
        return self.embedding_out
    
class RNN(nn.Module):
    def __init__(self, embedding_model_input, embedding_model_output):
        super(RNN, self).__init__()
        self.encoder = RNN_encode(embedding_model_input, embedding_model_output)
        self.decoder = RNN_decode(embedding_model_input, embedding_model_output)
        
    def forward(self, input_sentence):
        hidden_sentence = self.encoder(input_sentence)
        output_sentence = self.decoder(hidden_sentence)
        return output_sentence
    
    def get_embedding(self):
        return self.decoder.get_embedding()

In [None]:
#From this model we can create a loss function and an optimizer

def loss_function(predicted_sentence, target_sentence, pad, model):
    embedder = model.get_embedding()
    mask = (target_sentence < embedder.shape[0]).to(device)
    target = torch.where(mask, target_sentence.to(device), torch.zeros_like(target_sentence).to(device))

    loss = torch.dist(predicted_sentence, embedder[target.type(torch.int64)], p=2)
    return loss.requires_grad_(True)

In [None]:
batch_size = 128
data = MTFraEng(batch_size=batch_size)

In [None]:
#Now we can train the model
import torch
import time

def train(embed_in, embed_out, n_epoch = 15, batch_size = 10, usage_size = 130000, lr = 0.01, neural = RNN):

    model = neural(embedding_model_input=embed_in, embedding_model_output=embed_out).to(device)

    lr1 = lr
    
    size_per_epoch = int(usage_size * 0.6 / batch_size) + 1

    print("Ready")
    
    pad = word_to_token('<pad>', src=False)

    epoch_loss = np.zeros(n_epoch)
    for epoch in range(n_epoch):
        
        if epoch % 10 == 0 and epoch != 0:
            lr1 = lr1 / 1.1
        
        capturable = device == 'cuda'
        optimizer = torch.optim.Adam(model.parameters(), lr=lr1, capturable=capturable)
        
        counter = 0
        time_avg = 0
        for src, tgt, src_valid_len, label in data.get_dataloader(train=0, seed=0, maxi=usage_size):
            time_start = time.time()
            
            src.to(device)
            tgt.to(device)
            
            optimizer.zero_grad()

            tag_scores = model(src)

            loss = loss_function(tag_scores, tgt, pad, model)

            loss.backward()

            optimizer.step()
            
            #print("grad of lstm in"  + str(model.encoder.lstm_in.weight_hh_l0.grad))

            counter += 1
            time_avg = time_avg * 0.95 + (time.time() - time_start) * (size_per_epoch - counter) * 0.01
            print("New step : ", counter, "/", size_per_epoch, " loss : ", loss.item(), " estimated time :", time_avg, "grad :", model.encoder.lstm_in.weight_hh_l0.grad.mean().item(), end="\r")
            
        #here we can use the test data to evaluate the model
        with torch.no_grad() :
            losses = torch.zeros(int(usage_size * 0.2 / batch_size) + 1)
            counter = 0
            for src, tgt, src_valid_len, label in data.get_dataloader(train=1, seed=0, maxi=usage_size):
                src.to(device)
                tgt.to(device)

                tag_scores = model(src)

                loss = loss_function(tag_scores, tgt, pad, model)
                
                losses[counter] = loss.item()
                counter += 1

            epoch_loss[epoch] = losses.mean()
            print("Epoch: {}/{}.............".format(epoch, n_epoch), end=" ")
            print("Loss: " + str(epoch_loss[epoch]) + ".............")
            
    return epoch_loss, model.to('cpu')
    

In [None]:
def test(model, embed_out, usage_size = 130000) :
    #sample a sentence from the test set
    src, tgt, src_valid_len, label = next(iter(data.get_dataloader(train=False, seed=0, maxi=usage_size)))


    #translate the sentence

    sentence = src.to(device)

    tag_scores = model(sentence)
    
    tags = []
    tags_token = []
    for i in range(tag_scores.shape[1]):
        tags_token.append(int(embed_out.similar_by_vector(tag_scores[0][i].cpu().detach().numpy(), topn=1)[0][0]))
        tags.append(token_to_word(tags_token[i], src=False))
    

    #print the original sentence
    print("Original sentence : ")
    for word in src[0]:
        print(token_to_word(word.item(), src=True), end=" ")
    print()
    for word in src[0]:
        print(word.item(), end=" ")
    print()

    #print the translated sentence
    print("Translated sentence : ")
    print(tags)
    print(tags_token)
    
    #print the target sentence
    print("Target sentence : ")
    for word in tgt[0]:
        print(token_to_word(word.item(), src=False), end=" ")
    print()
    for word in tgt[0]:
        print(word.item(), end=" ")
    print()

In [None]:
import matplotlib.pyplot as plt

epoch_loss_fast, fast_model_neural = train(source_fast_model, target_fast_model, n_epoch = 50, batch_size = batch_size, usage_size = 130000, lr = 0.0001)
torch.save(fast_model_neural.state_dict(), sys.path[0] + "/models/" + "fast_trained_model.pt")     

In [None]:
fast_model_neural.to(device)
test(fast_model_neural, target_fast_model, usage_size = 130000)
plt.plot(epoch_loss_fast)
plt.savefig(sys.path[0] + "/models/" + "fast_trained_model_loss.png")
fast_model_neural.to('cpu')

In [None]:
#same thing with the glove model

epoch_loss_glove, glove_model_neural = train(source_glove_model, target_glove_model, n_epoch = 50, batch_size = batch_size, usage_size = 130000, lr = 0.0001)
torch.save(glove_model_neural.state_dict(), sys.path[0] + "/models/" + "glove_trained_model.pt")

In [None]:
glove_model_neural.to(device)
test(glove_model_neural, target_glove_model, usage_size = 130000)
plt.plot(epoch_loss_glove)
plt.savefig(sys.path[0] + "/models/" + "glove_trained_model_loss.png")
glove_model_neural.to('cpu')

In [None]:
#same thing with the word2vec model
 
epoch_loss_word2vec, word2vec_model_neural = train(source_w2v_model_cbow, target_w2v_model_cbow, n_epoch = 50, batch_size = batch_size, usage_size = 130000, lr = 0.0001)
torch.save(word2vec_model_neural.state_dict(), sys.path[0] + "/models/" + "word2vec_trained_model.pt")

In [None]:
word2vec_model_neural.to(device)
test(word2vec_model_neural, target_w2v_model_cbow, usage_size = 130000)
plt.plot(epoch_loss_word2vec)
plt.savefig(sys.path[0] + "/models/" + "word2vec_trained_model_loss.png")
word2vec_model_neural.to('cpu')

In [None]:
raise (KeyboardInterrupt)

To this model we can now try to add contextual embeddings

In [None]:
#for contextual embedding we will use BERT

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

#we use bert and we will train it on the data
bert_model = BertModel.from_pretrained('bert-base-uncased')
#we will use the tokenizer to tokenize the sentences
sentences = ["I love machine learning", "I love coding in python"]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [None]:
#we will make a function to get the embeddings of the sentences
def get_embeddings(sentences, src = True) :
    print("sentences : ", sentences)
    sentences_in = [token_to_word(sent, src=src) for sent in sentences]
    print("sentences in : ", sentences_in)
    
    #we tokenize the sentences
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences_in]
    #we get the ids of the tokens
    indexed_tokens = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    #we get the attention masks
    segments_ids = [[1] * len(sentence) for sentence in tokenized_texts]
    #we get the tensors
    tokens_tensor = torch.tensor([[indexed_tokens]])
    segments_tensors = torch.tensor([[segments_ids]])
    #we get the embeddings
    with torch.no_grad():
        encoded_layers, _ = bert_model(tokens_tensor, segments_tensors)
    #we get the embeddings of the last layer
    token_embeddings = torch.stack(encoded_layers, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    #we get the mean of the embeddings
    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    return token_vecs_sum

In [None]:
#we can modify the model to use the contextual embeddings

class BertRNN_encode(nn.Module):
    def __init__(self, embedding_model_input, embedding_model_output):
        super(BertRNN_encode, self).__init__()

        self.embedding_in = embedding_model_input

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.embedding_dim_in = embedding_model_input.vector_size
        self.lstm_in = nn.LSTM(self.embedding_dim_in, self.embedding_dim_in, bidirectional=True, dropout = 0.05, num_layers=2)
        self.hidden_in = nn.Linear(self.embedding_dim_in * 2, self.embedding_dim_in)


    def forward(self, input_sentence):
        #words_embeddings is a gensim model
        embeds = torch.tensor(np.array([[self.embedding_in[int(word.item())] if int(word.item()) in self.embedding_in else self.embedding_in.vectors.mean(axis=0) for word in sentence] for sentence in input_sentence]), requires_grad=True).to(device)
        
        bert_embed = get_embeddings(input_sentence, src=True)
        print("bert embed : ", bert_embed)
        
        embeds = torch.cat((embeds, bert_embed), dim=1)
        
        #encoder
        output_lstm_1, _ = self.lstm_in(embeds.view(input_sentence.shape[0], input_sentence.shape[1], self.embedding_dim_in))
        output_hidden_1 = self.hidden_in(output_lstm_1.view(input_sentence.shape[0], input_sentence.shape[1], self.embedding_dim_in * 2))
        
        return torch.tanh(output_hidden_1)
    
class BertRNN(nn.Module) :
    def __init__(self, embedding_model_input, embedding_model_output):
        super(BertRNN, self).__init__()

        self.encoder = BertRNN_encode(embedding_model_input, embedding_model_output)
        self.decoder = RNN_decode(embedding_model_output, embedding_model_output)
    
    def forward(self, input_sentence) :
        output_hidden_1 = self.encoder(input_sentence)
        output_hidden_2 = self.decoder(output_hidden_1)
        return output_hidden_2

In [None]:
# now use the train function to train the model

epoch_loss_bert, bert_model_neural = test(source_glove_model, target_glove_model, usage_size = 130000, neural = BertRNN)
torch.save(bert_model_neural.state_dict(), sys.path[0] + "/models/" + "bert_trained_model.pt")

In [None]:
test(bert_model_neural, target_glove_model, usage_size = 130000)
plt.plot(epoch_loss_bert)
plt.savefig(sys.path[0] + "/models/" + "bert_trained_model_loss.png")

In [None]:
#now we use attention to improve the model

class AttentionRNN_encode(nn.Module):
   