In this project, we will use an RNN architecture to build a Machine Translation model.

It will use as a corpus wikipedia dumps.

Either the source or the target will be English. We will, in our case, try English to French Translation.

Dataset

In [None]:
#Test samples location and preprocessing

Word embedding

# We will use three different types of word embeddings:
# 1. Word2Vec
# 2. GloVe
# 3. FastText


In [1]:
"""## Word2Vec"""

import io
import os

# Python program to generate word vectors using Word2Vec
 
# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import download
import warnings
 
warnings.filterwarnings(action = 'ignore')
 
import gensim
from gensim.models import Word2Vec

In [3]:
#keep in mind you have to launch the notebook inside the git folder to make this work (second one)
from inspect import getsourcefile
import sys
print(os.path.dirname(getsourcefile(lambda:0)))
print(sys.path[0])
print(os.path.abspath(sys.argv[0]))

C:\Users\gille\AppData\Local\Temp\ipykernel_10576
c:\Users\gille\OneDrive\Desktop\web\webtextanalysis
c:\Users\gille\miniconda3\envs\thesis\lib\site-packages\ipykernel_launcher.py


In [2]:
#We need to download the punkt package for tokenizing sentences
download("punkt")

[nltk_data] Downloading package punkt to D:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
def tokenize(path = sys.path[0] + "\\samples\\alice.txt"):
    #  Reads ‘alice.txt’ file
    with io.open(path, 'r',encoding='utf8') as sample :
        s = sample.read()
        
        # Replaces escape character with space
        f = s.replace("\n", " ")
        
        data = []
        
        # iterate through each sentence in the file
        for i in sent_tokenize(f):
            temp = []
            
            # tokenize the sentence into words
            for j in word_tokenize(i):
                temp.append(j.lower())
        
            data.append(temp)

    return data

def test_similarity(model, word1, word2, model_name):
    print("Cosine similarity between '" + word1 + "' and '"+ word2 +"' - " + model_name + " : " + str(model.similarity(word1, word2)))

embedding_data = tokenize()

In [3]:
with open(sys.path[0] + "\\samples\\alice_tokenised.txt", 'w',encoding='utf8') as f:
    for line in embedding_data:
        f.write(" ".join(line) + " ")

In [14]:
"""## Word2Vec"""
 
# Create CBOW model
w2v_model_cbow = gensim.models.Word2Vec(embedding_data, min_count = 1,
                              vector_size = 100, window = 5)
 
# Print results
test_similarity(w2v_model_cbow.wv, 'alice', 'wonderland', "CBOW")
     
test_similarity(w2v_model_cbow.wv, 'alice', 'machines', "CBOW")
 
# Create Skip Gram model
w2v_model_skip = gensim.models.Word2Vec(embedding_data, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)
 
# Print results
test_similarity(w2v_model_skip.wv, 'alice', 'wonderland', "SkipGram")
     
test_similarity(w2v_model_skip.wv, 'alice', 'machines', "SkipGram")

Cosine similarity between 'alice' and 'wonderland' - CBOW : 0.97200704
Cosine similarity between 'alice' and 'machines' - CBOW : 0.8724395
Cosine similarity between 'alice' and 'wonderland' - SkipGram : 0.66590977
Cosine similarity between 'alice' and 'machines' - SkipGram : 0.83266664


In [4]:
"""## GloVe"""

# coding: utf-8
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [40]:
#Only do this once
input_file = sys.path[0] + '\\models\\alice_glove.txt'
output_file = sys.path[0] + '\\models\\gensim_alice_glove.txt'
glove2word2vec(input_file, output_file)

(3343, 300)

In [19]:
#once we have the tokenized file, we can call the glove model

####CALL FROM BASH glove_run.py

CompletedProcess(args='python3.10 c:\\Users\\gille\\OneDrive\\Desktop\\web\\webtextanalysis/glove_run.py', returncode=1)

In [41]:
# Load the model, can take a bit of time
output_file = sys.path[0] + '\\models\\gensim_alice_glove.txt'
glove_model = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [42]:

# Test the model
test_similarity(glove_model, 'alice', 'wonderland', "GloVe")
test_similarity(glove_model, 'alice', 'machines', "GloVe")

Cosine similarity between 'alice' and 'wonderland' - GloVe : -0.21529882
Cosine similarity between 'alice' and 'machines' - GloVe : -0.8179076


#FastText

In [None]:
"""## FastText"""

RNN

In [None]:
# Now we can create the RNN model that will translate from english to french using one of the previous embeddings

from torch import nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
#From this model we can create a loss function and an optimizer

def loss_function(tag_scores, gold_tags):
    loss_function = nn.NLLLoss()
    loss = loss_function(tag_scores, gold_tags)
    return loss

In [None]:
#Now we can train the model
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNN(100, 128, 100, 100).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

n_epoch = 100

for epoch in range(n_epoch):
    for sentence, tags in data:
        sentence = torch.tensor(sentence, dtype=torch.long).to(device)
        tags = torch.tensor(tags, dtype=torch.long).to(device)

        model.zero_grad()

        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        
    #here we can use the test data to evaluate the model
    
    losses = torch.zeros(len(test_data))
    
    for sentence, tags in test_data:
        sentence = torch.tensor(sentence, dtype=torch.long).to(device)
        tags = torch.tensor(tags, dtype=torch.long).to(device)

        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tags)
        
        losses.append(loss.item())
        
    print("Epoch " + str(epoch) + " : " + str(losses.mean()))
    print("Std : " + str(losses.std()))
        

    print("Epoch: {}/{}.............".format(epoch, n_epoch), end=" ")
    print("Loss: {:.4f}".format(loss.item()))