In this project, we will use an RNN architecture to build a Machine Translation model.

It will use as a corpus wikipedia dumps.

Either the source or the target will be English. We will, in our case, try English to French Translation.

Dataset

In [2]:
#Test samples location and preprocessing
import numpy as np

data_path = "fra.txt"
data = open(data_path, "r", encoding='utf-8')

# Load data
source = []
target = []

for line in data:
    # Discard empty lines if any
    if line:
        l = line.split("\t")
        source.append(l[0])
        target.append(l[1])

# Close file
data.close()

def remove_duplicates(source, target):
    new_source = [source[0]]
    new_target = [target[0]]
    for i in range(1, len(source)):
        if source[i] != source[i-1]:
            new_source.append(source[i])
            new_target.append(target[i])
            
    return new_source, new_target

source, target = remove_duplicates(source, target)
print("Some dataset statistics:\n")

print("Number of sentences: {}\n".format(len(source)))

source_word_count = [len(sentence) for sentence in source]
target_word_count = [len(sentence) for sentence in target]

print("Total number of words (source): {}".format(np.sum(source_word_count)))
print("Total number of words (target): {}\n".format(np.sum(target_word_count)))

print("Average number of words per sentence (source): {}".format(np.mean(source_word_count)))
print("Average number of words per sentence (target): {}\n".format(np.mean(target_word_count)))

unique_source_words = []
for sentence in source:
    for word in sentence:
        if word not in unique_source_words:
            unique_source_words.append(word)
            
unique_target_words = []
for sentence in target:
    for word in sentence:
        if word not in unique_target_words:
            unique_target_words.append(word)
            
print("Number of unique words (source): {}".format(len(unique_source_words)))
print("Number of unique words (target): {}".format(len(unique_target_words)))


FileNotFoundError: [Errno 2] No such file or directory: 'fra.txt'

Word embedding

# We will use three different types of word embeddings:
# 1. Word2Vec
# 2. GloVe
# 3. FastText


In [6]:
"""## Word2Vec"""

import io
import os

# Python program to generate word vectors using Word2Vec
 
# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import download
import warnings
 
warnings.filterwarnings(action = 'ignore')
 
import gensim
from gensim.models import Word2Vec

In [7]:
#keep in mind you have to launch the notebook inside the git folder to make this work (second one)
from inspect import getsourcefile
import sys
print(os.path.dirname(getsourcefile(lambda:0)))
print(sys.path[0])
print(os.path.abspath(sys.argv[0]))

C:\Users\adrie\AppData\Local\Temp\ipykernel_10632
C:\Users\adrie\OneDrive\Bureau\webtextanalysis
C:\Users\adrie\miniconda3\envs\fondation\lib\site-packages\ipykernel_launcher.py


In [8]:
#We need to download the punkt package for tokenizing sentences
download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adrie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
def tokenize(path = sys.path[0] + "\\samples\\alice.txt"):
    #  Reads ‘alice.txt’ file
    with io.open(path, 'r',encoding='utf8') as sample :
        s = sample.read()
        
        # Replaces escape character with space
        f = s.replace("\n", " ")
        
        data = []
        
        # iterate through each sentence in the file
        for i in sent_tokenize(f):
            temp = []
            
            # tokenize the sentence into words
            for j in word_tokenize(i):
                temp.append(j.lower())
        
            data.append(temp)

    return data

def test_similarity(model, word1, word2, model_name):
    print("Cosine similarity between '" + word1 + "' and '"+ word2 +"' - " + model_name + " : " + str(model.similarity(word1, word2)))
    
def Fast_similarity(model, word1, word2):
    print("Cosine similarity between '" + word1 + "' and '"+ word2 +"' - " + "FastText" + " : " + str(model.wv.similarity(word1, word2)))

embedding_data = tokenize()

In [12]:
with open(sys.path[0] + "\\samples\\alice_tokenised.txt", 'w',encoding='utf8') as f:
    for line in embedding_data:
        f.write(" ".join(line) + " ")

In [13]:
"""## Word2Vec"""
 
# Create CBOW model
w2v_model_cbow = gensim.models.Word2Vec(embedding_data, min_count = 1,
                              vector_size = 100, window = 5)
 
# Print results
test_similarity(w2v_model_cbow.wv, 'alice', 'wonderland', "CBOW")
     
test_similarity(w2v_model_cbow.wv, 'alice', 'machines', "CBOW")
 
# Create Skip Gram model
w2v_model_skip = gensim.models.Word2Vec(embedding_data, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)
 
# Print results
test_similarity(w2v_model_skip.wv, 'alice', 'wonderland', "SkipGram")
     
test_similarity(w2v_model_skip.wv, 'alice', 'machines', "SkipGram")

Cosine similarity between 'alice' and 'wonderland' - CBOW : 0.9727886
Cosine similarity between 'alice' and 'machines' - CBOW : 0.8968569
Cosine similarity between 'alice' and 'wonderland' - SkipGram : 0.6006849
Cosine similarity between 'alice' and 'machines' - SkipGram : 0.81720406


In [14]:
"""## GloVe"""

# coding: utf-8
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [15]:
#Only do this once
input_file = sys.path[0] + '\\models\\alice_glove.txt'
output_file = sys.path[0] + '\\models\\gensim_alice_glove.txt'
glove2word2vec(input_file, output_file)

(3390, 300)

In [16]:
#once we have the tokenized file, we can call the glove model

####CALL FROM BASH glove_run.py

In [17]:
# Load the model, can take a bit of time
output_file = sys.path[0] + '\\models\\gensim_alice_glove.txt'
glove_model = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [18]:

# Test the model
test_similarity(glove_model, 'alice', 'wonderland', "GloVe")
test_similarity(glove_model, 'alice', 'machines', "GloVe")

Cosine similarity between 'alice' and 'wonderland' - GloVe : -0.18298031
Cosine similarity between 'alice' and 'machines' - GloVe : -0.8092102


#FastText

In [20]:
"""## FastText"""
from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
from gensim.test.utils import datapath

model = FastText(vector_size=100, window=5, min_count=1)  # instantiate
model.build_vocab(corpus_iterable=embedding_data)
model.train(corpus_iterable=embedding_data, total_examples=len(embedding_data), epochs=100)  
Fast_similarity(model,'alice','machine')

Cosine similarity between 'alice' and 'machine' - FastText : -0.117291


RNN

In [None]:
# Now we can create the RNN model that will translate from english to french using one of the previous embeddings

from torch import nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
#From this model we can create a loss function and an optimizer

def loss_function(tag_scores, gold_tags):
    loss_function = nn.NLLLoss()
    loss = loss_function(tag_scores, gold_tags)
    return loss

In [None]:
#Now we can train the model
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNN(100, 128, 100, 100).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

n_epoch = 100

for epoch in range(n_epoch):
    for sentence, tags in data:
        sentence = torch.tensor(sentence, dtype=torch.long).to(device)
        tags = torch.tensor(tags, dtype=torch.long).to(device)

        model.zero_grad()

        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        
    #here we can use the test data to evaluate the model
    
    losses = torch.zeros(len(test_data))
    
    for sentence, tags in test_data:
        sentence = torch.tensor(sentence, dtype=torch.long).to(device)
        tags = torch.tensor(tags, dtype=torch.long).to(device)

        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tags)
        
        losses.append(loss.item())
        
    print("Epoch " + str(epoch) + " : " + str(losses.mean()))
    print("Std : " + str(losses.std()))
        

    print("Epoch: {}/{}.............".format(epoch, n_epoch), end=" ")
    print("Loss: {:.4f}".format(loss.item()))