<h1><center>VecMap multilingual embeddings using GloVe vectorizer</center></h1>

Implementation for VecMap multilingual embeddings using GloVe. References here.

In [1]:
import os
import gensim
from gensim.models import Word2Vec
import glob
import string
import unidecode
from glove import Corpus, Glove
import csv
from lxml import etree as ET
import lxml.html
import subprocess

## Préparation des listes de phrases

In [2]:
#Directory containing XML tagged files, Perseus way
author_gk="./xml/homer_gk"

In [3]:
#Directory containing French XML tagged files, home made way
author_fr = "./xml/fr_translators"

In [4]:
texts_gk=""
lemmatized_sentences_gk=list()
files= glob.iglob(author_gk+'/**/*.xml', recursive=True)
for filename in files :
    p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
    tree_gk = ET.parse(filename,p)
    sentences_gk = tree_gk.findall(".//sentence")
    for sentence in sentences_gk:
        words_per_sentence=list()
        for word in sentence.xpath(".//word/@lemma"):
            words_per_sentence.append(word)
        lemmatized_sentences_gk.append(words_per_sentence)

In [5]:
texts_fr=""
lemmatized_sentences_fr=list()
files= glob.iglob(author_fr+'/**/*.xml', recursive=True)
for filename in files :
    p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
    tree_fr = ET.parse(filename,p)
    sentences_fr = tree_fr.findall(".//sentence")
    for sentence in sentences_fr:
        words_per_sentence = list()
        for word in sentence.xpath(".//word/@lemma"):
            if word not in string.punctuation and word is not " ":
                words_per_sentence.append(word)
        lemmatized_sentences_fr.append(words_per_sentence)   

<h2>Checking sentences</h2>

In [6]:
print(lemmatized_sentences_gk[0])
print(len(lemmatized_sentences_gk))

['ἀνήρ', 'ἐγώ', 'ἐνέπω', ',', 'Μοῦσα', ',', 'πολύτροπος', ',', 'ὅς', 'μάλα', 'πολύς', 'πλάζω', ',', 'ἐπεί', 'Τροία', 'ἱερός', 'πτολίεθρον', 'πέρθω', '·']
15138


In [7]:
print(lemmatized_sentences_fr[0])
print(len(lemmatized_sentences_fr))

['ce', 'être', 'ainsi', 'que', 'en', 'ces', 'lieu', 'prier', 'le', 'noble', 'et', 'patient', 'Ulysse', 'cependant', 'le', 'jeune', 'fille', 'sur', 'le', 'chariot', 'que', 'traîner', 'de', 'fort', 'mule', 'arriver', 'à', 'le', 'ville']
120154


<h2>Training GloVe models</h2>

In [8]:
nb_comp=300

In [9]:
corpus_gk = Corpus()

In [10]:
corpus_gk.fit(lemmatized_sentences_gk, window=20)

In [11]:
glove_gk = Glove(no_components=nb_comp, learning_rate=0.05)

In [12]:
glove_gk.fit(corpus_gk.matrix, epochs=150, no_threads=10, verbose=False)

In [13]:
glove_gk.add_dictionary(corpus_gk.dictionary)

In [14]:
corpus_fr = Corpus()

In [15]:
corpus_fr.fit(lemmatized_sentences_fr, window=20)

In [16]:
glove_fr = Glove(no_components=nb_comp, learning_rate=0.05)

In [17]:
glove_fr.fit(corpus_fr.matrix, epochs=150, no_threads=10, verbose=False)

In [23]:
glove_fr.add_dictionary(corpus_fr.dictionary)

<h2>Checking models</h2>

In [24]:
glove_gk.most_similar('Ὀδυσσεύς')

[('πολύμητις', 0.5707263838148661),
 ('ταλασίφρων', 0.5467660868558205),
 ('πολυμήχανος', 0.5328508330360958),
 ('τλήμων', 0.5301694650317548)]

In [26]:
glove_fr.most_similar('Ulysse')

[('Odysseus', 0.6425404979335774),
 ('ingénieux', 0.36945546442174043),
 ('patient', 0.3214320093424856),
 ('aviser', 0.30950651029393333)]

<h2>Creating VecMap compatible models</h2>

In [27]:
with open('./dumped/glove_vecs_gk.txt',  "w+") as glove_vecs_gk:
    glove_vecs_gk.write(str(len(glove_gk.dictionary))+" "+str(nb_comp)+"\n")
    for idx_word, word in enumerate(glove_gk.dictionary):
        glove_vecs_gk.write(word+" ")
        if idx_word<len(glove_gk.dictionary)-1:
            for idx_stat, stat in enumerate(glove_gk.word_vectors[glove_gk.dictionary[word]]):
                if idx_stat<len(glove_gk.word_vectors[glove_gk.dictionary[word]])-1:
                    glove_vecs_gk.write(str(stat)+" ")
                else:
                    glove_vecs_gk.write(str(stat))
            glove_vecs_gk.write("\n")
        else:
            for idx_stat, stat in enumerate(glove_gk.word_vectors[glove_gk.dictionary[word]]):
                if idx_stat<len(glove_gk.word_vectors[glove_gk.dictionary[word]])-1:
                    glove_vecs_gk.write(str(stat)+" ")
                else:
                    glove_vecs_gk.write(str(stat))
            
    glove_vecs_gk.close()

In [28]:
with open('./dumped/glove_vecs_fr.txt',  "w+") as glove_vecs_fr:
    glove_vecs_fr.write(str(len(glove_fr.dictionary))+" "+str(nb_comp)+"\n")
    for idx_word, word in enumerate(corpus_fr.dictionary):
        glove_vecs_fr.write(word+" ")
        if idx_word<len(glove_fr.dictionary)-1:
            for idx_stat, stat in enumerate(glove_fr.word_vectors[glove_fr.dictionary[word]]):
                if idx_stat<len(glove_fr.word_vectors[glove_fr.dictionary[word]])-1:
                    glove_vecs_fr.write(str(stat)+" ")
                else:
                    glove_vecs_fr.write(str(stat))
            glove_vecs_fr.write("\n")
        else:
            for idx_stat, stat in enumerate(glove_fr.word_vectors[glove_fr.dictionary[word]]):
                if idx_stat<len(glove_fr.word_vectors[glove_fr.dictionary[word]])-1:
                    glove_vecs_fr.write(str(stat)+" ")
                else:
                    glove_vecs_fr.write(str(stat))
    glove_vecs_fr.close()

<h2>Mapping with VecMap</h2>

In [53]:
def mapping(source,target) :
    subprocess.call(['python3', './vecmap/map_embeddings.py', '--cuda','--semi_supervised', './dicts/train.dict', source,target, './dumped/src_vecmapped_glove.emb', './dumped/trg_vecmapped_glove.emb'])

In [54]:
mapping('./dumped/glove_vecs_gk.txt','./dumped/glove_vecs_fr.txt')

<h2>Making KeyedVectors compatible models from VecMap models</h2>

In [55]:
gk_vec_file='./dumped/src_vecmapped_glove.emb'
fr_vec_file="./dumped/trg_vecmapped_glove.emb"
with open ('./dumped/bilingual_glove_vecs.txt','w+') as f_bil:
    f_bil.write(str(len(glove_fr.dictionary)+len(glove_gk.dictionary))+" "+str(nb_comp)+"\n")
    glob=list()
    with open(gk_vec_file, 'r') as f_gk:
        glob.extend(f_gk.readlines()[1:])
    with open(fr_vec_file,'r') as f_fr:
        glob.extend(f_fr.readlines()[1:])
    
    for line in glob:
        f_bil.write(line)
    
    f_bil.close()

In [56]:
modeltsv = gensim.models.KeyedVectors.load_word2vec_format('./dumped/bilingual_glove_vecs.txt', binary=False)

In [57]:
modeltsv.most_similar('Ulysse')

[('Odysseus', 0.874396562576294),
 ('ἀτάρ', 0.8216365575790405),
 ('revenir', 0.7921326756477356),
 ('alors', 0.7792285680770874),
 ('héros', 0.7757202982902527),
 ('père', 0.7735368609428406),
 ('πόθι', 0.77182537317276),
 ('ἐπεί', 0.766440212726593),
 ('temps', 0.7658858299255371),
 ('rentrer', 0.7638633847236633)]

<h2>Writing metadata for TensorFlow Projector</h2>

In [48]:
with open("./dumped/tensorflow_glove.tsv", 'w+') as tensors:
    with open("./dumped/tensorflowmeta_glove.tsv", 'w+') as metadata:
        for word in modeltsv.index2word:
            metadata.write(word+'\n')
            vector_row = '\t'.join(map(str, modeltsv[word]))
            tensors.write(vector_row + '\n')
        metadata.close()
    tensors.close()