<h1><center>VecMap multilingual embeddings using W2V vectorizer</center></h1>

Implementation for VecMap multilingual embeddings using GloVe. References here.

In [2]:
import os
import gensim
from gensim.models import Word2Vec
import glob
import string
import unidecode
import csv
from lxml import etree as ET
import lxml.html
import subprocess

## Préparation des listes de phrases

In [3]:
#Directory containing XML tagged files, Perseus way
author_gk="./xml/homer_gk"

In [4]:
#Directory containing French XML tagged files, home made way
author_fr = "./xml/fr_translators"

In [5]:
texts_gk=""
lemmatized_sentences_gk=list()
files= glob.iglob(author_gk+'/**/*.xml', recursive=True)
for filename in files :
    p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
    tree_gk = ET.parse(filename,p)
    sentences_gk = tree_gk.findall(".//sentence")
    for sentence in sentences_gk:
        words_per_sentence=list()
        for word in sentence.xpath(".//word/@lemma"):
            words_per_sentence.append(word)
        lemmatized_sentences_gk.append(words_per_sentence)

In [7]:
texts_fr=""
list_of_vocs={}
lemmatized_sentences_fr=list()
files= glob.iglob(author_fr+'/**/*.xml', recursive=True)
for filename in files :
    p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
    tree_fr = ET.parse(filename,p)
    author=filename[len(author_fr)+1:len(filename)-11]
    sentences_fr = tree_fr.findall(".//sentence")
    for sentence in sentences_fr:
        words_per_sentence = list()
        for word in sentence.xpath(".//word/@lemma"):
            if word not in string.punctuation and word is not " ":
                words_per_sentence.append(word)
                if word in list_of_vocs.keys():
                    list_of_vocs[word].add(author)
                else :
                    list_of_vocs[word]=set()
                    list_of_vocs[word].add(author)
        lemmatized_sentences_fr.append(words_per_sentence)   

<h2>Checking sentences</h2>

In [13]:
print(lemmatized_sentences_gk[0])
print(len(lemmatized_sentences_gk))

['ἀνήρ', 'ἐγώ', 'ἐνέπω', ',', 'Μοῦσα', ',', 'πολύτροπος', ',', 'ὅς', 'μάλα', 'πολύς', 'πλάζω', ',', 'ἐπεί', 'Τροία', 'ἱερός', 'πτολίεθρον', 'πέρθω', '·']
15138


In [14]:
print(lemmatized_sentences_fr[0])
print(len(lemmatized_sentences_fr))

['ce', 'être', 'ainsi', 'que', 'en', 'ces', 'lieu', 'prier', 'le', 'noble', 'et', 'patient', 'Ulysse', 'cependant', 'le', 'jeune', 'fille', 'sur', 'le', 'chariot', 'que', 'traîner', 'de', 'fort', 'mule', 'arriver', 'à', 'le', 'ville']
120154


<h2>Training W2V models</h2>

In [15]:
nb_comp=300

In [16]:
model_gk = Word2Vec(lemmatized_sentences_gk, min_count=10,max_vocab_size=10000, negative=5, iter=50, size=nb_comp)

In [17]:
model_gk.wv.save_word2vec_format("./dumped/w2v_model_gk.emb", fvocab=None, binary=False)

In [18]:
model_fr = Word2Vec(lemmatized_sentences_fr, min_count=10,max_vocab_size=10000, negative=5, iter=50, size=nb_comp)

In [19]:
model_fr.wv.save_word2vec_format("./dumped/w2v_model_fr.emb", fvocab=None, binary=False)

<h2>Checking models</h2>

In [22]:
model_gk.wv.most_similar('Ὀδυσσεύς')

[('ὑφορβός', 0.4769396185874939),
 ('Εὔμαιος', 0.34372854232788086),
 ('ἀοιδός', 0.33389297127723694),
 ('Ἀχιλλεύς', 0.3271576166152954),
 ('Ἀγήνωρ', 0.307390034198761),
 ('Λακεδαίμων', 0.28493624925613403),
 ('γυνή', 0.28086012601852417),
 ('ξενίζω', 0.2763843238353729),
 ('Ὀρέστης', 0.2703791856765747),
 ('Μενοιτιάδης', 0.2694649398326874)]

In [23]:
model_fr.wv.most_similar('Ulysse')

[('Odysseus', 0.6022398471832275),
 ('Télémaque', 0.4212080240249634),
 ('héros', 0.4194243550300598),
 ('Pirée', 0.3815237879753113),
 ('Pénélope', 0.3777717351913452),
 ('Théoclymène', 0.36129456758499146),
 ('Oreste', 0.34781932830810547),
 ('Alcinoos', 0.3445066213607788),
 ('Alcinoüs', 0.3425518870353699),
 ('Philoctète', 0.3377434015274048)]

<h2>Mapping with VecMap</h2>

In [31]:
def mapping(source,target) :
    subprocess.call(['python3', './vecmap/map_embeddings.py', '--cuda','--semi_supervised', './dicts/train.dict', source,target, './dumped/src_vecmapped_w2v.emb', './dumped/trg_vecmapped_w2v.emb'])

In [32]:
mapping('./dumped/w2v_model_gk.emb','./dumped/w2v_model_fr.emb')

<h2>Making KeyedVectors compatible models from VecMap models</h2>

In [37]:
gk_vec_file='./dumped/src_vecmapped_w2v.emb'
fr_vec_file="./dumped/trg_vecmapped_w2v.emb"
with open ('./dumped/bilingual_w2v_vecs.txt','w+') as f_bil:
    f_bil.write(str(len(model_gk.wv.vocab)+len(model_fr.wv.vocab))+" "+str(nb_comp)+"\n")
    glob=list()
    with open(gk_vec_file, 'r') as f_gk:
        glob.extend(f_gk.readlines()[1:])
    with open(fr_vec_file,'r') as f_fr:
        glob.extend(f_fr.readlines()[1:])
    
    for line in glob:
        f_bil.write(line)
    
    f_bil.close()

In [9]:
modeltsv = gensim.models.KeyedVectors.load_word2vec_format('./dumped/bilingual_w2v_vecs.txt', binary=False)

In [10]:
modeltsv.most_similar('mort')

[('trépas', 0.7332578897476196),
 ('Parque', 0.6382513046264648),
 ('μάχη', 0.6073669195175171),
 ('Kèr', 0.5236033797264099),
 ('perte', 0.4916805922985077),
 ('malheur', 0.489699125289917),
 ('meurtre', 0.48224690556526184),
 ('destin', 0.4717833399772644),
 ('carnage', 0.4534017741680145),
 ('châtiment', 0.4489312171936035)]

<h2>Writing metadata for TensorFlow Projector</h2>

In [42]:
with open("./dumped/tensorflow_w2v.tsv", 'w+') as tensors:
    with open("./dumped/tensorflowmeta_w2v.tsv", 'w+') as metadata:
        for word in modeltsv.index2word:
            metadata.write(word+'\n')
            vector_row = '\t'.join(map(str, modeltsv[word]))
            tensors.write(vector_row + '\n')
        metadata.close()
    tensors.close()

<h2>If you want to know which author used what</h2>

In [12]:
with open("./dumped/tensorflow_w2v_names.tsv", 'w+') as tensors:
    with open("./dumped/tensorflowmeta_w2v_names.tsv", 'w+') as metadata:
         for word in modeltsv.index2word:
                authors=""
                if word in list_of_vocs.keys():
                    if len(list_of_vocs[word])<5:
                        authors+="_".join(list_of_vocs[word])
                if (len(authors)>1):
                    metadata.write(word+"_"+authors+'\n')
                else:
                    metadata.write(word+'\n')
                vector_row = '\t'.join(map(str, modeltsv[word]))
                tensors.write(vector_row + '\n')