# <b>Générer un espace sémantique multilingue à partir de corpus monolingues</b>

Notebook qui permet de générer des modèles d'espace sémantique bilingue à partir de corpus monolingues, avec word2vec.
<lb>Ce notebook a recours, entre autres, aux librairies <b>CLTK</b>, <b>gensim</b> et <b>word2vec</b>, et crée des sorties exploitables, entre autres, sur le projecteur d'embeddings de <a href : "https://projector.tensorflow.org/">tensorflow</a>

## Imports

In [13]:
import os
import gensim
from gensim.models import Word2Vec
import glob
import string
import unidecode
from collections import defaultdict
from cltk.corpus.utils.importer import CorpusImporter
import pandas as pd
import ipympl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import numpy as np

from lxml import etree as ET
import lxml.html

from cltk.corpus.greek.beta_to_unicode import Replacer

Le paramètre suivant sert à déterminer la longueur des vecteurs à entraîner

In [None]:
nb_comp=500

### Si vous n'avez pas encore importé les modèles de langue de CLTK
Sinon, passez à la cellule suivante

In [None]:
corpus_importer = CorpusImporter('greek')
corpus_importer.import_corpus('greek_models_cltk')

In [None]:
from cltk.tokenize.sentence import TokenizeSentence
tokenizer = TokenizeSentence('greek')

## Embeddings du grec

In [None]:
r = Replacer()

In [None]:
author_gk="homer_gk"

## si le texte grec est déjà lemmatisé

Si le texte est tiré de treebanks comme ceux de Perseus, disponibles sur le github de Perseus et déjà lemmatisés, vous pouvez utiliser cette partie. Sinon, allez à la suivante, qui utilise CLTK.

In [None]:
if author_gk != "":
    texts_gk=""
    lemmatized_sentences_gk=list()
    files= glob.iglob(author_gk+'/**/*.xml', recursive=True)
    for filename in files :
        p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
        tree_gk = ET.parse(filename,p)
        sentences_gk = tree_gk.findall(".//sentence")
        for sentence in sentences_gk:
            words_per_sentence=list()
            for word in sentence.xpath(".//word/@lemma"):
                words_per_sentence.append(word)
            lemmatized_sentences_gk.append(words_per_sentence)

In [None]:
print(lemmatized_sentences_gk[0])
print(len(lemmatized_sentences_gk))

## si le texte grec n'est pas lemmatisé

In [None]:
if author1 != "":
    texts1=""
    files= glob.iglob(author1+'/**/*gk.xml', recursive=True)
    for filename in files :
        p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
        tree1 = ET.parse(filename,p)
        root1 = tree1.find(".//text")
        rawtext= lxml.html.tostring(root1, method="text", encoding="utf8")
        texts1+=rawtext.decode()

In [None]:
if 'texts1' in locals():
    sentences1 = tokenizer.tokenize_sentences(texts1)

In [None]:
if 'sentences1' in locals():
    print(len(sentences1))

In [None]:
voc1=set()

In [None]:
from cltk.tokenize.word import WordTokenizer
word_tokenizer = WordTokenizer('greek')

In [None]:
from cltk.stem.lemma import LemmaReplacer

lemmatized_sentences=list()
    
lemmatizer = LemmaReplacer('greek')

    
if 'sentences1' in locals():
    for idx,sentence in enumerate(sentences1):
        print(idx)
        lemmatized_tokens = [r.beta_code(lemma) for lemma in lemmatizer.lemmatize(sentence) if len(lemma)>2 and lemma not in ['.', ',', ':', ';','','·',', ',')','(','*','<','>','[',']','—','\'']]
        print(lemmatizer.lemmatize(sentence))
        voc1.update(lemmatized_tokens)
        lemmatized_sentences.append(lemmatized_tokens)

print(len(lemmatized_sentences))

In [None]:
print(lemmatized_sentences[0])

## Entraînement de Word2vec pour le grec

Entraînement du premier corpus monolingue, en grec ancien, à partir des phrases lemmatisées. Par défaut, les termes pris en compte sont ceux qui apparaissent au moins dix fois, pour éviter les coquilles. Le vocabulaire est limité à 10000 mots pour le temps de calcul. On définit la taille des vecteurs avec le paramètre "size", ici par défaut 500.

In [None]:
model_gk = Word2Vec(lemmatized_sentences_gk, min_count=10,max_vocab_size=10000, negative=5, iter=50, size=nb_comp)

In [None]:
print(model_gk)

In [None]:
words = list(model_gk.wv.vocab)

In [None]:
model_gk.wv.save_word2vec_format("./model_gk.emb", fvocab=None, binary=False)

## Embeddings du français

Les textes français sont lemmatisés et mis au format TEI souhaité avec ce taggeur personnel (écrit par Frédéric Glorieux, que j'ai adapté pour Python), disponible <a href="https://github.com/ANRChapitres/tagging">ici</a>. Tous les textes sont mis dans un seul et même dossier.

In [None]:
author_fr = "fr_translators"

In [None]:
list_of_vocs={}

In [None]:
if author_fr != "":
    texts_fr=""
    lemmatized_sentences_fr=list()
    files= glob.iglob(author_fr+'/**/*.xml', recursive=True)
    
    for filename in files :
        author=filename[len(author_fr)+1:len(filename)-11]
        p = ET.XMLParser(remove_blank_text=True, resolve_entities=False)
        tree_fr = ET.parse(filename,p)
        sentences_fr = tree_fr.findall(".//sentence")
        for sentence in sentences_fr:
            words_per_sentence = list()
            for word in sentence.xpath(".//word/@lemma"):
                if word not in string.punctuation and word is not " ":
                    words_per_sentence.append(word)
                    if word in list_of_vocs.keys():
                        list_of_vocs[word].add(author)
                    else :
                        list_of_vocs[word]=set()
                        list_of_vocs[word].add(author)
            lemmatized_sentences_fr.append(words_per_sentence)
print(len(lemmatized_sentences_fr))

In [None]:
model_fr = Word2Vec(lemmatized_sentences_fr, min_count=10,max_vocab_size=10000, negative=5, iter=50, size=nb_comp)

In [None]:
model_fr.wv.save_word2vec_format("./model_fr.emb", fvocab=None, binary=False)

## Projection des embeddings sur un espace commun

In [None]:
import subprocess

Pour la doc, voir ici : https://github.com/artetxem/vecmap
<br/>Attention, si sur gros ordinateur équipé d'une GPU, ne pas oublier le flag "--cuda"
<br/>Ce qui donnerait la commande subprocess.call(['python3', './vecmap/map_embeddings.py', '--unsupervised', '--cuda', source,target, 'src_mapped.emb', 'trg_mapped.emb'])
<br/>Le cas échéant, ne pas oublier d'importer CuPy pour la gestion par GPU
<br/><br/>"Two equivalent words in different languages should have a similar distribution, and we can use this fact to induce the initial set of wordpairings", "Our goal is to learn the <a href="https://en.wikipedia.org/wiki/Transformation_matrix">linear transformation matrices</a> WX and WZ so the mapped embeddings XWX and ZWZ are in the same cross-lingual space", voir le <a href="https://aclweb.org/anthology/P18-1073">papier d'Artetxe et altri</a>.
<br/><br/>Choisir une des deux méthodes suivantes, si vous disposez ou non d'un petit dictionnaire.

In [None]:
def semi_supervised_mapping(source,target) :
    subprocess.call(['python3', './vecmap/map_embeddings.py', '--unsupervised','--cuda', source,target, 'src_mapped.emb', 'trg_mapped.emb'])

In [None]:
def unsupervised_mapping(source,target) :
    subprocess.call(['python3', './vecmap/map_embeddings.py', '--semi_supervised','train.dict','--cuda', source,target, 'src_mapped.emb', 'trg_mapped.emb'])

In [None]:
unsupervised_mapping('model_fr.emb','model_gk.emb')

On fusionne les deux fichiers avec les vecteurs de chaque langue projetés dans le même espace.

In [None]:
gk_vec_file='src_mapped.emb'
fr_vec_file="trg_mapped.emb"
with open ('bilingual_vecs.txt','w+') as f_bil:
    f_bil.write(str(len(model_gk.wv.vocab)+len(model_fr.wv.vocab))+" "+str(nb_comp)+"\n")
    glob=list()
    with open(gk_vec_file, 'r') as f_gk:
        glob.extend(f_gk.readlines()[1:])
    with open(fr_vec_file,'r') as f_fr:
        glob.extend(f_fr.readlines()[1:])
    
    for line in glob:
        f_bil.write(line)
    
    f_bil.close()

## Chargement des vecteurs de l'espace multilingue et démonstration

In [4]:
modeltsv = gensim.models.KeyedVectors.load_word2vec_format('bilingual_vecs.txt', binary=False)

In [None]:
with open("./tensorflow.tsv", 'w+') as tensors:
    with open("./tensorflowmeta.tsv", 'w+') as metadata:
         for word in modeltsv.index2word:
                authors=""
                if word in list_of_vocs.keys():
                    if len(list_of_vocs[word])<5:
                        authors+="_".join(list_of_vocs[word])
                if (len(authors)>1):
                    metadata.write(word+"_"+authors+'\n')
                else:
                    metadata.write(word+'\n')
                vector_row = '\t'.join(map(str, modeltsv[word]))
                tensors.write(vector_row + '\n')

In [5]:
modeltsv.most_similar("Ulysse")

[('Odysseus', 0.6975877285003662),
 ('Ὀδυσσεύς', 0.5966129899024963),
 ('Pirée', 0.5505570769309998),
 ('Tèlémakhos', 0.5240138173103333),
 ('Piraeos', 0.5220750570297241),
 ('ὑφορβός', 0.510199785232544),
 ('Autolykos', 0.4850497543811798),
 ('Télémaque', 0.4747697114944458),
 ('Autolycos', 0.4697454869747162),
 ('Orsiloque', 0.4671053886413574)]

In [6]:
modeltsv.most_similar("aimer")

[('chérir', 0.6728097796440125),
 ('τίω', 0.5826131105422974),
 ('honorer', 0.5664807558059692),
 ('protéger', 0.46612560749053955),
 ('haïr', 0.4527318477630615),
 ('τίνω', 0.44333603978157043),
 ('valoir', 0.4342774748802185),
 ('plaire', 0.42875194549560547),
 ('τιμάω', 0.4277810752391815),
 ('traiter', 0.4219663143157959)]

In [7]:
modeltsv.most_similar("assassiner")

[('tuer', 0.5438829660415649),
 ('κατακτείνω', 0.5224876999855042),
 ('outrager', 0.5185527205467224),
 ('commis', 0.5084099173545837),
 ('tramer', 0.46658986806869507),
 ('égorger', 0.462814062833786),
 ('punir', 0.4557381272315979),
 ('mépriser', 0.44803065061569214),
 ('conspirer', 0.44266751408576965),
 ('déshonorer', 0.4417601525783539)]

In [8]:
modeltsv.most_similar("vin")

[('οἶνος', 0.6359249949455261),
 ('breuvage', 0.6237227916717529),
 ('boisson', 0.5851397514343262),
 ('nectar', 0.5671843886375427),
 ('liqueur', 0.5542675852775574),
 ('lait', 0.5295891165733337),
 ('κρέας', 0.5283176898956299),
 ('μέθυ', 0.5217383503913879),
 ('ἐρυθρός', 0.5033228397369385),
 ('μελιηδής', 0.5014011859893799)]

In [9]:
modeltsv.most_similar("guerrier")

[('λαός', 0.5341816544532776),
 ('compagnon', 0.49779969453811646),
 ('soldat', 0.4497343897819519),
 ('Argiens', 0.39662763476371765),
 ('στρατός', 0.3915303945541382),
 ('Grecs', 0.3909304440021515),
 ('nôtres', 0.3854723274707794),
 ('Danaens', 0.3830326795578003),
 ('gens', 0.3824494779109955),
 ('capitaine', 0.3746872842311859)]

In [10]:
modeltsv.most_similar("nourriture")

[('σῖτος', 0.6523345708847046),
 ('pain', 0.5389895439147949),
 ('πίνω', 0.5167507529258728),
 ('aliment', 0.5109034180641174),
 ('boisson', 0.4662516117095947),
 ('provision', 0.4558088779449463),
 ('manger', 0.4495968520641327),
 ('κρέας', 0.44095897674560547),
 ('breuvage', 0.44063881039619446),
 ('φαγεῖν', 0.43504542112350464)]

In [11]:
modeltsv.most_similar("et")

[('·', 0.48861220479011536),
 ('dont', 0.45370054244995117),
 ('où', 0.3881946802139282),
 ('qui', 0.3793215751647949),
 ('δέ', 0.3743005394935608),
 ('pour', 0.36974287033081055),
 ('de', 0.3554162383079529),
 ('quand', 0.34876349568367004),
 ('lorsque', 0.3483029305934906),
 ('puis', 0.3376055359840393)]

In [12]:
modeltsv.most_similar("afin")

[('ὄφρα', 0.54485023021698),
 ('tandis', 0.4267917275428772),
 ('avant', 0.42552343010902405),
 ('alors', 0.41726064682006836),
 ('pour', 0.35444748401641846),
 ('parce', 0.34463194012641907),
 ('peur', 0.3236440122127533),
 ('sitôt', 0.3111719787120819),
 ('désireux', 0.31033483147621155),
 ('demain', 0.30295059084892273)]

In [15]:
modeltsv.most_similar("mort")

[('trépas', 0.5939536094665527),
 ('Parque', 0.5737640857696533),
 ('Kèr', 0.48828446865081787),
 ('νεῖκος', 0.46946752071380615),
 ('perte', 0.4430833160877228),
 ('meurtre', 0.4051223397254944),
 ('Parques', 0.3897401988506317),
 ('forfait', 0.3799719512462616),
 ('destin', 0.3790188729763031),
 ('Kère', 0.3777950704097748)]

In [16]:
modeltsv.most_similar("bateau")

[('vaisseau', 0.8183284401893616),
 ('navire', 0.7919580936431885),
 ('nef', 0.7286497354507446),
 ('croiseur', 0.7107780575752258),
 ('ναῦς', 0.6743136644363403),
 ('barque', 0.5556160807609558),
 ('bord', 0.538252592086792),
 ('galère', 0.5344588756561279),
 ('flotte', 0.5338149070739746),
 ('κλισία', 0.5305114984512329)]

In [21]:
modeltsv.most_similar("θυγάτηρ")

[('fille', 0.6464950442314148),
 ('παράκοιτις', 0.6018003225326538),
 ('νύμφη', 0.5571032166481018),
 ('ἐκγίγνομαι', 0.5498508810997009),
 ('κόρη', 0.5260941982269287),
 ('Ναυσικάα', 0.4998238682746887),
 ('εὐρύοπα', 0.49351561069488525),
 ('ὀπυίω', 0.4886321425437927),
 ('γαμβρός', 0.4871732294559479),
 ('Λητώ', 0.4824705719947815)]

In [22]:
modeltsv.most_similar("ἐγώ")

[('σύ', 0.7144296765327454),
 ('moi', 0.6475143432617188),
 ('te', 0.5921909213066101),
 ('ἐμός', 0.5551868677139282),
 ('σός', 0.5544769167900085),
 ('ἐκεῖνος', 0.5518284440040588),
 ('vous', 0.5323278903961182),
 ('me', 0.5144422650337219),
 ('tu', 0.5044656991958618),
 ('δύστηνος', 0.502662718296051)]

In [24]:
modeltsv.most_similar("ἀθάνατος")

[('immortel', 0.5389121174812317),
 ('ἄνθρωπος', 0.5159936547279358),
 ('Οὐρανίωνες', 0.502825140953064),
 ('μάκαρ', 0.48085817694664),
 ('θεός', 0.4569125473499298),
 ('γόνος', 0.4510040581226349),
 ('αἰγίοχος', 0.4306454062461853),
 ('αἰδοῖος', 0.41837960481643677),
 ('τέρας', 0.4159465432167053),
 ('πάτηρ', 0.40604519844055176)]

In [25]:
modeltsv.most_similar("Ζεύς")

[('Jupiter', 0.6503325700759888),
 ('Zeus', 0.5428218245506287),
 ('ζεύς', 0.5213541984558105),
 ('Atlas', 0.4349021911621094),
 ('Κρόνος', 0.38605567812919617),
 ('τιμάω', 0.370660662651062),
 ('ὑπισχνέομαι', 0.3631303012371063),
 ('Ἥρα', 0.3576769530773163),
 ('Κρονίδης', 0.35384267568588257),
 ('θεός', 0.35296741127967834)]

## Projecteur local (mais mieux vaut utiliser celui de tensorflow) : partie en développement

In [29]:
gk_vec_file='src_mapped.emb'
fr_vec_file="trg_mapped.emb"
with open ('bilingual_vecs_local.txt','w+') as f_bil:
    glob=list()
    with open(gk_vec_file, 'r') as f_gk:
        glob.extend(f_gk.readlines()[1:])
    with open(fr_vec_file,'r') as f_fr:
        glob.extend(f_fr.readlines()[1:])
    
    for line in glob:
        f_bil.write(line)
    
    f_bil.close()

In [3]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [30, 30]
%matplotlib ipympl

In [4]:
vectors = pd.read_csv('bilingual_vecs_local.txt', delimiter=' ', header=None).as_matrix()
words = vectors[:,0]
vectors = vectors[:,1:]
print(words.shape)
print(vectors.shape)
print(words[:3])
print(vectors[:3,:])

(6813,)
(6813, 500)
[',' 'δέ' '·']
[[-0.160082 0.011541 -0.146137 ... -0.005783399999999999 -0.0148075
  -0.00371258]
 [0.162606 -0.020852000000000002 -0.0186189 ... -0.00668697 -0.00495149
  0.0249705]
 [-0.036411599999999995 -0.10372200000000001 -0.0052026 ...
  0.020937599999999997 -0.0043487 0.00652271]]


In [7]:
def plot_words_3d(vectors, words, plot = True):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(vectors[:,0], vectors[:,1], vectors[:,2], marker='.')
    for i, word in enumerate(words):
        ax.text(x=vectors[i,0], y=vectors[i,1], z=vectors[i,2], s=word)
    if plot:
        plt.show()
    return plt 

In [8]:
model = TSNE(n_components=3, random_state=0, perplexity=15)
if True:
    X_tsne_3d = model.fit_transform(vectors)
    np.savetxt('tsne_3d.txt', X_tsne_3d);
else:
    print("WARNING: loading stale vectors from tsne_3d.txt")
    X_tsne_3d = np.loadtxt('tsne_3d.txt');
    
npts = 20
plot_words_3d(X_tsne_3d[0:npts,0:3], words[0:npts]);

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to  previous…

In [12]:
ev = None

# show interactive 3D plot like in Tensorboard
def plot_words_3d_interactive(vectors, words, plot = True, n_neighbors = 15):
    fig = plt.figure(figsize=(15,45))
    ax = fig.add_subplot(211, projection='3d')
    ax_zoom = fig.add_subplot(212, projection='3d')
    ax.scatter(vectors[:,0], vectors[:,1], vectors[:,2], marker='.', c='b', picker=3)
    if plot:
        plt.show()
    
    NN_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(vectors)
    last_indices = None
    
    def onclick_3d_interactive(event):
        _x, _y, _z = event.artist._offsets3d
        ind = event.ind[0]
        x = _x[ind]
        y = _y[ind]
        z = _z[ind]
        distances, indices = NN_model.kneighbors([[x, y, z]], n_neighbors=n_neighbors)
        distances = distances[0]
        indices = indices[0]
        if indices.size == 0:
            return

        avg_neighbor = np.mean(distances[1:6])
        keep = distances>2*avg_neighbor
        distance = distances[keep]
        indices = indices[keep]
        
        ax_zoom.clear()
        ax_zoom.scatter(vectors[indices,0], vectors[indices,1], vectors[indices,2], marker='.')
        for i in indices:
            jitter = 0
            ax_zoom.text(x=vectors[i,0], y=vectors[i,1], z=vectors[i,2]+jitter, s=words[i])

    cid = fig.canvas.mpl_connect('pick_event', onclick_3d_interactive)
    return plt

plt.close("all")
npts = 1000
plot_words_3d_interactive(X_tsne_3d[0:npts,0:3], words[0:npts]);

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to  previous…