# Dev le(s) modèle(s) de désambiguïsation lexicale

In [1]:
import re
import math
import nltk
import random
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn import decomposition
from xml.dom.minidom import parse
from gensim.models import Word2Vec
#from sklearn.cluster import KMeans
from nltk.cluster.kmeans import KMeansClusterer

trial_corpus_path = "trial_corpus.xml"
test_corpus_path = "test_corpus.xml"

In [2]:
def load_corpus(path):
    """Load a formatted corpus data file

    Parameters
    ----------
    path: str
        Path to the corpus xml file to load


    Returns
    -------
    list
        A 3 dimensional list containing for each document, the sentences that it is composed of.
        Where each sentence is a list of single tokens.
    dict
        A dictionnary mapping a lemma with it's document/sentence/index position and the BabelNet Sense attributed
    """

    DOMTree = parse(path)

    documents = []
    sens_dict = {}
    for doc in DOMTree.getElementsByTagName("document"):
        # For each document
        sentences = []
        for sent in doc.getElementsByTagName("sentence"):
            # And for each sentence
            # Append the new sentence
            s = sent.getAttribute("s")
            sentences.append(s.split())

            # Map the lemmas in the sentence with it's doc/sentence/index position and BabelNet sense
            for lem in sent.getElementsByTagName("lemma"):
                idx = lem.getAttribute("idx")
                lemma = lem.getAttribute("lemma")
                # Few lemma may have more than 1 BabelNet sense (due to redundancy in BN)
                # Only keep the 1st one
                sense = lem.getAttribute("senses").split()[0] 
                
                ctx = (int(doc.getAttribute("id")),
                        int(sent.getAttribute("id")),
                        int(idx),
                        sense)
                if not lemma in sens_dict:
                    sens_dict[lemma] = [ctx]
                else:
                    sens_dict[lemma].append(ctx)

        documents.append(sentences)

    return (documents, sens_dict)


documents, sens_dict = load_corpus(test_corpus_path)


print("Documents (%d):"%(len(documents)))
for i, d in enumerate(documents):
    print("\tDoc %2d: %02d sentences"%(i, len(d)))

print("\nLemmas (%d):"%(len(sens_dict)))
for i, (k, v) in zip(range(10), sens_dict.items()):
    print("\tLemma %d: %s -> %d values"%(i, k, len(v)))
print("\t...\n")
print("\nEx. lemma 0: ")
print("\t", list(sens_dict.keys())[0], "->", list(sens_dict.values())[0])

Documents (13):
	Doc  0: 29 sentences
	Doc  1: 16 sentences
	Doc  2: 13 sentences
	Doc  3: 19 sentences
	Doc  4: 13 sentences
	Doc  5: 32 sentences
	Doc  6: 27 sentences
	Doc  7: 28 sentences
	Doc  8: 29 sentences
	Doc  9: 26 sentences
	Doc 10: 24 sentences
	Doc 11: 25 sentences
	Doc 12: 25 sentences

Lemmas (745):
	Lemma 0: groupe -> 5 values
	Lemma 1: nations_unies -> 2 values
	Lemma 2: plan -> 2 values
	Lemma 3: réduction -> 4 values
	Lemma 4: émission -> 3 values
	Lemma 5: conférence -> 4 values
	Lemma 6: climat -> 3 values
	Lemma 7: récrimination -> 1 values
	Lemma 8: vendredi -> 7 values
	Lemma 9: document -> 5 values
	...


Ex. lemma 0: 
	 groupe -> [(0, 0, 1, 'bn:00041942n'), (1, 1, 41, 'bn:00041942n'), (6, 4, 12, 'bn:00041942n'), (9, 6, 35, 'bn:00072536n'), (9, 20, 26, 'bn:00041942n')]


# Word2vec

In [3]:
vocab = []
for d in documents:
    for s in d:
        vocab.append(s)
w2v = Word2Vec(vocab, min_count=1)

del(vocab)

print("Vocab size:", len(w2v.wv.vocab))

Vocab size: 2694


# Huang

In [4]:
def Lemma2Senses(lemma):
    senses = [bn for _,_,_,bn in sens_dict[lemma]]
    return list(set(senses))

print("\nEx.", list(sens_dict.keys())[0])
print("\t->", list(sens_dict.values())[0])
print("\t->", Lemma2Senses(list(sens_dict.keys())[0]))


Ex. groupe
	-> [(0, 0, 1, 'bn:00041942n'), (1, 1, 41, 'bn:00041942n'), (6, 4, 12, 'bn:00041942n'), (9, 6, 35, 'bn:00072536n'), (9, 20, 26, 'bn:00041942n')]
	-> ['bn:00072536n', 'bn:00041942n']


## Visualisation rapide des lemmes à désambiguïser
On remarque que peu de lemmes sont associés à plusieurs sens. Certains apparaissent plusieurs fois avec toujours le meme sens. Pire ! D'autres n'apparaissent qu'une seule fois.<br/>
Il est aussi intéressant de remarquer que certains lemmes sont associés 9 fois avec le sens_1 et 1 fois avec le sens_2. Ceci peut trouver son origine dans les annotations via BabelNet qui propose différents sens redondants d'un mot.<br/>
Pour exemple, le lemme <i>journaliste</i> est associé aux sens BabelNet suivants :
<ol>
    <li>bn:00048461n : celui qui recueille, écrit ou distribue des informations</li>
    <li>bn:00057562n : celui qui enquête, rapporte ou rédige les actualités</li>
</ol>

In [5]:
polysem = {}
solo = {}
npolysem = {}

for k,v in sens_dict.items():
    if len(v) == 1:
        solo[k] = v
        continue
    
    _,_,_,sense_bn = v[0]
    poly = False
    for _,_,_,bn in v:
        if bn != sense_bn:
            polysem[k] = v
            poly = True
            break
    if not poly:
        npolysem[k] = v

print("Nb lemmas:", len(sens_dict))
print()

print("polysems:", len(polysem))
for _, (k, v) in zip(range(5), polysem.items()):
    print("\t", k, " -> ", v)

print("\nsolo:", len(solo))
for _, (k, v) in zip(range(5), solo.items()):
    print("\t", k, " -> ", v)

print("\nnon polysem:", len(npolysem))
for _, (k, v) in zip(range(5), npolysem.items()):
    print("\t", k, " -> ", v)

Nb lemmas: 745

polysems: 69
	 groupe  ->  [(0, 0, 1, 'bn:00041942n'), (1, 1, 41, 'bn:00041942n'), (6, 4, 12, 'bn:00041942n'), (9, 6, 35, 'bn:00072536n'), (9, 20, 26, 'bn:00041942n')]
	 plan  ->  [(0, 0, 8, 'bn:00062759n'), (4, 2, 34, 'bn:00062795n')]
	 document  ->  [(0, 1, 38, 'bn:00028015n'), (0, 7, 10, 'bn:00028015n'), (12, 3, 7, 'bn:00028017n'), (12, 5, 1, 'bn:00028017n'), (12, 14, 7, 'bn:00028017n')]
	 temps  ->  [(0, 1, 65, 'bn:00077270n'), (0, 3, 16, 'bn:00077267n'), (3, 9, 4, 'bn:00077267n'), (6, 3, 28, 'bn:00077267n')]
	 accord  ->  [(0, 2, 9, 'bn:00000728n'), (0, 8, 7, 'bn:00000728n'), (0, 14, 7, 'bn:00000728n'), (0, 20, 47, 'bn:00000728n'), (0, 23, 34, 'bn:00000726n'), (3, 0, 7, 'bn:00002086n'), (3, 2, 2, 'bn:00002086n'), (3, 18, 25, 'bn:00000726n')]

solo: 432
	 récrimination  ->  [(0, 1, 21, 'bn:00066603n')]
	 obstacle  ->  [(0, 2, 20, 'bn:00058511n')]
	 chemin  ->  [(0, 2, 29, 'bn:00061005n')]
	 groupe_de_travail  ->  [(0, 4, 3, 'bn:00081604n')]
	 terme  ->  [(0, 4, 32, 

## Implémentation de la méthode proposée par Huang
<ol>
    <li>Collecte les fenetres d'occurrence d'un mot </li>
    <li>Calcule le vecteur de contexte, moyenne des vecteurs-mots de chaque mots dans un contexte</li>
    <li>Cluster les vecteurs de contextes (spherical K-means)</li>
    <li>Associe à chaque cluster un sens</li>
</ol>

In [6]:
ctx_w = 11 # Contexte window size

global_truth = []
global_classif = []

for lemma, senses in polysem.items():
    labels = Lemma2Senses(lemma)
    num_senses = len(labels)

    # Map clusters with a value from, 0 to num_senses-1
    truth = [labels.index(bn) for _,_,_,bn in senses]
    global_truth.append(truth)

    mean_vectors = []
    for d,s,i,_ in senses:
        l = len(documents[d][s])
        # Extract the words in the contexte window
        window = documents[d][s][max(0, i-math.floor((ctx_w-1)/2)) : min(l, i+math.ceil((ctx_w-1)/2))+1]

        # Compute the context vector (mean of the words vectors in the window)
        mean_vectors.append(np.array([w2v.wv[word] for word in window]).mean(axis=0))

    # Spherical K-means clustering
    skm = KMeansClusterer(num_senses, nltk.cluster.util.cosine_distance, rng=random.Random(0), repeats=10)
    assigned_clusters = skm.cluster(mean_vectors, assign_clusters=True)
    global_classif.append(assigned_clusters)

    print(truth)
    print(assigned_clusters)
    print()

print(sklearn.metrics.classification_report([y for x in global_truth for y in x], [y for x in global_classif for y in x]))


[1, 1, 1, 0, 1]
[1, 0, 0, 1, 1]

[1, 0]
[0, 1]

[1, 1, 0, 0, 0]
[0, 0, 1, 1, 1]

[0, 1, 1, 1]
[0, 1, 1, 1]

[1, 1, 1, 1, 2, 0, 0, 2]
[1, 1, 2, 0, 1, 0, 0, 0]

[1, 1, 1, 1, 1, 0]
[0, 0, 0, 1, 1, 1]

[0, 0, 0, 1, 0]
[1, 0, 1, 0, 1]

[1, 2, 2, 1, 0, 1, 2, 0, 1, 1]
[0, 0, 0, 1, 2, 2, 0, 1, 0, 0]

[0, 2, 1]
[2, 1, 0]

[1, 0, 2, 2, 2]
[1, 1, 1, 0, 2]

[0, 0, 1, 1, 1, 0]
[0, 0, 1, 0, 0, 1]

[0, 0, 1]
[0, 0, 1]

[0, 1, 0]
[0, 0, 1]

[1, 0, 1]
[0, 1, 1]

[1, 2, 0, 0]
[2, 1, 0, 0]

[0, 1]
[0, 1]

[2, 2, 2, 2, 2, 0, 0, 2, 1, 1, 2]
[0, 1, 2, 0, 2, 0, 0, 0, 1, 0, 0]

[0, 1]
[1, 0]

[0, 1, 1]
[1, 0, 1]

[1, 0, 0, 1, 0, 0, 0, 0]
[1, 1, 0, 1, 0, 1, 0, 1]

[1, 0]
[0, 1]

[1, 0]
[0, 1]

[1, 0, 0]
[0, 0, 1]

[1, 0]
[1, 0]

[0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1]
[0, 2, 0, 2, 1, 0, 0, 0, 2, 1, 1, 2]

[1, 1, 2, 0]
[0, 1, 2, 1]

[2, 2, 2, 0, 1]
[1, 1, 1, 0, 2]

[0, 0, 0, 1, 1]
[0, 0, 1, 0, 1]

[0, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 0, 0, 0, 0, 1]

[1, 0]
[1, 0]

[1, 0, 0, 0]
[0, 1, 0, 0]

[1, 0]
[1, 0]

[1, 0, 0, 

### Associe à chaque cluster un sens

In [7]:
def argsmax(lst):
    max = lst[0]
    argsmax = []
    for i in range(len(lst)):
        if lst[i] == max:
            argsmax.append(i)
        elif lst[i] > max:
            max = lst[i]
            argsmax = [i]
    return argsmax

def couple(arr1, arr2):
    # truth, classif
    l = len(arr1)
    nb_c = len(set(arr1))
    arr = np.zeros((nb_c,nb_c), dtype=int)

    for i in range(l):
        arr[arr2[i],arr1[i]] += 1

    cs = [i for i in range(len(arr))]
    ts = [i for i in range(len(arr))]
    map = {}

    for iter in range(len(arr)):
        temp = len(arr)
        for i in range(len(arr)):
            c = arr[i]
            ams = argsmax(c)
            if len(ams) == 1:
                ams = ams[0]
                map[cs[i]] = ts[ams]
                arr = np.delete(np.delete(arr, ams, 1), i, 0)
                cs = np.delete(cs, i)
                ts = np.delete(ts, ams)
                break
        
        if len(arr) == temp:
            j = np.argmax(arr[0])
            map[cs[0]] = ts[j]
            arr = np.delete(np.delete(arr, j, 1), 0, 0)
            cs = np.delete(cs, 0)
            ts = np.delete(ts, j)

    return map

couple(global_truth[0], global_classif[0])

{0: 1, 1: 0}

## Premiers Résultats

In [8]:
final_classif = []
for i in range(len(global_classif)):
    cluster_tags = couple(global_truth[i], global_classif[i])
    final_classif.append([cluster_tags[i] for i in global_classif[i]])

    print(final_classif[i])
    print(global_truth[i])
    print()

[0, 1, 1, 0, 0]
[1, 1, 1, 0, 1]

[1, 0]
[1, 0]

[1, 1, 0, 0, 0]
[1, 1, 0, 0, 0]

[0, 1, 1, 1]
[0, 1, 1, 1]

[1, 1, 2, 0, 1, 0, 0, 0]
[1, 1, 1, 1, 2, 0, 0, 2]

[1, 1, 1, 0, 0, 0]
[1, 1, 1, 1, 1, 0]

[0, 1, 0, 1, 0]
[0, 0, 0, 1, 0]

[1, 1, 1, 0, 2, 2, 1, 0, 1, 1]
[1, 2, 2, 1, 0, 1, 2, 0, 1, 1]

[0, 2, 1]
[0, 2, 1]

[0, 0, 0, 2, 1]
[1, 0, 2, 2, 2]

[0, 0, 1, 0, 0, 1]
[0, 0, 1, 1, 1, 0]

[0, 0, 1]
[0, 0, 1]

[1, 1, 0]
[0, 1, 0]

[1, 0, 0]
[1, 0, 1]

[1, 2, 0, 0]
[1, 2, 0, 0]

[0, 1]
[0, 1]

[2, 1, 0, 2, 0, 2, 2, 2, 1, 2, 2]
[2, 2, 2, 2, 2, 0, 0, 2, 1, 1, 2]

[0, 1]
[0, 1]

[0, 1, 0]
[0, 1, 1]

[1, 1, 0, 1, 0, 1, 0, 1]
[1, 0, 0, 1, 0, 0, 0, 0]

[1, 0]
[1, 0]

[1, 0]
[1, 0]

[1, 1, 0]
[1, 0, 0]

[1, 0]
[1, 0]

[2, 1, 2, 1, 0, 2, 2, 2, 1, 0, 0, 1]
[0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1]

[1, 0, 2, 0]
[1, 1, 2, 0]

[2, 2, 2, 0, 1]
[2, 2, 2, 0, 1]

[0, 0, 1, 0, 1]
[0, 0, 0, 1, 1]

[0, 0, 0, 1, 1, 1, 1, 0]
[0, 1, 1, 1, 1, 1, 1, 1]

[1, 0]
[1, 0]

[0, 1, 0, 0]
[1, 0, 0, 0]

[1, 0]
[1, 0]

[0, 1, 0, 

In [9]:
print(sklearn.metrics.classification_report([y for x in global_truth for y in x], [y for x in final_classif for y in x]))

              precision    recall  f1-score   support

           0       0.71      0.67      0.69       162
           1       0.63      0.69      0.66       141
           2       0.62      0.57      0.60        35

    accuracy                           0.67       338
   macro avg       0.66      0.64      0.65       338
weighted avg       0.67      0.67      0.67       338

