# Parse les données d'apprentissage et de test de SemEval

In [1]:
import re
import csv
from xml.dom.minidom import parseString
from xml.dom.minidom import parse
import xml.etree.ElementTree as et

sens_path = "trial/data/multilingual-all-words.fr.senses"
gold_truth_path = "trial/keys/keys-bn.fr"
corpus_path = "trial/data/multilingual-all-words.fr.xml"

output_corpus_path = "trial_corpus.xml"

## Fournis un parser sur le fichier de sens SemEval (fichiers au format *.senses)

In [2]:
def parse_senses_file(file):
    """Parse the BabelNet senses contained in .senses file given by SemEval
    
    Parameters
    ----------
    file: file object
        An open .senses file
    
    Returns
    -------
    dict
        A dictionnary mapping a lemma with it's BabelNet senses
    """
    # BabelNet Sense Dictionnary
    # bn_sens_dict[<lemma>] = [<senses>]
    bn_sens_dict = {}

    # Capture only BabelNet senses in the given senses file
    for s in file.readlines():
        splitted_line = re.split("\s", s)
        
        # Parse and get the lemma
        lemma = splitted_line[0].split("#")[0]
        
        # Parse BabelNet data
        bn_num = int(splitted_line[2])
        bn_senses = []
        if bn_num > 0:
            for i in range(bn_num):
                bn_senses.append(splitted_line[3+i])
        
        # Parse WordNet data
        #wn_num = int(splitted_line[3+bn_num])
        #wn_senses = []
        #if wn_num > 0:
        #    for i in range(wn_num):
        #        wn_senses.append(splitted_line[4+bn_num+i])
        
        # Parse Wikipedia data 
        #wiki_num = int(splitted_line[4+bn_num+wn_num])
        #wiki_senses = []
        #if wiki_num > 0:
        #    for i in range(wiki_num):
        #        wiki_senses.append(splitted_line[5+bn_num+wn_num+i])

        bn_sens_dict[lemma] = bn_senses
    
    return bn_sens_dict

senses_dict = parse_senses_file(open(sens_path))
keys = list(senses_dict.keys())
print("5 first entries in the BabelNet senses dictionnary")
for i in range(5):
    print(" -", keys[i], "(%d)"%(len(senses_dict[keys[i]])), senses_dict[keys[i]])

5 first entries in the BabelNet senses dictionnary
(' -', 'mardi', '(6)', ['bn:01718496n', 'bn:01502893n', 'bn:01433525n', 'bn:02861587n', 'bn:00078546n', 'bn:00807619n'])
(' -', 'dollar', '(10)', ['bn:00028114n', 'bn:02156595n', 'bn:01827683n', 'bn:00008375n', 'bn:00015129n', 'bn:02122502n', 'bn:00028118n', 'bn:00028116n', 'bn:02897671n', 'bn:01524928n'])
(' -', 'rang', '(10)', ['bn:00054281n', 'bn:01743576n', 'bn:02256034n', 'bn:00073933n', 'bn:02155551n', 'bn:00066152n', 'bn:00066151n', 'bn:01646319n', 'bn:00827444n', 'bn:02587718n'])
(' -', 'conseiller', '(6)', ['bn:00023123n', 'bn:00023132n', 'bn:00391567n', 'bn:00001598n', 'bn:02181897n', 'bn:00001604n'])
(' -', 'radio', '(9)', ['bn:00065899n', 'bn:00065901n', 'bn:02558854n', 'bn:01441457n', 'bn:00065900n', 'bn:01152589n', 'bn:02599193n', 'bn:01936146n', 'bn:02231112n'])


## Fournis un parser sur le fichier "gold truth" de SemEval (fichiers "key")

In [3]:
def parse_gold_truth(file):
    """Parse a SemEval gold truth (keys) file
    
    Parameters
    ----------
    file: file object
        An open file
    
    Returns
    -------
    dict
        A dictionnary mapping a SemEval ids with their senses (according to the gold truth, an id can be link to multiple senses)
    """
    # BabelNet gold truth of the corpus
    bn_gt = {}

    for line in file.readlines():
        line = line.split()

        id = line[1]
        senses = []
        i = 2
        while i < len(line) and line[i] != "!!":
            senses.append(line[i])
            i+=1
        bn_gt[id] = senses
    return bn_gt

gt_dict = parse_gold_truth(open(gold_truth_path))
keys = list(gt_dict.keys())
print("5 first entries in the BabelNet gold truth dictionnary")
for i in range(5):
    print(" -", keys[i], "(%d)"%(len(gt_dict[keys[i]])), gt_dict[keys[i]])

5 first entries in the BabelNet gold truth dictionnary
(' -', 'd001.s004.t001', '(1)', ['bn:00006997n'])
(' -', 'd001.s004.t002', '(1)', ['bn:00007299n'])
(' -', 'd001.s004.t003', '(1)', ['bn:00023471n'])
(' -', 'd001.s004.t004', '(1)', ['bn:00053479n'])
(' -', 'd001.s004.t005', '(1)', ['bn:00007140n'])


## Fournis une méthode pour parser un corpus SemEval au format .xml

In [4]:
def parse_corpus_file(file):
    """Parse a SemEval corpus
    
    Parameters
    ----------
    file
        Either an open .xml SemEval corpus file or it's path
    text_id: int
        The index of the text to parse in the SemEval corpus starting from 0

    Returns
    -------
    list
        list of tuple, each one contains a str (the sentence where elements are separated by a space) and a list of tuples mapping the SemEval id of the word (dxxx.sxxx.txxx), the index of the lemma in the sentence (index start from 0) and the lemma itself
    """

    DOMTree = parse(file)
    corpus = DOMTree.documentElement
    
    documents = []

    # Iterate through the different documents (text markers in the SemEval corpora)    
    for t in corpus.getElementsByTagName("text"):
        sentences = []
        polysems = []
        # Iterate through the sentences in each documents
        for s in t.getElementsByTagName("sentence"):
            idx = 0
            sentence = ""
            polysem = []
            for n in s.childNodes:
                if n.nodeName == "wf":
                    sentence += n.childNodes[0].data + " "
                    idx += 1
                if n.nodeName == "instance":
                    lemma = n.getAttribute("lemma")
                    id = n.getAttribute("id")
                    sentence += n.childNodes[0].data + " "
                    polysem.append((id, idx, lemma))
                    idx += 1
            sentences.append(sentence)
            polysems.append(polysem)
    
        documents.append((sentences, polysems))

    return documents
    

d = parse_corpus_file(corpus_path)
print("Documents in the corpus:", len(d))
print()
for i, (s,_) in enumerate(d):
    print("Document %d: %d sentences" % (i, len(s)))

('Documents in the corpus:', 1)
()
Document 0: 36 sentences


## Fournis une méthode pour convertir un corpus SemEval vers un format plus pratique

In [5]:
unknow = []

def parse_data(semeval_corpus_path, semeval_gt_path, output_path):
    """Parse the given SemEval data to a new xml.
    All given path has to exist

    Parameters
    ----------
    semeval_corpus_path: str
        Path to a SemEval .xml corpus file
    semeval_gt_path: str
        Path to a SemEval keys fils (related to the previous given one and preferably a BabelNet file : keys-bn)
    output_path: str
        Path to the output xml file, will contain the newly formated SemEval corpus
    """

    documents = parse_corpus_file(semeval_corpus_path)
    gt_dict = parse_gold_truth(open(semeval_gt_path))

    root = et.Element("corpus")

    for d, (sentences, polysems) in enumerate(documents):
        document = et.SubElement(root, "document", {"id":str(d)})
        for i in range(len(sentences)):
            sentence = et.SubElement(document, "sentence", {
                "id":str(i), "s": sentences[i]
            })
            for p, (id, idx, lemma) in enumerate(polysems[i]):
                if id in gt_dict:
                    lemma = et.SubElement(sentence, "lemma", {
                        "id" : str(p),
                        "idx" : str(idx), 
                        "lemma" : lemma,
                        "senses" : " ".join(gt_dict[id])
                    })
                else:
                    unknow.append(id)

    with open(output_path, "wb") as f:
        f.write(parseString(et.tostring(root)).toprettyxml(encoding="UTF-8"))

## Parse les données de Dev de SemEval vers un nouveau fichier : trial_corpus.xml

In [6]:
parse_data(corpus_path, gold_truth_path, output_corpus_path)
print("unknow lemma:", len(unknow))

f = open(output_corpus_path)
for i in range(10):
    print(f.readline())
print("(...)")

('unknow lemma:', 0)
<?xml version="1.0" encoding="UTF-8"?>

<corpus>

	<document id="0">

		<sentence id="0" s="Nous ne savons pas qui gagnera la guerre_contre_la_drogue en Amérique_Latine , mais nous savons qui est en train de la perdre - la presse . ">

			<lemma id="0" idx="7" lemma="guerre_contre_la_drogue" senses="bn:00028885n"/>

			<lemma id="1" idx="9" lemma="Amérique_Latine" senses="bn:00050165n"/>

			<lemma id="2" idx="23" lemma="presse" senses="bn:00064245n"/>

		</sentence>

		<sentence id="1" s="Au cours des six derniers mois , six journalistes ont été tués et 10 ont été enlevés par des trafiquants_de_drogue ou des guérilleros de gauche - souvent il s ’ agit des mêmes personnes - en Colombie . ">

			<lemma id="0" idx="5" lemma="mois" senses="bn:00014710n"/>

(...)


## Parse les données de Test de SemEval vers un nouveau fichier : test_corpus.xml

In [7]:
test_corpus_path = "test/data/multilingual-all-words.fr.xml"
test_gt_path = "test/keys/gold/babelnet/babelnet.fr.key"
output_path = "test_corpus.xml"

parse_data(test_corpus_path, test_gt_path, output_path)
print("unknow lemma:", len(unknow))

f = open(output_path)
for i in range(10):
    print(f.readline())
print("(...)")

('unknow lemma:', 226)
<?xml version="1.0" encoding="UTF-8"?>

<corpus>

	<document id="0">

		<sentence id="0" s="Le groupe des Nations_Unies a des projets de plans pour la réduction des émissions ">

			<lemma id="0" idx="1" lemma="groupe" senses="bn:00041942n"/>

			<lemma id="1" idx="3" lemma="nations_unies" senses="bn:00078931n"/>

			<lemma id="3" idx="8" lemma="plan" senses="bn:00062759n"/>

			<lemma id="4" idx="11" lemma="réduction" senses="bn:00025780n"/>

			<lemma id="5" idx="13" lemma="émission" senses="bn:00030455n"/>

		</sentence>

(...)


# Analyse les données BabelNet
ATTENTION : Les Librairies BabelNet necessitent l'utilisation de python 2

In [8]:
import urllib2
import urllib
import json
import gzip

from StringIO import StringIO

key=""

## Fournis une méthode pour convertir un lemme en un ensemble d'id BabelNet

In [9]:
def lemma2ids(lemma):
    ids = []
    
    service_url = 'https://babelnet.io/v5/getSynsetIds'

    params = {
            'lemma' : lemma,
            'searchLang' : "FR",
            'key'  : key
    }

    url = service_url + '?' + urllib.urlencode(params)
    request = urllib2.Request(url)
    request.add_header('Accept-encoding', 'gzip')
    response = urllib2.urlopen(request)

    if response.info().get('Content-Encoding') == 'gzip':
            buf = StringIO( response.read())
            f = gzip.GzipFile(fileobj=buf)
            data = json.loads(f.read())
            ids = [str(res["id"]) for res in data]
    
    return ids

#lemma2ids("apple")

# OUT :
#['bn:00289737n', 'bn:03739345n', 'bn:00955003n', 'bn:00512973n']

## Fournis une méthode pour récuperer la définition associée à une id BabelNet

In [10]:
def id2glosses(id):
    res = []

    service_url = 'https://babelnet.io/v5/getSynset'

    params = {
        'id' : id,
        'targetLang' : 'FR',
        'key'  : key
    }

    url = service_url + '?' + urllib.urlencode(params)
    request = urllib2.Request(url)
    request.add_header('Accept-encoding', 'gzip')
    response = urllib2.urlopen(request)

    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO( response.read())
        f = gzip.GzipFile(fileobj=buf)
        data = json.loads(f.read())

        # retrieving BabelSense data
        #senses = data['senses']
        #for result in senses:
        #    lemma = result["properties"].get('fullLemma')
        #    language = result["properties"].get('language')
        #    print language.encode('utf-8') + "\t" + str(lemma.encode('utf-8'))

        # retrieving BabelGloss data
        glosses = data['glosses']
        for result in glosses:
            gloss = result.get('gloss')
            res.append(gloss)
    
    return res

#for g in id2glosses("bn:00015540n"):
#    print(g)

# OUT :
#Paris est la capitale de la France.
#La Rue de l'Abbé-de-l'Épée jouxte l'Institut des Jeunes sourds.
#Capitale de la France
#Capitale et plus grosse ville de France.
#Paris est une ville française, capitale de la France et le chef-lieu de la région d'Île-de-France.

In [11]:
# Capture all the polysemous word to desambiguate
# in the test and trial corpora
words = []

for _,d in parse_corpus_file(test_corpus_path):
    for s in d:
        for _,_,w in s:
            words.append(w.encode("utf-8"))
for _,d in parse_corpus_file(corpus_path):
    for s in d:
        for _,_,w in s:
            words.append(w.encode("utf-8"))

words = set(words)

## Produit un dictionnaire des définitions des mots polysemiques des corpus de dev et test de SemEval.
Crée un dictionnaire qui est le fichier : dict.dictionary

In [17]:
# BabelNet API key
key = "eda2e938-8518-40c9-b46f-4cdb0005e6cd"

# Update the dictionary file
with open("dict.dictionary", "a+") as file:
    # look for the lemmas allready in the dictionary file
    # no need to consume babelnet coin for a lemma we know
    reader = csv.reader(file, delimiter=";")
    alreadyin = [row[0] for row in reader]
    print "%d/%d lemmas in the dictionary"%(len(alreadyin)-1, len(words))
    
    writer = csv.writer(file, delimiter=";")
    
    if len(alreadyin) <= 1:
        writer.writerow(["lemma", "nb_senses", "BN_senses", "definitions"])
    
    for lemma in words:
        print "\r%3d/%d"%(len(alreadyin)-1, len(words)),

        if not lemma in alreadyin:

            ids_lemma = lemma2ids(lemma)
            
            defs_lemma = []
            for id in ids_lemma:
                try:
                    defs_lemma.append(id2glosses(id))
                except urllib2.HTTPError:
                    defs_lemma.append([])
            
            ids = []
            defs = []
            for i in range(len(defs_lemma)):
                # Iterate through the definitions
                # if the definition is empty : pass
                # else append the id and definition
                if defs_lemma[i]:
                    ids.append(ids_lemma[i])
                    defs.append(defs_lemma[i])
            # append the new lemma to the file
            file.write(lemma + ";" +
                str(len(ids)) + ";" +
                (",".join(ids)) + ";" +
                (",".join(["\"" + " ".join(d).encode("utf-8") + "\"" for d in defs])) + "\n")
            alreadyin.append(lemma)

976/959 lemmas in the dictionary
977/959
