# Parse les données d'apprentissage et de test

In [1]:
import re
from xml.dom.minidom import parseString
from xml.dom.minidom import parse
import xml.etree.ElementTree as et

sens_path = "trial/data/multilingual-all-words.fr.senses"
gold_truth_path = "trial/keys/keys-bn.fr"
corpus_path = "trial/data/multilingual-all-words.fr.xml"

output_corpus_file = "corpus.xml"

## Parser sur le fichier de sens

In [2]:
def parse_senses_file(file):
    """Parse the BabelNet senses contained in .senses file given by SemEval
    
    Parameters
    ----------
    file: file object
        An open .senses file
    
    Returns
    -------
    dict
        A dictionnary mapping a lemma with it's BabelNet senses
    """
    # BabelNet Sense Dictionnary
    # bn_sens_dict[<lemma>] = [<senses>]
    bn_sens_dict = {}

    # Capture only BabelNet senses in the given senses file
    for s in file.readlines():
        splitted_line = re.split("\s", s)
        
        # Parse and get the lemma
        lemma = splitted_line[0].split("#")[0]
        
        # Parse BabelNet data
        bn_num = int(splitted_line[2])
        bn_senses = []
        if bn_num > 0:
            for i in range(bn_num):
                bn_senses.append(splitted_line[3+i])
        
        # Parse WordNet data
        #wn_num = int(splitted_line[3+bn_num])
        #wn_senses = []
        #if wn_num > 0:
        #    for i in range(wn_num):
        #        wn_senses.append(splitted_line[4+bn_num+i])
        
        # Parse Wikipedia data 
        #wiki_num = int(splitted_line[4+bn_num+wn_num])
        #wiki_senses = []
        #if wiki_num > 0:
        #    for i in range(wiki_num):
        #        wiki_senses.append(splitted_line[5+bn_num+wn_num+i])

        bn_sens_dict[lemma] = bn_senses
    
    return bn_sens_dict

senses_dict = parse_senses_file(open(sens_path))
keys = list(senses_dict.keys())
print("5 first entries in the BabelNet senses dictionnary")
for i in range(5):
    print(" -", keys[i], "(%d)"%(len(senses_dict[keys[i]])), senses_dict[keys[i]])

5 first entries in the BabelNet senses dictionnary
 - trafiquant_de_drogue (2) ['bn:01761518n', 'bn:00028881n']
 - drogue (2) ['bn:00026546n', 'bn:00028872n']
 - semaine (4) ['bn:00080815n', 'bn:00043484n', 'bn:00080821n', 'bn:00080813n']
 - indignation (6) ['bn:00046491n', 'bn:00004087n', 'bn:00004086n', 'bn:01960121n', 'bn:00004085n', 'bn:01328234n']
 - employé (1) ['bn:00030618n']


## Parser sur le fichier "gold truth"

In [3]:
def parse_gold_truth(file):
    """Parse a SemEval gold truth (keys) file
    
    Parameters
    ----------
    file: file object
        An open file
    
    Returns
    -------
    dict
        A dictionnary mapping a SemEval ids with their senses (according to the gold truth, an id can be link to multiple senses)
    """
    # BabelNet gold truth of the corpus
    bn_gt = {}

    for line in file.readlines():
        line = line.split()

        id = line[1]
        senses = []
        i = 2
        while line[i] != "!!":
            senses.append(line[i])
            i+=1
        bn_gt[id] = senses
    return bn_gt

gt_dict = parse_gold_truth(open(gold_truth_path))
keys = list(gt_dict.keys())
print("5 first entries in the BabelNet gold truth dictionnary")
for i in range(5):
    print(" -", keys[i], "(%d)"%(len(gt_dict[keys[i]])), gt_dict[keys[i]])

5 first entries in the BabelNet gold truth dictionnary
 - d001.s001.t001 (1) ['bn:00028885n']
 - d001.s001.t002 (1) ['bn:00050165n']
 - d001.s001.t003 (1) ['bn:00064245n']
 - d001.s002.t001 (1) ['bn:00014710n']
 - d001.s002.t002 (1) ['bn:00048461n']


## Parser sur le corpus

In [4]:
def parse_corpus_file(file):
    """Parse a SemEval corpus
    
    Parameters
    ----------
    file
        Either an open .xml SemEval corpus file or it's path

    Returns
    -------
    str
        the sentence, elements are separated by a space
    list
        a list of tuples containing the SemEval id of the word (dxxx.sxxx.txxx), the index of the lemma in the sentence (index start from 0) and the lemma itself
    """

    DOMTree = parse(file)
    corpus = DOMTree.documentElement
    
    
    sentences = []
    polysems = []
    for s in corpus.getElementsByTagName("sentence"):
        idx = 0
        sentence = ""
        polysem = []
        for n in s.childNodes:
            if n.nodeName == "wf":
                sentence += n.childNodes[0].data + " "
                idx += 1
            if n.nodeName == "instance":
                lemma = n.getAttribute("lemma")
                id = n.getAttribute("id")
                sentence += n.childNodes[0].data + " "
                polysem.append((id, idx, lemma))
                idx += 1
        sentences.append(sentence)
        polysems.append(polysem)
    
    return (sentences, polysems)
    

sentences, polysems = parse_corpus_file(corpus_path)
for i in range(5):
    print("Sentence :", sentences[i])
    print("Polysemous words :", polysems[i])
    print()

Sentence : Nous ne savons pas qui gagnera la guerre_contre_la_drogue en Amérique_Latine , mais nous savons qui est en train de la perdre - la presse . 
Polysemous words : [('d001.s001.t001', 7, 'guerre_contre_la_drogue'), ('d001.s001.t002', 9, 'Amérique_Latine'), ('d001.s001.t003', 23, 'presse')]

Sentence : Au cours des six derniers mois , six journalistes ont été tués et 10 ont été enlevés par des trafiquants_de_drogue ou des guérilleros de gauche - souvent il s ’ agit des mêmes personnes - en Colombie . 
Polysemous words : [('d001.s002.t001', 5, 'mois'), ('d001.s002.t002', 8, 'journaliste'), ('d001.s002.t003', 19, 'trafiquant_de_drogue'), ('d001.s002.t004', 22, 'guérillero'), ('d001.s002.t005', 24, 'gauche'), ('d001.s002.t006', 33, 'personne'), ('d001.s002.t007', 36, 'Colombie')]

Sentence : Au cours des 12 dernières années , au moins 40 journalistes sont morts là-bas . 
Polysemous words : [('d001.s003.t001', 5, 'année'), ('d001.s003.t002', 10, 'journaliste')]

Sentence : Les attaqu

In [5]:
def parse_data(semeval_corpus_path, semeval_gt_path, output_path):
    """Parse the given SemEval data to a new xml.
    All given path has to exist

    Parameters
    ----------
    semeval_corpus_path: str
        Path to a SemEval .xml corpus file
    semeval_gt_path: str
        Path to a SemEval keys fils (related to the previous given one and preferably a BabelNet file : keys-bn)
    output_path: str
        Path to the output xml file, will contain the newly formated SemEval corpus
    """

    sentences, polysems = parse_corpus_file(semeval_corpus_path)
    gt_dict = parse_gold_truth(open(semeval_gt_path))

    root = et.Element("corpus")

    for i in range(len(sentences)):
        sentence = et.SubElement(root, "sentence", {
            "id":str(i), "s": sentences[i]
        })
        for i2, (id, idx, lemma) in enumerate(polysems[i]):
            lemma = et.SubElement(sentence, "lemma", {
                "id" : str(i2),
                "idx" : str(idx), 
                "lemma" : lemma,
                "senses" : " ".join(gt_dict[id])
            })

    with open(output_path, "wb") as f:
        f.write(parseString(et.tostring(root)).toprettyxml(encoding="UTF-8"))

In [6]:
parse_data(corpus_path, gold_truth_path, output_corpus_file)

f = open(output_corpus_file)
for i in range(10):
    print(f.readline())
print("(...)")

<?xml version="1.0" encoding="UTF-8"?>

<corpus>

	<sentence id="0" s="Nous ne savons pas qui gagnera la guerre_contre_la_drogue en Amérique_Latine , mais nous savons qui est en train de la perdre - la presse . ">

		<lemma id="0" idx="7" lemma="guerre_contre_la_drogue" senses="bn:00028885n"/>

		<lemma id="1" idx="9" lemma="Amérique_Latine" senses="bn:00050165n"/>

		<lemma id="2" idx="23" lemma="presse" senses="bn:00064245n"/>

	</sentence>

	<sentence id="1" s="Au cours des six derniers mois , six journalistes ont été tués et 10 ont été enlevés par des trafiquants_de_drogue ou des guérilleros de gauche - souvent il s ’ agit des mêmes personnes - en Colombie . ">

		<lemma id="0" idx="5" lemma="mois" senses="bn:00014710n"/>

		<lemma id="1" idx="8" lemma="journaliste" senses="bn:00048461n"/>

(...)
