# Parse les données d'apprentissage et de test

In [2]:
import re
from xml.dom.minidom import parseString
from xml.dom.minidom import parse
import xml.etree.ElementTree as et

sens_path = "trial/data/multilingual-all-words.fr.senses"
gold_truth_path = "trial/keys/keys-bn.fr"
corpus_path = "trial/data/multilingual-all-words.fr.xml"

output_corpus_path = "trial_corpus.xml"

## Parser sur le fichier de sens

In [3]:
def parse_senses_file(file):
    """Parse the BabelNet senses contained in .senses file given by SemEval
    
    Parameters
    ----------
    file: file object
        An open .senses file
    
    Returns
    -------
    dict
        A dictionnary mapping a lemma with it's BabelNet senses
    """
    # BabelNet Sense Dictionnary
    # bn_sens_dict[<lemma>] = [<senses>]
    bn_sens_dict = {}

    # Capture only BabelNet senses in the given senses file
    for s in file.readlines():
        splitted_line = re.split("\s", s)
        
        # Parse and get the lemma
        lemma = splitted_line[0].split("#")[0]
        
        # Parse BabelNet data
        bn_num = int(splitted_line[2])
        bn_senses = []
        if bn_num > 0:
            for i in range(bn_num):
                bn_senses.append(splitted_line[3+i])
        
        # Parse WordNet data
        #wn_num = int(splitted_line[3+bn_num])
        #wn_senses = []
        #if wn_num > 0:
        #    for i in range(wn_num):
        #        wn_senses.append(splitted_line[4+bn_num+i])
        
        # Parse Wikipedia data 
        #wiki_num = int(splitted_line[4+bn_num+wn_num])
        #wiki_senses = []
        #if wiki_num > 0:
        #    for i in range(wiki_num):
        #        wiki_senses.append(splitted_line[5+bn_num+wn_num+i])

        bn_sens_dict[lemma] = bn_senses
    
    return bn_sens_dict

senses_dict = parse_senses_file(open(sens_path))
keys = list(senses_dict.keys())
print("5 first entries in the BabelNet senses dictionnary")
for i in range(5):
    print(" -", keys[i], "(%d)"%(len(senses_dict[keys[i]])), senses_dict[keys[i]])

5 first entries in the BabelNet senses dictionnary
 - trafiquant_de_drogue (2) ['bn:01761518n', 'bn:00028881n']
 - drogue (2) ['bn:00026546n', 'bn:00028872n']
 - semaine (4) ['bn:00080815n', 'bn:00043484n', 'bn:00080821n', 'bn:00080813n']
 - indignation (6) ['bn:00046491n', 'bn:00004087n', 'bn:00004086n', 'bn:01960121n', 'bn:00004085n', 'bn:01328234n']
 - employé (1) ['bn:00030618n']


## Parser sur le fichier "gold truth"

In [4]:
def parse_gold_truth(file):
    """Parse a SemEval gold truth (keys) file
    
    Parameters
    ----------
    file: file object
        An open file
    
    Returns
    -------
    dict
        A dictionnary mapping a SemEval ids with their senses (according to the gold truth, an id can be link to multiple senses)
    """
    # BabelNet gold truth of the corpus
    bn_gt = {}

    for line in file.readlines():
        line = line.split()

        id = line[1]
        senses = []
        i = 2
        while i < len(line) and line[i] != "!!":
            senses.append(line[i])
            i+=1
        bn_gt[id] = senses
    return bn_gt

gt_dict = parse_gold_truth(open(gold_truth_path))
keys = list(gt_dict.keys())
print("5 first entries in the BabelNet gold truth dictionnary")
for i in range(5):
    print(" -", keys[i], "(%d)"%(len(gt_dict[keys[i]])), gt_dict[keys[i]])

5 first entries in the BabelNet gold truth dictionnary
 - d001.s001.t001 (1) ['bn:00028885n']
 - d001.s001.t002 (1) ['bn:00050165n']
 - d001.s001.t003 (1) ['bn:00064245n']
 - d001.s002.t001 (1) ['bn:00014710n']
 - d001.s002.t002 (1) ['bn:00048461n']


## Parser sur le corpus

In [9]:
def parse_corpus_file(file):
    """Parse a SemEval corpus
    
    Parameters
    ----------
    file
        Either an open .xml SemEval corpus file or it's path
    text_id: int
        The index of the text to parse in the SemEval corpus starting from 0

    Returns
    -------
    list
        list of tuple, each one contains a str (the sentence where elements are separated by a space) and a list of tuples mapping the SemEval id of the word (dxxx.sxxx.txxx), the index of the lemma in the sentence (index start from 0) and the lemma itself
    """

    DOMTree = parse(file)
    corpus = DOMTree.documentElement
    
    documents = []

    # Iterate through the different documents (text markers in the SemEval corpora)    
    for t in corpus.getElementsByTagName("text"):
        sentences = []
        polysems = []
        # Iterate through the sentences in each documents
        for s in t.getElementsByTagName("sentence"):
            idx = 0
            sentence = ""
            polysem = []
            for n in s.childNodes:
                if n.nodeName == "wf":
                    sentence += n.childNodes[0].data + " "
                    idx += 1
                if n.nodeName == "instance":
                    lemma = n.getAttribute("lemma")
                    id = n.getAttribute("id")
                    sentence += n.childNodes[0].data + " "
                    polysem.append((id, idx, lemma))
                    idx += 1
            sentences.append(sentence)
            polysems.append(polysem)
    
        documents.append((sentences, polysems))

    return documents
    

d = parse_corpus_file(corpus_path)
print("Documents in the corpus:", len(d))
print()
for i, (s,_) in enumerate(d):
    print("Document %d: %d sentences" % (i, len(s)))

Documents in the corpus: 1

Document 0: 36 sentences


In [6]:
unknow = []

def parse_data(semeval_corpus_path, semeval_gt_path, output_path):
    """Parse the given SemEval data to a new xml.
    All given path has to exist

    Parameters
    ----------
    semeval_corpus_path: str
        Path to a SemEval .xml corpus file
    semeval_gt_path: str
        Path to a SemEval keys fils (related to the previous given one and preferably a BabelNet file : keys-bn)
    output_path: str
        Path to the output xml file, will contain the newly formated SemEval corpus
    """

    documents = parse_corpus_file(semeval_corpus_path)
    gt_dict = parse_gold_truth(open(semeval_gt_path))

    root = et.Element("corpus")

    for d, (sentences, polysems) in enumerate(documents):
        document = et.SubElement(root, "document", {"id":str(d)})
        for i in range(len(sentences)):
            sentence = et.SubElement(document, "sentence", {
                "id":str(i), "s": sentences[i]
            })
            for p, (id, idx, lemma) in enumerate(polysems[i]):
                if id in gt_dict:
                    lemma = et.SubElement(sentence, "lemma", {
                        "id" : str(p),
                        "idx" : str(idx), 
                        "lemma" : lemma,
                        "senses" : " ".join(gt_dict[id])
                    })
                else:
                    unknow.append(id)

    with open(output_path, "wb") as f:
        f.write(parseString(et.tostring(root)).toprettyxml(encoding="UTF-8"))

## Parsing des données de dev

In [7]:
parse_data(corpus_path, gold_truth_path, output_corpus_path)
print("unknow lemma:", len(unknow))

f = open(output_corpus_path)
for i in range(10):
    print(f.readline())
print("(...)")

unknow lemma: 0
<?xml version="1.0" encoding="UTF-8"?>

<corpus>

	<document id="0">

		<sentence id="0" s="Nous ne savons pas qui gagnera la guerre_contre_la_drogue en Amérique_Latine , mais nous savons qui est en train de la perdre - la presse . ">

			<lemma id="0" idx="7" lemma="guerre_contre_la_drogue" senses="bn:00028885n"/>

			<lemma id="1" idx="9" lemma="Amérique_Latine" senses="bn:00050165n"/>

			<lemma id="2" idx="23" lemma="presse" senses="bn:00064245n"/>

		</sentence>

		<sentence id="1" s="Au cours des six derniers mois , six journalistes ont été tués et 10 ont été enlevés par des trafiquants_de_drogue ou des guérilleros de gauche - souvent il s ’ agit des mêmes personnes - en Colombie . ">

			<lemma id="0" idx="5" lemma="mois" senses="bn:00014710n"/>

(...)


## Parsing des données de test

In [8]:
test_corpus_path = "test/data/multilingual-all-words.fr.xml"
test_gt_path = "test/keys/gold/babelnet/babelnet.fr.key"
output_path = "test_corpus.xml"

parse_data(test_corpus_path, test_gt_path, output_path)
print("unknow lemma:", len(unknow))

f = open(output_path)
for i in range(10):
    print(f.readline())
print("(...)")

unknow lemma: 226
<?xml version="1.0" encoding="UTF-8"?>

<corpus>

	<document id="0">

		<sentence id="0" s="Le groupe des Nations_Unies a des projets de plans pour la réduction des émissions ">

			<lemma id="0" idx="1" lemma="groupe" senses="bn:00041942n"/>

			<lemma id="1" idx="3" lemma="nations_unies" senses="bn:00078931n"/>

			<lemma id="3" idx="8" lemma="plan" senses="bn:00062759n"/>

			<lemma id="4" idx="11" lemma="réduction" senses="bn:00025780n"/>

			<lemma id="5" idx="13" lemma="émission" senses="bn:00030455n"/>

		</sentence>

(...)
