In [1]:
from tqdm import tqdm, tnrange, tqdm_notebook
import json
import pandas as pd
import os

import utils
import parse

In [2]:
from nltk.corpus import wordnet as wn
import copy

In [3]:
fine_senses_vocab_path = '../resources/semcor.vocab.WordNet.json'
input_vocab_path = '../resources/semcor.input.vocab.json'
input_antivocab_path = '../resources/semcor.leftout.vocab.json'

# labels = '../resources/semcor.vocab.BabelNet.json'
# labels = list(utils.json_vocab_reader(labels))

In [4]:
senses = utils.json_vocab_reader(fine_senses_vocab_path)
inputs, antivocab = utils.json_vocab_reader(input_vocab_path, input_antivocab_path)

In [None]:
rev 

In [5]:
def vocab_merge(vocab1, vocab2):
    """
    Merges two vocabularies into the first one, keeping the reverse vocabulary consistent.
    :param vocab1: First vocabulary (will contain the merged vocabulary), as Dict str -> int
    :param rev_vocab1: First reverse vocabulary, as List of str
    :param vocab2: Second vocabulary, as Dict str -> int
    :return: (vocab1, rev_vocab1) updated to resemble the merged vocabulary
    """

    v1 = copy.deepcopy(vocab1)

    for key2 in vocab2.keys():
        if key2 not in v1:
            v1[key2] = len(v1)

    return v1

In [25]:
output_vocab = vocab_merge(senses, inputs)
reverse_output_vocab =  dict((v, k) for k, v in output_vocab.items())

In [7]:
def replacement_routine(l, entry):
    ret_word = None
    if l in antivocab:
        ret_word = output_vocab["<REPLACEMENT>"]

    if entry.instance or ret_word is None:
        if l in output_vocab:
            ret_word = output_vocab[l]
        elif ret_word is None:
            ret_word = output_vocab["<UNK>"]

    return ret_word

In [8]:
def wn_id_from_synset(synset):
    """
    Builds the WordNet ID in the shape of wn:<offset><pos> for the given synset.
    :param synset: Synset to get the ID from
    :return: WordNet ID as described
    """

    offset = str(synset.offset())
    offset = "0" * (8 - len(offset)) + offset  # append heading 0s to the offset
    wn_id = "wn:%s%s" % (offset, synset.pos())

    return wn_id

In [9]:
def u_candidate_synsets(lemma, pos):
    """
    Retrieves the candidate synsets for the given lemma and pos combination.
    :param lemma: Lemma to get the synsets of
    :param pos: POS associated to the lemma
    :return: Candidate synsets having the given lemma and POS, as List; the lemma itself in case there is no match in WordNet
    """

    pos_dictionary = {"ADJ": wn.ADJ, "ADV": wn.ADV, "NOUN": wn.NOUN, "VERB": wn.VERB}   # open classes only
    if pos == "." or pos == "PUNCT":
        return ["<PUNCT>"]
    elif pos == "NUM":
        return ["<NUM>"]
    elif pos == "SYM":
        return ["<SYM>"]
    elif pos in pos_dictionary:
        synsets = wn.synsets(lemma, pos=pos_dictionary[pos])
    else:
        synsets = wn.synsets(lemma)
    #print(len(synsets))
    if len(synsets) == 0:
        return [lemma]
    return [wn_id_from_synset(syn) for syn in synsets]

In [10]:
from collections import namedtuple
gold_file = "../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt"

In [11]:
XMLEntry = namedtuple("Training", "id_ lemma pos instance")

In [12]:
Training_xml = parse.TrainingParser('../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml')
Gold = parse.GoldParser(gold_file)

sentence_input = []
labels_input = []
candidate_synsets = []
c=0

for sentence in Training_xml.parse():
    c += 1
    for xmlentry in sentence:
        
        id_ = xmlentry.id_
        lemma = xmlentry.lemma
        pos = xmlentry.pos
        
        sent_word = replacement_routine(l=lemma, entry=xmlentry)
        sentence_input.append(sent_word)
        
        if id_ is None:
            labels_input.append(sent_word)
            # no instance word, just give the lemma itself as prediction
            candidates = [sent_word]
        else:
            labels = next(Gold.parse())
            if labels is not None:
                assert labels.id_ == id_, "ID mismatch"
                sense = labels.senses[0]
                #sense = utils.sensekeyToSynsetConverter
                sense = output_vocab[sense] if sense in output_vocab else output_vocab["<UNK>"]
                labels_input.append(sense)
            candidates = u_candidate_synsets(lemma, pos)
            #print(candidates)
            candidates = [replacement_routine(c, XMLEntry(id_=None, lemma=c, pos="X", instance=True)) for c in candidates]
        candidate_synsets.append(candidates)
            
    if c==1:
        break


In [13]:
sentence_input

[0, 26005, 4, 0, 25921, 4, 0, 27449, 0, 26821, 0, 0, 26822, 0, 26208, 26041, 0]