In [1]:
from tqdm import tqdm, tnrange, tqdm_notebook
import json
import pandas as pd
import os
from collections import namedtuple

import utils
import parsers
from nltk.corpus import wordnet as wn
import copy

In [2]:
fine_senses_vocab_path = '../resources/semcor.vocab.WordNet.json'
input_vocab_path = '../resources/semcor.input.vocab.json'
input_antivocab_path = '../resources/semcor.leftout.vocab.json'

In [3]:
senses = utils.json_vocab_reader(fine_senses_vocab_path)
inputs, antivocab = utils.json_vocab_reader(input_vocab_path, input_antivocab_path)

In [4]:
output_vocab = utils.vocab_merge(senses, inputs)
reverse_output_vocab =  dict((v, k) for k, v in output_vocab.items())

In [23]:
def prepare_sentence_batch(batch_size, training_file_path, antivocab, output_vocab, gold_file_path = None):
    """
    
    """
    batch = {"sentences" : [], "candidates" : []}
    
    training_data_flow = parsers.TrainingParser(training_file_path)
    if gold_file_path:
        gold_data_flow = parsers.GoldParser(gold_file_path)
        batch.update({"labels" : []})
        
    
    for batch_count, sentence in enumerate(training_data_flow.parse()):
        #training mode
        if gold_file_path:
            labels = gold_data_flow.parse()         
            output = prepare_sentence(sentence, antivocab, output_vocab, labels)

            batch['sentences'].append(output['sentence'])
            batch['candidates'].append(output['candidates'])
            batch['labels'].append(output['labels'])
        
        #evaulation mode
        else:
            output = prepare_sentence(sentence, antivocab, output_vocab)

            batch['sentences'].append(output['sentence'])
            batch['candidates'].append(output['candidates'])
            
        if batch_count == batch_size:
            batch_count = 0
            yield batch
            
    del batch
            
        
def prepare_sentence(sentence, antivocab, output_vocab, labels=None):
    """
    
    """
    records = namedtuple("Training", "id_ lemma pos instance")

    output = {"sentence" : [], "labels" : [], "candidates": []}
    for entry in sentence:
        
        id_, lemma, pos, _ = entry
        
        output_word = utils.replacement_routine(lemma, entry, antivocab, output_vocab)        
        output['sentence'].append(output_word)
        
        if id_ is None:
            output['labels'].append(output_word)
            candidates = [output_word]
            
        else:
            if labels is not None:
                current_label = labels.__next__()
                assert current_label.id_ == id_, "ID mismatch"
                
                sense = current_label.senses[0]
                sense = output_vocab[sense] if sense in output_vocab else output_vocab["<UNK>"]
                output['labels'].append(sense)
            candidates = utils.candidate_synsets(lemma, pos)
            candidates = [utils.replacement_routine(c, records(id_=None, lemma=c, pos="X", instance=True), antivocab, output_vocab) for c in candidates]
            
        output['candidates'].append(candidates)
    return output

batch = prepare_sentence_batch(batch_size = 64,
                               training_file_path = '../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml',
                               gold_file_path =  '../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt',
                               antivocab = antivocab,
                               output_vocab = output_vocab)

In [24]:
for s in batch):
    pass


0it [00:00, ?it/s][A
1it [00:01,  1.08s/it][A

KeyboardInterrupt: 

In [None]:
for (i,j,k) in batch.__next__():
    print(i)

In [None]:
batch.__next__()