In [21]:
import models
import utils
import keras as K
from sklearn.utils import shuffle

In [27]:
training_file_path = '../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml'
gold_file_path =  '../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt'
training_file_path_dev = '../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2013/semeval2013.data.xml'
gold_file_path_dev = '../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2013/semeval2013.gold.key.txt'
fine_senses_vocab_path = '../resources/semcor.vocab.WordNet.json'
input_vocab_path = '../resources/semcor.input.vocab.json'
input_antivocab_path = '../resources/semcor.leftout.vocab.json'
embedding_size = 32
batch_size = 64
LEARNING_RATE = 0.01
N_EPOCHS = 10
PADDING_SIZE = 50
print_model = False

In [3]:
# training_file_path = '../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.data.xml'
# gold_file_path = '../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt'

In [28]:
#loading dict
senses = utils.json_vocab_reader(fine_senses_vocab_path)
inputs, antivocab = utils.json_vocab_reader(input_vocab_path, input_antivocab_path)
output_vocab = utils.vocab_merge(senses, inputs)
reverse_output_vocab =  dict((v, k) for k, v in output_vocab.items())

K.backend.clear_session()
model = models.Basic(test=2)

In [29]:
from tqdm import tqdm

In [30]:
import parsers

In [7]:
# validation_generator = model.prepare_sentence_batch(batch_size = 64,
#                                                      training_file_path = training_file_path_dev,
#                                                      gold_file_path = gold_file_path_dev,
#                                                      antivocab = antivocab,
#                                                      output_vocab = output_vocab,
#                                                      PADDING_SIZE = PADDING_SIZE)


# for i, j in validation_generator:
#     print(i.shape)

In [8]:
training_data_flow = parsers.TrainingParser(training_file_path)

In [9]:
sentences = 0
for batch_count, sentence in enumerate(tqdm(training_data_flow.parse()), start = 1):
    sentences +=1
sentences,batch_count

37176it [00:05, 6721.90it/s]


(37176, 37176)

In [10]:
output = {"sentence" : [], "labels" : [], "candidates": []}
for entry in sentence:

            id_, lemma, pos, _ = entry
            
            output_word = utils.replacement_routine(lemma, entry, antivocab, output_vocab)        
            output['sentence'].append(output_word)


In [11]:
output

{'candidates': [], 'labels': [], 'sentence': [4, 0, 0]}

In [12]:
import re
import os

from tqdm import tqdm, tnrange, tqdm_notebook
import json
import pandas as pd
import os
from collections import namedtuple

import utils
import parsers
from nltk.corpus import wordnet as wn
import copy
import numpy as np

import tensorflow.keras as K

In [13]:
training_file_path = '../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.data.xml'
gold_file_path = '../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt'

In [31]:
class Basic(object):
    """
    Word Sense Disambiguiation performed via a basic sequence tagging
    """
    def __init__(self, batch_size, training_file_path, antivocab, output_vocab, PADDING_SIZE = 50, gold_file_path = None):

        self.batch_size =  batch_size
        self.training_file_path =  training_file_path
        self.antivocab =  antivocab
        self.output_vocab =  output_vocab
        self.PADDING_SIZE =  PADDING_SIZE
        self.gold_file_path  =  gold_file_path
        self.length = 0
        
    def __len__(self):
        return parsers.TrainingParser(self.training_file_path).count()

    def __getitem__(self):
        """
        Batch procesing generator, yields a dict of sentences, candidates and labels if in training mode (determined if gold_file_path is specified)

        param batch_size:
        param training_file_path:
        param antivocab:
        param output_vocab:
        param gold_file_path:
        return: generator object
        """
        batch = {"sentences" : [], "candidates" : []}

        training_data_flow = parsers.TrainingParser(self.training_file_path )
        if self.gold_file_path:
            self.gold_data_flow = parsers.GoldParser(self.gold_file_path)
            batch.update({"labels" : []})


        for batch_count, sentence in enumerate(training_data_flow.parse(), start = 1):
            self.length += 1
            #training mode
            if self.gold_file_path:
                labels = self.gold_data_flow.parse()
                output = self.prepare_sentence(sentence, self.antivocab, self.output_vocab, labels)

                batch['sentences'].append(output['sentence'])
                batch['candidates'].append(output['candidates'])
                batch['labels'].append(output['labels'])

            #evaulation mode
            else:
                output = self.prepare_sentence(sentence, antivocab, output_vocab)

                batch['sentences'].append(output['sentence'])
                batch['candidates'].append(output['candidates'])

            if int(batch_count)%int(self.batch_size)==0:

                for key in batch.keys():
                    batch[key] = self.apply_padding(batch, key, maxlen = self.PADDING_SIZE, value = 1)
                
                
                batch_count = 0
                
                if self.gold_file_path:
                    yield batch['sentences'], np.expand_dims(batch['labels'], axis=-1)
                else:
                    yield batch['sentences']
                batch = {"sentences" : [], "candidates" : []}
                if self.gold_file_path:
                    batch.update({"labels" : []})
                    
        if batch_count>0:
            print(batch_count)
            for key in batch.keys():
                    batch[key] = self.apply_padding(batch, key, maxlen = self.PADDING_SIZE, value = 1)
            batch_count = 0
            
            if self.gold_file_path:
                x, y = batch['sentences'], np.expand_dims(batch['labels'], axis=-1)
                yield shuffle(x, y)
            else:
                yield shuffle(batch['sentences'])


    @staticmethod
    def apply_padding(output, key, maxlen=50, value=1):
        """
        Applies padding to output sequences

        param output: dict
        param key: key of dict
        param maxlen: length to pad
        param value: pad with this value
        return padded list of lists
        """
        x = output[key]
        if key == 'candidates':
            for candidate in range(len(x)):
                x[candidate] =  x[candidate] + [[value]] * (maxlen-len(x[candidate]))
            return x
        else:
            return K.preprocessing.sequence.pad_sequences(x, truncating='pre', padding='post', maxlen=maxlen, value = value )


    @staticmethod
    def prepare_sentence(sentence, antivocab, output_vocab, labels=None):
        """
        Prepares an output sentence consisting of the sentence itself along with labels and candidates

        param sentence:
        param antivocab:
        param output_vocab:
        param labels:

        return output: dict with keys: sentence, labels, candidates all list type objects
        """
        records = namedtuple("Training", "id_ lemma pos instance")

        output = {"sentence" : [], "labels" : [], "candidates": []}
        for entry in sentence:

            id_, lemma, pos, _ = entry

            output_word = utils.replacement_routine(lemma, entry, antivocab, output_vocab)
            output['sentence'].append(output_word)

            if id_ is None:
                output['labels'].append(output_word)
                candidates = [output_word]

            else:
                if labels is not None:
                    current_label = labels.__next__()
                    assert current_label.id_ == id_, "ID mismatch"

                    sense = current_label.senses[0]
                    sense = output_vocab[sense] if sense in output_vocab else output_vocab["<UNK>"]
                    output['labels'].append(sense)
                candidates = utils.candidate_synsets(lemma, pos)
                candidates = [utils.replacement_routine(c, records(id_=None, lemma=c, pos="X", instance=True), antivocab, output_vocab) for c in candidates]

            output['candidates'].append(candidates)
        return output


In [32]:
x = Basic(batch_size,
      training_file_path,
      antivocab,
      output_vocab,
      50,
      gold_file_path)

In [33]:
x.__len__()

37176

In [34]:
for i, j  in x.__getitem__():
    print(i.shape)

(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)
(64, 50)


KeyboardInterrupt: 

In [None]:
[reverse_output_vocab[q] for q in i[-2,:]]

In [None]:
training_data_flow = parsers.TrainingParser(training_file_path )


for batch_count, sentence in enumerate(training_data_flow.parse(), start = 1):
    if int(batch_count)%int(batch_size)==0:
        print(batch_count)
    if 

In [None]:
batch_count

In [None]:
64*2

In [None]:
from sklearn.utils import shuffle

In [None]:
shuffle(i, np.squeeze(j))

In [None]:
class test(object):
    def __init__(self, t):
        self.t = t
        self.length = None
        
    def __len__(self):
        return self.length
    def sex(self):
        self.length = 2
        return self.t**2
    
    

In [None]:
t = test(2)

In [None]:
t.sex()

In [None]:
from lxml import etree