In [17]:
from collections import Counter
import warnings

In [63]:
def is_number(word):
    return word.replace('.','').replace(',','').isdecimal()

In [76]:
class VocabularyBuilder():
    def __init__(self, max_size = 512):
        self._words_counter = Counter()
        self.max_size = max_size
        self._vocabulary = { '<PAD>':0, '<NUMBER>':1, '<RARE>':2 }
        self.built = False
        
    def add(self, word):
        if not is_number(word):
            self._words_counter.update([word.lower()])
            
    def build(self):
        for word, count in self._words_counter.most_common(self.max_size):
            self._vocabulary[word] = len(self._vocabulary)
        self.built = True
        return self._vocabulary

    def get_vocab(self):
        if not self.built:
            warnings.warn(
                "The vocabulary is not built. Use VocabularyBuilder.build(). Returning default vocabulary.", Warning)
            return self._vocabulary
        else:
            return self._vocabulary
            

In [77]:
vocab_b = VocabularyBuilder()

In [78]:
vocab_b.add('12.3')

In [79]:
vocab_b.build()

{'<PAD>': 0, '<NUMBER>': 1, '<RARE>': 2}

In [80]:
vocab_b.get_vocab()

{'<PAD>': 0, '<NUMBER>': 1, '<RARE>': 2}

In [1]:
from utils import vocabulary

In [2]:
vb = vocabulary.VocabularyBuilder()

In [3]:
vb.add("honda")
vb.add("1,23")

In [4]:
vb.get_vocab()



{'<PAD>': 0, '<NUMBER>': 1, '<RARE>': 2}

In [5]:
vb.build()

Vocabulary of size 4 built!


{'<PAD>': 0, '<NUMBER>': 1, '<RARE>': 2, 'honda': 4}

In [6]:
vb.get_vocab()

{'<PAD>': 0, '<NUMBER>': 1, '<RARE>': 2, 'honda': 4}