# Vocabulary

In [14]:
import numpy as np
import nltk
from nltk import FreqDist
import pickle

In [25]:
UNK, UNK_CODE = '<UNK>', 0
BOS, BOS_CODE = '<BOS>', 1
EOS, EOS_CODE = '<EOS>', 2
PAD, PAD_CODE = '<PAD>', 3

IGNORE = "()[]{}:<>~@#$%^/\|_+*…–«»"
AS_DOT = ";"

class Vocabulary:
    def __init__(self):
        self._tokens_to_words = None
        self._words_to_tokens = None
    
    
    @staticmethod
    def _update(fdist, path):
        with open(path, 'r') as f:
            text = f.read()
            text = text.replace('\xa0', ' ').replace('\ufeff','')
            text = text.lower()

        for sentence in nltk.tokenize.sent_tokenize(text):
            for word in nltk.tokenize.word_tokenize(sentence):
                fdist[word]+=1
        return fdist               

    
    def build(self, paths, max_size=30000):
        if type(paths) is str:
            paths = [paths]

        # collect all words
        fdist = FreqDist()
        for p in paths:
            fdist = self._update(fdist, p)
            
        # build vocab from the most frequent words
        most_common = fdist.most_common(max_size)
        words       = [ UNK, EOS, BOS, PAD ] + [w for w, _ in most_common]
        self._tokens_to_words = words
        self._words_to_tokens = {words[i]:i for i in range(len(words))}        

        
    def save(self, path):
        pickle.dump([self._tokens_to_words, self._words_to_tokens], open(path, "wb"))

        
    def restore(self, path):
        [self._tokens_to_words, self._words_to_tokens] = pickle.load(open(path, "rb"))


## Build

In [32]:
text_path="data/anna.txt"

voc = Vocabulary()
voc.build(text_path, 10000)

## Restoring

In [46]:
voc_path = "_vocabulary.data"
voc.save(voc_path)
voc2 = Vocabulary()
voc2.restore(voc_path)

assert voc2._tokens_to_words == voc._tokens_to_words
assert voc2._words_to_tokens == voc._words_to_tokens

In [45]:
a = {1:2, 2:3}
b = {2:3, 1:2}
c = a

assert a == b
assert a == c
assert b == c

assert a is c
