# Vocabulary

In [1]:
import numpy as np
import nltk
from nltk import FreqDist

In [9]:
UNK, UNK_CODE = '<UNK>', 0
BOS, BOS_CODE = '<BOS>', 1
EOS, EOS_CODE = '<EOS>', 2
PAD, PAD_CODE = '<PAD>', 3

IGNORE = "()[]{}:<>~@#$%^/\|_+*…–«»"
AS_DOT = ";"

class Vocabulary:
    def __init__(self):
        self._tokens_to_words = None
        self._words_to_tokens = None
    
    
    @staticmethod
    def _update(fdist, path):
        with open(path, 'r') as f:
            text = f.read()
            text = text.replace('\xa0', ' ').replace('\ufeff','')
            text = text.lower()

        for sentence in nltk.tokenize.sent_tokenize(text):
            for word in nltk.tokenize.word_tokenize(sentence):
                fdist[word]+=1
        return fdist               

    
    def build(self, paths, max_size=30000):
        if type(paths) is str:
            paths = [paths]

        # collect all words
        fdist = FreqDist()
        for p in paths:
            fdist = self._update(fdist, p)
            
        # build vocab from the most frequent words
        most_common = fdist.most_common(max_size)
        words       = [ UNK, EOS, BOS, PAD ] + [w for w, _ in most_common]
        self._tokens_to_words = words
        self._words_to_tokens = {words[i]:i for i in range(len(words))}        

        
    def save(self, path):
        raise NotImplementedError

        
    def load(self, path):
        raise NotImplementedError


## Build

In [10]:
path="data/anna.txt"

voc = Vocabulary()
voc.build(path, 100)
print(voc._tokens_to_words)
print(voc._words_to_tokens)



[(',', 39367), ('.', 16451), ('и', 12904), ('–', 10927), ('не', 6534), ('что', 5763), ('в', 5718), ('он', 5544), ('на', 3594), ('она', 3430), ('с', 3327), ('я', 3190), ('как', 2656), ('его', 2574), ('но', 2564), ('?', 2335), ('это', 2220), ('к', 1983), ('ее', 1801), ('все', 1667), ('было', 1654), ('!', 1545), ('так', 1411), ('сказал', 1411), ('а', 1385), ('то', 1384), ('же', 1323), ('ему', 1250), ('о', 1241), ('за', 1139), ('левин', 1134), (';', 1108), ('только', 1016), ('ты', 991), ('у', 913), ('был', 899), ('по', 832), ('когда', 829), ('для', 827), ('сказала', 827), ('бы', 820), ('от', 813), ('да', 806), ('теперь', 805), ('«', 775), ('»', 774), ('вы', 754), ('из', 735), ('была', 728), (':', 706), ('еще', 699), ('ей', 688), ('мне', 675), ('кити', 658), ('они', 644), ('него', 622), ('уже', 600), ('нет', 588), ('очень', 570), ('быть', 560), ('чтобы', 528), ('меня', 524), ('вронский', 508), ('себя', 501), ('этого', 499), ('себе', 499), ('были', 499), ('ни', 496), ('анна', 496), ('если', 