In [12]:
import os
import sys
sys.path.insert(0, '..')


In [13]:
from preprocessing.save_cove_weights import save_cove_weights
from preprocessing.create_train_data import DataParser
from preprocessing.download_data import download_data
from preprocessing.embedding_util import split_vocab_and_embedding
from preprocessing.s3_util import maybe_upload_data_files_to_s3
from flags import get_options_from_flags

In [14]:
import numpy as np
import operator
import os
import preprocessing.constants as constants
import preprocessing.chars as chars

In [15]:
def _get_line_count(filename):
    num_lines = 0
    with open(filename, "r", encoding="utf-8") as f:
        for _ in f:
            num_lines += 1
    return num_lines

In [17]:
input_file = os.path.join("../downloads", constants.VECTOR_FILE)
embedding_output_file = os.path.join("../data", constants.EMBEDDING_FILE)
vocab_output_file = os.path.join("../data", constants.VOCAB_FILE)
vocab_chars_output_file = os.path.join("../data", constants.VOCAB_CHARS_FILE)
if all([os.path.exists(f) for f in 
    [embedding_output_file, vocab_output_file, vocab_chars_output_file]]):
    print("Word embedding and vocab files already exist")
    #return
print("Creating NumPy word embedding file and vocab files")
num_lines = _get_line_count(input_file)
print("Vocab size: %d" % num_lines)


Word embedding and vocab files already exist
Creating NumPy word embedding file and vocab files
Vocab size: 2196017


In [18]:
# Include 4 entries for bos/eos/unk/pad (they will all be left as 0 vectors).
embedding = np.zeros((num_lines + 4, constants.WORD_VEC_DIM), dtype=np.float32)
vocab_o_file = open(vocab_output_file, "w", encoding="utf-8")


In [19]:
embedding

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [22]:
embedding.shape

(2196021, 300)

In [20]:
# Get IDs for the total vocab, not just the words. This includes
# the bos/eos/unk/pad.
vocab_chars = np.zeros((num_lines + 4, constants.MAX_WORD_LEN), dtype=np.uint8)


In [21]:
vocab_chars

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [23]:
vocab_chars.shape

(2196021, 25)

In [24]:
input_file

'../downloads/glove.840B.300d.txt'

In [25]:
i_file = open(input_file, "r", encoding="utf-8")
i = 0
char_counts = {}
vocab_list = []
for line in i_file:
    idx = line.index(" ")
    word = line[:idx]
    vocab_list.append(word)
    for c in word:
        if c in char_counts:
            char_counts[c] += 1
        else:
            char_counts[c] = 1
    vocab_o_file.write(word + "\n")
    embedding[i] = np.fromstring(line[idx + 1:], dtype=np.float32, sep=' ')
    i += 1
    if i % 10000 == 0 or i == num_lines:
        print("Processed %d of %d (%f percent done)" % (i, num_lines, 100 * float(i) / float(num_lines)), end="\r")


Processed 2196017 of 2196017 (100.000000 percent done)

In [26]:
sorted_chars = sorted(char_counts.items(), key=operator.itemgetter(1),
    reverse=True)


In [27]:
sorted_chars

[('e', 1354358),
 ('a', 1093525),
 ('i', 952983),
 ('o', 890981),
 ('r', 850106),
 ('n', 832787),
 ('t', 767491),
 ('s', 745565),
 ('l', 626051),
 ('0', 476241),
 ('c', 435938),
 ('1', 410373),
 ('d', 396763),
 ('u', 389591),
 ('m', 349989),
 ('2', 342964),
 ('h', 339529),
 ('-', 330745),
 ('p', 281784),
 ('g', 279148),
 ('3', 213468),
 ('S', 211261),
 ('y', 210565),
 (':', 199751),
 ('b', 199191),
 ('5', 191100),
 ('A', 189688),
 ('4', 184435),
 ('.', 174535),
 ('k', 172986),
 ('C', 157929),
 ('9', 151780),
 ('T', 148214),
 ('f', 146117),
 ('E', 139104),
 ('8', 138954),
 ('M', 138861),
 ('6', 138578),
 ('7', 135860),
 ('w', 135529),
 ('R', 129252),
 ('v', 125159),
 ('P', 124767),
 ('I', 117385),
 ('/', 112908),
 ('B', 110429),
 ('D', 108928),
 ('N', 107473),
 ('L', 107203),
 ('O', 98874),
 ('H', 80269),
 ('G', 78283),
 ('F', 75475),
 ('W', 59483),
 ('z', 56062),
 ('K', 54839),
 ('x', 52897),
 ('U', 50737),
 (',', 47315),
 ('V', 44560),
 ('j', 30326),
 ('J', 30001),
 ('Y', 27180),
 ('_

In [29]:
len(sorted_chars)

1525

In [30]:
frequent_chars = dict((x[0], i) for i, x in enumerate(
    sorted_chars[:chars.MAX_CHARS]))

In [31]:
frequent_chars

{'!': 69,
 '"': 110,
 '#': 68,
 '$': 92,
 '%': 217,
 '&': 83,
 "'": 73,
 '(': 89,
 ')': 87,
 '*': 71,
 '+': 75,
 ',': 58,
 '-': 17,
 '.': 28,
 '/': 44,
 '0': 9,
 '1': 11,
 '2': 15,
 '3': 20,
 '4': 27,
 '5': 25,
 '6': 37,
 '7': 38,
 '8': 35,
 '9': 31,
 ':': 23,
 ';': 123,
 '<': 93,
 '=': 100,
 '>': 94,
 '?': 70,
 '@': 74,
 'A': 26,
 'B': 45,
 'C': 30,
 'D': 46,
 'E': 34,
 'F': 52,
 'G': 51,
 'H': 50,
 'I': 43,
 'J': 61,
 'K': 55,
 'L': 48,
 'M': 36,
 'N': 47,
 'O': 49,
 'P': 42,
 'Q': 67,
 'R': 40,
 'S': 21,
 'T': 32,
 'U': 57,
 'V': 59,
 'W': 53,
 'X': 64,
 'Y': 62,
 'Z': 66,
 '[': 109,
 '\\': 125,
 ']': 116,
 '^': 193,
 '_': 63,
 '`': 132,
 'a': 1,
 'b': 24,
 'c': 10,
 'd': 12,
 'e': 0,
 'f': 33,
 'g': 19,
 'h': 16,
 'i': 2,
 'j': 60,
 'k': 29,
 'l': 8,
 'm': 14,
 'n': 5,
 'o': 3,
 'p': 18,
 'q': 65,
 'r': 4,
 's': 7,
 't': 6,
 'u': 13,
 'v': 41,
 'w': 39,
 'x': 56,
 'y': 22,
 'z': 54,
 '{': 245,
 '|': 231,
 '~': 198,
 '\x92': 247,
 '\xa0': 101,
 '©': 240,
 'ª': 173,
 '²': 225,
 '³': 

In [32]:
len(frequent_chars)

256

In [33]:
chars.CHAR_BOW_ID

258

In [35]:
chars.CHAR_PAD_ID

260

In [37]:
chars.CHAR_UNK_ID

261

In [38]:
chars.CHAR_EOW_ID

259

In [34]:
constants.MAX_WORD_LEN

25

In [39]:
vocab_list

[',',
 '.',
 'the',
 'and',
 'to',
 'of',
 'a',
 'in',
 '"',
 ':',
 'is',
 'for',
 'I',
 ')',
 '(',
 'that',
 '-',
 'on',
 'you',
 'with',
 "'s",
 'it',
 'The',
 'are',
 'by',
 'at',
 'be',
 'this',
 'as',
 'from',
 'was',
 'have',
 'or',
 '...',
 'your',
 'not',
 '!',
 '?',
 'will',
 'an',
 "n't",
 'can',
 'but',
 'all',
 'my',
 'has',
 '|',
 'do',
 'we',
 'they',
 'more',
 'one',
 'about',
 'he',
 ';',
 "'",
 'out',
 '$',
 'their',
 'so',
 'his',
 'up',
 'It',
 '&',
 'like',
 '/',
 '1',
 'which',
 'if',
 'would',
 'our',
 '[',
 ']',
 'me',
 'who',
 'just',
 'This',
 'time',
 'what',
 'A',
 '2',
 'had',
 'when',
 'there',
 'been',
 'some',
 'get',
 'were',
 'other',
 'also',
 'In',
 'her',
 'them',
 'You',
 'new',
 'We',
 'no',
 'any',
 '>',
 'people',
 'than',
 'into',
 'only',
 '3',
 'how',
 'its',
 'first',
 'said',
 'i',
 'If',
 'over',
 'make',
 'good',
 'know',
 'very',
 '%',
 'am',
 'now',
 'see',
 'may',
 'she',
 'could',
 'most',
 'then',
 "'m",
 'use',
 'these',
 'did',
 'An

In [40]:
len(vocab_list)

2196017

In [41]:
print("Creating word character data")
for z in range(len(vocab_list)):
    word = vocab_list[z]
    vocab_chars[z, 0] = chars.CHAR_BOW_ID
    for zz in range(constants.MAX_WORD_LEN - 1):
        insert_index = zz + 1
        if zz >= len(word):
            vocab_chars[z, insert_index] = chars.CHAR_PAD_ID
        elif word[zz] not in frequent_chars:
            vocab_chars[z, insert_index] = chars.CHAR_UNK_ID
        else:
            vocab_chars[z, insert_index] = frequent_chars[word[zz]]
    vocab_chars[z, min(1 + len(word), constants.MAX_WORD_LEN - 1)] = \
        chars.CHAR_EOW_ID

Creating word character data


In [42]:
# The order of the following must match that of vocab.py
vocab_chars[num_lines, :] = chars.CHAR_BOS_ID
vocab_chars[num_lines + 1, :] = chars.CHAR_EOS_ID
vocab_chars[num_lines + 2, :] = chars.CHAR_PAD_ID
vocab_chars[num_lines + 3, :] = chars.CHAR_UNK_ID

In [44]:
vocab_chars

array([[ 2, 58,  3, ...,  4,  4,  4],
       [ 2, 28,  3, ...,  4,  4,  4],
       [ 2,  6, 16, ...,  4,  4,  4],
       ...,
       [ 1,  1,  1, ...,  1,  1,  1],
       [ 4,  4,  4, ...,  4,  4,  4],
       [ 5,  5,  5, ...,  5,  5,  5]], dtype=uint8)

In [45]:
vocab_chars.size

54900525

In [46]:
np.save(vocab_chars_output_file, vocab_chars)
np.save(embedding_output_file, embedding)
vocab_o_file.close()
i_file.close()
print("")
print("Finished creating vocabulary and embedding file")


Finished creating vocabulary and embedding file


In [47]:
vocab_chars_output_file

'../data/vocab.chars.npy'

In [48]:
embedding_output_file

'../data/glove.embedding.npy'