# Step 0: Imports and Constants#

In [45]:
###########################
# Imports for the project #
###########################
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import h5py
import nltk
import re
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [56]:
#############
# Constants #
#############

word_sequence_dest = "word_sequence.hdf5"
word_mapping_dest = "word_map.pkl"
idx_mapping_dest = "idx_map.pkl"
delims = ' |\t|\n|\r\n|:'
prune_freq = 1 # the word must appear >5 times in the entire text.

# Step 1: Acquire dataset #

We have already the .txt's of the bible in our own file directory. All we need to do is to read it. 

**Currently we are only reading in the English version.**

**TODO: We don't have a "proper" tokenizer right now. We are just delimiting via spaces.**

In [60]:
with open("overfit.txt") as f:
    bible = re.split(delims, f.read())

# remove white space:
bible = [word for word in bible if word != ""]

In [64]:
bible_map_word_to_freq = {}

for word in bible:
    if word in bible_map_word_to_freq:
        bible_map_word_to_freq[word] += 1
    else:
        bible_map_word_to_freq[word] = 1

bible = ["<unknown>" if bible_map_word_to_freq[word] < prune_freq else word for word in bible]
print bible[:100]

['First', 'Citizen', 'Before', 'we', 'proceed', 'any', 'further,', 'hear', 'me', 'speak.', 'All', 'Speak,', 'speak.', 'First', 'Citizen', 'You', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish?', 'All', 'Resolved.', 'resolved.', 'First', 'Citizen', 'First,', 'you', 'know', 'Caius', 'Marcius', 'is', 'chief', 'enemy', 'to', 'the', 'people.', 'All', 'We', "know't,", 'we', "know't.", 'First', 'Citizen', 'Let', 'us', 'kill', 'him,', 'and', "we'll", 'have', 'corn', 'at', 'our', 'own', 'price.', "Is't", 'a', 'verdict?', 'All', 'No', 'more', 'talking', "on't;", 'let', 'it', 'be', 'done', 'away,', 'away!', 'Second', 'Citizen', 'One', 'word,', 'good', 'citizens.', 'First', 'Citizen', 'We', 'are', 'accounted', 'poor', 'citizens,', 'the', 'patricians', 'good.', 'What', 'authority', 'surfeits', 'on', 'would', 'relieve', 'us', 'if', 'they', 'would']


In [65]:
# Change bible into an np array. In this case it has around 900k words. Therefore, it will be a (900k,) array.
bible_seq = np.array(bible)

bible_set = set(bible)
bible_map_word_to_idx = {word : i for i, word in enumerate(bible_set)} # We have about 30k unique vocabularies.
bible_map_idx_to_word = {i : word for i, word in enumerate(bible_set)} # We have about 30k unique vocabularies.

In [71]:
print len(bible_map_idx_to_word)
print len(bible_set)
max_val = 0
for key in bible_map_word_to_idx:
    max_val = max(max_val, bible_map_word_to_idx[key])
print max_val

1850
1850
1849


In [63]:
# This is a numpy array, so we use h5py.
with h5py.File(word_sequence_dest, 'w') as f:
    f.create_dataset('bible_seq', data=bible_seq)
# This is a python dict, so we use pickle.
with open(word_mapping_dest, 'w') as f:
    pickle.dump(bible_map_word_to_idx, f)
with open(idx_mapping_dest, 'w') as f:
    pickle.dump(bible_map_idx_to_word, f)