# Step 0: Imports and Constants#

In [20]:
###########################
# Imports for the project #
###########################
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import h5py
from nltk import word_tokenize
import re
import pickle
import codecs
from unidecode import unidecode

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [24]:
#############
# Constants #
#############

word_sequence_dest = "word_sequence.hdf5"
word_mapping_dest = "word_map.pkl"
idx_mapping_dest = "idx_map.pkl"
delims = ' |\t|\n|\r\n|:'
prune_freq = 3 # the word must appear >5 times in the entire text.

# Step 1: Acquire dataset #

We have already the .txt's of the bible in our own file directory. All we need to do is to read it. 

**Currently we are only reading in the English version.**

**TODO: We don't have a "proper" tokenizer right now. We are just delimiting via spaces.**

In [25]:
with codecs.open('small_bible.txt', encoding='utf-8') as f:
    #bible = re.split(delims, f.read())
    f_str = unidecode(f.read())
    bible = word_tokenize(f_str)

In [26]:
bible_map_word_to_freq = {}

for word in bible:
    if word in bible_map_word_to_freq:
        bible_map_word_to_freq[word] += 1
    else:
        bible_map_word_to_freq[word] = 1

bible = ["<unknown>" if bible_map_word_to_freq[word] < prune_freq else word for word in bible]
print bible[1000:1500]

['because', 'that', 'in', 'it', 'he', 'had', 'rested', 'from', 'all', 'his', 'work', 'which', 'God', 'created', 'and', 'made', '.', 'These', 'are', 'the', 'generations', 'of', 'the', 'heavens', 'and', 'of', 'the', 'earth', 'when', 'they', 'were', 'created', ',', 'in', 'the', 'day', 'that', 'the', 'LORD', 'God', 'made', 'the', 'earth', 'and', 'the', 'heavens', ',', 'And', 'every', 'plant', 'of', 'the', 'field', 'before', 'it', 'was', 'in', 'the', 'earth', ',', 'and', 'every', 'herb', 'of', 'the', 'field', 'before', 'it', 'grew', ':', 'for', 'the', 'LORD', 'God', 'had', 'not', 'caused', 'it', 'to', 'rain', 'upon', 'the', 'earth', ',', 'and', 'there', 'was', 'not', 'a', 'man', 'to', 'till', 'the', 'ground', '.', 'But', 'there', 'went', 'up', 'a', '<unknown>', 'from', 'the', 'earth', ',', 'and', 'watered', 'the', 'whole', 'face', 'of', 'the', 'ground', '.', 'And', 'the', 'LORD', 'God', 'formed', 'man', 'of', 'the', 'dust', 'of', 'the', 'ground', ',', 'and', 'breathed', 'into', 'his', 'nost

In [27]:
# Change bible into an np array. In this case it has around 900k words. Therefore, it will be a (900k,) array.
bible_seq = np.array(bible)

bible_set = set(bible)
bible_map_word_to_idx = {word : i for i, word in enumerate(bible_set)} # We have about 30k unique vocabularies.
bible_map_idx_to_word = {i : word for i, word in enumerate(bible_set)} # We have about 30k unique vocabularies.

In [28]:
print len(bible_map_idx_to_word)
print len(bible_set)
max_val = 0
for key in bible_map_word_to_idx:
    max_val = max(max_val, bible_map_word_to_idx[key])
print max_val

4094
4094
4093


In [29]:
# This is a numpy array, so we use h5py.
with h5py.File(word_sequence_dest, 'w') as f:
    f.create_dataset('bible_seq', data=bible_seq)
# This is a python dict, so we use pickle.
with open(word_mapping_dest, 'w') as f:
    pickle.dump(bible_map_word_to_idx, f)
with open(idx_mapping_dest, 'w') as f:
    pickle.dump(bible_map_idx_to_word, f)