In [12]:
# https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html

In [13]:

from nltk.corpus import gutenberg
norm_bible = gutenberg.sents('bible-kjv.txt')
print(norm_bible)

[['[', 'The', 'King', 'James', 'Bible', ']'], ['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible'], ...]


In [9]:
import numpy as np
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

len(word2id)

id2word = {v:k for k, v in word2id.items()}

wids = [[word2id[w.lower()] for w in doc] for doc in norm_bible]

vocab_size = len(word2id)

embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 12767
Vocabulary Sample: [(',', 1), ('the', 2), ('and', 3), (':', 4), ('of', 5), ('.', 6), ('to', 7), ('that', 8), ('in', 9), ('he', 10)]


In [10]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [11]:
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['[', 'the', 'james', 'bible'] -> Target (Y): king
Context (X): ['the', 'king', 'bible', ']'] -> Target (Y): james
Context (X): ['the', 'old', 'of', 'the'] -> Target (Y): testament
Context (X): ['old', 'testament', 'the', 'king'] -> Target (Y): of
Context (X): ['testament', 'of', 'king', 'james'] -> Target (Y): the
Context (X): ['of', 'the', 'james', 'bible'] -> Target (Y): king
Context (X): ['the', 'first', 'of', 'moses'] -> Target (Y): book
Context (X): ['first', 'book', 'moses', ':'] -> Target (Y): of
Context (X): ['book', 'of', ':', 'called'] -> Target (Y): moses
Context (X): ['of', 'moses', 'called', 'genesis'] -> Target (Y): :
Context (X): ['1', ':', 'in', 'the'] -> Target (Y): 1
