In [3]:
from nltk.corpus import gutenberg
from string import punctuation
import nltk
import numpy as np
import re

In [4]:
# bible = gutenberg.sents('bible-kjv.txt') 
remove_terms = punctuation + '01234567'

bible  =   [['[', 'The', 'King', 'James', 'Bible', ']'],
            ['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible'],
            ['The', 'First', 'Book', 'of', 'Moses', ':', 'Called', 'Genesis'],
            ['1',':','1','In','the','beginning','God','created','the','heaven','and','the','earth','.'],
            ['1',':','2','And','the','earth','was','without','form',',','and','void',';','and','darkness','was','upon','the','face','of','the','deep','.'],
            ['And','the','Spirit','of','God','moved','upon','the','face','of','the','waters','.'],
            ['1',':','3','And','God','said',',','Let','there','be','light',':','and','there','was','light','.'],
            ['1',':','4','And','God','saw','the','light',',','that','it','was','good',':','and','God','divided','the','light','from','the','darkness','.'],
            ['1',':','5','And','God','called','the','light','Day',',','and','the','darkness','he','called','Night','.'],
            ['And','the','evening','and','the','morning','were','the','first','day','.']]



In [5]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

normalize_corpus

<numpy.vectorize at 0x2a75d63fb48>

In [6]:
len(bible)

10

In [7]:
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

norm_bible

['king james bible',
 'old testament king james bible',
 'first book moses called genesis',
 'beginning god created heaven earth',
 'earth without form void darkness upon face deep',
 'spirit god moved upon face waters',
 'god said let light light',
 'god saw light good god divided light darkness',
 'god called light day darkness called night',
 'evening morning first day']

In [8]:
print('Total lines:', len(bible))
#print('\nSample line:', bible[10])
#print('\nProcessed line:', norm_bible[10])

Total lines: 10


In [22]:
from keras.preprocessing import text

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1 
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)

wids

Vocabulary Size: 36


[[5, 6, 7],
 [13, 14, 5, 6, 7],
 [8, 15, 16, 3, 17],
 [18, 1, 19, 20, 9],
 [9, 21, 22, 23, 4, 10, 11, 24],
 [25, 1, 26, 10, 11, 27],
 [1, 28, 29, 2, 2],
 [1, 30, 2, 31, 1, 32, 2, 4],
 [1, 3, 2, 12, 4, 3, 33],
 [34, 35, 8, 12]]

In [13]:
from keras.preprocessing.sequence import skipgrams

# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(james (6), darkness (4)) -> 0
(king (5), james (6)) -> 0
(bible (7), testament (14)) -> 0
(king (5), bible (7)) -> 1
(james (6), bible (7)) -> 1
(bible (7), face (11)) -> 0
(king (5), spirit (25)) -> 0
(james (6), king (5)) -> 1
(bible (7), king (5)) -> 1
(bible (7), james (6)) -> 1


In [23]:
skip_grams

[([[6, 4],
   [5, 6],
   [7, 14],
   [5, 7],
   [6, 7],
   [7, 11],
   [5, 25],
   [6, 5],
   [7, 5],
   [7, 6],
   [6, 17],
   [5, 6]],
  [0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]),
 ([[5, 6],
   [7, 13],
   [5, 13],
   [13, 5],
   [7, 6],
   [6, 13],
   [7, 15],
   [14, 6],
   [14, 5],
   [6, 32],
   [13, 10],
   [6, 12],
   [14, 7],
   [13, 7],
   [14, 15],
   [13, 6],
   [7, 26],
   [7, 14],
   [6, 5],
   [6, 11],
   [13, 14],
   [5, 7],
   [7, 11],
   [7, 28],
   [14, 23],
   [7, 5],
   [5, 5],
   [14, 6],
   [13, 22],
   [13, 34],
   [6, 14],
   [14, 13],
   [5, 14],
   [14, 15],
   [13, 14],
   [6, 23],
   [5, 23],
   [6, 7],
   [5, 24],
   [5, 6]],
  [0,
   1,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   0,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   1,
   1,
   1,
   0,
   0,
   0,
   0,
   1,
   0,
   1]),
 ([[15, 16],
   [16, 7],
   [17, 16],
   [16, 10],
   [16, 3],
   [3, 14],
   [3, 8],
   [3, 16],


In [21]:
labels

[0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]