In [96]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import keras

pd.options.display.max_colwidth = 200
%matplotlib inline

Using TensorFlow backend.


In [97]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vhuang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
wpt = nltk.WordPunctTokenizer()

In [99]:
stop_words = nltk.corpus.stopwords.words('english')

In [100]:
stop_words[10:20]

["you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [101]:
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [102]:
#below is testing example

In [103]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]

In [104]:
corpus = np.array(corpus)

In [105]:
corpus

array(['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
       'I love green eggs, ham, sausages and bacon!',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!'], dtype='<U66')

In [106]:
corpus[4]

'I love green eggs, ham, sausages and bacon!'

In [107]:
doc = re.sub(r'[^a-zA-Z\s]', '', corpus[4], re.I|re.A)
doc

'I love green eggs ham sausages and bacon'

In [108]:
doc = doc.lower()
doc

'i love green eggs ham sausages and bacon'

In [109]:
doc = doc.strip()
doc

'i love green eggs ham sausages and bacon'

In [110]:
tokens = wpt.tokenize(doc)
tokens

['i', 'love', 'green', 'eggs', 'ham', 'sausages', 'and', 'bacon']

In [111]:
filtered_tokens = [token for token in tokens if token not in stop_words]
filtered_tokens

['love', 'green', 'eggs', 'ham', 'sausages', 'bacon']

In [112]:
doc = ' '.join(filtered_tokens)
doc

'love green eggs ham sausages bacon'

In [113]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U51')

# Word Embeddings

In [114]:
from nltk.corpus import gutenberg
from string import punctuation

In [115]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [116]:
gutenberg

<PlaintextCorpusReader in 'C:\\Users\\vhuang\\AppData\\Roaming\\nltk_data\\corpora\\gutenberg'>

In [117]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\vhuang\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [118]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vhuang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [119]:
bible = gutenberg.sents('bible-kjv.txt')

In [120]:
remove_terms = punctuation + '0123456789'

In [121]:
bible[:3]

[['[', 'The', 'King', 'James', 'Bible', ']'],
 ['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible'],
 ['The', 'First', 'Book', 'of', 'Moses', ':', 'Called', 'Genesis']]

In [122]:
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]

In [123]:
norm_bible[:3]

[['the', 'king', 'james', 'bible'],
 ['the', 'old', 'testament', 'of', 'the', 'king', 'james', 'bible'],
 ['the', 'first', 'book', 'of', 'moses', 'called', 'genesis']]

In [124]:
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible[:5]

['the king james bible',
 'the old testament of the king james bible',
 'the first book of moses called genesis',
 'in the beginning god created the heaven and the earth',
 'and the earth was without form and void and darkness was upon the face of the deep']

In [125]:
norm_bible = filter(None, normalize_corpus(norm_bible))

In [126]:
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

## Implementing word2vec using CBOW

In [128]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

In [129]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)

In [132]:
word2id = tokenizer.word_index
word2id

{'shall': 1,
 'unto': 2,
 'lord': 3,
 'thou': 4,
 'thy': 5,
 'god': 6,
 'ye': 7,
 'said': 8,
 'thee': 9,
 'upon': 10,
 'man': 11,
 'israel': 12,
 'king': 13,
 'son': 14,
 'hath': 15,
 'people': 16,
 'came': 17,
 'house': 18,
 'come': 19,
 'one': 20,
 'children': 21,
 'also': 22,
 'day': 23,
 'land': 24,
 'men': 25,
 'shalt': 26,
 'let': 27,
 'go': 28,
 'hand': 29,
 'saying': 30,
 'us': 31,
 'made': 32,
 'even': 33,
 'went': 34,
 'behold': 35,
 'saith': 36,
 'every': 37,
 'therefore': 38,
 'things': 39,
 'father': 40,
 'sons': 41,
 'hast': 42,
 'david': 43,
 'make': 44,
 'say': 45,
 'may': 46,
 'earth': 47,
 'jesus': 48,
 'great': 49,
 'name': 50,
 'thine': 51,
 'away': 52,
 'put': 53,
 'among': 54,
 'thereof': 55,
 'forth': 56,
 'give': 57,
 'neither': 58,
 'take': 59,
 'city': 60,
 'days': 61,
 'brought': 62,
 'moses': 63,
 'two': 64,
 'heart': 65,
 'pass': 66,
 'judah': 67,
 'jerusalem': 68,
 'according': 69,
 'know': 70,
 'took': 71,
 'thus': 72,
 'offering': 73,
 'bring': 74,
 'goo

In [133]:
word2id['PAD'] = 0

In [134]:
id2word = {v:k for k, v in word2id.items()}

In [136]:
id2word[0]

'PAD'

In [137]:
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

In [145]:
wids[:5]

[[13, 1154, 5766],
 [154, 2450, 13, 1154, 5766],
 [132, 310, 63, 86, 8480],
 [582, 6, 1180, 94, 47],
 [47, 136, 1883, 1884, 396, 10, 144, 860]]

In [140]:
doc

'grace lord jesus christ'

In [139]:
text.text_to_word_sequence(doc)

['grace', 'lord', 'jesus', 'christ']

In [141]:
vocab_size = len(word2id)

In [142]:
embed_size = 100
window_size = 2

In [147]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)
            
            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [149]:
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size,
                                        vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['old', 'testament', 'james', 'bible'] -> Target (Y): king
Context (X): ['first', 'book', 'called', 'genesis'] -> Target (Y): moses
Context (X): ['beginning', 'god', 'heaven', 'earth'] -> Target (Y): created
Context (X): ['earth', 'without', 'void', 'darkness'] -> Target (Y): form
Context (X): ['without', 'form', 'darkness', 'upon'] -> Target (Y): void
Context (X): ['form', 'void', 'upon', 'face'] -> Target (Y): darkness
Context (X): ['void', 'darkness', 'face', 'deep'] -> Target (Y): upon
Context (X): ['spirit', 'god', 'upon', 'face'] -> Target (Y): moved
Context (X): ['god', 'moved', 'face', 'waters'] -> Target (Y): upon
Context (X): ['god', 'said', 'light', 'light'] -> Target (Y): let
Context (X): ['god', 'saw', 'good', 'god'] -> Target (Y): light


In [150]:
# Build CBOW Deep Network Model

In [167]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

In [168]:
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))

In [170]:
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))

In [171]:
cbow.add(Dense(vocab_size, activation='softmax'))

In [173]:
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [174]:
print(cbow.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1242500   
_________________________________________________________________
lambda_2 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 12425)             1254925   
Total params: 2,497,425
Trainable params: 2,497,425
Non-trainable params: 0
_________________________________________________________________
None


In [185]:
for epoch in range(1, 6):
    loss = 0
    i = 0
    for x, y  in generate_context_word_pairs(wids, window_size, vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Processed 100000 (context, word) pairs
Processed 200000 (context, word) pairs
Processed 300000 (context, word) pairs


KeyboardInterrupt: 

In [182]:
cbow.get_weights()[0].shape

(12425, 100)

In [183]:
cbow.get_weights()[1].shape

(100, 12425)

In [184]:
cbow.get_weights()[2].shape

(12425,)

## Implement word2vec using Skip-Gram

In [187]:
from keras.preprocessing import text

In [188]:
tokenizer = text.Tokenizer()

In [189]:
tokenizer.fit_on_texts(norm_bible)

In [190]:
word2id = tokenizer.word_index

In [191]:
norm_bible[:5]

['king james bible',
 'old testament king james bible',
 'first book moses called genesis',
 'beginning god created heaven earth',
 'earth without form void darkness upon face deep']

In [193]:
id2word = {v:k for k, v in word2id.items()}

In [194]:
vocab_size = len(word2id) + 1
embed_size = 100

In [195]:
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

In [196]:
from keras.preprocessing.sequence import skipgrams

In [197]:
#generate skip-grams

In [198]:
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]

In [199]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]

In [202]:
type(skip_grams)

list

In [204]:
#build  the skip-grams model architecture

In [209]:
from tensorflow.keras.layers import Concatenate

In [211]:
from tensorflow.keras.layers import Dense, Reshape

In [213]:
from tensorflow.keras.layers import Embedding

In [215]:
from tensorflow.keras.models import Sequential

In [228]:
from tensorflow.keras.layers import Input

In [241]:
word_model = Sequential()
word_model.add(Input(shape=(embed_size, )))
word_model.add(Embedding(vocab_size, embed_size, 
                         embeddings_initializer='glorot_uniform', 
                         input_length=1))

In [242]:
word_model.add(Reshape((embed_size, )))

In [243]:
context_model = Sequential()
context_model.add(Input(shape=(embed_size, )))
context_model.add(Embedding(vocab_size, embed_size, 
                            embeddings_initializer='glorot_uniform',
                            input_length=1))

In [244]:
context_model.add(Reshape((embed_size, )))

In [245]:
model = Sequential()

In [246]:
model.add(Concatenate([word_model, context_model]))

In [247]:
model.add(Dense(1, kernel_initializer='glorot_uniform', activation='sigmoid'))

In [248]:
model.compile(loss='mse', optimizer='rmsprop')

In [249]:
print(model.summary())

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

In [None]:
word_model = Input(shape=(embed_size, ))

In [258]:
for epoch in range(1, 6):
    loss = 0
    for i, elem in enumerate(skip_grams[:2]):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')

        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)