## Problem Statement 7
### Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 2) using the below steps:
    a. Data preparation
    b. Generate training data
    c. Train model
    d. Output

In [1]:
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras import backend as K

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [2]:
data="""But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?"""

dl_data=data.split()

## a. Data preparation

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id = tokenizer.word_index
words2id['PAD']=0

id2words = {v:k for k,v in words2id.items()}

wids = [[words2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(words2id)
embed_size=100
window_size=2

In [4]:
print("Vocabulary size: ", vocab_size)
print("Vocabulary items: ", list(words2id.items())[:10])

Vocabulary size:  102
Vocabulary items:  [('to', 1), ('of', 2), ('pleasure', 3), ('pain', 4), ('a', 5), ('the', 6), ('who', 7), ('but', 8), ('and', 9), ('or', 10)]


## b. Generating training data

In [5]:
def pairwise(corpus, window_size, vocab_size):
    context_length=window_size*2

    for words in corpus:
        sentence_length=len(words)
        for index, word in enumerate(words):
            context_words=[]
            label_word=[]
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield(x,y)

## c. Training the model

In [6]:
cbow = Sequential()

cbow.add(Embedding(input_dim = vocab_size, output_dim = embed_size, input_length = window_size * 2))
cbow.add(Lambda(lambda x:K.mean(x, axis = 1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))

cbow.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [7]:
print(cbow.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            10200     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 102)               10302     
                                                                 
Total params: 20,502
Trainable params: 20,502
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
for epoch in range(1,6):
    loss=0

    for x,y in pairwise(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        loss += cbow.train_on_batch(x, y)
    print("Epoch: {} Loss: {}".format(epoch,loss))
    print()

Epoch: 1 Loss: 773.6744208335876

Epoch: 2 Loss: 759.7337009906769

Epoch: 3 Loss: 748.727814912796

Epoch: 4 Loss: 744.9754633903503

Epoch: 5 Loss: 743.4458160400391



In [9]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2words.values())[1:]).head()

(101, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,0.035179,0.023368,0.049963,-0.007478,0.034212,0.038297,-0.031115,0.043417,-0.047161,0.028689,...,0.005756,-0.018679,0.007764,0.004578,-0.025996,-0.007956,0.019899,-0.040133,-0.049973,0.043166
pleasure,0.045747,-0.016991,0.041693,-0.012932,-0.03986,0.009047,0.002665,0.024652,-0.046257,0.027263,...,-0.000842,-0.013116,-0.013517,-0.007207,0.007037,-0.043289,0.003623,0.038937,-0.049211,-0.009029
pain,0.027725,0.047766,0.037364,-0.010055,0.011514,0.038374,0.03894,0.022389,0.00794,0.022344,...,0.023969,-0.038194,0.031198,0.011376,0.037452,-0.028125,0.040851,-0.049558,-0.048536,0.002859
a,0.039197,0.014887,0.005141,0.007287,0.018211,-0.008881,-0.047905,-0.02202,-0.027918,-0.022115,...,-0.008482,-0.017775,0.037556,-0.022071,-0.021873,0.034996,0.019938,-0.031782,0.013948,0.006009
the,-0.015644,0.001658,0.005288,0.001571,-0.025929,0.015047,-0.016131,-0.000493,-0.033741,0.038236,...,-0.047691,-0.026041,-0.040466,-0.01118,-0.049383,-0.03151,0.04634,-0.028934,0.009354,0.011843


## d. Output

In [10]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

(101, 101)


In [12]:
inwords = input()

similar_words={ search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
              for search_term in {inwords}}
similar_words

{'teachings': ['actual', 'that', 'in', 'PAD', 'a', 'complete']}