## Problem Statement 6

### Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps: 
    a. Data preparation 
    b. Generate training data 
    c. Train model
    d. Output

### Import necessary packages

In [1]:
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.src.utils import np_utils

from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [3]:
data = """Deep learning (also known as deep structured learning) is part of a 
broader family of machine learning methods based on artificial neural networks 
with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks, 
deep reinforcement learning, recurrent neural networks, convolutional neural networks and 
Transformers have been applied to fields including computer vision, speech recognition, 
natural language processing, machine translation, bioinformatics, drug design, 
medical image analysis, climate science, material inspection and board game programs, 
where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

### a. Data preparation

In [55]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id=tokenizer.word_index
words2id['PAD']=0

id2words={v:k for k,v in words2id.items()}

wids=[[words2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(words2id)
embed_size=100
window_size=2

print("Vocabulary size: ", vocab_size)
print("Vocabulary Sample: ", list(words2id.items())[:10])

Vocabulary size:  75
Vocabulary Sample:  [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


### b. Generate training data

In [61]:
def generate_context_word_pair(corpus, window_size, vocab_size):
    context_length=window_size*2

    for words in corpus:
        sentence_length = len(words)

        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <=i <sentence_length
                                 and i!= index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen = context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x,y)



### c. Train Model

In [62]:
cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x:K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))


cbow.compile(loss="categorical_crossentropy", optimizer="rmsprop")
print(cbow.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 100)            7500      
                                                                 
 lambda_2 (Lambda)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 75)                7575      
                                                                 
Total params: 15075 (58.89 KB)
Trainable params: 15075 (58.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [58]:
for epoch in range(1,6):
    loss=0
    i=0

    for x,y in generate_context_word_pair(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:',epoch, '\tLoss:', loss)
    print()



Epoch: 1 	Loss: 433.2113757133484

Epoch: 2 	Loss: 428.88651967048645

Epoch: 3 	Loss: 425.65926337242126

Epoch: 4 	Loss: 422.6268413066864

Epoch: 5 	Loss: 420.26561641693115



In [59]:
weights=cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2words.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,-0.03525,-0.017182,-0.014591,-0.059133,-0.030444,-0.018133,0.014737,0.007353,0.020941,0.032662,...,0.030231,-0.032152,0.05953,0.027135,0.015601,-0.017077,0.065609,0.059204,-0.058002,0.020034
networks,-0.039052,-0.028336,0.053455,0.032577,0.0308,0.02704,0.004421,-0.050127,0.035925,-0.041192,...,-0.015013,-0.013109,0.027947,-0.050477,0.009925,-0.004569,-0.000745,-0.002495,-0.006544,0.022819
neural,-0.015448,0.004266,0.021796,-0.012649,0.030122,0.028571,-0.046028,-0.035076,0.00829,-0.031895,...,0.009841,-0.027353,0.006563,-0.013955,-0.049002,-0.041623,0.00623,-0.019244,-0.027475,-0.025156
and,-0.044523,-0.034967,-0.03463,0.003425,0.038434,-0.026311,-0.013805,-0.014041,0.041481,0.042669,...,-0.011139,0.000411,-0.002457,-0.003976,0.047401,0.041385,-0.047281,-0.039385,0.003109,0.014924
as,-0.03492,-0.001912,0.022261,0.002947,0.027355,0.033859,0.039777,0.031998,-0.047989,-0.015527,...,0.039581,0.030206,0.016534,-0.005382,0.045003,-0.018945,0.035311,0.039242,-0.048324,0.003846


### d. Output

In [75]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

inwords = input()

similar_words = {search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
                   for search_term in {inwords}}

similar_words

(74, 74)


 deep


{'deep': ['learning',
  'recurrent',
  'networks',
  'natural',
  'based',
  'transformers']}