## Assignment 5
    1. Name: Omkar Pawar
    2. Batch: R-9
    3. Roll No.: 43160
    
### Problem Statement :
    Implement the Continuous Bag of Words (CBOW) Model

### Importing Libraries

In [3]:
pip install np_utils

Collecting np_utilsNote: you may need to restart the kernel to use updated packages.

  Downloading np_utils-0.6.0.tar.gz (61 kB)
     ---------------------------------------- 0.0/62.0 kB ? eta -:--:--
     ------------ ------------------------- 20.5/62.0 kB 682.7 kB/s eta 0:00:01
     -------------------------------------- 62.0/62.0 kB 834.6 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: np_utils
  Building wheel for np_utils (setup.py): started
  Building wheel for np_utils (setup.py): finished with status 'done'
  Created wheel for np_utils: filename=np_utils-0.6.0-py3-none-any.whl size=56449 sha256=3487c222825b9a030b9fa9068d9aabee85b261addbc909c72b561f2aacd06641
  Stored in directory: c:\users\pawar\appdata\local\pip\cache\wheels\19\0d\33\eaa4dcda5799bcbb51733c0744970d10edb4b9add4f41beb43
Successfully built np_utils
Installing collected packages: np_utils
Successfully ins

In [3]:
from keras.preprocessing import text
from keras.src.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

#### Random sentence as data

In [4]:
data = """Deep learning (also known as deep structured learning) is part of a 
broader family of machine learning methods based on artificial neural networks 
with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks, 
deep reinforcement learning, recurrent neural networks, convolutional neural networks and 
Transformers have been applied to fields including computer vision, speech recognition, 
natural language processing, machine translation, bioinformatics, drug design, 
medical image analysis, climate science, material inspection and board game programs, 
where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

### a. Data preparation
#### Tokenization

In [5]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


#### b. Generate training data
#### Generating (context word, target/label word) pairs

In [6]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])

        if i == 10:
            break
        i += 1

### c. Train Model
#### Model Building

In [7]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            7500      
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 75)                7575      
                                                                 
Total params: 15075 (58.89 KB)
Trainable params: 15075 (58.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [8]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 433.3096146583557

Epoch: 2 	Loss: 428.84291982650757

Epoch: 3 	Loss: 425.5224049091339

Epoch: 4 	Loss: 422.4686472415924

Epoch: 5 	Loss: 420.1378016471863



In [9]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,-0.013229,0.016762,-0.020026,-0.020566,0.031135,0.006816,0.035224,-0.057148,0.034406,0.007027,...,-0.05961,0.060229,0.050503,-0.043293,0.030989,0.049634,0.041702,0.015633,-0.023802,-0.050265
networks,-0.051126,-0.012177,0.013474,-0.030006,-0.000113,-0.039106,0.001721,0.033965,-0.01462,0.041402,...,0.029056,0.024922,-0.004864,-0.058565,0.002117,-0.003913,-0.062047,0.038528,-0.026459,-0.002042
neural,0.04815,0.013395,-0.032853,0.020569,-0.011879,-0.026698,0.034642,-0.041267,-0.000735,-0.001523,...,0.041538,-0.043489,-0.00586,0.0238,0.02304,0.032298,-0.048479,-0.042032,0.030212,0.023323
and,-0.038757,-0.049871,-0.044562,0.026347,-0.02224,-0.048286,0.031216,-0.03912,-0.025615,0.007222,...,0.004701,-0.014461,0.044197,0.044381,0.012224,0.043667,0.028213,0.005074,0.033509,0.040212
as,0.016282,0.036789,0.03074,0.049739,0.023463,0.000273,0.040475,-0.025917,-0.026471,0.005818,...,-0.004882,-0.00588,0.02798,-0.029601,0.019891,0.037088,-0.001953,0.028963,0.035577,0.04018


### d. Output

In [11]:
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                   for search_term in ['deep']}

similar_words

(74, 74)


{'deep': ['transformers', 'they', 'game', 'of', 'such']}