Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps:
a. Data preparation
b. Generate training data
c. Train model
d. Output

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding,Lambda,Dense
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

**Data Preparation**

In [None]:
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance"""

In [None]:
sentences = data.split('.')

sentences = [sentence.lower() for sentence in sentences]

sentences

['deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning',
 ' learning can be supervised, semi-supervised or unsupervised',
 ' deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[2,
  1,
  12,
  13,
  6,
  2,
  14,
  1,
  15,
  16,
  7,
  17,
  18,
  19,
  7,
  8,
  1,
  20,
  21,
  22,
  23,
  4,
  3,
  24,
  25,
  1],
 [1, 26, 27, 9, 28, 9, 29, 30],
 [2,
  1,
  31,
  32,
  6,
  2,
  4,
  3,
  2,
  33,
  3,
  2,
  34,
  1,
  35,
  4,
  3,
  36,
  4,
  3,
  5,
  37,
  10,
  38,
  39,
  11,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  8,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  5,
  60,
  61,
  62,
  63,
  64,
  10,
  65,
  66,
  67,
  11,
  5,
  68,
  69,
  70,
  71,
  72,
  73,
  74]]

In [None]:
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word

print(word2idx)
print("\n")
print(idx2word)

{'learning': 1, 'deep': 2, 'networks': 3, 'neural': 4, 'and': 5, 'as': 6, 'of': 7, 'machine': 8, 'supervised': 9, 'have': 10, 'to': 11, 'also': 12, 'known': 13, 'structured': 14, 'is': 15, 'part': 16, 'a': 17, 'broader': 18, 'family': 19, 'methods': 20, 'based': 21, 'on': 22, 'artificial': 23, 'with': 24, 'representation': 25, 'can': 26, 'be': 27, 'semi': 28, 'or': 29, 'unsupervised': 30, 'architectures': 31, 'such': 32, 'belief': 33, 'reinforcement': 34, 'recurrent': 35, 'convolutional': 36, 'transformers': 37, 'been': 38, 'applied': 39, 'fields': 40, 'including': 41, 'computer': 42, 'vision': 43, 'speech': 44, 'recognition': 45, 'natural': 46, 'language': 47, 'processing': 48, 'translation': 49, 'bioinformatics': 50, 'drug': 51, 'design': 52, 'medical': 53, 'image': 54, 'analysis': 55, 'climate': 56, 'science': 57, 'material': 58, 'inspection': 59, 'board': 60, 'game': 61, 'programs': 62, 'where': 63, 'they': 64, 'produced': 65, 'results': 66, 'comparable': 67, 'in': 68, 'some': 69, 

**Generating training data**

In [None]:
vocab_size = len(word2idx) + 1
embed_size = 100
context_size = 2

contexts = []
targets = []

for sequence in sequences:
  for i in range(context_size,len(sequence)-context_size):
    target = sequence[i]
    targets.append(target)
    context = [sequence[i-2],sequence[i-1],sequence[i+1],sequence[i+2]]
    contexts.append(context)

print(contexts,"\n")
print(targets)

[[2, 1, 13, 6], [1, 12, 6, 2], [12, 13, 2, 14], [13, 6, 14, 1], [6, 2, 1, 15], [2, 14, 15, 16], [14, 1, 16, 7], [1, 15, 7, 17], [15, 16, 17, 18], [16, 7, 18, 19], [7, 17, 19, 7], [17, 18, 7, 8], [18, 19, 8, 1], [19, 7, 1, 20], [7, 8, 20, 21], [8, 1, 21, 22], [1, 20, 22, 23], [20, 21, 23, 4], [21, 22, 4, 3], [22, 23, 3, 24], [23, 4, 24, 25], [4, 3, 25, 1], [1, 26, 9, 28], [26, 27, 28, 9], [27, 9, 9, 29], [9, 28, 29, 30], [2, 1, 32, 6], [1, 31, 6, 2], [31, 32, 2, 4], [32, 6, 4, 3], [6, 2, 3, 2], [2, 4, 2, 33], [4, 3, 33, 3], [3, 2, 3, 2], [2, 33, 2, 34], [33, 3, 34, 1], [3, 2, 1, 35], [2, 34, 35, 4], [34, 1, 4, 3], [1, 35, 3, 36], [35, 4, 36, 4], [4, 3, 4, 3], [3, 36, 3, 5], [36, 4, 5, 37], [4, 3, 37, 10], [3, 5, 10, 38], [5, 37, 38, 39], [37, 10, 39, 11], [10, 38, 11, 40], [38, 39, 40, 41], [39, 11, 41, 42], [11, 40, 42, 43], [40, 41, 43, 44], [41, 42, 44, 45], [42, 43, 45, 46], [43, 44, 46, 47], [44, 45, 47, 48], [45, 46, 48, 8], [46, 47, 8, 49], [47, 48, 49, 50], [48, 8, 50, 51], [8, 

In [None]:
for i in range(5):
  context_words = []
  target_word = idx2word[targets[i]]
  for j in contexts[i]:
    context_words.append(idx2word[j])
  print(context_words,"->",target_word)

['deep', 'learning', 'known', 'as'] -> also
['learning', 'also', 'as', 'deep'] -> known
['also', 'known', 'deep', 'structured'] -> as
['known', 'as', 'structured', 'learning'] -> deep
['as', 'deep', 'learning', 'is'] -> structured


In [None]:
x = np.array(contexts)
y = np.array(targets)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


**Training Model**

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size,output_dim=embed_size,input_length=2*context_size),
    Lambda(lambda x: tf.reduce_mean(x,axis=1)),
    Dense(32,activation='relu'),
    Dense(64,activation='relu'),
    Dense(vocab_size,activation='softmax')
])



In [None]:
model.compile(loss="sparse_categorical_crossentropy",optimizer='adam',metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test))

Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.1008 - loss: 4.0463 - val_accuracy: 0.0000e+00 - val_loss: 4.5506
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1346 - loss: 3.9695 - val_accuracy: 0.0000e+00 - val_loss: 4.6035
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.1151 - loss: 3.9187 - val_accuracy: 0.0000e+00 - val_loss: 4.6690
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1346 - loss: 3.8502 - val_accuracy: 0.0000e+00 - val_loss: 4.7500
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1268 - loss: 3.7874 - val_accuracy: 0.0000e+00 - val_loss: 4.8483
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1730 - loss: 3.6974 - val_accuracy: 0.0000e+00 - val_loss: 4.9664
Epoch 7/20
[1m3/3[0m

**Prediction**

In [None]:
def predict_target_word(context_words):
  context_sequence = [word2idx[word] for word in context_words]
  context_sequence = np.array(context_sequence)

  prediction_prob = model.predict(context_sequence.reshape(1,-1))

  predicted_word_index = np.argmax(prediction_prob)
  predicted_word = idx2word[predicted_word_index]

  return predicted_word

In [None]:
target_word = predict_target_word(['deep'] )
target_word

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'networks'

In [None]:
embedding_layer = model.layers[0]
word_embeddings = embedding_layer.get_weights()[0]
word_embeddings

df = pd.DataFrame(word_embeddings[1:], index=list(idx2word.values()))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
learning,-0.080244,0.023685,-0.049131,0.051408,-0.048281,-0.11154,0.01715,0.13907,0.100558,-0.093907,...,-0.039399,0.033559,-0.130371,0.0535,-0.025011,0.040557,-0.073035,-0.03828,-0.085537,-0.077802
deep,-0.128014,0.124043,-0.028864,-0.098549,-0.154651,-0.127853,-0.072508,0.042119,0.090774,-0.042864,...,0.09159,0.076861,0.082205,0.089769,-0.131354,0.053096,0.001475,-0.136682,-0.113537,0.050998
networks,-0.198471,0.137536,-0.119992,0.14485,-0.089286,-0.171421,-0.107217,0.054368,0.109344,-0.167025,...,-0.024955,0.090296,0.017261,0.061083,-0.078614,0.010457,0.080655,-0.174555,-0.113716,0.133805
neural,-0.10119,0.041711,-0.013254,-0.121814,-0.112283,-0.102772,-0.154237,0.028148,0.060274,0.097394,...,-0.023819,0.03098,0.05353,0.113209,-0.151861,0.028212,0.026922,-0.077721,-0.126031,0.12929
and,-0.017712,0.023926,-0.007297,0.011597,0.006323,-0.023147,-0.052281,-0.158657,0.032322,0.113524,...,0.018009,-0.021413,0.052097,-0.005501,0.013487,0.027314,0.081147,-0.025999,0.102464,0.001669


In [None]:
def get_most_similar_words(word,word_embeddings,top_n=5):
  word_idx = word2idx[word]
  word_vec = word_embeddings[word_idx].reshape(1,-1)
  all_words_vec = word_embeddings
  similarity = cosine_similarity(word_vec,all_words_vec)

  # Sort the similarity scores in descending order
  sorted_indices = similarity.argsort()[0][::-1]

  # Get the indices of the top 'top_n' similar words, excluding the word itself
  similar_indices = sorted_indices[1:top_n+1]  # Exclude the word itself by starting from index 1

  # Print the most similar words
  for i in similar_indices:
      print(f"Word: {idx2word[i]}, Similarity: {similarity[0][i]:.4f}")

In [None]:
get_most_similar_words('deep', word_embeddings, top_n=5)

Word: belief, Similarity: 0.8125
Word: neural, Similarity: 0.7959
Word: convolutional, Similarity: 0.7600
Word: with, Similarity: 0.7142
Word: artificial, Similarity: 0.7051
