In [None]:
import tensorflow as tf
import numpy as np
import random
import os
import regex as re

class Word2Vec:

  def __init__(self,input_file_path,stop_words = None):
    self.input_file_path = input_file_path
    self.word_count = None
    self.count = 0
    #self.vocab_size = None
    self.stop_words = stop_words
    self.word_to_index = {}
    self.index_to_word = {}
    self.vocab = []

    #self._target_words = []
    #self._context_vectors = []
    #self._target_to_context_data = {}

    self.data = self._read_file(self.input_file_path)
    self._Prepare_data_utils(self.data)
    self.vocab = self.vocab[:500]
    self.word_count = len(self.vocab)

  def process(self,window_size):
    #data = self._read_file(self.input_file_path)
    #self._Prepare_data_utils(data)
    return self._generate_training_data(window_size)
    #return self._Augmented_Generated_data(target_vec,context_vec,labels,window_size)

  def _read_file(self,remove_stop_words = False):
    file_contents = []
    if os.path.exists(self.input_file_path):

      with open(self.input_file_path) as f:
          file_contents = f.read()
      data = []
      for sent in file_contents.split('.'):
          sent = re.findall("[A-Za-z]+", sent)
          new_sent = ''
          for words in sent:

              if self.stop_words is not None:
                  if len(words) > 1 and words not in self.stop_words:
                      new_sent = new_sent + ' ' + words
                  continue
              if len(words) > 1 :
                    new_sent = new_sent + ' ' + words
          data.append(new_sent)
      return data
    else:
      raise Exception("File Path Does Not Exist")

  def _Prepare_data_utils(self,data):
    for sent in data:
        for word in sent.split():
            word = word.lower()
            self.vocab.append(word)
            if word not in self.word_to_index:
                self.word_to_index[word] = self.count
                self.index_to_word[self.count] = word
                self.count  += 1
    self.word_count = len(self.vocab)

  def _one_hot_encode(self,target_word,context_words):
    target_vector = np.zeros(len(self.vocab))
    context_vector = np.zeros(len(self.vocab))
    target_index = self.word_to_index.get(target_word)
    for word in context_words:
      context_index = self.word_to_index.get(word)
      context_vector[context_index] = 1
    target_vector[target_index] = 1
    return target_vector,context_vector

  def _generate_training_data(self,window_size,gen_negative_data = True):
    target_vectors, context_vectors, labels = [],[],[]
    if gen_negative_data:
      for index,word in enumerate(self.vocab):
        target = word
        context_words = random.sample(self.vocab,window_size*2)
        target_vector,context_vector = self._one_hot_encode(target,context_words)
        labels.append([0])
        target_vectors.append(target_vector)
        context_vectors.append(context_vector)


    for index,word in enumerate(self.vocab):
      target = word
      context_words = []
      if index == 0:
        context_words = [self.vocab[idx] for idx in range(index+1,index+1+window_size)]
      elif index == self.word_count - 1:
        context_words = [self.vocab[idx] for idx in range(index-1,index-1-window_size,-1)]
      else:
        #right side
        for idx in range(index+1,index+1+window_size):
          if idx < len(self.vocab)-1:
            #print(index)
            context_words.append(self.vocab[idx])
            continue
          break

        #left side
        for idx in range(index-1,index-1-window_size,-1):
          if idx > 0:
            context_words.append(self.vocab[idx])
            continue
          break
      target_vector,context_vector = self._one_hot_encode(target,context_words)
      labels.append([1])
      target_vectors.append(target_vector)
      context_vectors.append(context_vector)

    return np.array(target_vectors), np.array(context_vectors), np.array(labels)

  '''def _Get_Target_Vectors(self,target_words,target,window_size):
    target_vec = [target]
    labels = [1]
    for idx,word in enumerate(target_words):
      target_vector = np.zeros(len(self.vocab))
      target_vector[self.word_to_index[word]] = 1
      target_vec.append(target_vector)
      if np.all(target_vector == np.array(target)):
        labels.append(1)
      else:
        labels.append(0)
    return target_vec,labels


  def _Augmented_Generated_data(self,target_vectors,context_vectors,labels,window_size):
    new_context = []
    new_targets = []
    new_labels = []
    for target,contexts in zip(target_vectors,context_vectors):
      for idx,context in enumerate(contexts):
        cont_vec = np.zeros(len(self.vocab))
        if context == 1:
            cont_vec[idx] = 1
            new_context.append(cont_vec)
            target_words = random.sample(self.vocab,window_size*2)
            target_vector,label = self._Get_Target_Vectors(target_words,target,window_size)
            new_targets.append(target_vector)
            new_labels.append(label)

    return np.array(new_context),np.array(new_targets),np.array(new_labels)'''



In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
vectorizer = Word2Vec(path_to_file)
Autotune = tf.data.AUTOTUNE
target_vectors,context_vectors,labels = vectorizer.process(2)
data = tf.data.Dataset.from_tensor_slices((context_vectors,(target_vectors,labels)))
data = data.cache().shuffle(5000).batch(500).prefetch(Autotune)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
def Word2VecCBOW_Model(Vocab_size,Hidden_dim):
  Inp_Layer = tf.keras.layers.Input((Vocab_size,),name = "input_layer")
  Embedding_Layer = tf.keras.layers.Embedding(Vocab_size,Hidden_dim,name = "Embedding_Layer_1")(Inp_Layer)
  print(Embedding_Layer.shape)
  Comm_Hidden_Layer = tf.keras.layers.Dense(128,activation = "relu",name = "Common_hidden")(Embedding_Layer)
  print(Comm_Hidden_Layer.shape)

  #For_Target
  Target_Hidden = tf.keras.layers.Dense(64,activation="relu",name = "Target_hidden")(Comm_Hidden_Layer)
  Reg = tf.keras.layers.Dropout(0.1,name = "Regularization_1")(Target_Hidden)
  Target = tf.keras.layers.Dense(Vocab_size,name = "Target_Out")(Reg)

  #For_Neg_or_Pos_Labels
  Label_Hidden = tf.keras.layers.Dense(64,activation="relu",name = "Label_hidden")(Comm_Hidden_Layer)
  Reg2 = tf.keras.layers.Dropout(0.1,name = "Regularization_2")(Label_Hidden)
  Label = tf.keras.layers.Dense(1,name = "Label_Out")(Reg2)
  print(Label.shape)

  CBOWWord2Vec = tf.keras.models.Model(inputs = Inp_Layer,outputs = [Target,Label])

  return CBOWWord2Vec

In [None]:
class Word2VecModel(tf.keras.models.Model):

  def __init__(self,my_model,**kwargs):
    super().__init__(**kwargs)
    self.model = my_model

  def compile(self,optimizer,Target_loss,Label_loss,**kwargs):
    super().compile(**kwargs)
    self.optimizer = optimizer
    self.Target_loss = Target_loss
    self.Label_loss = Label_loss

  def train_step(self,batch,**kwargs):
    x,y = batch
    print(x.shape)
    print(y[0].shape,y[1].shape)
    with tf.GradientTape() as tape:
      Target,Label = self.model(x,training = True)
      batch_targetloss = self.Target_loss(tf.cast(y[0],tf.float32),Target[0])
      batch_labelloss = self.Label_loss(tf.cast(y[1],tf.float32),Label[0])
      total_loss = batch_targetloss + batch_labelloss

      gradients = tape.gradient(total_loss,self.model.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients,self.model.trainable_variables))

    return {"total_loss" : total_loss,"Target_loss":batch_targetloss,"Label_loss":batch_labelloss}

  def test_step(self,batch,**kwargs):
    x,y = batch

    Target,Label = self.model(x,training = True)

    batch_targetloss = self.Target_loss(y[0],Target)
    batch_labelloss = self.Label_loss(y[1],Label[0])
    total_loss = batch_targetloss + batch_labelloss
    return {"total_loss" : total_loss,"Target_loss":batch_targetloss,"Label_loss":batch_labelloss}

  def call(self,inp,**kwargs):
    return self.model(inp,**kwargs)

In [None]:
model = Word2VecCBOW_Model(500,120)
model = Word2VecModel(model)

(None, 500, 120)
(None, 500, 128)
(None, 500, 1)


In [None]:
def target_loss(y_true,y_pred):
  return tf.nn.softmax_cross_entropy_with_logits(y_true,y_pred)
def label_loss(y_true,y_pred):
  return tf.nn.sigmoid_cross_entropy_with_logits(y_true,y_pred)

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),Target_loss = target_loss,Label_loss=label_loss,metrics = ["accuracy"])

In [None]:
model.fit(data,epochs = 100)

Epoch 1/100
(None, 500)
(None, 500) (None, 1)
(None, 500)
(None, 500) (None, 1)
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch

<keras.src.callbacks.History at 0x7a503b4d3fa0>

In [None]:
embeddings = model.model.get_layer("Embedding_Layer_1").get_weights()

In [None]:
embeddings[0].shape

(500, 120)

In [None]:
for i,j in enumerate(list(embeddings[0])):
  print(f"{vectorizer.vocab[i]}-->{j}")
  break

first-->[ 0.03838825 -0.05927904  0.07840646  0.04125115 -0.08230995  0.07507766
  0.09762581  0.045667   -0.09867255  0.0624503   0.07419965 -0.09086895
  0.05858266 -0.10045303  0.09924387  0.03329634  0.08240353 -0.07684124
 -0.05686071 -0.04822601  0.07820345  0.05972499  0.01935342  0.02556623
  0.09234457 -0.01956867  0.04068892 -0.06160046 -0.07195235  0.03452663
 -0.09757455  0.09166995  0.04848677 -0.08157872 -0.04222412  0.08523004
 -0.04002318  0.08769921  0.07150321 -0.05587499 -0.09701591 -0.09103511
  0.05098839  0.04793097  0.06309173  0.00383381  0.05290581 -0.08663704
 -0.0557266  -0.04753168  0.05469995  0.03938609 -0.07055417  0.03451018
  0.08698063  0.06349916  0.05222734 -0.06417726  0.08591545 -0.03372167
 -0.04212867 -0.07595556 -0.06894468 -0.04489912 -0.06312299  0.05461016
  0.07891735 -0.06174817 -0.04056623 -0.03440571 -0.0170647  -0.04761111
  0.03354085  0.09936614  0.08427736  0.07575633  0.04813368  0.04192816
  0.05055092 -0.04907617 -0.07640123 -0.062