In [3]:
import tensorflow as tf
import numpy as np
import os
import regex as re
import random

In [4]:
class Word2Vec:

  def __init__(self,input_file_path,stop_words = None):
    self.input_file_path = input_file_path
    self.word_count = None
    self.count = 0
    #self.vocab_size = None
    self.stop_words = stop_words
    self.word_to_index = {}
    self.index_to_word = {}
    self.vocab = []

    #self._target_words = []
    #self._context_vectors = []
    #self._target_to_context_data = {}

    self.data = self._read_file(self.input_file_path)
    self._Prepare_data_utils(self.data)
    self.vocab = self.vocab[:1000]
    self.word_count = len(self.vocab)

  def process(self,window_size):
    #data = self._read_file(self.input_file_path)
    #self._Prepare_data_utils(data)
    return self._generate_training_data(window_size)
  def _read_file(self,remove_stop_words = False):
    file_contents = []
    if os.path.exists(self.input_file_path):

      with open(self.input_file_path) as f:
          file_contents = f.read()
      data = []
      for sent in file_contents.split('.'):
          sent = re.findall("[A-Za-z]+", sent)
          new_sent = ''
          for words in sent:

              if self.stop_words is not None:
                  if len(words) > 1 and words not in self.stop_words:
                      new_sent = new_sent + ' ' + words
                  continue
              if len(words) > 1 :
                    new_sent = new_sent + ' ' + words
          data.append(new_sent)
      return data
    else:
      raise Exception("File Path Does Not Exist")

  def _Prepare_data_utils(self,data):
    for sent in data:
        for word in sent.split():
            word = word.lower()
            self.vocab.append(word)
            if word not in self.word_to_index:
                self.word_to_index[word] = self.count
                self.index_to_word[self.count] = word
                self.count  += 1
    self.word_count = len(self.vocab)
  def _one_hot_encode(self,target_word,context_words):
    target_vector = np.zeros(len(self.vocab))
    context_vector = np.zeros(len(self.vocab))
    target_index = self.word_to_index.get(target_word)
    for word in context_words:
      context_index = self.word_to_index.get(word)
      context_vector[context_index] = 1
    target_vector[target_index] = 1
    return target_vector,context_vector
  def _generate_training_data(self,window_size,gen_negative_data = True):
    target_vectors, context_vectors, labels = [],[],[]
    if gen_negative_data:
      for index,word in enumerate(self.vocab):
        target = word
        context_words = random.sample(self.vocab,window_size*2)
        target_vector,context_vector = self._one_hot_encode(target,context_words)
        labels.append(0)
        target_vectors.append(target_vector)
        context_vectors.append(context_vector)


    for index,word in enumerate(self.vocab):
      target = word
      context_words = []
      if index == 0:
        context_words = [self.vocab[idx] for idx in range(index+1,index+1+window_size)]
      elif index == self.word_count - 1:
        context_words = [self.vocab[idx] for idx in range(index-1,index-1-window_size,-1)]
      else:
        #right side
        for idx in range(index+1,index+1+window_size):
          if idx < len(self.vocab)-1:
            #print(index)
            context_words.append(self.vocab[idx])
            continue
          break

        #left side
        for idx in range(index-1,index-1-window_size,-1):
          if idx > 0:
            context_words.append(self.vocab[idx])
            continue
          break
      target_vector,context_vector = self._one_hot_encode(target,context_words)
      labels.append(1)
      target_vectors.append(target_vector)
      context_vectors.append(context_vector)

    return np.array(target_vectors), np.array(context_vectors), np.array(labels)


In [5]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
vectorizer = Word2Vec(path_to_file)
Autotune = tf.data.AUTOTUNE
target_vectors,context_vectors,labels = vectorizer.process(2)
data = tf.data.Dataset.from_tensor_slices(((target_vectors,context_vectors),labels))
data = data.cache().shuffle(5000).batch(1000).prefetch(Autotune)

In [6]:
class Word2VecModel(tf.keras.Model):

  def __init__(self,vocab_size,emb_dim):
    super(Word2VecModel,self).__init__()
    self.target_embedding = tf.keras.layers.Embedding(vocab_size,emb_dim,name = "embedding_1")

    self.context_embedding = tf.keras.layers.Embedding(vocab_size,
                                       emb_dim,
                                       name = "embedding_2")
    self.flatten = tf.keras.layers.Flatten()
    self.dense = tf.keras.layers.Dense(1,activation = "sigmoid")

  def call(self,x):
    target,context = x
    word_em1 = self.target_embedding(target)
    word_em2 = self.context_embedding(context)
    dots = tf.math.add(word_em1,word_em2)
    dots = self.flatten(dots)
    dots = self.dense(dots)
    return dots

In [7]:
my_model = Word2VecModel(1000,120)
my_model.compile(optimizer='adam',
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits = False),
                 metrics=['accuracy'])
my_model.fit(data,epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7c12f8754550>

In [8]:
weights = my_model.get_layer('embedding_1').get_weights()[0]

In [9]:
for i in range(3):
  print(f"{vectorizer.vocab[i]} --> {weights[i]}")

first --> [-0.0170556  -0.01243787  0.00800631 -0.0252011   0.00590843 -0.00553115
 -0.01863282 -0.01254865  0.0119739  -0.04019622  0.00842682  0.01008526
 -0.01437738 -0.01867424  0.03809348  0.01732139  0.02572223 -0.0023969
 -0.02887852  0.01013532  0.03384162  0.02036899 -0.00526825 -0.01088508
 -0.00495841 -0.0413136  -0.01216512  0.01178444  0.01038048  0.03569179
  0.02515993  0.02015463 -0.01342836 -0.02670871  0.00222888 -0.01042104
  0.0108773   0.00628067  0.01688491  0.02201271 -0.02625884  0.03790294
 -0.00548539 -0.01699776 -0.00224262 -0.00725609 -0.02122648 -0.00460905
 -0.02242819  0.00616083 -0.01072491  0.01072041 -0.02837901 -0.01030625
 -0.04010846  0.01760278 -0.0307328   0.00995832  0.0252569   0.01456528
  0.0024913  -0.03257979  0.03858005  0.0247371   0.02406243  0.00557162
  0.04629162  0.00309694  0.03102067  0.0171019   0.00737533 -0.01972257
 -0.00252412 -0.03328662  0.01405191  0.00577006 -0.02121025  0.00300525
 -0.01443823 -0.02820752  0.01292439 -0.02