This NoteBook will detail the process in generating Word2Vec or FastText Word Embeddings. In this notebook, we will use a Word2Vec Embeddings Generator, that generates word embeddings, although this paradigm can be applied to subword models too.

To do this, break down the words into trigrams(or smaller) and use a convolutional encoder to encode each word, and do the same process(FastText)

What This Model will not do:
- This will not generate contextual word encodings, such as ELMo, ULMFit, or BERT
- To perform this, use a LM that processes all text before(or after too(ULMFIt)) and pretrain the model to predict text
- Upon Transfer Learning, Use the encodings for the LM to concatenate to the regular embeddings(contextual + Word2Vec.)


# Import Dependencies and Load in SST Dataset

In [1]:
%%capture
!pip install kaggle
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import nltk
import tqdm.notebook as tqdm
import random
import math
import copy
import collections
from sklearn.decomposition import PCA
nltk.download('punkt')
nltk.download("stopwords")
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%%capture
!mkdir /root/.kaggle/
!cp -f ./kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
%%capture
#!kaggle datasets download -d atulanandjha/stanford-sentiment-treebank-v2-sst2
#!unzip ./stanford-sentiment-treebank-v2-sst2.zip

In [3]:
%%capture
# Load in the Dataset, IMDB for more data
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip
!rm -f imdb-dataset-of-50k-movie-reviews.zip

In [3]:
def read(file_path):
  vals = []
  with open(file_path, 'r') as file:
    for line in tqdm.tqdm(file):
      #print(line)
      idx = 0
      for char in line:
        idx += 1
        if char == ' ':
          break
      vals += [line[idx:-1]]
  return vals

In [4]:
#dataset = read(file_path)[1:-1] #SST Dataset

In [5]:
reviews_pd = pd.read_csv("./IMDB Dataset.csv")

In [6]:
reviews = [review for review in reviews_pd['review']]

In [7]:
def process_corpus(corpus):
  '''
  Processes in the corpus, lowering all words, stemming the words, and splitting it into a list
  '''
  corpus_processed = []
  for sentence in tqdm.tqdm(corpus):
    sentence_tokenized = []
    for word in nltk.word_tokenize(sentence):
      if str.isalnum(word):
        sentence_tokenized += [str.lower(word)]
    corpus_processed += [sentence_tokenized]
  return corpus_processed

In [8]:
processed_dataset = process_corpus(reviews)

HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




# Word2Vec Model

In [9]:
class Word2Vec(keras.Model):
  def __init__(self, emb_dim, corpus):
    super().__init__()
    self.unique_words, self.probabilities= self._compute_unique_words(corpus)
    self.idx_2_word = {i: self.unique_words[i] for i in range(len(self.unique_words))}
    self.word_2_idx = {self.unique_words[i]: i for i in range(len(self.unique_words))}
    self.Center_Embeddings = keras.layers.Embedding(len(self.unique_words), output_dim = emb_dim)
    self.Context_Embeddings = keras.layers.Embedding(len(self.unique_words), output_dim= emb_dim)
    self.probability_tensor = np.array([self.probabilities[self.idx_2_word[word_idx]] for word_idx in range(len(self.unique_words))], dtype = np.float16)
  def _compute_unique_words(self, corpus):
    '''
    Given a Corpus, this method returns a set of all unique words
    '''
    words = [word for article in corpus for word in article]
    total_loss = len(words)
    counter = collections.Counter(words)
    probabilities = {word: counter[word] / total_loss for word in counter}
    return list(set(words)), probabilities
  def _extract_pos(self, sentence, sentence_idx, window_size):
    '''
    Extracts the positive samples(if there are enough to the right)
    If there isnt enough to the right, there isn't enough to batch the input, and thus it would retunr simply None
    '''
    # Tensorize the Sentence
    tensor = sentence
    if sentence_idx + window_size >= len(sentence):
      return None
    if sentence_idx - window_size < 0:
      return tensor[0: sentence_idx + window_size]
    else:
      return tensor[sentence_idx - window_size: sentence_idx + window_size]
  def _prepare_pos(self, sentences, sentence_idx, window_size):
    '''
    Prepares a positive set, pruning out the None values(to get one clean batch)
    '''
    tensor = [self._extract_pos(sentence, sentence_idx, window_size) for sentence in sentences]
    vals = [x for x in tensor if type(x) != type(None)]
    return vals if len(vals) > 0 else None
  def _training_batch(self, batch, neg_samps, center_tokens):
    '''
    batch: (B, 2WindowSize)
    neg_samps: (NumNeg)
    center_tokens: (B)
    '''
    B, N = batch.shape
    batched_neg_samps = tf.repeat(tf.expand_dims(neg_samps, 0), B, axis = 0) # (B, NumNeg)
    # Lookup embeddings for positive and negative samples
    center_embeddings = self.Center_Embeddings(np.reshape(center_tokens, (B, 1))) # (B, 1, 300)
    pos_embeddings = self.Context_Embeddings(batch) # (B, 2Window, 300)
    neg_embeddings = self.Context_Embeddings(batched_neg_samps) # (B, NumNeg, 300)
    # BatchMM
    pos_similarity = tf.matmul(center_embeddings, tf.transpose(pos_embeddings,perm = (0, 2, 1)))
    neg_similarity = tf.matmul(center_embeddings, tf.transpose(neg_embeddings, perm = (0, 2, 1)))

    pos_scores = tf.squeeze(pos_similarity) # (B, 2Window)
    neg_scores = tf.squeeze(neg_similarity) # (B, NumNeg)
    pos_loss = tf.losses.binary_crossentropy(tf.ones_like(pos_scores), pos_scores, from_logits = True)
    neg_loss = tf.losses.binary_crossentropy(tf.zeros_like(neg_scores), neg_scores, from_logits = True)
    return tf.reduce_mean(pos_loss + neg_loss)
  def batched_word2vec(self, sentences, window_size = 2, num_neg_samp = 5):
    # Tokenize All Sentences
    tokenized_sentences = []
    longest_sent = 0
    for sentence in sentences:
      tokenized_sentences += [[self.word_2_idx[word] for word in sentence]]
      longest_sent = max(len(sentence), longest_sent)
    # Iterate over all indices
    total_loss = tf.zeros((1))
    for idx in range(longest_sent):
      center_tokens= []
      for sentence in tokenized_sentences:
        if idx + window_size < len(sentence):
           center_tokens += [sentence[idx]]
      center_tokens = np.array(center_tokens)
      tokenized = self._prepare_pos(tokenized_sentences, idx, window_size)
      if type(tokenized) == type(None):
        break
      # Generate negative samples
      neg_samps = []
      while len(neg_samps) < num_neg_samp:
        neg_samp = np.random.choice(len(self.unique_words), p = self.probability_tensor)
        if neg_samp not in tokenized and neg_samp not in neg_samps:
          neg_samps += [neg_samp]
      neg_samps = np.array(neg_samps)
      np_tokenized = np.array(tokenized)
      total_loss = total_loss + self._training_batch(np_tokenized, neg_samps, center_tokens)
    return total_loss
  def call(self, idx):
    array = np.zeros((1, 1))
    array[:, :] = idx
    return np.squeeze(self.Center_Embeddings(array))

In [10]:
word2vec = Word2Vec(300, processed_dataset)

In [11]:
def training_fn(corpus, NUM_EPOCHS, batch_size, display_every = 24):
  optim = tf.optimizers.Adam(learning_rate = 1e-3)
  cur_idx = 0
  corpus_length = len(corpus) // batch_size
  for EPOCH in range(NUM_EPOCHS):
    total_loss = 0
    for i in tqdm.tqdm(range(display_every)):
      cur_idx += 1
      if cur_idx >= corpus_length:
        cur_idx = 0
      batch = corpus[cur_idx * batch_size: (cur_idx + 1) * batch_size]
      with tf.GradientTape() as tape:
        loss = word2vec.batched_word2vec(batch)
      grads = tape.gradient(loss, word2vec.trainable_weights)
      optim.apply_gradients(zip(grads, word2vec.trainable_weights))
      total_loss += loss.numpy().item()
    print(f'EPOCH: {EPOCH}, total_loss: {total_loss / display_every}')      

In [12]:
def unique_words(num_words, corpus):
  '''
  corpus: sentences
  Computes the N most frequent words, not including stopwords
  '''
  stopwords = nltk.corpus.stopwords.words('english')
  words = []
  for article in corpus:
    for word in article:
      if word not in stopwords:
        words += [word]
  counter = collections.Counter(words)
  mapped_values = {word: counter[word] for word in counter}
  reversed_orders = sorted(mapped_values.items(), key = lambda x: x[1], reverse = True)
  
  top_n_words = []
  count = 0
  for word, _ in reversed_orders:
    top_n_words += [word]
    count += 1
    if count == num_words:
      break
  return top_n_words
def visualize_embeddings(num_words, corpus):
  '''
  Visualizes the embeddings for the n most frequent words 
  '''
  top_n_words = unique_words(num_words, corpus)
  embeddings = []
  #indices = []
  for word in top_n_words:
    embeddings += [word2vec(word2vec.word_2_idx[word])]
    #indices += [word2vec.word_2_idx[word]]
  embeddings = np.stack(embeddings, axis = 0) # (N, 300)
  # Reduce Dimensionality of the Model
  pca = PCA(n_components = 2)
  reduced = pca.fit_transform(embeddings) # (N, 2)
  for tensor in range(len(reduced)):
    plt.scatter(reduced[tensor][0], reduced[tensor][1])
    plt.text(reduced[tensor][0], reduced[tensor][1], top_n_words[tensor])
  plt.show()
def visualize_words(words):
  embeddings = []
  for word in words:
    embeddings += [word2vec(word2vec.word_2_idx[word])]
  embeddings = np.stack(embeddings, axis = 0) # (N, 300)
  # Reduce Dimensionality of the Model
  pca = PCA(n_components = 2)
  reduced = pca.fit_transform(embeddings) # (N, 2)
  for tensor in range(len(reduced)):
    plt.scatter(reduced[tensor][0], reduced[tensor][1])
    plt.text(reduced[tensor][0], reduced[tensor][1], words[tensor])
  plt.show() 
def distances(center_vector, matrix):
  '''
  Computes Euclidean Distance 
  '''
  vals = matrix - center_vector
  vals = vals ** 2
  vals = np.sum(vals, axis = -1) # (N)
  return vals
def closest(word, k, farthest = False):
  '''
  finds the k nearest words to this word.
  '''
  center_embeddings = word2vec(word2vec.word_2_idx[word])
  embeddings = word2vec.Center_Embeddings.weights[0]
  distance_matrix = distances(center_embeddings, embeddings)
  distance_map = {idx: distance_matrix[idx] for idx in range(len(distance_matrix))}
  sorted_map = sorted(distance_map.items(), key = lambda x: x[1], reverse = farthest)
  return sorted_map[:k ]
def closest_embeddings(center_embeddings, k, farthest = False):
  embeddings = word2vec.Center_Embeddings.weights[0]
  distance_matrix = distances(center_embeddings, embeddings)
  distance_map = {idx: distance_matrix[idx] for idx in range(len(distance_matrix))}
  sorted_map = sorted(distance_map.items(), key = lambda x: x[1], reverse = farthest)
  return sorted_map[:k]

In [None]:
with tf.device("GPU:0"):
  training_fn(processed_dataset, 150, 512)

In [14]:
word2vec.save_weights("./model/model")

In [None]:
!zip -r ./model.zip ./model

In [None]:
visualize_embeddings(100, processed_dataset)

In [16]:
def analogy(first, second, third):
  first_embeddings = word2vec(word2vec.word_2_idx[first])
  second_embeddings = word2vec(word2vec.word_2_idx[second])
  third_embeddings = word2vec(word2vec.word_2_idx[third])
  return closest_embeddings(first_embeddings - second_embeddings + third_embeddings, 5)

In [17]:
def read(indices):
  for word, _ in indices:
    print(word2vec.idx_2_word[word])

In [None]:
read(analogy('england', 'english', 'france'))