# Word2Vec implementation

# 1. Implementation

### Data collecting

In [2]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import tensorflow.keras.layers as L
from multiset import Multiset
from tqdm import tqdm
import random

from sklearn.decomposition import PCA
import plotly.graph_objects as go

In [5]:
data = open("corpus_100k", "r").read()

def lower(txt):
  return txt.lower()

data_split = list(map(lower, data.split()))

data_len = len(data_split)

words_counts = Multiset(data_split) # *word*: *count*
uniq_words = Multiset(set(data_split)) # *word*: 1

words_counts = words_counts - uniq_words * 100 # words used at least 100 times

counts_lst = words_counts.values()

prob_lst = list(np.array(list(counts_lst)) / sum(counts_lst))

vocab_words = list(set(words_counts))
vocab_len = len(vocab_words)

print(f"From {data_len} words we got {vocab_len} to vocabulary.")

From 25293193 words we got 17535 to vocabulary.


In [6]:
ohe = OneHotEncoder(sparse_output=False)

ohe.fit(np.array(vocab_words).reshape(-1, 1))

In [None]:
# precollect some pairs for speed

indices = []

for i in tqdm(range(0, data_len, 10)):
  if data_split[i] in ohe.categories_[0] and data_split[i + 1] in ohe.categories_[0]:
    indices.append(i)

In [None]:
def get_sample():
  i = random.choice(indices)

  x = ohe.transform(np.array(data_split[i]).reshape(-1, 1))
  y = ohe.transform(np.array(data_split[i + 1]).reshape(-1, 1))

  k = random.choices(vocab_words, k=10, weights=prob_lst) # Use random.choices() with weights
  k = ohe.transform(np.array(k).reshape(-1, 1))

  return x, y, k

### Training

In [None]:
@tf.function
def neg_sampling_loss(u_o, v_c, u_k):
  return -tf.math.log(tf.math.sigmoid(u_o @ tf.transpose(v_c))) - tf.reduce_mean(tf.math.log(tf.math.sigmoid(- u_k @ tf.transpose(v_c))))

# Network
input_encoder = L.Input((vocab_len,), name="input")
encoder_layer = L.Dense(200, name="encoder", use_bias=False)(input_encoder)
decoder_layer = L.Dense(vocab_len, name="decoder", use_bias=False)(encoder_layer)

network = tf.keras.Model(input_encoder, decoder_layer)

# Optimizer
optimizer = tf.keras.optimizers.Adam()

# Steps
steps = 50000

global_loss = 0
for i in tqdm(range(steps)):
  x, y, k = get_sample()

  x, y, k = tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32), tf.constant(k, dtype=tf.float32)

  with tf.GradientTape() as tape:
      u_o = network(x)
      u_k = network(k)

      loss = neg_sampling_loss(u_o, y, u_k)

  # Collect trainable variables
  train_vars = network.trainable_variables

  # Calculate gradients
  grad = tape.gradient(loss, train_vars)

  # Apply gradients    
  optimizer.apply_gradients(zip(grad, train_vars))

  global_loss += loss[0][0]
  if i % 500 == 0:
    tf.print("LOSS: ", global_loss / 500)
    global_loss = 0  

### Saving

In [None]:
network.save_weights("word2vec/word2vec.ckpt")

# 2. Visualization

In [9]:
input_encoder = L.Input((vocab_len,), name="input")
encoder_layer = L.Dense(200, name="encoder", use_bias=False)(input_encoder)
decoder_layer = L.Dense(vocab_len, name="decoder", use_bias=False)(encoder_layer)

network = tf.keras.Model(input_encoder, decoder_layer)

network.load_weights("word2vec/word2vec.ckpt")

embeddings_np = network.weights[0]
transformed = PCA(n_components=2).fit_transform(embeddings_np)

x_1, x_2 = transformed[:, 0][:100], transformed[:, 1][:100]

fig = go.Figure()

fig.add_trace(go.Scatter(x=x_1, y=x_2, mode="markers+text", text=list(vocab_words)[:100]))