### CBOW = Continuous Bag of Words
#### Our objective: to obtain vector embeddings of 20,000 common english language words by training for their contextual meaning.
#### AKA: subpar version of Word2Vec

In [None]:
import gensim.downloader as api
import tensorflow

dataset = api.load("text8")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
vocab_size = 20000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(dataset)
sequences = tokenizer.texts_to_sequences(dataset)

tokenizer.word_index

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Dense, Input, Embedding, Lambda
from tensorflow.keras.models import Model

In [None]:
import random
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)

In [None]:
context_size = 10
embedding_dim = 50

i = Input(shape=(context_size,))
x = Embedding(vocab_size, embedding_dim)(i)
x = Lambda(lambda t: tf.reduce_mean(t, axis=1))(x)
x = Dense(vocab_size, activation='softmax')(x)

model = Model(i, x)

In [None]:
model.summary()

In [None]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
half_context_size = context_size // 2

def data_generator(sequences, batch_size=128):
  X_batch = np.zeros((batch_size, context_size))
  Y_batch = np.zeros(batch_size)
  n_batches = int(np.ceil(len(sequences) / batch_size))

  while True:
    random.shuffle(sequences)

    # one epoch will be one pass through the data
    for i in range(n_batches):
      batch_sequences = sequences[i * batch_size:(i + 1) * batch_size]

      current_batch_size = len(batch_sequences) # may be less than batch_size
      for ii in range(current_batch_size):
        seq = batch_sequences[ii]
        j = np.random.randint(0, len(seq) - context_size - 1)
        x1 = seq[j:j + half_context_size]
        x2 = seq[j + half_context_size + 1:j + context_size + 1]
        # x = x1 + x2
        # X_batch[ii] = x
        X_batch[ii, :half_context_size] = x1
        X_batch[ii, half_context_size:] = x2
        y = seq[j + half_context_size]
        Y_batch[ii] = y

      yield X_batch[:current_batch_size], Y_batch[:current_batch_size]

In [None]:
batch_size = 128
r = model.fit(
  data_generator(sequences, batch_size),
  epochs=10000,
  steps_per_epoch=int(np.ceil(len(sequences) / batch_size))
)

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.legend();

In [None]:
plt.plot(r.history['accuracy'], label='acc')
plt.legend();

In [None]:
embeddings = model.layers[1].get_weights()[0]
embeddings

In [None]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
neighbors.fit(embeddings)

In [None]:
def print_neighbors(query):
  query_idx = tokenizer.word_index[query]
  query = embeddings[query_idx:query_idx + 1]
  distances, indices = neighbors.kneighbors(query)
  for idx in indices[0]:
    word = tokenizer.index_word[idx]
    print(word)

In [None]:
print_neighbors('uncle')

In [None]:
print_neighbors('paris')

In [None]:
def get_embedding(word):
  idx = tokenizer.word_index[word]
  return embeddings[idx:idx + 1]

england = get_embedding('england')

english = get_embedding('english')
australian = get_embedding('australian')

# australia - australian = england - english in resulting embedding space
# expected query = australia
query = england - english + australian

distances, indices = neighbors.kneighbors(query)
for idx in indices[0]:
  word = tokenizer.index_word[idx]
  print(word)