# Imports

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import tensorflow.keras.layers as L
from multiset import Multiset # Use *pip install multiset*
from tqdm import tqdm
import random

from sklearn.decomposition import PCA
import plotly.graph_objects as go

# Making data

In [None]:
data = open("corpus_100k", "r").read()

def lower(txt):
  return txt.lower()

data_split = list(map(lower, data.split()))

data_len = len(data_split)

words_counts = Multiset(data_split) # *word*: *count*
uniq_words = Multiset(set(data_split)) # *word*: 1

words_counts = words_counts - uniq_words * 10000 # words used at least 100 times

counts_lst = words_counts.values()

prob_lst = list(np.array(list(counts_lst)) / sum(counts_lst))

vocab_words = list(set(words_counts))
vocab_len = len(vocab_words)

print(f"From {data_len} words we got {vocab_len} to vocabulary.")

In [None]:
ohe = OneHotEncoder(sparse_output=False)

ohe.fit(np.array(vocab_words).reshape(-1, 1))

In [None]:
# precollect some pairs for speed

indices = []

for i in tqdm(range(0, data_len, 2)):
  if data_split[i] in ohe.categories_[0] and data_split[i + 1] in ohe.categories_[0]:
    indices.append(i)

In [None]:
def get_sample():
  i = random.choice(indices)

  x = ohe.transform(np.array(data_split[i]).reshape(-1, 1))
  y = ohe.transform(np.array(data_split[i + 1]).reshape(-1, 1))

  k = random.choices(vocab_words, k=10, weights=prob_lst) # Use random.choices() with weights
  k = ohe.transform(np.array(k).reshape(-1, 1))

  return x, y, k

In [32]:
@tf.function
def neg_sampling_loss(u_o, v_c, u_k):
  return -tf.math.log(tf.math.sigmoid(u_o @ tf.transpose(v_c))) - tf.reduce_mean(tf.math.log(tf.math.sigmoid(- u_k @ tf.transpose(v_c))))

# Network
input_encoder = L.Input((vocab_len,), name="input")
encoder_layer = L.Dense(200, name="encoder", use_bias=False)(input_encoder)
decoder_layer = L.Dense(vocab_len, name="decoder", use_bias=False)(encoder_layer)

network = tf.keras.Model(input_encoder, decoder_layer)

# Optimizer
optimizer = tf.keras.optimizers.Adam()

# Steps
steps = 10000

global_loss = 0
for i in tqdm(range(steps)):
  x, y, k = get_sample()

  x, y, k = tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32), tf.constant(k, dtype=tf.float32)

  with tf.GradientTape() as tape:
      u_o = network(x)
      u_k = network(k)

      loss = neg_sampling_loss(u_o, y, u_k)

  # Collect trainable variables
  train_vars = network.trainable_variables

  # Calculate gradients
  grad = tape.gradient(loss, train_vars)

  # Apply gradients
  optimizer.apply_gradients(zip(grad, train_vars))

  global_loss += loss[0][0]
  if i % 500 == 0:
    tf.print("LOSS: ", global_loss / 500)
    global_loss = 0

  0%|          | 0/10000 [00:00<?, ?it/s]

LOSS:  0.0137328766


  5%|▌         | 500/10000 [00:03<01:07, 140.03it/s]

LOSS:  6.69351196


 10%|▉         | 996/10000 [00:06<00:58, 153.46it/s]

LOSS:  5.82085371


 15%|█▍        | 1491/10000 [00:10<00:53, 160.29it/s]

LOSS:  4.58118868


 20%|█▉        | 1993/10000 [00:13<00:50, 159.61it/s]

LOSS:  4.16720533


 25%|██▍       | 2497/10000 [00:16<00:47, 158.79it/s]

LOSS:  3.50161409


 30%|██▉       | 2999/10000 [00:19<00:45, 155.54it/s]

LOSS:  3.20104194


 35%|███▍      | 3499/10000 [00:23<00:52, 122.70it/s]

LOSS:  2.99384737


 40%|███▉      | 3993/10000 [00:26<00:37, 159.80it/s]

LOSS:  2.76117158


 45%|████▍     | 4486/10000 [00:29<00:34, 160.22it/s]

LOSS:  2.44905186


 50%|████▉     | 4994/10000 [00:32<00:31, 161.06it/s]

LOSS:  2.60686064


 55%|█████▍    | 5498/10000 [00:36<00:28, 156.96it/s]

LOSS:  2.26132607


 60%|██████    | 6000/10000 [00:39<00:25, 157.28it/s]

LOSS:  2.47705


 65%|██████▍   | 6499/10000 [00:42<00:21, 159.57it/s]

LOSS:  2.59084654


 70%|██████▉   | 6993/10000 [00:45<00:18, 158.69it/s]

LOSS:  2.39083028


 75%|███████▍  | 7498/10000 [00:48<00:15, 163.06it/s]

LOSS:  2.40112829


 80%|███████▉  | 7994/10000 [00:52<00:12, 154.70it/s]

LOSS:  1.87852848


 85%|████████▌ | 8500/10000 [00:55<00:09, 159.60it/s]

LOSS:  2.034


 90%|████████▉ | 8986/10000 [00:58<00:06, 163.37it/s]

LOSS:  1.98264


 95%|█████████▍| 9484/10000 [01:01<00:03, 161.16it/s]

LOSS:  2.21353889


100%|██████████| 10000/10000 [01:04<00:00, 154.13it/s]


In [None]:
network.save_weights("network/network.ckpt")

In [33]:
embeddings_np = network.weights[0]
transformed = PCA(n_components=2).fit_transform(embeddings_np)

x_1, x_2 = transformed[:, 0], transformed[:, 1]


fig = go.Figure()

fig.add_trace(go.Scatter(x=x_1, y=x_2, mode="markers+text", text=list(vocab_words)))