# Pre-trained embeddings for Text

In [None]:
import gzip
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
glove_path = '../data/embeddings/glove.6B.50d.txt.gz'

In [None]:
with gzip.open(glove_path, 'r') as fin:
    line = fin.readline().decode('utf-8')

In [None]:
line

In [None]:
def parse_line(line):
    values = line.decode('utf-8').strip().split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    return word, vector

In [None]:
embeddings = {}
word_index = {}

In [None]:
word_inverted_index = []

In [None]:
with gzip.open(glove_path, 'r') as fin:
    for idx, line in enumerate(fin):
        word, vector = parse_line(line) # parse a line
        
        embeddings[word] = vector  # add word vector
        word_index[word] = idx  # add idx
        word_inverted_index.append(word)  # append word

In [None]:
word_index['good']

In [None]:
word_inverted_index[219]

In [None]:
embeddings['good']

In [None]:
embedding_size = len(embeddings['good'])
embedding_size

In [None]:
plt.plot(embeddings['good']);

In [None]:
plt.subplot(211)
plt.plot(embeddings['two'])
plt.plot(embeddings['three'])
plt.plot(embeddings['four'])
plt.title("A few numbers")
plt.ylim(-2, 5)

plt.subplot(212)
plt.plot(embeddings['cat'])
plt.plot(embeddings['dog'])
plt.plot(embeddings['rabbit'])
plt.title("A few animals")
plt.ylim(-2, 5)

plt.tight_layout()

In [None]:
vocabulary_size = len(embeddings)
vocabulary_size

## Loading pre-trained embeddings in Keras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

In [None]:
embedding_weights = np.zeros((vocabulary_size,
                              embedding_size))

In [None]:
for word, index in word_index.items():
    embedding_weights[index, :] = embeddings[word]

In [None]:
emb_layer = Embedding(input_dim=vocabulary_size,
                      output_dim=embedding_size,
                      mask_zero=False,
                      trainable=False)

In [None]:
word_inverted_index[0]

In [None]:
model = Sequential()
model.add(emb_layer)

In [None]:
model.set_weights([embedding_weights])

In [None]:
embeddings['cat'] 

In [None]:
cat_index = word_index['cat']

In [None]:
cat_index

In [None]:
model.predict([[cat_index]])

## Gensim

In [1]:
import gensim

In [2]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [3]:
glove_path = '../data/embeddings/glove.6B.50d.txt.gz'
glove_w2v_path = '../data/embeddings/glove.6B.50d.txt.vec'

In [4]:
glove2word2vec(glove_path, glove_w2v_path)

(400000, 50)

In [5]:
from gensim.models import KeyedVectors

In [6]:
glove_model = KeyedVectors.load_word2vec_format(
    glove_w2v_path, binary=False)

In [7]:
glove_model.most_similar(positive=['good'], topn=5)

[('better', 0.9284390807151794),
 ('really', 0.9220625162124634),
 ('always', 0.9165270328521729),
 ('sure', 0.9033513069152832),
 ('something', 0.9014205932617188)]

In [8]:
glove_model.most_similar(positive=['two'], topn=5)

[('three', 0.9885902404785156),
 ('four', 0.9817472696304321),
 ('five', 0.9644663333892822),
 ('six', 0.964131236076355),
 ('seven', 0.9512959718704224)]

In [9]:
glove_model.most_similar(positive=['king', 'woman'],
                         negative=['man'], topn=3)

[('queen', 0.8523603677749634),
 ('throne', 0.7664333581924438),
 ('prince', 0.7592144012451172)]

## Visualization

In [None]:
import os

In [None]:
model_dir = '/tmp/tensorboard/'

In [None]:
n_viz = 4000

In [None]:
emb_layer_viz = Embedding(n_viz,
                          embedding_size,
                          mask_zero=False,
                          trainable=False)

In [None]:
model = Sequential([emb_layer_viz])

In [None]:
model.set_weights([embedding_weights[:n_viz]],)

In [None]:
word_embeddings = emb_layer_viz.weights[0]

In [None]:
word_embeddings

In [None]:
import tensorflow as tf

In [None]:
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.save(os.path.join(model_dir, 'model.ckpt'))

In [None]:
os.listdir(model_dir)

In [None]:
fname = os.path.join(model_dir, 'metadata.tsv')

with open(fname, 'w', encoding="utf-8") as fout:
    for index in range(0, n_viz):
        word = word_inverted_index[index]
        fout.write(word + '\n')

In [None]:
config_string = """
embeddings {
  tensor_name: "model/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE"
  metadata_path: "metadata.tsv"
}
"""

In [None]:
fname = os.path.join(model_dir, 'projector_config.pbtxt')

with open(fname, 'w', encoding="utf-8") as fout:
    fout.write(config_string)

Now open tensorboard and play with the projector.