## Import modules

In [None]:
import gensim
import logging
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
from tensorboard.plugins import projector

print("Gensim version: \t%s" % gensim.__version__)
print("TensorFlow version: \t%s" % tf.__version__)

## Config/Load data

In [None]:
# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw files
TEXT_DIR  = 'data/.../...'

# Directory for saving checkpoints
MODEL_DIR = ".../"

EMBEDDING_SIZE = 300

## Preprocessing

In [None]:
def clean_doc(doc):
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

def read_files(path):
    documents = list()
    tokenize  = lambda x: gensim.utils.simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

docs = read_files(TEXT_DIR)

print('Number of documents: %i' % len(docs))

## Training the model

In [None]:
model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE, min_count=0)

In [None]:
# Save the model
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

In [None]:
# Checkpoints and metadata

weights = model.wv.vectors
index_words = model.wv.index2word

vocab_size = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'embeddings'
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(MODEL_DIR, config)

tensor_embeddings = tf.Variable(model.wv.vectors, name='embeddings')

checkpoint = tf.compat.v1.train.Saver([tensor_embeddings])
checkpoint_path = checkpoint.save(sess=None, global_step=None, save_path=os.path.join(MODEL_DIR, "model.ckpt"))

In [None]:
model.wv.most_similar(positive=["cat"], topn=10)

## t-SNE Visualization

In [None]:
# Inspired by code here: https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=32)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')