# [SemEval 2023 Task 1](https://raganato.github.io/vwsd/)

[By Abdullah Alshadadi (Srking501)](https://github.com/Srking501)

In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import cv2
import keras
import random
import os

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [3]:
# to access google drive folder
# from google.colab import drive
# drive.mount('/content/drive') # When you run this you'll be prompted for a token - follow the link to generate this.
# #        "/content/drive/MyDrive/...path/to/dir
# path = "/content/drive/MyDrive/semeval-2023/task-1/"
# path = "/content/drive/MyDrive/data/"

# Comment this out if you are using Google Colab, otherwise just change to local directory where the data is located
path = "./data/"

In [4]:
os.getcwd()

'/Users/srking501/Desktop/Projects/Data Science Hackathon/semeval-2023-task1'

❗❗❗Retrieve the data from [SemEval-2023 Task-1 page](https://raganato.github.io/vwsd/), specifically the ["[TRAIN+TRIAL]" data](https://drive.google.com/file/d/1byX4wpe1UjyCVyYrT04sW17NnycKAK7N/view), and put it into `data/` directory❗❗❗

In [5]:
from zipfile import ZipFile

try:
    if os.path.isdir(path + "semeval-2023-task-1-V-WSD-train-v1") == False:
        with ZipFile(path + "semeval-2023-task-1-V-WSD-train-v1.zip", "r") as zip:
            print("Unzipping data...")
            zip.extractall("data/")
            print("Finished.")
    else:
        print("[Data already unzipped]")
except FileNotFoundError:
    print(f"The file semeval-2023-task-1-V-WSD-train-v1.zip is not found in path:\n"
          f"{path}\n")
    print(f"The contents of {path}:\n"
          f"{os.listdir(path)}")

[Data already unzipped]


To check if we are in the right directory

In [6]:
os.listdir(path)

['.DS_Store',
 'captions_train_clean_complete.csv',
 'README.md',
 'semeval-2023-task-1-V-WSD-train-v1',
 'wordnet(individual tokens).csv']

Using `train_v1/` data

In [7]:
path_to_data = path + "semeval-2023-task-1-V-WSD-train-v1/"
data_labels = pd.read_csv(path_to_data + "train_v1/train.data.v1.txt", 
                sep = "\t", 
                names = ["target_word", "full_phrase"] + 
                ["image_%d"%index for index in range(10)])

In [8]:
data_labels.head()

Unnamed: 0,target_word,full_phrase,image_0,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg
3,bangalores,bangalores torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg
4,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg


In [9]:
gold_labels = pd.read_csv(path_to_data + "train_v1/train.gold.v1.txt", 
                    names = ["gold_images"])

In [10]:
gold_labels.head()

Unnamed: 0,gold_images
0,image.0.jpg
1,image.20.jpg
2,image.35.jpg
3,image.55.jpg
4,image.75.jpg


Image Caption Data and WordNet senses

In [11]:
caption_data_labels = pd.read_csv(path + "captions_train_clean_complete.csv")
wordnet_senses_labels = pd.read_csv(path + "wordnet(individual tokens).csv")

In [12]:
caption_data_labels.head()

Unnamed: 0,File Name,Generated Caption
0,image.0.jpg,a black and white bird is standing in the air
1,image.1.jpg,a man with a red hat is standing in front of a...
2,image.10.jpg,a man is standing in a large white and white a...
3,image.100.jpg,a man and a woman are playing with a toy
4,image.1000.jpg,a person is riding a dirt path in a wooded area


In [13]:
wordnet_senses_labels.head()

Unnamed: 0,Complex_word,senses
0,moorhen swamphen,black gallinule that inhabits ponds and lakes ...
1,serinus genus,Old World finches; e.g. canaries and serins or...
2,pegmatite igneous,produced under conditions involving intense he...
3,bangalore torpedo,an industrial city in south central India (wes...
4,bonxie skua,gull-like jaeger of northern seas or gull-like...


## Creating the main `dataset`

In [14]:
dataset = data_labels.join(gold_labels)

In [15]:
dataset.head()

Unnamed: 0,target_word,full_phrase,image_0,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9,gold_images
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,image.0.jpg
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,image.20.jpg
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,image.35.jpg
3,bangalores,bangalores torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,image.55.jpg
4,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,image.75.jpg


The only important dataframes are `caption_data_labels_renamed` and the newly merged `wordnet_senses_labels_renamed` into the main `dataset` dataframe

In [16]:
# renaming columns to match the `data_labels` dataframe and 
# to fit one standard format 
# (following the python naming convention - 
# https://peps.python.org/pep-0008/#naming-conventions)
#
caption_data_labels_renamed = caption_data_labels.rename(columns={
    "File Name": "images",
    "Generated Caption": "generated_caption"
    })
wordnet_senses_labels_renamed = wordnet_senses_labels.rename(columns={
    "Complex_word": "full_phrase"
})

In [17]:
caption_data_labels_renamed.head()

Unnamed: 0,images,generated_caption
0,image.0.jpg,a black and white bird is standing in the air
1,image.1.jpg,a man with a red hat is standing in front of a...
2,image.10.jpg,a man is standing in a large white and white a...
3,image.100.jpg,a man and a woman are playing with a toy
4,image.1000.jpg,a person is riding a dirt path in a wooded area


In [18]:
wordnet_senses_labels_renamed.head()

Unnamed: 0,full_phrase,senses
0,moorhen swamphen,black gallinule that inhabits ponds and lakes ...
1,serinus genus,Old World finches; e.g. canaries and serins or...
2,pegmatite igneous,produced under conditions involving intense he...
3,bangalore torpedo,an industrial city in south central India (wes...
4,bonxie skua,gull-like jaeger of northern seas or gull-like...


In [19]:
dataset = dataset.merge(wordnet_senses_labels_renamed, how="inner", on="full_phrase")

In [20]:
dataset.head()

Unnamed: 0,target_word,full_phrase,image_0,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9,gold_images,senses
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,image.0.jpg,black gallinule that inhabits ponds and lakes ...
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,image.20.jpg,Old World finches; e.g. canaries and serins or...
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,image.35.jpg,produced under conditions involving intense he...
3,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,image.75.jpg,gull-like jaeger of northern seas or gull-like...
4,ixia,ixia genus,image.90.jpg,image.3.jpg,image.91.jpg,image.4.jpg,image.92.jpg,image.1.jpg,image.2.jpg,image.94.jpg,image.93.jpg,image.5.jpg,image.90.jpg,a monocotyledonous genus of the family Iridace...


Remove any missing data from the generated `senses`

In [21]:
for missing_data in np.where(dataset.isnull()):
    dataset = dataset.drop(missing_data)

In [22]:
# Check if the missing data are dropped (numpy array should appear empty)
np.where(dataset.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [23]:
dataset.head()

Unnamed: 0,target_word,full_phrase,image_0,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9,gold_images,senses
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,image.0.jpg,black gallinule that inhabits ponds and lakes ...
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,image.20.jpg,Old World finches; e.g. canaries and serins or...
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,image.35.jpg,produced under conditions involving intense he...
3,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,image.75.jpg,gull-like jaeger of northern seas or gull-like...
4,ixia,ixia genus,image.90.jpg,image.3.jpg,image.91.jpg,image.4.jpg,image.92.jpg,image.1.jpg,image.2.jpg,image.94.jpg,image.93.jpg,image.5.jpg,image.90.jpg,a monocotyledonous genus of the family Iridace...


# Word2Vec model

Reference (https://www.tensorflow.org/tutorials/text/word2vec)

In [24]:
import io
import re
import string
import tqdm

from tensorflow.keras import layers

In [25]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [26]:
# Define the vocabulary size and the number of words in a sequence.
VOCAB_SIZE = 4096
SEQUENCE_LENGTH = 10

In [27]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [28]:
len(dataset) # Size of the dataset

11722

Reads the `full_phrase` and `senses` then combine them into `result_sense.txt` corpus

In [29]:
import glob
# +------------------------------------------------------------+
# | had to do this hacky way to make jupyter run the for-loop  |
# | probably without repeating the first index of the `dataset`|
# +------------------------------------------------------------+
list_index = []
for index in range(len(dataset)):
    if index % 1 == 0:
        list_index.append(index)

# adds the final index that is not divisible by the mod number
if len(dataset) not in list_index:
    list_index.append(len(dataset)) 


if os.path.isfile(path + "result_senses.txt") == False:
    # i being the first index, y being the index after it
    print("Creating the senses text files...")
    for i, y in zip(tqdm.tqdm(list_index), list_index[1:]):
        file = open(path + "senses%d.txt"%y, "w")
        dataset_senses = ""
        for full_phrase, senses in zip(dataset.full_phrase[i:y], dataset.senses[i:y]):
            dataset_senses +=  full_phrase + " " + senses + "\n"
            file.write(dataset_senses)
        file.close()
        del full_phrase, senses  # clears the memory to avoid memory leak
    
    # Reference (https://stackoverflow.com/a/17749339)
    print("Combining senses text files into one...")
    senses_files = sorted(glob.glob(path + "senses*.txt"))
    with open(path + "result_senses.txt", "wb") as outfile:
        for f in tqdm.tqdm(senses_files):
            with open(f, "rb") as infile:
                outfile.write(infile.read())
            os.remove(f)
        
else:
    print("[result_senses.txt has already been created]")

Creating the senses text files...


100%|█████████▉| 11722/11723 [00:01<00:00, 8669.78it/s]


Combining senses text files into one...


100%|██████████| 11722/11722 [00:00<00:00, 18361.15it/s]


In [30]:
text_ds = tf.data.TextLineDataset(path + "result_senses.txt").filter(lambda x: tf.cast(tf.strings.length(x), bool))

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-03-18 17:14:49.926070: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-18 17:14:49.926216: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Vectorize sentences from the corpus

In [31]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

Call `TextVectorization.adapt` on the text dataset to create vocabulary.

In [32]:
vectorize_layer.adapt(text_ds.batch(1024))

2023-03-18 17:14:50.007099: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-18 17:14:50.050361: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Once the state of the layer has been adapted to represent the text corpus, the vocabulary can be accessed with `TextVectorization.get_vocabulary`. This function returns a list of all vocabulary tokens sorted (descending) by their frequency.

In [33]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'or', 'a', 'of', 'the', 'and', 'in', 'to', 'an', 'that', 'for', 'by', 'with', 'something', 'as', 'one', 'is', 'group', 'genus']


The `vectorize_layer` can now be used to generate vectors for each element in the `text_ds` (a `tf.data.Dataset`). Apply `Dataset.batch`, `Dataset.prefetch`, `Dataset.map`, and `Dataset.unbatch`.

In [34]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

Obtain sequences from the dataset

In [35]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

11722


Inspect a few examples from `sequences`:

In [36]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[   1    1  332    1   10    1    1    6 3609    2] => ['[UNK]', '[UNK]', 'black', '[UNK]', 'that', '[UNK]', '[UNK]', 'and', 'lakes', 'or']
[  1   1   3 102  25  17   3 231   4   1] => ['[UNK]', '[UNK]', 'a', 'criminal', 'who', 'is', 'a', 'member', 'of', '[UNK]']
[  1  46   3  45  54  57  56   2 115  45] => ['[UNK]', 'family', 'a', 'social', 'unit', 'living', 'together', 'or', 'primary', 'social']
[  1 733   1 733   1   1   7 220   2   1] => ['[UNK]', 'horse', '[UNK]', 'horse', '[UNK]', '[UNK]', 'in', 'size', 'or', '[UNK]']
[   1    1    5  597    1   63    4    5 1993  161] => ['[UNK]', '[UNK]', 'the', 'fleshy', '[UNK]', 'part', 'of', 'the', 'external', 'human']


Generate training examples from sequences

In [37]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=VOCAB_SIZE,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 11722/11722 [00:45<00:00, 259.52it/s]




targets.shape: (41461,)
contexts.shape: (41461, 5)
labels.shape: (41461, 5)


## Training of Word2Vec

In [38]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
final_dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
final_dataset = final_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(final_dataset)

<BatchDataset shapes: (((1024,), (1024, 5)), (1024, 5)), types: ((tf.int64, tf.int64), tf.int64)>


In [39]:
final_dataset = final_dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(final_dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5)), (1024, 5)), types: ((tf.int64, tf.int64), tf.int64)>


In [40]:
num_ns = 4  # number of negative samples

Subclassed word2vec model

In [41]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

Define loss function and compile model

In [42]:
embedding_dim = 128
word2vec = Word2Vec(VOCAB_SIZE, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [43]:
log_dir='logs/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

Train the model on the `final_dataset` for some number of epochs:

In [44]:
word2vec.fit(final_dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20


2023-03-18 17:15:48.763380: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x176ee1400>

## Embedding lookup and analysis

Reference (https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin)

In [45]:
from tensorboard.plugins import projector

if (os.path.isfile(log_dir + "vectors.tsv") and os.path.isfile(log_dir + "metadata.tsv")) == False:
  weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
  vocab = vectorize_layer.get_vocabulary()
  out_v = io.open(log_dir + 'vectors.tsv', 'w', encoding='utf-8')
  out_m = io.open(log_dir + 'metadata.tsv', 'w', encoding='utf-8')

  for index, word in enumerate(vocab):
    if index == 0:
      continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
  out_v.close()
  out_m.close()

else:
   print("[vectors.csv and metadata.csv already exists]")

# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(word2vec.layers[0].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))


# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)


In [47]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir logs