<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/NLP%20with%20Tensorflow/Creating%20word%20embeddings%20using%20Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

from tensorboard.plugins import projector

os.makedirs("logs", exist_ok=True)

### Loading Dataset

In [2]:
dataset = pd.read_excel("/content/testing_vocab.xlsx")

In [3]:
dataset["target"] = dataset["DO Type"].apply(lambda x: 0 if x=="Deliverable" else 1)

## Text preprocessing

In [4]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


vocab_size = 600
sequence_length = 100


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [5]:
# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
X = dataset["Relevant Description"].values
y = dataset["target"].values

vectorize_layer.adapt(X)

### Model Building

In [6]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

## Compile and train the model

In [7]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

Compile and train the model using the `Adam` optimizer and `BinaryCrossentropy` loss. 

In [8]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [9]:
model.fit(X, y, epochs=15,callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f540f383ed0>

In [10]:
weights =  tf.Variable(model.get_layer('embedding').get_weights()[0])
vocab = vectorize_layer.get_vocabulary()

In [11]:
from pathlib import Path

log_dir = Path("logs/projector")
os.makedirs(log_dir, exist_ok=True)

checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(log_dir/"embedding.ckpt")

with open(log_dir/"metadata.tsv", "w") as f:
  for word in vocab:
    f.write("{}\n".format(word))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir /content/logs/projector