# TF Hub Embedding and Lookup

In [1]:
import time
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
# import os
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''

In [3]:
HOST = "dev.verta.ai"

PROJECT_NAME = "TFHub Train Example"
EXPERIMENT_NAME = "Embed for Training"

from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)

In [4]:
import pandas as pd

DATA_DIR = '/Users/convoliution/Downloads/'
DATA_FILE = DATA_DIR + 'imdb_master.csv'

reviews = pd.read_csv(DATA_FILE, encoding='latin')['review'].values.tolist()

print(len(reviews))

In [5]:
import os

class EmbeddingOnlyModel(): 
    '''
    This implementation assumes that the ANN index is served by a different service
    '''
    def __init__(self):
        self.EMBEDDING_LENGTH = 512 
        os.environ["TFHUB_CACHE_DIR"] = "tf_cache_dir"
        
        # define the graph
        g = tf.Graph()
        with g.as_default():
            self.text_input = tf.placeholder(dtype=tf.string, shape=[None])
            self.encoder = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
            self.my_result = self.encoder(self.text_input)
            init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
        g.finalize()
        self.graph = g
        
        # Create session and initialize.
        self.session = tf.Session(graph=self.graph)
        self.session.run(init_op)
        
    def predict(self, sentences):
        return self.session.run(self.my_result, feed_dict={self.text_input: sentences})

In [6]:
embedding_only_model = EmbeddingOnlyModel()
embedding_only_model.predict(["I love this movie", "I love this movie"])

In [7]:
import os
import annoy
class EmbeddingAndLookupModel():
    '''
    This class assumes that the ANN index is available within the model class
    '''
    def __init__(self):
        self.EMBEDDING_LENGTH = 512 
        os.environ["TFHUB_CACHE_DIR"] = "tf_cache_dir"
        
        # define the graph
        g = tf.Graph()
        with g.as_default():
            self.text_input = tf.placeholder(dtype=tf.string, shape=[None])
            self.encoder = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
            self.my_result = self.encoder(self.text_input)
            init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
        g.finalize()
        self.graph = g
        
        # Create session and initialize.
        self.session = tf.Session(graph=self.graph)
        self.session.run(init_op)
        
        # load the index
        self.index = annoy.AnnoyIndex(self.EMBEDDING_LENGTH, "angular")
        self.index.load("reviews.ann") # available locally or as artifact/custom module
        
    def compute_single_embedding(self, sentence):
        return self.session.run(self.my_result, feed_dict={self.text_input: [sentence]})
     
    def predict(self, sentences):
        for sentence in sentences:
            # embed the sentence
            embedding = self.session.run(self.my_result, feed_dict={self.text_input: [sentence]})

            # find closest
            predictions = []
            predictions.append({
                sentence: self.index.get_nns_by_vector(embedding[0], 10)
            })
        return predictions

In [8]:
predict2 = EmbeddingAndLookupModel()
predict2.predict(["i love this movie", "i love this movie"])

In [9]:
class BuildEmbeddingLookupIndex():
    '''
    This train step creates embeddings from text and puts them in an index
    
    TODO: upload the index as an artifact for EmbeddingAndLookupModel
    '''
    def __init__(self):
        self.EMBEDDING_LENGTH = 512
        import os
        os.environ["TFHUB_CACHE_DIR"] = "tf_cache_dir"
        
        # define the graph
        g = tf.Graph()
        with g.as_default():
            self.text_input = tf.placeholder(dtype=tf.string, shape=[None])
            self.encoder = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
            self.my_result = self.encoder(self.text_input)
            init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
        g.finalize()
        self.graph = g
        
        # Create session and initialize.
        self.session = tf.Session(graph=self.graph)
        self.session.run(init_op)
        
    def compute_index(self, sentences):
        from annoy import AnnoyIndex
        t = AnnoyIndex(self.EMBEDDING_LENGTH, 'angular')  # Length of item vector that will be indexed
        for i in range(len(sentences)):
            # calculate embedding with TF
            embedding = self.session.run(self.my_result, feed_dict={self.text_input: [sentences[i]]})
            t.add_item(i, embedding[0])

        t.build(10) # 10 trees
        t.save('reviews.ann')
        
    def compute_single_embedding(self, sentence):
        return self.session.run(self.my_result, feed_dict={self.text_input: [sentence]})

In [10]:
build_index = BuildEmbeddingLookupIndex()
print(build_index.compute_single_embedding("I love this movie").shape)
build_index.compute_index(reviews[:1000])