In [None]:
# a very short example of computing similariy with Universal Encoder wz 
# original: https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb 

In [None]:
# Copyright 2018 The TensorFlow Hub Authors. All Rights Reserved.


# Universal Sentence Encoder


<table align="left"><td>
  <a target="_blank"  href="https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab
  </a>
</td><td>
  <a target="_blank"  href="https://github.com/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb">
    <img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
</td></table>


This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


# Getting Started

This section sets up the environment for access to the Universal Sentence Encoder on TF Hub and provides examples of applying the encoder to words, sentences, and paragraphs.

In [None]:
%%capture
!pip3 install seaborn

More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [None]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
#module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [None]:
# Import the Universal Sentence Encoder's TF Hub module
#embed = hub.load(module_url)

# Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print("Message: {}".format(messages[i]))
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join(
      (str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))


Message: Elephant
Embedding size: 512
Embedding: [0.008344451896846294, 0.00048082732246257365, 0.06595247983932495, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.05080863833427429, -0.016524292528629303, 0.015737809240818024, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [-0.028332697227597237, -0.055862199515104294, -0.012941516004502773, ...]



## TRYING RQE DATA ##

try editing the next cell to see the format of the data

   

<pair pid="8567" type="originalQ-shortRandQ" value="false">
<chq>   What is the latest information about the treatment of adolescent migraines?</chq>
<faq>   Boys' father has Marfan's Syndrome. Do these boys need further evaluation?</faq></pair>

<pair pid="8568" type="originalQ-shortQ" value="true">
<chq>   What are the procedures for single ventricle heart in a baby? What kind of heart surgery?</chq>
<faq>   What are the procedures for single ventricle?</faq></pair>

<pair pid="142" type="originalQ-shortQ" value="true">
<chq>   This patient has a history of irregular heart beat, bigeminy and trigeminy.  
He's having a few brief runs occasionally now.  On Quinidine in the past.  Should I treat these?</chq>
<faq>   Patient with bigeminy and trigeminy.  Should I treat these?</faq></pair>}


In [None]:
# Compute a representation for each message, showing various lengths supported.

sentCHQ = "What are the procedures for single ventricle heart in a baby? What kind of heart surgery?"
sentFAQ = "What are the procedures for single ventricle?"
paragraph12 = (
    "This patient has a history of irregular heart beat, bigeminy and trigeminy."
    "He's having a few brief runs occasionally now."  
    "On Quinidine in the past.  Should I treat these?")

messages = [sentFAQ, sentCHQ, paragraph12]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(messages)



In [None]:
message_embeddings[0]

<tf.Tensor: shape=(512,), dtype=float32, numpy=
array([-7.11722448e-02, -3.83576304e-02,  5.26492065e-03,  5.01837805e-02,
        4.98656370e-02, -9.01518092e-02,  9.77779552e-03, -4.33578081e-02,
       -3.61515619e-02,  3.25920135e-02,  9.17477682e-02, -4.81946423e-04,
        2.02154703e-02,  4.61017042e-02,  2.21734475e-02, -1.92170846e-03,
       -9.30491686e-02,  3.25900316e-02, -1.31623829e-02,  5.99438250e-02,
        1.27306106e-02, -7.57595990e-03,  5.19785397e-02, -4.11439724e-02,
       -7.78693557e-02, -3.98984812e-02, -6.02020621e-02,  6.99339285e-02,
        8.09468329e-02,  1.69326961e-02,  4.08169217e-02,  8.27570781e-02,
       -2.42679082e-02,  5.99825718e-02,  1.75679475e-02, -1.12264394e-03,
        1.23985028e-02,  4.84383963e-02, -4.07073349e-02,  2.91648395e-02,
       -1.20374851e-03, -3.07601523e-02,  4.09933217e-02,  5.34487143e-02,
        5.02854178e-04, -7.40119442e-03, -1.66680403e-02, -6.82638213e-02,
       -4.84709535e-03, -6.82081133e-02, -6.41399547

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(np.array([[1,1]]),np.array([[1,1]]))


array([[1.]])

In [None]:
eCHQ= message_embeddings[0]
eFAQ= message_embeddings[1]
ePar =message_embeddings[2]

print(cosine_similarity([eCHQ],[eFAQ]))
print(cosine_similarity([eCHQ],[ePar]))
print(cosine_similarity([eFAQ],[ePar]))

[[0.7353872]]
[[0.20984122]]
[[0.27447593]]


# Word arithmetic

In [None]:
messages=['king','queen','man','woman']
message_embeddings = embed(messages)

In [None]:
me=message_embeddings
q=me[0]-me[2]+me[3]
cosine_similarity([q],[me[1]])

array([[0.53663784]], dtype=float32)

In [None]:
messages=['paris','france','rome','italy']
message_embeddings = embed(messages)

In [None]:
paris,france,rome,italy=message_embeddings
ro=paris-france+italy 
cosine_similarity([ro],[rome])

array([[0.47470134]], dtype=float32)

In [None]:
cosine_similarity([paris],[france])

array([[0.44418812]], dtype=float32)

In [None]:
cosine_similarity([paris],[italy])

array([[0.33479273]], dtype=float32)

In [None]:
kidney,arithm =embed(['kidney','arithmetic'])
cosine_similarity([paris,paris],[kidney,arithm])

array([[0.25483054, 0.14489774],
       [0.25483054, 0.14489774]], dtype=float32)