In [None]:
# Install the library using pip
# !pip install torch==0.1.2
# !pip install sentence-transformers

#### Tutorial : https://towardsdatascience.com/quick-semantic-search-using-siamese-bert-networks-1052e7b4df1

In [1]:
from sentence_transformers import SentenceTransformer
import scipy

In [2]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [11]:
# A corpus is a list with documents split by sentences.

sentences = ['Absence of sanity', 
             'Lack of saneness',
             'A man is eating food.',
             'A man is eating a piece of bread.',
             'The girl is carrying a baby.',
             'A man is riding a horse.',
             'A woman is playing violin.',
             'Two men pushed carts through the woods.',
             'A man is riding a white horse on an enclosed ground.',
             'A monkey is playing drums.',
             'A cheetah is running behind its prey.',
             'ponraj getting tired',
             'I am going to buy food']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

In [12]:
print('Sentences: ', len(sentence_embeddings))
print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sentences:  13
Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [ 2.95402884e-01  2.91811198e-01  2.16480112e+00  2.20419720e-01
 -1.30865481e-02  1.01950383e+00  1.51298213e+00  2.34132290e-01
  2.73058027e-01  1.35122925e-01 -1.11313355e+00 -1.25884727e-01
  1.45378441e-01  9.77708459e-01  1.39352262e+00  4.57705110e-01
 -5.82131505e-01 -7.24941134e-01 -3.61734420e-01 -2.27515191e-01
  1.66627970e-02  2.04862028e-01  6.55132890e-01 -1.29376388e+00
 -7.26099551e-01 -1.91136092e-01 -3.07211161e-01 -1.30278659e+00
 -1.42963886e+00  5.67477290e-03  3.54811668e-01  4.83713001e-01
  6.65388107e-01  5.33848703e-01  6.40496612e-01  5.90408683e-01
  7.83848539e-02 -1.07759178e+00 -1.24676622e-01 -3.98406029e-01
  7.36314416e-01  5.28293252e-01  5.63290715e-01  4.14546102e-01
  4.49179113e-01 -9.58785266e-02  1.45424581e+00 -2.69144505e-01
 -2.44059831e-01 -1.10387039e+00 -2.00923800e-01 -2.17427453e-03
  1.83387983e+00  1.06518483e+00 -5.1

 -4.14106458e-01 -5.26974380e-01 -5.91103256e-01 -2.92363435e-01]


## Semantic Search

In [13]:
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

query = 'shall we shop' #@param {type: 'string'}

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 3 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: shall we shop

Top 5 most similar sentences in corpus:
I am going to buy food (Cosine Score: 0.7054)
ponraj getting tired (Cosine Score: 0.3276)
A man is eating food. (Cosine Score: 0.2690)


In [72]:
w1 = model.encode(['deep learning'])
w2 = model.encode(['machine learning'])

distances = scipy.spatial.distance.cdist(w1, w2, 'cosine')
print(1-distances)

[[0.57613355]]
