# SiameseBert Semantic Sentence Similiarity

In [2]:
from sentence_transformers import SentenceTransformer

# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
# A corpus is a list with documents split by sentences.
sentences = ['weather','clouds','time','sun','rain','dust','strom','winds']
sentence = ['Absence of sanity', 
             'Lack of saneness',
             'A man is eating food.',
             'A man is eating a piece of bread.',
             'The girl is carrying a baby.',
             'A man is riding a horse.',
             'A woman is playing violin.',
             'Two men pushed carts through the woods.',
             'A man is riding a white horse on an enclosed ground.',
             'A monkey is playing drums.',
             'A cheetah is running behind its prey.',
             'Get me updates on stock market',
             'Get me updates on rupee market',
             "get me details of",
            "1. Abstractive Model",
            "2. Extractive Model",
             "linux",
            "Launch",
            "Build"]

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentence)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [ 2.95402914e-01  2.91811258e-01  2.16480088e+00  2.20419854e-01
 -1.30862715e-02  1.01950312e+00  1.51298177e+00  2.34132320e-01
  2.73057789e-01  1.35122865e-01 -1.11313367e+00 -1.25884742e-01
  1.45378441e-01  9.77708459e-01  1.39352286e+00  4.57705081e-01
 -5.82131386e-01 -7.24940956e-01 -3.61734450e-01 -2.27515012e-01
  1.66631341e-02  2.04862073e-01  6.55133009e-01 -1.29376388e+00
 -7.26099610e-01 -1.91135988e-01 -3.07211488e-01 -1.30278563e+00
 -1.42963910e+00  5.67491632e-03  3.54811460e-01  4.83712852e-01
  6.65387809e-01  5.33848584e-01  6.40496850e-01  5.90408623e-01
  7.83847570e-02 -1.07759190e+00 -1.24676540e-01 -3.98406357e-01
  7.36314416e-01  5.28293252e-01  5.63290656e-01  4.14545923e-01
  4.49179560e-01 -9.58785191e-02  1.45424604e+00 -2.69144595e-01
 -2.44059727e-01 -1.10387051e+00 -2.00923488e-01 -2.17434089e-03
  1.83387971e+00  1.06518435e+00 -5.11946142e-01 -1.

In [8]:
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py
import scipy
#query = "second one" #@param {type: 'string'}
query = "rocket"

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentence[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: rocket

Top 5 most similar sentences in corpus:
Launch (Cosine Score: 0.7751)
Build (Cosine Score: 0.5190)
linux (Cosine Score: 0.5184)
1. Abstractive Model (Cosine Score: 0.4717)
get me details of (Cosine Score: 0.4658)
