In [2]:
# Sentence Transformer - Based on Colab example
# SiameseBERT SemanticSearch
# 
# Nov 10, 2022
# Sila

In [3]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 43.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 34.8 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 56.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 21.5 MB/s 
Building wheels for collected 

In [4]:
from sentence_transformers import SentenceTransformer
# Load the BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [5]:
# A corpus is a list with documents split by sentences.

sentences = ['Absence of sanity', 
             'Lack of saneness',
             'A man is eating food.',
             'A man is eating a piece of bread.',
             'The girl is carrying a baby.',
             'A man is riding a horse.',
             'A woman is playing violin.',
             'Two men pushed carts through the woods.',
             'A man is riding a white horse on an enclosed ground.',
             'A monkey is playing drums.',
             'A cheetah is running behind its prey.']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [ 2.95402914e-01  2.91811526e-01  2.16480112e+00  2.20420077e-01
 -1.30865183e-02  1.01950336e+00  1.51298177e+00  2.34132767e-01
  2.73057520e-01  1.35123193e-01 -1.11313379e+00 -1.25884145e-01
  1.45378441e-01  9.77708519e-01  1.39352250e+00  4.57705319e-01
 -5.82132161e-01 -7.24941134e-01 -3.61733824e-01 -2.27515101e-01
  1.66625828e-02  2.04862028e-01  6.55132830e-01 -1.29376388e+00
 -7.26099193e-01 -1.91135913e-01 -3.07210982e-01 -1.30278587e+00
 -1.42963934e+00  5.67488605e-03  3.54811519e-01  4.83712375e-01
  6.65388584e-01  5.33848643e-01  6.40497208e-01  5.90408742e-01
  7.83847421e-02 -1.07759190e+00 -1.24676540e-01 -3.98406357e-01
  7.36314535e-01  5.28293490e-01  5.63291252e-01  4.14546102e-01
  4.49179262e-01 -9.58784819e-02  1.45424581e+00 -2.69144118e-01
 -2.44059563e-01 -1.10387075e+00 -2.00924203e-01 -2.17411388e-03
  1.83387983e+00  1.06518424e+00 -5.11945605e-01 -1.

In [6]:
import scipy
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

query = 'Nobody has sane thoughts' #@param {type: 'string'}

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: Nobody has sane thoughts

Top 5 most similar sentences in corpus:
Lack of saneness (Cosine Score: 0.8958)
Absence of sanity (Cosine Score: 0.8744)
A man is riding a horse. (Cosine Score: 0.1705)
A monkey is playing drums. (Cosine Score: 0.1687)
The girl is carrying a baby. (Cosine Score: 0.1521)


In [7]:
# A corpus is a list with documents split by sentences.
# Lets try this model in danish

sentences = ['Mangel på fornuft', 
             'Det er meget mangfoldigt',
             'En mand spiser frugt.',
             'En man er spist.',
             'Pigen har et barn.',
             'En mand ridder på en hest.',
             'En kvinde spiller på violin.',
             'To mænd skubber en trillebør.',
             'En man ridder på hest udover markerne',
             'En ab spille rpå trommer',
             'En gorilla er bange for en mand.']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [-2.93264747e-01  4.57956046e-02  1.42177558e+00 -1.06673680e-01
  5.79703897e-02  4.70868766e-01 -4.30052318e-02  3.10041904e-01
 -8.33642334e-02 -1.08622983e-01 -6.85756505e-01  4.59749520e-01
  1.78348064e-01  5.50402164e-01  1.97939694e-01  2.48682350e-01
 -5.34830511e-01 -2.60909408e-01  2.36423701e-01 -1.00282598e+00
  2.36105755e-01  2.50197053e-01  1.99443668e-01 -2.44849771e-01
  2.92146653e-01 -4.31401938e-01  1.12081945e-01 -2.20468283e+00
 -7.82790303e-01  1.82220116e-01 -1.57539234e-01 -1.35629788e-01
  2.86828250e-01 -3.71458948e-01 -4.95609701e-01  2.80675590e-02
 -1.86542124e-01  5.58054745e-01  1.93539143e-01 -3.34817410e-01
  1.42787409e+00 -2.70668060e-01  2.18868256e-01 -7.11818114e-02
 -7.15680540e-01 -4.80934978e-01  9.60307658e-01  1.04374997e-01
 -6.64378524e-01 -1.59022856e+00 -4.66777027e-01 -8.98470879e-01
  8.18359017e-01  4.00809467e-01 -1.12979993e-01 -3.

In [10]:
# In danish this is not as good as the model is not trained on
# danish texts. 

In [11]:
query = 'Der er mangel på fornuft i verden' #@param {type: 'string'}

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: Der er mangel på fornuft i verden

Top 5 most similar sentences in corpus:
Mangel på fornuft (Cosine Score: 0.8974)
En man ridder på hest udover markerne (Cosine Score: 0.8726)
Det er meget mangfoldigt (Cosine Score: 0.8715)
En mand ridder på en hest. (Cosine Score: 0.8714)
En mand spiser frugt. (Cosine Score: 0.8691)


In [13]:
query = 'Han spiser frugt' #@param {type: 'string'}

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: Han spiser frugt

Top 5 most similar sentences in corpus:
En mand spiser frugt. (Cosine Score: 0.9236)
Det er meget mangfoldigt (Cosine Score: 0.8800)
En mand ridder på en hest. (Cosine Score: 0.8488)
En man er spist. (Cosine Score: 0.8332)
En man ridder på hest udover markerne (Cosine Score: 0.8288)
