# Siamese-BERT network for semantic searching

### Import all the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

from random import sample, seed, shuffle
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse


In [2]:
PERSON = 'Niket Girdhar'

google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}').text).get_text()[:1024] # not a good way to search on google

nlp = pipeline('question-answering', 
               model='deepset/roberta-base-squad2', 
               tokenizer='deepset/roberta-base-squad2', 
               max_length=10)

print(google_html)
nlp(f'Who is {PERSON}?', google_html)

Device set to use mps:0


Google SearchPlease click here if you are not redirected within a few seconds.If you're having trouble accessing Google Search, please click here, or send feedback.




{'score': 8.867191780836947e-08,
 'start': 154,
 'end': 164,
 'answer': '\xa0feedback.'}

This might give some answer but that answer is based on some given context (html data extracted here)

Task in hand is to find the context out of a massive corpora of text

In [3]:
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode() # our dataset: textbook about insects

documents = list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n'))) # only keeps documents that have atleast 100 characters

documents = np.array(documents)

print(f'There are {len(documents)} documents/paragraphs')

There are 70 documents/paragraphs


In [4]:
documents[0]

np.str_('\ufeffThe Project Gutenberg eBook of The History of Insects\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this ebook or online\r\nat www.gutenberg.org. If you are not located in the United States,\r\nyou will have to check the laws of the country where you are located\r\nbefore using this eBook.')

In [5]:
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4') # model that is pre-trained on an asymmetric semantic search task
bi_encoder.max_seq_length = 256     # Truncate long documents to 256 tokens

bi_encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [6]:
document_embeddings = bi_encoder.encode(documents, convert_to_tensor=True, show_progress_bar=True) # document being encoded using .encode function

document_embeddings.shape

Batches: 100%|██████████| 3/3 [00:00<00:00,  4.17it/s]


torch.Size([70, 768])

In [7]:
QUESTION = 'What kind of butterflies are there?' # a natural language query that we will be using

In [8]:
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True) # encoding the query using the bi-encoder

hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0] # number of documents to retrieve with the bi-encoder

hits

[{'corpus_id': 30, 'score': 0.5829142332077026},
 {'corpus_id': 29, 'score': 0.36111608147621155},
 {'corpus_id': 11, 'score': 0.33377891778945923}]

In [9]:
print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):
    
    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print('\n')

Question: What kind of butterflies are there?

Document 1 Cos_Sim 0.583:


Of butterflies there are many kinds. How wonderful the various changes
of this class of insects! The butterflies lay their eggs: from these
hatch out worms or caterpillars, which change their skins several times,
and, finally, become aureliae, chrysales, or silkworms, out of which
come the beautiful butterflies.


Document 2 Cos_Sim 0.361:


Of these flies, which are called by many Spindles, there are various
species. They all have two very large eyes, covering the whole surface
of the head. They fly very swiftly, and prey upon the wing, clearing the
air of innumerable little flies. The great ones live about water, but
the smaller are common among hedges, and about gardens.


Document 3 Cos_Sim 0.334:

There are two classes of crickets: viz. the field cricket, and the house
cricket; the latter inhabits warm places, the holes of the hearth, &c.
from whence we hear its notes, which are agreeable: it is said, that


In [10]:
nlp(QUESTION, str(documents[hits[0]['corpus_id']]))

{'score': 0.03519178181886673, 'start': 27, 'end': 37, 'answer': 'many kinds'}

The above system is called an open book Q/A system

### Further Fine-Tuning the bi-encoder

In [11]:
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split='train') # loading the dataset from HuggingFace directly

# we will use the context and question fields of the dataset alone to fine-tune the bi-encoder system

good_training_data = []
bad_training_data = []

last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], 0.0))  #  give 0 similarity score for training i.e. neutral and not -1 as in that case the model would start to work against the context
    
    good_training_data.append((example['question'], example['context'], 1.0)) # question, context, label is 1 if should be matched together
    last_example = example

In [12]:
len(good_training_data), len(bad_training_data)

(30000, 2647)

In [13]:
good_training_data[-1]

('What letter designates what Ektachrome is designed for?',
 'Some high-speed black-and-white films, such as Ilford Delta 3200 and Kodak T-MAX P3200, are marketed with film speeds in excess of their true ISO speed as determined using the ISO testing method. For example, the Ilford product is actually an ISO 1000 film, according to its data sheet. The manufacturers do not indicate that the 3200 number is an ISO rating on their packaging. Kodak and Fuji also marketed E6 films designed for pushing (hence the "P" prefix), such as Ektachrome P800/1600 and Fujichrome P1600, both with a base speed of ISO 400.',
 1.0)

In [14]:
bad_training_data[-1]

('What film beside Ektachrome and Fujichorme is designed for pushing?',
 'The Weston Cadet (model 852 introduced in 1949), Direct Reading (model 853 introduced 1954) and Master III (models 737 and S141.3 introduced in 1956) were the first in their line of exposure meters to switch and utilize the meanwhile established ASA scale instead. Other models used the original Weston scale up until ca. 1955. The company continued to publish Weston film ratings after 1955, but while their recommended values often differed slightly from the ASA film speeds found on film boxes, these newer Weston values were based on the ASA system and had to be converted for use with older Weston meters by subtracting 1/3 exposure stop as per Weston\'s recommendation. Vice versa, "old" Weston film speed ratings could be converted into "new" Westons and the ASA scale by adding the same amount, that is, a film rating of 100 Weston (up to 1955) corresponded with 125 ASA (as per ASA PH2.5-1954 and before). This conver

In [15]:
seed(42)  # seed our upcoming sample

sampled_training_data = sample(good_training_data, 500) + sample(bad_training_data, 500)

shuffle(sampled_training_data)  # shuffle our data around

training_index = int(.8 * len(sampled_training_data))  # Get an 80/20 train/test split

In [16]:
train_examples = [InputExample(texts=t[:2], label=t[2]) for t in sampled_training_data[:training_index]] # Define the training examples

train_examples[0].__dict__

{'guid': '',
 'texts': ('What changed after the eigth century?',
  'There is disagreement about the origin of the term, but general consensus that "cardinalis" from the word cardo (meaning \'pivot\' or \'hinge\') was first used in late antiquity to designate a bishop or priest who was incorporated into a church for which he had not originally been ordained. In Rome the first persons to be called cardinals were the deacons of the seven regions of the city at the beginning of the 6th century, when the word began to mean “principal,” “eminent,” or "superior." The name was also given to the senior priest in each of the "title" churches (the parish churches) of Rome and to the bishops of the seven sees surrounding the city. By the 8th century the Roman cardinals constituted a privileged class among the Roman clergy. They took part in the administration of the church of Rome and in the papal liturgy. By decree of a synod of 769, only a cardinal was eligible to become pope. In 1059, during th

In [17]:
train_dataloader = DataLoader(
    train_examples, shuffle=True, batch_size=32,
    collate_fn=bi_encoder.smart_batching_collate
    )  # A data loader is the object that specifically shuffles/grabs batches of data from a Dataset

train_loss = losses.CosineSimilarityLoss(bi_encoder)

In [18]:
(question_batch, context_batch), labels = next(iter(train_dataloader))  # get a sample batch of data

question_batch['input_ids'].shape, context_batch['input_ids'].shape, labels.shape

(torch.Size([32, 30]), torch.Size([32, 256]), torch.Size([32]))

In [19]:
# Evaluation data, sentences1 and sentences2 are lists of questions and context respectively and scores are 0 or 1
sentences1, sentences2, scores = zip(*sampled_training_data[training_index:])

# evaluator will evaluate embedding closeness
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

In [20]:
bi_encoder.evaluate(evaluator) # a higher initial value is better

{'pearson_cosine': np.float64(0.5021223579927213),
 'spearman_cosine': np.float64(0.5044913287672261)}

In [21]:
# Fine-tune the model using the fit method
bi_encoder.fit(
    train_objectives=[(train_dataloader, train_loss)], 
    output_path='ir/results',
    epochs=2, 
    evaluator=evaluator
)

[34m[1mwandb[0m: Currently logged in as: [33mniketgirdhar2004[0m ([33mniketgirdhar2004-vit-chennai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
25,No log,No log,0.502262,0.504838
50,No log,No log,0.502689,0.505011




In [22]:
bi_encoder.evaluate(evaluator) # not a huge jump in performance with 2 epochs

{'pearson_cosine': np.float64(0.5026890378632383),
 'spearman_cosine': np.float64(0.5050109764878448)}

In [23]:
finetuned_bi_encoder = SentenceTransformer('ir/results') # loading fine-tuned IR model

In [24]:
document_embeddings = finetuned_bi_encoder.encode(documents, convert_to_tensor=True, show_progress_bar=True)

question_embedding = finetuned_bi_encoder.encode(QUESTION, convert_to_tensor=True)

# Get document hits
hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0]

print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):
    
    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print('\n')

Batches: 100%|██████████| 3/3 [00:00<00:00,  4.96it/s]


Question: What kind of butterflies are there?

Document 1 Cos_Sim 0.584:


Of butterflies there are many kinds. How wonderful the various changes
of this class of insects! The butterflies lay their eggs: from these
hatch out worms or caterpillars, which change their skins several times,
and, finally, become aureliae, chrysales, or silkworms, out of which
come the beautiful butterflies.


Document 2 Cos_Sim 0.363:


Of these flies, which are called by many Spindles, there are various
species. They all have two very large eyes, covering the whole surface
of the head. They fly very swiftly, and prey upon the wing, clearing the
air of innumerable little flies. The great ones live about water, but
the smaller are common among hedges, and about gardens.


Document 3 Cos_Sim 0.334:

There are two classes of crickets: viz. the field cricket, and the house
cricket; the latter inhabits warm places, the holes of the hearth, &c.
from whence we hear its notes, which are agreeable: it is said, that


Very minute change as fine--tuning has less data and less no. of epochs

In [25]:
def gutenberg_to_documents(gutenberg_url, bi_encoder):
    text = urlopen(gutenberg_url).read().decode()
    documents = np.array(list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n'))))
    print(f'There are {len(documents)} documents/paragraphs')
    return documents, bi_encoder.encode(documents)

def retrieve_relevant_documents(bi_encoder, query, documents, document_embeddings, hits=3):
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

    hits = util.semantic_search(query_embedding, document_embeddings, top_k=hits)[0]

    for i, hit in enumerate(hits):
        print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
        print('\n')
    print(f"Answer from Top Document: {nlp(query, str(documents[hits[0]['corpus_id']]))}")

In [26]:
banks_to_bassoon_documents, banks_to_bassoon_embeddings = gutenberg_to_documents(
    'https://www.gutenberg.org/cache/epub/27480/pg27480.txt', finetuned_bi_encoder
)

There are 1396 documents/paragraphs


In [27]:
retrieve_relevant_documents(finetuned_bi_encoder,
    'What is a banshee?', banks_to_bassoon_documents, banks_to_bassoon_embeddings, 2
)

Document 1 Cos_Sim 0.754:

BANSHEE (Irish _bean sidhe_; Gaelic _ban sith_, "woman of the fairies"), a
supernatural being in Irish and general Celtic folklore, whose mournful
screaming, or "keening," at night is held to foretell the death of some
member of the household visited. In Ireland legends of the banshee belong
more particularly to certain families in whose records periodic visits from
the spirit are chronicled. A like ghostly informer figures in Brittany
folklore. The Irish banshee is held to be the distinction only of families
of pure Milesian descent. The Welsh have the banshee under the name _gwrach
y Rhibyn_ (witch of Rhibyn). Sir Walter Scott mentions a belief in the
banshee as existing in the highlands of Scotland (_Demonology and
Witchcraft_, p. 351). A Welsh death-portent often confused with the gwrach
y Rhibyn and banshee is the _cyhyraeth_, the groaning spirit.


Document 2 Cos_Sim 0.324:

BANNU, a town and district of British India, in the Derajat division of the
Nor



Answer from Top Document: {'score': 0.044728297740221024, 'start': 76, 'end': 94, 'answer': 'supernatural being'}


In [28]:
retrieve_relevant_documents(finetuned_bi_encoder,
    'When was the Imperial Bank of Germany founded?', banks_to_bassoon_documents, banks_to_bassoon_embeddings, 2
)

Document 1 Cos_Sim 0.797:

[3] The date 1876 is taken as being that when the Imperial Bank of Germany
came into full operation.


Document 2 Cos_Sim 0.573:

Similar banks had been established in Middelburg, (March 28th, 1616), in
Hamburg (1619) and in Rotterdam (February 9th, 1635). Of these the Bank of
Hamburg carried on much the largest business and survived the longest. It
was not till the 15th of February 1873 that its existence was closed by the
act of the German parliament which decreed that Germany should possess a
gold standard, and thus removed those conditions of the local medium of
exchange--silver coins of very different intrinsic values--whose
circulation had provided an ample field for the operations of the bank. The
business of the Bank of Hamburg had been conducted in absolute accordance
with the regulations under which it was founded.


Answer from Top Document: {'score': 0.18934372067451477, 'start': 13, 'end': 17, 'answer': '1876'}
