<a href="https://colab.research.google.com/github/SandeshRangreji/NeuralSearchEngine/blob/main/Retrieval_and_Rerank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.8 MB/s 
[?25hCollecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 19.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 51.1 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K  

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

In [None]:
if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

In [None]:
# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     # Truncate long passages to 256 tokens
top_k = 32                          # Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 32 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Simple Wikipedia dataset with 170k articles

wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

  0%|          | 0.00/50.2M [00:00<?, ?B/s]

In [None]:
# Loading articles from file and appending to a list of articles
passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        # Adds article summary to the list (first para in wikipedia article)
        passages.append(data['paragraphs'][0])

print("Passages:", len(passages))

Passages: 169597


In [None]:
# Encode all articles in vector space
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/5300 [00:00<?, ?it/s]

In [None]:
# BM25 for lexical search

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# Text preprocessing
# Converts text to lower case, removes stop words, punctuations
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

# tokenization
tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

# TF-IDF based algorithm for keyword search
bm25 = BM25Okapi(tokenized_corpus)


  0%|          | 0/169597 [00:00<?, ?it/s]

In [None]:
# This function will search all wikipedia articles in the dataset that answer the query
def search(query):
    print("Input question:", query)
    ans = []

    ##### BM25 search (lexical search) #####
    # predict lexical match scores
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    # extract top 5 results
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    # create list of top 5 documents with relevant information
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    # sort the list according to the score
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    # print top 3 matches
    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:3]:
      print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
      ans.append(passages[hit['corpus_id']].replace("\n", " "))

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    # search for top 32 similar documents in vector space
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # core all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # create list of all hits
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-3 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:3]:
      print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
      ans.append(passages[hit['corpus_id']].replace("\n", " "))

    # Output of top-3 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
      print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
      ans.append(passages[hit['corpus_id']].replace("\n", " "))

    return ans[0], ans[1], ans[2], ans[3], ans[4], ans[5], ans[6], ans[7], ans[8]


In [None]:
search(query = "What is the capital of the United States?")

Input question: What is the capital of the United States?
Top-3 lexical search (BM25) hits
	13.316	Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states. The federal government (including the United States military) also uses capital punishment.
	11.434	Ohio is one of the 50 states in the United States. Its capital is Columbus. Columbus also is the largest city in Ohio.
	11.179	Nevada is one of the United States' states. Its capital is Carson City. Other big cities are Las Vegas and Reno.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.622	Cities in the United States:
	0.597	The United States Capitol is the building where the United States Congress meets. It is the center of the legislative branch of the U.S. federal government. It is in Washington, D.C., on top of Capitol Hill at the east end of the National Mall.
	0.596	In the United States:

-

('Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states. The federal government (including the United States military) also uses capital punishment.',
 'Ohio is one of the 50 states in the United States. Its capital is Columbus. Columbus also is the largest city in Ohio.',
 "Nevada is one of the United States' states. Its capital is Carson City. Other big cities are Las Vegas and Reno.",
 'Cities in the United States:',
 'The United States Capitol is the building where the United States Congress meets. It is the center of the legislative branch of the U.S. federal government. It is in Washington, D.C., on top of Capitol Hill at the east end of the National Mall.',
 'In the United States:',
 'Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district. The Pr

In [None]:
search(query = "What is the best orchestra in the world?")

Input question: What is the best orchestra in the world?
Top-3 lexical search (BM25) hits
	15.328	The BBC Symphony Orchestra is the main orchestra of the British Broadcasting Corporation. It is one of the best orchestras in Britain.
	15.320	The NHK Symphony Orchestra is a Japanese orchestra based in Tokyo, Japan. In Japanese it is written: NHK交響楽団, pronounced: Enueichikei Kōkyō Gakudan. When the orchestra was started in 1926 it was called "New Symphony Orchestra". It was the first large professional orchestra in Japan. Later, it changed its name to "Japan Symphony Orchestra". In 1951 it started to get money from the Japanese radio station NHK (Nippon Hōsō Kyōkai), so it changed its name again to the name it has now. It is thought of as the best orchestra in Japan. They have played in many parts of the world, including at the BBC Proms in London.
	14.079	The Bamberger Symphoniker (Bamberg Symphony Orchestra) is a world-famous orchestra from the city of Bamberg, Germany. It was formed in

('The BBC Symphony Orchestra is the main orchestra of the British Broadcasting Corporation. It is one of the best orchestras in Britain.',
 'The NHK Symphony Orchestra is a Japanese orchestra based in Tokyo, Japan. In Japanese it is written: NHK交響楽団, pronounced: Enueichikei Kōkyō Gakudan. When the orchestra was started in 1926 it was called "New Symphony Orchestra". It was the first large professional orchestra in Japan. Later, it changed its name to "Japan Symphony Orchestra". In 1951 it started to get money from the Japanese radio station NHK (Nippon Hōsō Kyōkai), so it changed its name again to the name it has now. It is thought of as the best orchestra in Japan. They have played in many parts of the world, including at the BBC Proms in London.',
 'The Bamberger Symphoniker (Bamberg Symphony Orchestra) is a world-famous orchestra from the city of Bamberg, Germany. It was formed in 1946. Most of the musicians who formed the orchestra were Germans who had been forced to leave Czechosl

In [None]:
search(query = "Number countries Europe")

Input question: Number countries Europe
Top-3 lexical search (BM25) hits
	13.795	Amy MacDonald is a Scottish singer and songwriter. She became famous in 2007 with her first album "This Is The Life" and her first single "Poison Prince". She has become even more successful in Europe since her single "This Is The Life" charted at number 1 in many European countries.
	13.758	The Croatian language is spoken mainly throughout the countries of Croatia and Bosnia and Herzegovina and in the surrounding countries of Europe.
	13.019	Organization for Security and Co-operation in Europe (OSCE) is an international organization for peace and human rights. Presently, it has 57 countries as its members. Most of the member countries of the OSCE are from Europe, the Caucasus, Central Asia and North America.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.538	The Council of Europe (, ) is an international organization of 47 member states in the European region. One of its first successes wa

('Amy MacDonald is a Scottish singer and songwriter. She became famous in 2007 with her first album "This Is The Life" and her first single "Poison Prince". She has become even more successful in Europe since her single "This Is The Life" charted at number 1 in many European countries.',
 'The Croatian language is spoken mainly throughout the countries of Croatia and Bosnia and Herzegovina and in the surrounding countries of Europe.',
 'Organization for Security and Co-operation in Europe (OSCE) is an international organization for peace and human rights. Presently, it has 57 countries as its members. Most of the member countries of the OSCE are from Europe, the Caucasus, Central Asia and North America.',
 'The Council of Europe (, ) is an international organization of 47 member states in the European region. One of its first successes was the European Convention on Human Rights in 1950, which serves as the basis for the European Court of Human Rights.',
 'England is a country in Europ

In [None]:
search(query = "When did the cold war end?")

Input question: When did the cold war end?
Top-3 lexical search (BM25) hits
	17.374	The Cold War was the tense relationship between the United States (and its allies), and the Soviet Union (the USSR and its allies) between the end of World War II and the fall of the Soviet Union. It is called the "Cold" War because the US and the USSR never actually fought each other directly. Instead, they opposed each other in conflicts known as proxy wars, where each country chose a side to support.
	17.291	The Reagan Doctrine was a document by the United States under the Reagan Administration. It was about being against the global influence of the Soviet Union during the final years of the Cold War. The doctrine lasted for less than a decade, it was the most important document of United States foreign policy from the early 1980s until the end of the Cold War in 1991.
	15.420	Cold Norton is a village and civil parish in Maldon District, Essex, England. In 2001 there were 1103 people living in Cold N

('The Cold War was the tense relationship between the United States (and its allies), and the Soviet Union (the USSR and its allies) between the end of World War II and the fall of the Soviet Union. It is called the "Cold" War because the US and the USSR never actually fought each other directly. Instead, they opposed each other in conflicts known as proxy wars, where each country chose a side to support.',
 'The Reagan Doctrine was a document by the United States under the Reagan Administration. It was about being against the global influence of the Soviet Union during the final years of the Cold War. The doctrine lasted for less than a decade, it was the most important document of United States foreign policy from the early 1980s until the end of the Cold War in 1991.',
 'Cold Norton is a village and civil parish in Maldon District, Essex, England. In 2001 there were 1103 people living in Cold Norton. Cold Norton is at the south-east end of the Danbury Ridge.',
 'The Cold War was the

In [None]:
search(query = "How long do cats live?")

Input question: How long do cats live?
Top-3 lexical search (BM25) hits
	22.997	Reliable information on the lifespans of house cats is hard to find. However, research has been done to get an estimate (an educated guess) on how long cats usually live. Cats usually live for 13 to 20 years. Sometimes cats can live for 22 to 30 years but there are claims of cats dying at ages of more than 30 years old.
	16.974	The sabertoothed cats or sabretooth cats are some of the best known and most popular extinct animals. They are among the most impressive carnivores that ever have lived. These cats had long canines and jaws which opened wider than modern cats. This suggests a different style of killing from modern felines.
	16.490	The Cyprus cat is a breed of cat. These cats are thought to have first come from ancient Egypt or Palestine. They were brought to the island of Cyprus by St. Helen. These are now common domestic cats that live in homes or outside. Many of these cats still live all over Cypr

('Reliable information on the lifespans of house cats is hard to find. However, research has been done to get an estimate (an educated guess) on how long cats usually live. Cats usually live for 13 to 20 years. Sometimes cats can live for 22 to 30 years but there are claims of cats dying at ages of more than 30 years old.',
 'The sabertoothed cats or sabretooth cats are some of the best known and most popular extinct animals. They are among the most impressive carnivores that ever have lived. These cats had long canines and jaws which opened wider than modern cats. This suggests a different style of killing from modern felines.',
 'The Cyprus cat is a breed of cat. These cats are thought to have first come from ancient Egypt or Palestine. They were brought to the island of Cyprus by St. Helen. These are now common domestic cats that live in homes or outside. Many of these cats still live all over Cyprus. But, a large number are now feral. This means they are not tame and they run wild.

In [None]:
search(query = "How many people live in Toronto?")

Input question: How many people live in Toronto?
Top-3 lexical search (BM25) hits
	15.978	Markham, Ontario is a city in Regional Municipality of York, in the Greater Toronto Area of Southern Ontario, Canada. There are twice as many people there as in 1990. 261,573 people live in Markham. It is the 4th largest town in the Greater Toronto Area, after Toronto, Mississauga, and Brampton.
	11.299	The Toronto Zoo is a zoo in Toronto, Ontario, Canada. With , the Toronto Zoo is the largest zoo in Canada.
	10.679	Denzil Minnan-Wong (; born ) is a Canadian politician. He is a Toronto city councillor. He is the person that represents Ward 16, an area of Toronto. He is a chairperson of the Employee and Labour Relations Committee in Toronto's municipal government and is also the deputy mayor of Toronto. He is also part of the board of the Toronto Transit Commission and the Toronto Hydro.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.604	Vaughan is a city in Ontario, Canada, 335,000

('Markham, Ontario is a city in Regional Municipality of York, in the Greater Toronto Area of Southern Ontario, Canada. There are twice as many people there as in 1990. 261,573 people live in Markham. It is the 4th largest town in the Greater Toronto Area, after Toronto, Mississauga, and Brampton.',
 'The Toronto Zoo is a zoo in Toronto, Ontario, Canada. With , the Toronto Zoo is the largest zoo in Canada.',
 "Denzil Minnan-Wong (; born ) is a Canadian politician. He is a Toronto city councillor. He is the person that represents Ward 16, an area of Toronto. He is a chairperson of the Employee and Labour Relations Committee in Toronto's municipal government and is also the deputy mayor of Toronto. He is also part of the board of the Toronto Transit Commission and the Toronto Hydro.",
 'Vaughan is a city in Ontario, Canada, 335,000 people live there .',
 'Toronto is a city in Ohio in the United States.',
 'Toronto is a city in Kansas, United States.',
 'It has about 110,000 people living

In [None]:
search(query = "Oldest US president")

Input question: Oldest US president
Top-3 lexical search (BM25) hits
	11.010	Glafcos Ioannou Clerides (; 24 April 1919 – 15 November 2013) was a Greek-Cypriot politician. He was the fourth President of Cyprus. He was the oldest living former President of the Republic of Cyprus.
	9.237	José Celso de Mello Filho (Tatuí, November 1, 1945), is a Brazilian jurist. He is the oldest member of the Supreme Federal Court of Brazil. He was nominated by President José Sarney in 1989.
	8.872	USS "Constitution" is a wooden, three-masted heavy frigate of the United States Navy. Named by President George Washington after the Constitution of the United States of America, she is the world's oldest commissioned naval vessel afloat.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.645	William Henry Harrison (February 9, 1773 – April 4, 1841) was the 9th President of the United States. His nickname was "Old Tippecanoe " and he was a well-respected war veteran. Harrison served the shortest ter

('Glafcos Ioannou Clerides (; 24 April 1919 – 15 November 2013) was a Greek-Cypriot politician. He was the fourth President of Cyprus. He was the oldest living former President of the Republic of Cyprus.',
 'José Celso de Mello Filho (Tatuí, November 1, 1945), is a Brazilian jurist. He is the oldest member of the Supreme Federal Court of Brazil. He was nominated by President José Sarney in 1989.',
 'USS "Constitution" is a wooden, three-masted heavy frigate of the United States Navy. Named by President George Washington after the Constitution of the United States of America, she is the world\'s oldest commissioned naval vessel afloat.',
 'William Henry Harrison (February 9, 1773 – April 4, 1841) was the 9th President of the United States. His nickname was "Old Tippecanoe " and he was a well-respected war veteran. Harrison served the shortest term of any United States President. His term lasted for exactly one month.',
 'Richard Arvin Overton (May 11, 1906 – December 27, 2018) was an Am

In [None]:
search(query = "Coldest place earth")

Input question: Coldest place earth
Top-3 lexical search (BM25) hits
	24.891	East Antarctica, also called Greater Antarctica, is the largest part (two-thirds) of the Antarctic continent. It is on the Indian Ocean side of the Transantarctic Mountains. It is the coldest, windiest, and driest part of Earth. East Antarctica holds the record as the coldest place on earth.
	12.650	Earth Day is a day that is supposed to inspire more awareness and appreciation for the Earth's natural environment. It takes place each year on April 22. It now takes place in more than 193 countries around the world. During Earth Day, the world encourages everyone to turn off all unwanted lights.
	12.172	Heinrich events occurred during the coldest point of "Bond Cycles" in which many icebergs were discharged into the North Atlantic and melted.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.633	East Antarctica, also called Greater Antarctica, is the largest part (two-thirds) of the Antarctic contine

('East Antarctica, also called Greater Antarctica, is the largest part (two-thirds) of the Antarctic continent. It is on the Indian Ocean side of the Transantarctic Mountains. It is the coldest, windiest, and driest part of Earth. East Antarctica holds the record as the coldest place on earth.',
 "Earth Day is a day that is supposed to inspire more awareness and appreciation for the Earth's natural environment. It takes place each year on April 22. It now takes place in more than 193 countries around the world. During Earth Day, the world encourages everyone to turn off all unwanted lights.",
 'Heinrich events occurred during the coldest point of "Bond Cycles" in which many icebergs were discharged into the North Atlantic and melted.',
 'East Antarctica, also called Greater Antarctica, is the largest part (two-thirds) of the Antarctic continent. It is on the Indian Ocean side of the Transantarctic Mountains. It is the coldest, windiest, and driest part of Earth. East Antarctica holds t

In [None]:
search(query = "Elon Musk year birth")

Input question: Elon Musk year birth
Top-3 lexical search (BM25) hits
	23.364	Tesla, Inc. is a company based in Palo Alto, California which makes electric cars. It was started in 2003 by Martin Eberhard, Dylan Stott, and Elon Musk (who also co-founded PayPal and SpaceX and is the CEO of SpaceX). Eberhard no longer works there. Today, Elon Musk is the Chief Executive Officer (CEO). It started selling its first car, the Roadster in 2008.
	19.943	The Boring Company is a tunnel boring company founded by Elon Musk, who earlier started SpaceX. It aims to reduce traffic congestion in urban areas. It is involved in the building of the Hyperloop in Los Angeles.
	18.392	Elon Reeve Musk (born June 28, 1971) is a businessman and philanthropist. He was born in South Africa. He moved to Canada and later became an American citizen. Musk is the current CEO & Chief Product Architect of Tesla Motors, a company that makes electric vehicles. He is also the CEO of Solar City, a company that makes solar pan

('Tesla, Inc. is a company based in Palo Alto, California which makes electric cars. It was started in 2003 by Martin Eberhard, Dylan Stott, and Elon Musk (who also co-founded PayPal and SpaceX and is the CEO of SpaceX). Eberhard no longer works there. Today, Elon Musk is the Chief Executive Officer (CEO). It started selling its first car, the Roadster in 2008.',
 'The Boring Company is a tunnel boring company founded by Elon Musk, who earlier started SpaceX. It aims to reduce traffic congestion in urban areas. It is involved in the building of the Hyperloop in Los Angeles.',
 'Elon Reeve Musk (born June 28, 1971) is a businessman and philanthropist. He was born in South Africa. He moved to Canada and later became an American citizen. Musk is the current CEO & Chief Product Architect of Tesla Motors, a company that makes electric vehicles. He is also the CEO of Solar City, a company that makes solar panels, and the CEO & CTO of SpaceX, an aerospace company. In August 2020, Bloomberg ra

In [None]:
search(query = "Paris eiffel tower")

Input question: Paris eiffel tower
Top-3 lexical search (BM25) hits
	27.300	The Eiffel Tower (French: La Tour Eiffel, ], IPA pronunciation: "EYE-full" English; "eh-FEHL" French) is a landmark in Paris. It was built between 1887 and 1889 for the Exposition Universelle (World Fair). The Tower was the Exposition's main attraction.
	25.263	Paris is a city in the U.S. state of Texas. It is in Lamar County, Texas. It had a population of 25,171 in 2010. It has been called the "Second Largest Paris in the World". It has a replica of the Eiffel Tower.
	24.059	Paris is a city in the U.S. state of Tennessee. It had a population of 25,171 in 2010. It has been called the "World's Biggest Fish Fry". It has a 70-foot replica of the Eiffel Tower.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.812	The Eiffel Tower (French: La Tour Eiffel, ], IPA pronunciation: "EYE-full" English; "eh-FEHL" French) is a landmark in Paris. It was built between 1887 and 1889 for the Exposition Universelle 

('The Eiffel Tower (French: La Tour Eiffel, ], IPA pronunciation: "EYE-full" English; "eh-FEHL" French) is a landmark in Paris. It was built between 1887 and 1889 for the Exposition Universelle (World Fair). The Tower was the Exposition\'s main attraction.',
 'Paris is a city in the U.S. state of Texas. It is in Lamar County, Texas. It had a population of 25,171 in 2010. It has been called the "Second Largest Paris in the World". It has a replica of the Eiffel Tower.',
 'Paris is a city in the U.S. state of Tennessee. It had a population of 25,171 in 2010. It has been called the "World\'s Biggest Fish Fry". It has a 70-foot replica of the Eiffel Tower.',
 'The Eiffel Tower (French: La Tour Eiffel, ], IPA pronunciation: "EYE-full" English; "eh-FEHL" French) is a landmark in Paris. It was built between 1887 and 1889 for the Exposition Universelle (World Fair). The Tower was the Exposition\'s main attraction.',
 'Alexandre Gustave Eiffel (December 15, 1832\xa0– December 27, 1923; , ) was 

In [None]:
search(query = "Which US president was killed?")

Input question: Which US president was killed?
Top-3 lexical search (BM25) hits
	10.179	Lyndon Baines Johnson (August 27, 1908 – January 22, 1973) was a member of the Democratic Party and the 36th president of the United States serving from 1963 to 1969. Johnson took over as president when President Kennedy was killed in November 1963. He was then re-elected in the 1964 election.
	10.091	Lech Kaczyński, the fourth President of the Republic of Poland, died on 10 April 2010. He died in a plane crash outside of Smolensk, Russia. The plane was a Tu-154 belonging to the Polish Air Force. The crash killed all 96 on board. His wife, Maria Kaczyńska, was also among those killed.
	9.791	Jacobo Majluta Azar (October 9, 1934 – March 2, 1996) was a Dominican politician. He was Vice President of the Dominican Republic during the Antonio Guzmán Fernández presidency between 1978 to 1982. He became President of the Dominican Republic after Guzmán Fernández killed himself in 1982. He was president for 

('Lyndon Baines Johnson (August 27, 1908 – January 22, 1973) was a member of the Democratic Party and the 36th president of the United States serving from 1963 to 1969. Johnson took over as president when President Kennedy was killed in November 1963. He was then re-elected in the 1964 election.',
 'Lech Kaczyński, the fourth President of the Republic of Poland, died on 10 April 2010. He died in a plane crash outside of Smolensk, Russia. The plane was a Tu-154 belonging to the Polish Air Force. The crash killed all 96 on board. His wife, Maria Kaczyńska, was also among those killed.',
 'Jacobo Majluta Azar (October 9, 1934 – March 2, 1996) was a Dominican politician. He was Vice President of the Dominican Republic during the Antonio Guzmán Fernández presidency between 1978 to 1982. He became President of the Dominican Republic after Guzmán Fernández killed himself in 1982. He was president for a month between July to August 1982.',
 "John F. Kennedy was the 35th President of the United

In [None]:
search(query="When is Chinese New Year")

Input question: When is Chinese New Year
Top-3 lexical search (BM25) hits
	18.743	Chinese New Year, known in China as the SpringFestival and in Singapore as the LunarNewYear, is a holiday on and around the new moon on the first day of the year in the traditional Chinese calendar. This calendar is based on the changes in the moon and is only sometimes changed to fit the seasons of the year based on how the Earth moves around the sun. Because of this, Chinese New Year is never on January1. It moves around between January21 and February20.
	18.527	New Year in Japan is one of the most important festivals. Unlike the Chinese New Year, it is held on January 1.
	15.789	The CCTV New Year's Gala (Simplified Chinese: 中国中央电视台春节联欢晚会; Traditional Chinese: 中國中央電視台春節聯歡晚會; Pinyin: "Zhōngguó zhōngyāng diànshìtái chūnjié liánhuān wǎnhuì") is a Chinese New Year special produced by China Central Television. It was presented by Zhao Zhongxiang.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0

('Chinese New Year, known in China as the SpringFestival and in Singapore as the LunarNewYear, is a holiday on and around the new moon on the first day of the year in the traditional Chinese calendar. This calendar is based on the changes in the moon and is only sometimes changed to fit the seasons of the year based on how the Earth moves around the sun. Because of this, Chinese New Year is never on January1. It moves around between January21 and February20.',
 'New Year in Japan is one of the most important festivals. Unlike the Chinese New Year, it is held on January 1.',
 'The CCTV New Year\'s Gala (Simplified Chinese: 中国中央电视台春节联欢晚会; Traditional Chinese: 中國中央電視台春節聯歡晚會; Pinyin: "Zhōngguó zhōngyāng diànshìtái chūnjié liánhuān wǎnhuì") is a Chinese New Year special produced by China Central Television. It was presented by Zhao Zhongxiang.',
 'Chinese New Year, known in China as the SpringFestival and in Singapore as the LunarNewYear, is a holiday on and around the new moon on the first

In [None]:
!pip install gradio==3.0.9
import gradio as gr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio==3.0.9
  Downloading gradio-3.0.9-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 8.3 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 46.9 MB/s 
Collecting uvicorn
  Downloading uvicorn-0.18.2-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 5.8 MB/s 
[?25hCollecting paramiko
  Downloading paramiko-2.11.0-py2.py3-none-any.whl (212 kB)
[K     |████████████████████████████████| 212 kB 69.1 MB/s 
Collecting pycryptodome
  Downloading pycryptodome-3.15.0-cp35-abi3-manylinux2010_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 51.4 MB/s 
[?25hCollecting python-multipart
  Downloading python-multipart-0.0.5.tar.gz (32 kB)
Collecting anal

In [None]:
demo = gr.Interface(fn=search, inputs='text', outputs=[gr.outputs.Textbox(label='lexical search (BM25) hit 1'),
                                                       gr.outputs.Textbox(label='lexical search (BM25) hit 2'),
                                                       gr.outputs.Textbox(label='lexical search (BM25) hit 3'),
                                                       gr.outputs.Textbox(label='Bi-Encoder Retrieval hit 1'),
                                                       gr.outputs.Textbox(label='Bi-Encoder Retrieval hit 2'),
                                                       gr.outputs.Textbox(label='Bi-Encoder Retrieval hit 3'),
                                                       gr.outputs.Textbox(label='Cross-Encoder Re-ranker hit 1'),
                                                       gr.outputs.Textbox(label='Cross-Encoder Re-ranker hit 2'),
                                                       gr.outputs.Textbox(label='Cross-Encoder Re-ranker hit 3'),], layout='vertical')
demo.launch(debug=True, share=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://16213.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


Input question: President of North Rhine-Westphalia
Top-3 lexical search (BM25) hits
	15.571	Münster is one of the five "Regierungsbezirke" of North Rhine-Westphalia, Germany, in the north of the state.
	15.262	The Hagenbach is a river in North Rhine-Westphalia, Germany.
	14.809	The Rhein-Erft-Kreis is a district in the west of North Rhine-Westphalia, Germany.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.657	Kurt Hans Biedenkopf (born 28 January 1930) is a German politician. He was the 1st Minister President of the Free State of Saxony (one of Germany's federal states) from 1990 until 2002. He was the 54th President of the Bundesrat in 1999/2000. He was born in Ludwigshafen, Germany.
	0.634	Ulrich Hahnen (10 May 1952 in Krefeld – 10 January 2016) was a German politician. He was a member of the Social Democratic Party (SPD). He served as a deputy of the Landtag of North Rhine-Westphalia from 2010 until his death. In 1994, he became a member of the City Council of Krefe

In [None]:
from sklearn.metrics import ndcg_score, dcg_score
import numpy as np

true_relevance = np.asarray([[3,2,1], [3,2,1], [3,2,1], [3,2,1], [3,2,1], [3,2,1], [3,2,1], [3,2,1], [3,2,1], [3,2,1]])

lexical_relevance_score = np.asarray([[3,0,0], [3,3,1], [3,2,0], [0,0,3], [3,2,2], [1,3,1], [2,2,3], [1,2,0], [3,0,0], [1,0,0]])

dcg = dcg_score(true_relevance, lexical_relevance_score)
print("For Lexical")
print("DCG score : ", dcg)

idcg = dcg_score(true_relevance, true_relevance)
print("IDCG score : ", idcg)
  
ndcg = dcg / idcg
print("nDCG score : ", ndcg)

print("nDCG score (from function) : ", ndcg_score(true_relevance, lexical_relevance_score))

In [None]:
bi_encoder_relevance_score = np.asarray([[3,1,0], [3,1,3], [3,1,1], [3,0,2], [0,0,0], [3,1,1], [3,1,2], [2,1,3], [2,2,1], [3,0,1]])

dcg = dcg_score(true_relevance, bi_encoder_relevance_score)
print("For Bi-Encoder")
print("DCG score : ", dcg)

idcg = dcg_score(true_relevance, true_relevance)
print("IDCG score : ", idcg)
  
ndcg = dcg / idcg
print("nDCG score : ", ndcg)

print("nDCG score (from function) : ", ndcg_score(true_relevance, bi_encoder_relevance_score))

In [None]:
cross_encoder_relevance_score = np.asarray([[3,2,0], [3,3,1], [3,2,1], [3,0,2], [0,0,0], [3,1,1], [3,2,2], [3,2,1], [3,2,2], [3,1,1]])

dcg = dcg_score(true_relevance, cross_encoder_relevance_score)
print("For cross-encoder")
print("DCG score : ", dcg)

idcg = dcg_score(true_relevance, true_relevance)
print("IDCG score : ", idcg)
  
ndcg = dcg / idcg
print("nDCG score : ", ndcg)

print("nDCG score (from function) : ", ndcg_score(true_relevance, cross_encoder_relevance_score))