In [1]:
!pip install beir



In [2]:
! pip install datasets transformers



In [3]:
from utils_index import *

In [4]:
from utils_addcontext import *

# Part 1: Adding DBPedia contexts into our SQuADv2 dataset

In [5]:
# Load the dbpedia dataset

dataset = "dbpedia"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/dbpedia-entity.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

datasets/dbpedia-entity.zip:   0%|          | 0.00/610M [00:00<?, ?iB/s]

  0%|          | 0/4635922 [00:00<?, ?it/s]

In [6]:
# Load the squadv2 dataset

datasets = load_dataset("squad_v2")

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Include unique DBPedia contexts into SQuADv2 dataset

all_good_entries = get_all_good_entries(qrels, corpus)
dico = transform_into_dico(datasets)
all_questions, all_contexts, all_titles, all_answers = create_list_of_all(dico)
all_contexts = all_contexts + all_good_entries

In [13]:
# Put it into a DataFrame 

df = pd.DataFrame({'question': pd.Series(all_questions), 'context': pd.Series(all_contexts), 'title': pd.Series(all_titles), 'answers': pd.Series(all_answers)})
df

Unnamed: 0,question,context,title,answers
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,Normans,"{'text': ['France', 'France', 'France', 'Franc..."
1,Who was the duke in the battle of Hastings?,"The Norman dynasty had a major political, cult...",Normans,"{'text': ['William the Conqueror', 'William th..."
2,What is the original meaning of the word Norman?,"The English name ""Normans"" comes from the Fren...",Normans,"{'text': ['Viking', 'Norseman, Viking', 'Norse..."
3,When was the Duchy of Normandy founded?,"In the course of the 10th century, the initial...",Normans,"{'text': ['911', '911', '911'], 'answer_start'..."
4,Who upon arriving gave the original viking set...,"Before Rollo's arrival, its populations did no...",Normans,"{'text': ['Rollo', 'Rollo', 'Rollo'], 'answer_..."
...,...,...,...,...
22495,,The World Meteorological Organization (WMO) is...,,
22496,,The World Veterans Federation (WVF) is the wor...,,
22497,,World Vision International is an Evangelical C...,,
22498,,ZF Electronics GmbH (formerly known as Cherry ...,,


In [14]:
unique_questions, q_a = get_all_unique_questions(all_questions, df)

In [16]:
unique_questions[:5]

['In what country is Normandy located?',
 'Who was the duke in the battle of Hastings?',
 'What is the original meaning of the word Norman?',
 'When was the Duchy of Normandy founded?',
 'Who upon arriving gave the original viking settlers a common identity?']

# Part 2: Testing different asymetric similarity models

We measure the different models by computing the MRR and speed for each model

In [None]:
%%time
model_distilbert_v4 = SentenceTransformer('msmarco-distilbert-base-v4')

similarity_distilbert_v4 = index_model_cosine_similarity(model_distilbert_v4, unique_questions, df.context.to_list())
rank_distilbert_v4 = get_ranks(similarity_distilbert_v4, unique_questions, q_a)
result_distilbert_v4 = map(inverse, rank_distilbert_v4)
# Compute MRR
print(sum(list(result_distilbert_v4)) / len(rank_distilbert_v4))

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

0.7372709088377434
CPU times: user 10min 49s, sys: 3.29 s, total: 10min 53s
Wall time: 10min 57s


In [None]:
%%time
model_distilbert_v3 = SentenceTransformer('msmarco-distilbert-base-v3')

similarity_distilbert_v3 = index_model_cosine_similarity(model_distilbert_v3, unique_questions, df.context.to_list())
rank_distilbert_v3 = get_ranks(similarity_distilbert_v3, unique_questions, q_a)
result_distilbert_v3 = map(inverse, rank_distilbert_v3)
# Compute MRR
print(sum(list(result_distilbert_v3)) / len(rank_distilbert_v3))

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

0.7473299182677691
CPU times: user 10min 51s, sys: 2.39 s, total: 10min 53s
Wall time: 10min 56s


In [None]:
%%time
model_roberta_v3 = SentenceTransformer('msmarco-roberta-base-v3')

similarity_roberta_v3 = index_model_cosine_similarity(model_roberta_v3, unique_questions, df.context.to_list())
rank_roberta_v3 = get_ranks(similarity_roberta_v3, unique_questions, q_a)
result_roberta_v3 = map(inverse, rank_roberta_v3)
# Compute MRR
print(sum(list(result_roberta_v3)) / len(rank_roberta_v3))

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

0.6636146515360238
CPU times: user 12min 47s, sys: 3.01 s, total: 12min 50s
Wall time: 12min 54s


In [None]:
%%time
model_distilbert_tas = SentenceTransformer('msmarco-distilbert-base-tas-b')

similarity_distilbert_tas = index_model_dot_product_similarity(model_distilbert_tas, unique_questions, df.context.to_list())
rank_distilbert_tas = get_ranks(similarity_distilbert_tas, unique_questions, q_a)
result_distilbert_tas = map(inverse, rank_distilbert_tas)
# Compute MRR
print(sum(list(result_distilbert_tas)) / len(rank_distilbert_tas))

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

0.8203144880079584
CPU times: user 10min 40s, sys: 2.3 s, total: 10min 42s
Wall time: 10min 46s


In [None]:
%%time
model_roberta_firstp = SentenceTransformer('msmarco-roberta-base-ance-firstp')

similarity_roberta_firstp = index_model_dot_product_similarity(model_roberta_firstp, unique_questions, df.context.to_list())
rank_roberta_firstp = get_ranks(similarity_roberta_firstp, unique_questions, q_a)
result_roberta_firstp = map(inverse, rank_roberta_firstp)
# Compute MRR
print(sum(list(result_roberta_firstp)) / len(rank_roberta_firstp))

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

0.7139793512717401
CPU times: user 13min 11s, sys: 3.19 s, total: 13min 14s
Wall time: 13min 19s


We can see the model "msmarco-distilbert-base-tas-b" yields the best results overall, we'll be using this model

# Part 3: Using a Nearest Neighbors approximation to speed up our searchable index

We use the model FAISS libray for our searchable index

In [None]:
import faiss
from faiss import normalize_L2

In [None]:
%%time
model_distilbert_tas = SentenceTransformer('msmarco-distilbert-base-tas-b') # best performance overall
index = create_index(df.context.to_list(), model_distilbert_tas, df)
D, I = doc_search(unique_questions, model_distilbert_tas, index, num_results=10)
MMR_test(I, unique_questions, q_a)

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

CPU times: user 1min 45s, sys: 832 ms, total: 1min 46s
Wall time: 1min 49s


A bit of loss regarding the MRR, but much faster 

In [None]:
# We test our index
top10 = get_top10_context(unique_questions[0], model_distilbert_tas, index, df.context.to_list())

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
for i in top10:
  print(i)

The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Denmark (/ˈdɛnmɑrk/; Danish: Danmark [ˈd̥ænmɑɡ̊]) is a country in Northern Europe. The southernmost of the Nordic countries, it is located southwest of Sweden and south of Norway, and bordered to the south by Germany. Denmark forms part of the cultural regi