In [None]:
!pip install beir
!pip install tensorflow-text
!pip install hnswlib

In [None]:
import numpy as np
import tqdm
from beir import util
import itertools
import random
import json
import os
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
import hnswlib

  from tqdm.autonotebook import tqdm


In [None]:

def download_dataset(folder_path: str):
	url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/dbpedia-entity.zip"
	data_path = util.download_and_unzip(url, folder_path)
	corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
	return corpus, queries, qrels


def create_smaller_corpus(corpus: dict, qrels: dict, folder_path: str, number_thrash: int):
	usefull_documents_title = list(
		dict.fromkeys(itertools.chain.from_iterable(map(lambda x: list(x.keys()), qrels.values()))))
	useless_documents_title = list(set(corpus.keys()) - set(usefull_documents_title))
	final_corpus_titles = random.sample(useless_documents_title, number_thrash) + usefull_documents_title
	final_corpus_titles = list(filter(lambda x: x in corpus.keys(), final_corpus_titles))
	simple_corpus = {k: corpus[k] for k in final_corpus_titles}
	list_json = []
	for el in tqdm.tqdm(simple_corpus):
		el_to_insert = simple_corpus[el]
		el_to_insert["_id"] = el
		list_json.append(el_to_insert)
	print("Saving corpus")
	with open(os.path.join(folder_path, "dbpedia-entity", "corpus.jsonl"), "w") as file:
		for item in tqdm.tqdm(list_json):
			json.dump(item,file)
			file.write("\n")
	return simple_corpus


is_clean_corpus = True
number_trash = 100000
corpus, queries, qrels = download_dataset("./data")
if is_clean_corpus:
	corpus = create_smaller_corpus(corpus, qrels, "./data", number_trash)


  0%|          | 0/140724 [00:00<?, ?it/s]

100%|██████████| 140724/140724 [00:00<00:00, 1136990.85it/s]


Saving corpus


100%|██████████| 140724/140724 [00:01<00:00, 72570.98it/s]


In [None]:
print("size of corpus is", len(corpus))

print("size of queries is", len(queries))
print("size of qrels is", len(qrels))

size of corpus is 140724
size of queries is 400
size of qrels is 400


We have 3 values.

**-Corpus** is a dictionary representing the description of each DBPedia page, representing an event, a person, a culture of anything else. Each element element contains the title of a page and the text associated.

**-Queries** represent a series of request by a user. It could be a question, a name, or a sentence describing what he looking for.

**-Qrels** represent the possible answers that match with queries. Each values is marked between 0 and 2. The best they have the best the response is.

In [None]:
list_id = [corpus[el]["_id"] for el in tqdm.tqdm(corpus)]
list_id[:10]


100%|██████████| 140724/140724 [00:00<00:00, 1096336.81it/s]


['<dbpedia:Kreischa>',
 '<dbpedia:Distinguished_Service_Cross_(United_States)>',
 '<dbpedia:Ap_Lei_Chau_Bridge>',
 '<dbpedia:Avalon_(Stargate_SG-1)>',
 '<dbpedia:Yoshirō_Fujimura>',
 '<dbpedia:Sun_Tiantian>',
 '<dbpedia:Crenshaw_Christian_Center>',
 '<dbpedia:96th_Air_Division>',
 '<dbpedia:Ferdinánd_Pálffy>',
 '<dbpedia:Civil_Rights_Act_of_1991>']

In [None]:
list_corpus = [corpus[el]["text"] for el in tqdm.tqdm(corpus)]
list_corpus[:4]

100%|██████████| 140724/140724 [00:00<00:00, 1044837.65it/s]


['Kreischa is a municipality in the Sächsische Schweiz-Osterzgebirge district, Saxony, Germany. It directly borders the Saxon capital Dresden and consists of 14 districts.Kreischa was first mentioned in 1282 in the name Heinricus de Kryschowe. The name could be derived from an Old Slavic word meaning "crooked" or "lame".',
 'The Distinguished Service Cross is the second highest military award that can be given to a member of the United States Army (and previously, the United States Army Air Forces), for extreme gallantry and risk of life in actual combat with an armed enemy force.  Actions that merit the Distinguished Service Cross must be of such a high degree that they are above those required for all other U.S. combat decorations but do not meet the criteria for the Medal of Honor.',
 'Ap Lei Chau Bridge (traditional Chinese: 鴨脷洲大橋; simplified Chinese: 鸭脷洲大桥; pinyin: Yālìzhōu Dàqiáo; Cantonese Yale: ngaap3 lei6 jau1 daai6 kiu4) is a highway bridge in Hong Kong connecting the island 

We will use the msmarco-distilbert-base-v4, because it seen it is the model with the best score on the TREC-DL-2019.

In [None]:
from sentence_transformers import SentenceTransformer
from requests.auth import HTTPBasicAuth

model_transformers = SentenceTransformer("msmarco-distilbert-base-v4")

In [None]:
list_queries = [queries[el] for el in queries.keys()]
list_queries[:10]

['Szechwan dish food cuisine',
 'roman architecture',
 'finland car industry manufacturer saab sisu',
 'france second world war normandy',
 'social network group selection',
 'D-Day normandy invasion',
 'web ranking scoring algorithm',
 'virtual museums',
 'Indian food',
 'composer museum']

In [None]:
embedded_corpus = model_transformers.encode(list_corpus)

In [None]:
import torch
list_ap_cos_sim = []
for q in tqdm.tqdm(list_queries):
  k=100
  embedded_queries = model_transformers.encode(q)
  cos_scores = util.cos_sim(embedded_queries, embedded_corpus)[0]
  top_results = torch.topk(cos_scores, k=k)
  id_results = []
  for score, idx in zip(top_results[0], top_results[1]):
    id_results.append(list_id[idx])
  id_querie = [i for i in queries if queries[i] == q][0]
  number_okay = 0
  for qrel in qrels[id_querie]:
    if qrel in id_results:
      number_okay+=1
  n = min(k,len(qrels[id_querie]))
  ap = number_okay/n
  list_ap_cos_sim.append(ap)



100%|██████████| 400/400 [01:54<00:00,  3.49it/s]


Nous calculons le MAP de la façon suivante. 

**1-** Nous calculons les 100 meilleurs prédictions du modèle pour un querie nommé q.

**2-** Nous comptons le nombre d'occurrence présente dans la prédition qui sont également présentes dans qrels. (C'est à dire combien sont des vrai positif).

**3-** Nous faisons la division du nombre de vrai positif par rapport au nombre attendu, pour avoir un score compris entre 0 et 1. 0 veut dire que le modèle à prédis que des articles n'ayant rien à voire avec la querie, 1 veut dire que le modèle à prédit toutes les articles intérréssantes. 

Une fois les prédictions faites pour chaques queries, nous calculons la moyennes.



In [None]:
sum(list_ap_cos_sim)/len(list_ap_cos_sim)

0.416559928295235

In [None]:
index = hnswlib.Index(space='cosine',dim=768)
index_path = "./hnswlib_index"
if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
else:
    ### Create the HNSWLIB index
    print("Start creating HNSWLIB index")
    index.init_index(max_elements = len(embedded_corpus), ef_construction = 100, M = 48)


    index.add_items(embedded_corpus, list(range(len(embedded_corpus))))

    print("Saving index to:", index_path)
    index.save_index(index_path)

Start creating HNSWLIB index
Saving index to: ./hnswlib_index


En prenant le modele hnswlib (car c'est celui que le redacteur de l'article nous conseille.) Nousa vons plusieurs paramètre sur lequelle  varier. 
ef_construction est un paramètre qui influe sur la qualité de la prédiction, mais également sur le temps. Plus l'indicateur augmente, plus la qualité et le temps augmentent. (Jusqu'à un certain point).

In [None]:
import torch
list_ap = []
for q in tqdm.tqdm(list_queries):
  k=100
  embedded_querie = model_transformers.encode(q)
  knn_scores = index.knn_query(embedded_querie, k=k)
  id_results = []
  for idx in knn_scores[0][0]:
    id_results.append(list_id[idx])
  id_querie = [i for i in queries if queries[i] == q][0]
  number_okay = 0
  for qrel in qrels[id_querie]:
    if qrel in id_results:
      number_okay+=1
  n = min(len(qrels[id_querie]),k)
  ap = number_okay/n
  list_ap.append(ap)

100%|██████████| 400/400 [00:03<00:00, 105.99it/s]


In [None]:
sum(list_ap)/len(list_ap)

0.41514155565144584

Après plusieurs tests, nous avons fait varier les paramètres M et ef_construction, nous trouvons que le meilleur résultat que nous pouvons avoir est d'avoir un ef_construction égale au nombre de résultat retourné (100).
La bibliothèque nous conseille un M situé au alentour de 48. Effectivement, c'est le meilleur ratio que nous pouvons avoir. 

Le résultat de l'ANN est similaire au cosine similarity. Néanmoins, on constate une nette accélération, (environ 10 fois plus rapide). 
