#**Introduction to Natural Language Processing 2 Lab03**

##**Create a searchable index**

In [None]:
!pip install beir
!pip install -U sentence-transformers

import random
import time
import torch
import faiss
import numpy as np
from beir import util, LoggingHandler
import logging
import pathlib, os
from beir.datasets.data_loader import GenericDataLoader

We download the dataset with Beir library

In [2]:
# Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Getting our dpedia dataset
dataset = "dbpedia-entity"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))
data_path = "datasets/dbpedia-entity"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

/content/datasets/dbpedia-entity.zip:   0%|          | 0.00/610M [00:00<?, ?iB/s]

Dataset downloaded here: /content/datasets/dbpedia-entity


  0%|          | 0/4635922 [00:00<?, ?it/s]

In [13]:
print((len(corpus),type(corpus)))
print((len(queries),type(queries)))
print((len(qrels),type(qrels)))

(114877, <class 'dict'>)
(400, <class 'dict'>)
(400, <class 'dict'>)


1. Data is divided in three parts :


*   corpus is a dictionary with three fields _id with unique document identifier, title with document title (optional) and text with document paragraph or passage. Each text for the corpus are answers to questions.
*   queries is a dictionary with two fields _id with unique query identifier and text with query text. Each query is a question.
*   qrels is  a  dictionary , i.e. the query-id. The value of this dictionnary is also one. The second dictionary have information about corpus-id and score in this order. It links a question to an answer.

2. To ease the problem, extract all the document from the corpus which are relevant to at least one query. Then, add 100K random documents which are not relevant to any query.

To do this part we made 3 function:
- The first one to get all valid_id
- The second one to get 100k non valid id
- The last one to merge our new dataset

In [10]:
def good_id(data):
  """ 
    Take qrels dictionary and return a set containing all id that are present 
    in at least 1 query  

    Parameters
    ----------
    data : <class 'dict'>

    Returns
    -------
    res : <class 'set'>
  
  """
  res = set()
  for values in data.values():
    res.update({k: v for k,v in values.items() if v != 0})
  return res

data_searched = good_id(qrels)
print("Nb of valid id: ",len(data_searched))

def no_result_id(data):
  """ 
    Take 100k random data from the corpus that have no quey search

    Parameters
    ----------
    data : <class 'set'>

    Returns
    -------
    outputs : <class 'set'>
  
  """
  res = set(corpus.keys()) - data
  return set(random.sample(list(res),100000))

data_not_searched = no_result_id(data_searched)
print("100k data with no valid id: "+ str(len(data_not_searched))) #Sum of non valid id

def get_data(data, keys):
  """ 
    Take a union dataset from our previous function and return 
    our new corpus

    Parameters
    ----------
    data : <class 'dict'>
    keys : <class 'set'>

    Returns
    -------
    res_corpus : <class 'dict'>
  
  """
  res_corpus = corpus
  delete = set(corpus) - keys
  for keys in delete:
    del res_corpus[keys]
  return res_corpus

dataset = data_searched.union(data_not_searched)
data = get_data(corpus,dataset)
print(len(data)) #Sum of our new dataset

Nb of valid id:  14877
100k data with no valid id: 100000
114877


3. Embed the reduced corpus and the queries using the chosen model

For this part we will test 4 different model, and take our best model to test the ANN version

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('msmarco-distilroberta-base-v2')
model_mini_v3_12 = SentenceTransformer('msmarco-MiniLM-L-12-v3')
model_mini_v3_6 = SentenceTransformer('msmarco-MiniLM-L-6-v3')
model_roberta_v3 = SentenceTransformer('msmarco-roberta-base-v3')
L_model = [model,model_mini_v3_12,model_mini_v3_6,model_roberta_v3]

This next part take 25 Minutes with GPU in collabs because some models are really heavy 

In [12]:
corpus_value = [text["text"] for text in data.values()]

def load_model(model_list):
  """ 
    Load every model froom our list

    Parameters
    ----------
    model_list: <class 'list'>

    Returns
    -------
    res : <class 'list'>
  
  """
  res= []
  for model in model_list:
    corpus_embeddings = model.encode(corpus_value, convert_to_tensor=True)
    print("1 done")
    res.append(corpus_embeddings)
  return res

print(type(L_model))
List_model = load_model(L_model)
print(type(List_model))
queries_val = queries.values()

<class 'list'>
1 done
1 done
1 done
1 done
<class 'list'>


In [14]:
List_of_scores = []
List_of_times = [] 

def Map_time(model,corpus_emb,corpus_val,queries_val,mini):
  """ 
    Compute The MAP and time for every query
    
    Parameters
    ----------
    model: <class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
    corpus_emb: <class 'torch.Tensor'>
    corpus_val: <class 'list'>
    queries_val: <class 'dict_values'>
    mini: int  

  """
  top_k = min(mini,len(corpus_val))
  for query in queries_val:
    score = []
    ti = []
    start_time = time.time()
    query_emb= model.encode(query, convert_to_tensor=True)

    cos_scores = util.cos_sim(query_emb, corpus_emb)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    result = top_results[0].tolist()
    List_of_scores.append(result)
    end_time = time.time()
    ti = end_time-start_time
    List_of_times.append(ti)

for model,corp_emb in zip(L_model,List_model):
  Map_time(model,corp_emb,corpus_value,queries_val,100)

In [16]:
def print_result(queries_val):
  """ 
    Print our result for the differents models

    Parameters
    ----------
    queries_val: dict_values
  
  """
  cont = 0
  for query in queries_val:
    print("\n\n-------------------------------------------\n\n")
    print("Query: ",query)
    print("\n Score Model distilberta: ",sum(List_of_scores[cont])/100)
    print("Time Model distilberta: ",List_of_times[cont])
    print("\n Score Model mini_v3_12: ",sum(List_of_scores[400+cont])/100)
    print("Time Model distilberta: ",List_of_times[400+cont])
    print("\n Score Model mini_v3_6: ",sum(List_of_scores[800+cont])/100)
    print("Time Model distilberta: ",List_of_times  [800+cont])
    print("\n Score Model roberta v3: ",sum(List_of_scores[1200+cont])/100)
    print("Time Model distilberta: ",List_of_times[1200+cont])
    cont = cont + 1
print_result(queries_val)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m

Query:  circus mammals

 Score Model distilberta:  0.462085117995739
Time Model distilberta:  0.0138092041015625

 Score Model mini_v3_12:  0.34524727463722227
Time Model distilberta:  0.017473697662353516

 Score Model mini_v3_6:  0.3582668137550354
Time Model distilberta:  0.009717702865600586

 Score Model roberta v3:  0.2913888895511627
Time Model distilberta:  0.01878952980041504


-------------------------------------------


Query:  Works by Charles Rennie Mackintosh

 Score Model distilberta:  0.44340875923633577
Time Model distilberta:  0.01418924331665039

 Score Model mini_v3_12:  0.4206217816472054
Time Model distilberta:  0.022380590438842773

 Score Model mini_v3_6:  0.41722067564725873
Time Model distilberta:  0.010225534439086914

 Score Model roberta v3:  0.36951981782913207
Time Model distilberta:  0.02305746078491211


-------------------------------------------


Query:  Mov

After testing our models, we figure out that the first one is the best. It is the most cost-effictive between speed and efficiency.

##**Approximate nearest neighbours**

In [23]:
def ANN_faiss(corpus_emb,model,emb_size,cluster,probe,Scores):
  """ 
    Use the faiss approach to speed up the algorithms

    Parameters
    ----------
    model: <class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
    corpus_emb: <class 'torch.Tensor'>
    emb_size: int
    cluster: int
    probe: int
    Scores: list

  """

  embedding_size = emb_size    #Size of embeddings
  top_k_hits = 100        #Output k hits
  n_clusters = cluster

  quantizer = faiss.IndexFlatIP(embedding_size)
  index = faiss.IndexIVFFlat(quantizer, embedding_size, 
                             n_clusters, faiss.METRIC_INNER_PRODUCT)
  index.nprobe = probe

  #print("Start creating FAISS index")
  # First, we need to normalize vectors to unit length
  corpus_embeddings = corpus_emb.cpu()
  corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]
  # Then we train the index to find a suitable clustering
  index.train(corpus_embeddings)
  # Finally we add all embeddings to the index
  index.add(corpus_embeddings)

  for query in queries_val:
    start_time = time.time()
    question_embedding = model.encode(query)
    #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    question_embedding = np.expand_dims(question_embedding, axis=0)
    # Search in FAISS. It returns a matrix with distances and corpus ids.
    distances, corpus_ids = index.search(question_embedding, top_k_hits)
    # We extract corpus ids and scores for the first query
    hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])]
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    end_time = time.time()
    #print("\nInput question:", query)
    #print("\nResults (after {:.3f} seconds):".format(end_time-start_time))
    L_score = [d['score'] for d in hits]
    result = sum(L_score)/top_k_hits
    #print("Score: ",result)
    Scores.append(result)


Next we want to change differents parameters to see if we can get a good combination:
- First we will change the number of clusters
- Sexond we will change the number of neighbors

In [24]:
Clusters = [128,256,512,1024,2048]
for cluster in Clusters:
  Scores = []
  ANN_faiss(List_model[0],L_model[0],768,cluster,10,Scores)
  print("\n------------------------------------\n")
  print("Nb de cluster: ",cluster)
  print("Score: ",sum(Scores)/len(Scores))



------------------------------------

Nb de cluster:  128
Score:  0.44146069394759807

------------------------------------

Nb de cluster:  256
Score:  0.4390012654125689

------------------------------------

Nb de cluster:  512
Score:  0.43631949677914383

------------------------------------

Nb de cluster:  1024
Score:  0.43246338729113376

------------------------------------

Nb de cluster:  2048
Score:  0.42738040986880654


In [25]:
probe = [5,10,15,20]
for p in probe:
  Scores = []
  ANN_faiss(List_model[0],L_model[0],768,1024,p,Scores)
  print("\n------------------------------------\n")
  print("Nb de Neigbourgh: ",probe)
  print("Score: ",sum(Scores)/len(Scores))


------------------------------------

Nb de Neigbourgh:  [5, 10, 15, 20]
Score:  0.42030490662008496

------------------------------------

Nb de Neigbourgh:  [5, 10, 15, 20]
Score:  0.43246338729113376

------------------------------------

Nb de Neigbourgh:  [5, 10, 15, 20]
Score:  0.43757598299831174

------------------------------------

Nb de Neigbourgh:  [5, 10, 15, 20]
Score:  0.44026761355362837


We can observed that the model is more efficient with low cluster but also more efficient with high probe.