In [1]:
import numpy as np
import ir_datasets
from kannolo import SparsePlainHNSWf16

### Load Data

In [2]:
# MS MARCO SPLADE (sparse)
queries_path = '/data3/silvio/datasets_numpy/queries/ms_marco_splade/'
index_path = "/data3/silvio/indexes/kannolo/kannolo_sparse_efc_2000_m_32_metric_ip"

In [3]:
# Load data
queries_components = np.load(queries_path + 'components.npy') # Query components
queries_values = np.load(queries_path + 'values.npy') # Query values
queries_offsets = np.load(queries_path + 'offsets.npy') # Query offsets
index = SparsePlainHNSWf16.load(index_path) # Index

In [369]:
# Choose a query
query_id_1 = 1500
query_id_2 = 5000
query_components_1 = queries_components[queries_offsets[query_id_1]:queries_offsets[query_id_1 + 1]]
query_values_1 = queries_values[queries_offsets[query_id_1]:queries_offsets[query_id_1 + 1]]
query_components_2 = queries_components[queries_offsets[query_id_2]:queries_offsets[query_id_2 + 1]]
query_values_2 = queries_values[queries_offsets[query_id_2]:queries_offsets[query_id_2 + 1]]

### Search Queries

In [370]:
# Set search parameters
k = 10
efSearch = 1000

In [371]:
# Perform search
dists_1, ids_1 = index.search(query_components_1, query_values_1, k, efSearch)
dists_2, ids_2 = index.search(query_components_2, query_values_2, k, efSearch)
dists_1 = dists_1.reshape(-1, 10)
ids_1 = ids_1.reshape(-1, 10)
dists_2 = dists_2.reshape(-1, 10)
ids_2 = ids_2.reshape(-1, 10)

### Collect Results

In [372]:
# add ir_dataset dataset string id
ir_dataset_string = "msmarco-passage/dev/small"
# Load the dataset
dataset = ir_datasets.load("msmarco-passage/dev/small")

In [373]:
query_passage_1 = [query for query in dataset.queries_iter()][query_id_1].text
query_passage_2 = [query for query in dataset.queries_iter()][query_id_2].text

In [374]:
documents_passages = dataset.docs_iter()[:]
results_1 = [documents_passages[int(i)].text for i in ids_1[0]]
results_2 = [documents_passages[int(i)].text for i in ids_2[0]]

### Evaluation

In [375]:
import ir_measures
ir_measure = ir_measures.parse_measure("MRR@10")

In [None]:
# Remapping the query ids for metric evaluation
real_query_id_1 = [query for query in dataset.queries_iter()][query_id_1].query_id
real_query_id_2 = [query for query in dataset.queries_iter()][query_id_2].query_id

In [None]:
# Parsing the results for metric evaluation
results_for_metric_1 = []
for dd, ii in zip(dists_1[0], ids_1[0]):
    results_for_metric_1.append(ir_measures.ScoredDoc(real_query_id_1, str(ii), dd))

results_for_metric_2 = []
for dd, ii in zip(dists_2[0], ids_2[0]):
    results_for_metric_2.append(ir_measures.ScoredDoc(real_query_id_2, str(ii), dd))

In [None]:
# Load the qrels (relevance judgments) for the dataset
qrels = dataset.qrels
qrel_1 = [q for q in qrels if q.query_id == real_query_id_1]
qrel_2 = [q for q in qrels if q.query_id == real_query_id_2]

In [389]:
# Compute the MRR@10 metric
print("Metric evaluation for query 1", ir_measures.calc_aggregate([ir_measure], qrel_1, results_for_metric_1))
print("Metric evaluation for query 2", ir_measures.calc_aggregate([ir_measure], qrel_2, results_for_metric_2))

Metric evaluation for query 1 {RR@10: 0.14285714285714285}
Metric evaluation for query 2 {RR@10: 0.0}


### Display Results

In [386]:
query_passage_1

'temperature in clearwater florida per month'

In [387]:
results_1

['Clearwater: Annual Weather Averages. July is the hottest month in Clearwater with an average temperature of 83 °F (28 °C) and the coldest is January at 58 °F (14 °C) with the most daily sunshine hours at 11 in July. The wettest month is June with an average of 133.3mm of rain. The best month to swim in the sea is in August when the average sea temperature is 86 °F (30 °C).',
 'Clearwater: Annual Weather Averages. July is the hottest month in Clearwater with an average temperature of 28 °C (83 °F) and the coldest is January at 14 °C (58 °F) with the most daily sunshine hours at 11 in July. The wettest month is June with an average of 133.3mm of rain. The best month to swim in the sea is in August when the average sea temperature is 30 °C (86 °F).',
 'Clearwater: Annual Weather Averages. July is the hottest month in Clearwater with an average temperature of 28°C (83°F) and the coldest is January at 14°C (58°F) with the most daily sunshine hours at 11 in July. The wettest month is June 

##### Sparse representation better capture the specific question thanks to word-matching

In [110]:
query_passage_2

'definition of dignity for kids'

In [111]:
results_2

['For many children, “ dignity ” is an unfamiliar term. You have a wonderful opportunity to talk to the children in your life about the dignity that resides in each of us and to help them develop the values and character traits that exemplify a dignified life.hildren need to hear in your own words what dignity means to you and why it is something you value. Talk about people you admire for living dignified lives. Public figures such as Mother Teresa and Martin Luther King, Jr. may first come to mind, but don’t overlook those in your immediate circle of family and friends.',
 'dignity. Possession of respect from others or self-respect; the life and rights of an individual being respected by the group. Some believe that a terminally ill patient or one in a vegetative state should be allowed to terminate their life; to die with dignity.',
 'Freebase(5.00 / 1 vote)Rate this definition: Dignity. Dignity is a term used in moral, ethical, legal, and political discussions to signify that a bei

##### Sparse representations in this case are fooled by the matching word. "kids" is associated to "children" and the result is considered relevant. However, the relevant document containing a real definition of dignity is absent in the results.