In [2]:
from seismic import SeismicIndex
import numpy as np
import json
import ir_measures
import ir_datasets
from ir_measures import *

Build the index

In [2]:

json_input_file = "/data4/lvenuta/splade/data/docs_anserini.jsonl"

index = SeismicIndex.build(json_input_file)


Building the index...
Configuration { pruning: GlobalThreshold { n_postings: 3500, max_fraction: 1.5 }, blocking: RandomKmeans { centroid_fraction: 0.1, min_cluster_size: 2, clustering_algorithm: RandomKmeansInvertedIndexApprox { doc_cut: 15 } }, summarization: EnergyPreserving { summary_energy: 0.4 }, knn: KnnConfiguration { nknn: 0, knn_path: None }, batched_indexing: None }
Reading the collection..
Number of rows: 8841823
Elapsed time to read the number of rows 10 s
Elapsed time to read the collection 378
	Distributing and Pruning postings 44 secs
	Number of posting lists: 28679
	Building summaries 23 secs


In [3]:
print("Number of documents: ", index.len)
print("Avg number of non-zero components: ", index.nnz / index.len)
print("Dimensionality of the vectors: ", index.dim)

index.print_space_usage_byte()

Number of documents:  8841823
Avg number of non-zero components:  119.96379434422064
Dimensionality of the vectors:  28679
Space Usage:
	Forward Index: 4313529200 Bytes
	Posting Lists: 2245566745 Bytes
	Knn: 0 Bytes
	Total: 6559095945 Bytes


Load queries

In [4]:
queries_path = "/data4/lvenuta/splade/data/queries_anserini.tsv"

queries = []
with open(queries_path, 'r') as f:
    for line in f:
        queries.append(json.loads(line))

MAX_TOKEN_LEN = 30
string_type  = f'U{MAX_TOKEN_LEN}'

queries_ids = np.array([q['id'] for q in queries], dtype=string_type)

query_components = []
query_values = []

for query in queries:
    vector = query['vector']
    query_components.append(np.array(list(vector.keys()), dtype=string_type))
    query_values.append(np.array(list(vector.values()), dtype=np.float32))

Perform the search

In [6]:

results = index.batch_search(
    queries_ids=queries_ids,
    query_components=query_components,
    query_values=query_values,
    k=10,
    query_cut=20,
    heap_factor=0.7,
    sorted=True,
    n_knn=0,
)

Evaluation

In [7]:
ir_results = [ir_measures.ScoredDoc(query_id, doc_id, score) for r in results for (query_id, score, doc_id) in r]
qrels = ir_datasets.load('msmarco-passage/dev/small').qrels

ir_measures.calc_aggregate([RR@10], qrels, ir_results)

{RR@10: 0.37996532041661046}