In [1]:
from seismic import SeismicIndex

## Indexing

### Building

We can build the index either from a jsonl file or a compressed archive .tar.gz containing the jsonl file.

In [2]:
json_input_file = "/data4/lvenuta/splade/data/docs_anserini.jsonl"
compressed_input_file = "/data4/lvenuta/splade/data/documents.tar.gz"

We can use the default configuration by specifying only the input file or choose each of the parameters.

In [3]:
index = SeismicIndex.build(json_input_file)


Building the index...
Configuration { pruning: GlobalThreshold { n_postings: 3500, max_fraction: 1.5 }, blocking: RandomKmeans { centroid_fraction: 0.1, min_cluster_size: 2, clustering_algorithm: RandomKmeansInvertedIndexApprox { doc_cut: 15 } }, summarization: EnergyPreserving { summary_energy: 0.4 }, knn: KnnConfiguration { nknn: 0, knn_path: None }, batched_indexing: None }
Reading the collection..
Number of rows: 8841823
Elapsed time to read the number of rows 10 s
Elapsed time to read the collection 388
	Distributing and Pruning postings 41 secs
	Number of posting lists: 28679
	Building summaries 23 secs


In [4]:
index = SeismicIndex.build(
    compressed_input_file,
    n_postings=3500,
    centroid_fraction=0.1,
    min_cluster_size=2,
    summary_energy=0.4, 
    batched_indexing=10000000)


Building the index...
Configuration { pruning: GlobalThreshold { n_postings: 3500, max_fraction: 1.5 }, blocking: RandomKmeans { centroid_fraction: 0.1, min_cluster_size: 2, clustering_algorithm: RandomKmeansInvertedIndexApprox { doc_cut: 15 } }, summarization: EnergyPreserving { summary_energy: 0.4 }, knn: KnnConfiguration { nknn: 0, knn_path: None }, batched_indexing: Some(10000000) }
Reading the collection..
Number of rows: 8841823
Elapsed time to read the number of rows 114 s
Elapsed time to read the collection 615
	Distributing and Pruning postings 51 secs
	Number of posting lists: 28679
	Building summaries 22 secs


By setting the nknn parameter we can build the knn graph together with the index.

In [4]:
index = SeismicIndex.build(
    json_input_file,
    n_postings=3500,
    centroid_fraction=0.1,
    min_cluster_size=2,
    summary_energy=0.4,
    nknn=10,
    batched_indexing=10000000)


Building the index...
Configuration { pruning: GlobalThreshold { n_postings: 3500, max_fraction: 1.5 }, blocking: RandomKmeans { centroid_fraction: 0.1, min_cluster_size: 2, clustering_algorithm: RandomKmeansInvertedIndexApprox { doc_cut: 15 } }, summarization: EnergyPreserving { summary_energy: 0.4 }, knn: KnnConfiguration { nknn: 10, knn_path: None }, batched_indexing: Some(10000000) }
Reading the collection..
Number of rows: 8841823
Elapsed time to read the number of rows 12 s
Elapsed time to read the collection 380648795
	Distributing and Pruning postings 51 secs
	Number of posting lists: 28679
	Building summaries 23 secs
	Computing KNN
572 secs


While, if we set also the knn_path, we can add to the index a precomputed knn graph.
In this case, the nknn parameter allow us to add a subset of the knn graph (with less neighbors).

In [3]:
knn_path = "/data4/lvenuta/splade/wrapped_data/splade.3500_10.knn.seismic"

index = SeismicIndex.build(
    json_input_file,
    n_postings=3500,
    centroid_fraction=0.1,
    min_cluster_size=2,
    summary_energy=0.4,
    knn_path=knn_path,
    nknn=5,
    batched_indexing=10000000)

Number of threads in Rayon pool: 64

Building the index...
Configuration { pruning: GlobalThreshold { n_postings: 3500, max_fraction: 1.5 }, blocking: RandomKmeans { centroid_fraction: 0.1, min_cluster_size: 2, clustering_algorithm: RandomKmeansInvertedIndexApprox { doc_cut: 15 } }, summarization: EnergyPreserving { summary_energy: 0.4 }, knn: KnnConfiguration { nknn: 5, knn_path: Some("/data4/lvenuta/splade/wrapped_data/splade.3500_10.knn.seismic") }, batched_indexing: Some(10000000) }
Reading the collection..
Number of rows: 8841823
Elapsed time to read the number of rows 10 s
Elapsed time to read the collection 449
	Distributing and Pruning postings 53 secs
	Number of posting lists: 28679
	Building summaries 22 secs
Reading KNN from file: /data4/lvenuta/splade/wrapped_data/splade.3500_10.knn.seismic
Number of vectors: 8841823
Number of neighbors in the file: 10
We only take 5 neighbors per element
5 secs


Once the index is constructed, we can serialize and store it in a file.


In [None]:
index_path = "/data4/lvenuta/splade/wrapped_data/splade.bin.3500_0.4_0.1"

index.save(index_path)

### Loading

We may load a serialized index to query it.

In [3]:
index_path = "/data4/lvenuta/splade/wrapped_data/splade.bin.3500_0.4_0.1.index.seismic"

index = SeismicIndex.load(index_path)

In [4]:
print("Number of documents: ", index.len)
print("Avg number of non-zero components: ", index.nnz / index.len)
print("Dimensionality of the vectors: ", index.dim)

index.print_space_usage_byte()

Number of documents:  8841823
Avg number of non-zero components:  119.96379434422064
Dimensionality of the vectors:  28679
Space Usage:
	Forward Index: 4313529200 Bytes
	Posting Lists: 2169336756 Bytes
	Knn: 0 Bytes
	Total: 6482865956 Bytes


### KNN Graph

Given an inverted index, we can build a knn graph and attach to it with the build_knn function.
It is also possible to serialise the graph and then link it to another index with the load_knn function.

In [None]:
nknn=10
index.build_knn(nknn)

knn_path = "/data4/lvenuta/splade/wrapped_data/splade.3500_10"

index.save_knn(knn_path)

When adding the knn graph we can specify a subset of the neighbours we want for each entry of the index or load the full knn graph

In [7]:
index_path = "/data4/lvenuta/splade/wrapped_data/splade.bin.3500_0.4_0.1.index.seismic"
knn_path = "/data4/lvenuta/splade/wrapped_data/splade.3500_10.knn.seismic"

#load full knn graph
index.load_knn2(knn_path)


Reading KNN from file: /data4/lvenuta/splade/wrapped_data/splade.3500_10.knn.seismic
Number of vectors: 8841823
Number of neighbors in the file: 10


In [8]:
nknn = 5
#load partial graph
index.load_knn2(knn_path, nknn)

Reading KNN from file: /data4/lvenuta/splade/wrapped_data/splade.3500_10.knn.seismic
Number of vectors: 8841823
Number of neighbors in the file: 10
We only take 5 neighbors per element


### Perform the search

Prepare the data to perform the search

In [9]:
import numpy as np
import json

file_path = "/data4/lvenuta/splade/data/queries_anserini.tsv"

queries = []
with open(file_path, 'r') as f:
    for line in f:
        queries.append(json.loads(line))

MAX_TOKEN_LEN = 30
string_type  = f'U{MAX_TOKEN_LEN}'

queries_ids = np.array([q['id'] for q in queries], dtype=string_type)

query_components = []
query_values = []

for query in queries:
    vector = query['vector']
    query_components.append(np.array(list(vector.keys()), dtype=string_type))
    query_values.append(np.array(list(vector.values()), dtype=np.float32))

We can ran a single search or a parallel batch search

In [11]:
results = index.search(
    query_id=str(queries_ids[0]),
    query_components=query_components[0],
    query_values=query_values[0],
    k=10,
    query_cut=20,
    heap_factor=0.7,
    n_knn=0,
    sorted=True,
)

In [15]:

results = index.batch_search(
    queries_ids=queries_ids,
    query_components=query_components,
    query_values=query_values,
    k=10,
    query_cut=20,
    heap_factor=0.7,
    n_knn=0,
    sorted=True,
    num_threads=1,
)

## Evaluation of results



Evaulation of the results with the ir_measure package for the choosen dataset

In [16]:
import ir_measures
import ir_datasets

ir_results = [ir_measures.ScoredDoc(query_id, doc_id, score) for r in results for (query_id, score, doc_id) in r]
qrels = ir_datasets.load('msmarco-passage/dev/small').qrels


In [17]:
from ir_measures import *

ir_measures.calc_aggregate([RR@10], qrels, ir_results)

{RR@10: 0.3798996566152729}

# Raw Seismic Index


Input file in Seismic inner format (this means that we have to provide a method to produce documents and queries in the seismic inner format)

In [3]:
from seismic import SeismicIndexRaw

In [4]:
input_path = "/data4/lvenuta/splade/data/documents.bin"

index = SeismicIndexRaw.build(input_path)


Building the index...
Configuration { pruning: GlobalThreshold { n_postings: 3500, max_fraction: 1.5 }, blocking: RandomKmeans { centroid_fraction: 0.1, min_cluster_size: 2, clustering_algorithm: RandomKmeansInvertedIndexApprox { doc_cut: 15 } }, summarization: EnergyPreserving { summary_energy: 0.4 }, knn: KnnConfiguration { nknn: 0, knn_path: None }, batched_indexing: None }
	Distributing and Pruning postings 43 secs
	Number of posting lists: 28679
	Building summaries 23 secs


In [7]:
queries_path=""

query_path="/data4/lvenuta/splade/data/queries.bin"


results = index.batch_search(
    query_path,
    k=10,
    query_cut=3,
    heap_factor=0.9,
    n_knn=0,
    sorted=True)