# A detailed how-to of Seismic.

## This Jupyter notebook is a more detailed documentation of how to use Seismic and all its functionalities.

## For questions, feel free to open a GitHub issue.

In [11]:
# ONLY ONCE
!python -m pip install pyseismic-lsr




In [1]:
from seismic import SeismicIndex, SeismicDataset

# 1. Indexing

## 1.0 Building from in-memory vectors

### We can build an index from numpy vectors loaded in memory. 

In [2]:
import json
import numpy as np

import seismic
from seismic import SeismicDataset # -> SeismicDataset allows to incrementally add vectors and then build the SeismicIndex.

### Load the vectors into the dataset. 

## 1.1 Building from `jsonl` or `tar.gz`

### We can build the index either from a jsonl file or a compressed archive tar.gz containing the jsonl file.

In [3]:
json_input_file = "mMARCO/Dataset_Json/English/mMARCO_english_vectors_multi_unicoil_xlmr_topk.jsonl"
# compressed_input_file = ""

### We can use the default configuration by specifying only the input file or choose each of the parameters.

In [2]:
# bruach ich nimmer?

from seismic import SeismicModel
import json

model = SeismicModel.from_pretrained("seismic-base")   # or multilingual model later

input_path = "mMARCO/Dataset_Json/English/mMARCO_collection_converted-english.jsonl"         # original file
output_path = "mMARCO/Dataset_Json/English/mMARCO_collection_converted-english_vectors.jsonl"

with open(input_path, "r") as fin, open(output_path, "w") as fout:
    for i, line in enumerate(fin):
        if i >= 10:
            break  # stop after 10 documents
        doc = json.loads(line)
        text = doc["contents"]

        # compute sparse vector
        sparse_vec = model.encode({"contents": text})  # returns dict of term: weight

        # add vector field
        doc["vector"] = sparse_vec

        # write
        fout.write(json.dumps(doc) + "\n")

ImportError: cannot import name 'SeismicModel' from 'seismic' (/home/eli/Multilingual_IE_Seismic/.venv/lib/python3.10/site-packages/seismic/__init__.py)

In [None]:
index = SeismicIndex.build(json_input_file)

In [6]:
from carbontracker.tracker import CarbonTracker
from codecarbon import EmissionsTracker

#tracker1 = CarbonTracker(epochs=1, decimal_precision=8) # Carbontracker
tracker = EmissionsTracker() # CodeCarbon

tracker = EmissionsTracker()
tracker.start()
try:
      # Compute intensive code goes here
      index = SeismicIndex.build(json_input_file)
finally:
      tracker.stop()


[codecarbon INFO @ 00:12:27] [setup] RAM Tracking...
[codecarbon INFO @ 00:12:27] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 00:12:29] CPU Model on constant consumption mode: AMD Ryzen 9 5900X 12-Core Processor
[codecarbon INFO @ 00:12:29] [setup] GPU Tracking...
[codecarbon INFO @ 00:12:29] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 00:12:29] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 00:12:29] >>> Tracker's metadata:
[codecarbon INFO @ 00:12:29]   Platform system: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35
[codecarbon INFO @ 00:12:29]   Python version: 3.10.12
[codecarbon INFO @ 00:12:29]   CodeCarbon version: 3.1.1
[codecarbon INFO @ 00:12

: 

In [None]:
index = SeismicIndex.build(
    compressed_input_file,
    n_postings=3500,
    centroid_fraction=0.1,
    min_cluster_size=2,
    summary_energy=0.4, 
    batched_indexing=10000000)

### By setting the `nknn` parameter we can build the knn graph together with the index.

In [None]:
index = SeismicIndex.build(
    json_input_file,
    n_postings=3500,
    centroid_fraction=0.1,
    min_cluster_size=2,
    summary_energy=0.4,
    nknn=10,
    batched_indexing=10000000)

### While, if we set also the `knn_path` (details on how to do it below), we can add to the index a precomputed knn graph. In this case, the `nknn` parameter allow us to add a subset of the knn graph (with less neighbors).

In [None]:
knn_path = ""

index = SeismicIndex.build(
    json_input_file,
    n_postings=3500,
    centroid_fraction=0.1,
    min_cluster_size=2,
    summary_energy=0.4,
    knn_path=knn_path,
    nknn=5,
    batched_indexing=10000000)

### Once the index is constructed, we can serialize and store it in a file.

In [4]:
index_path = "mMARCO/BM25_results/English_IndexmMARCO/BM25_results/English_Index"

index.save(index_path)

NameError: name 'index' is not defined

## 1.2 Loading

### We may want to load a serialized index to query it.

In [None]:
index_path = ""

index = SeismicIndex.load(index_path)

In [None]:
print("Number of documents: ", index.len)
print("Avg number of non-zero components: ", index.nnz / index.len)
print("Dimensionality of the vectors: ", index.dim)

index.print_space_usage_byte()

# 2. kNN Graph

### Given an inverted index, we can build a knn graph and attach to it with the build_knn function. It is also possible to serialize the graph and link it to another index with the `load_knn` function.

In [None]:
nknn=10
index.build_knn(nknn)

knn_path = ""

index.save_knn(knn_path)

### When adding the knn graph we can specify a subset of the neighbours we want for each entry of the index or load the full knn graph

In [None]:
index_path = ""
knn_path = ""

#load full knn graph
index.load_knn(knn_path)

In [None]:
nknn = 5

#load partial graph
index.load_knn(knn_path, nknn)

# 3. Perform the search

### Prepare the data to perform the search

In [None]:
import numpy as np
import json

file_path = ""

queries = []
with open(file_path, 'r') as f:
    for line in f:
        queries.append(json.loads(line))

MAX_TOKEN_LEN = 30
string_type  = f'U{MAX_TOKEN_LEN}'

queries_ids = np.array([q['id'] for q in queries], dtype=string_type)

query_components = []
query_values = []

for query in queries:
    vector = query['vector']
    query_components.append(np.array(list(vector.keys()), dtype=string_type))
    query_values.append(np.array(list(vector.values()), dtype=np.float32))

### We can ran a single search or a parallel batch search

In [None]:
results = index.search(
    query_id=str(queries_ids[0]),
    query_components=query_components[0],
    query_values=query_values[0],
    k=10,
    query_cut=20,
    heap_factor=0.7,
    n_knn=0,
    sorted=True, #specified even if default value
)

In [None]:
results = index.batch_search(
    queries_ids=queries_ids,
    query_components=query_components,
    query_values=query_values,
    k=10,
    query_cut=20,
    heap_factor=0.7,
    n_knn=0,
    sorted=True, #specified even if default value
    num_threads=1,
)

# 4. Evaluation of results



### Evaluation of the results with the ir_measure library

In [None]:
import ir_measures
import ir_datasets

# add your ir_dataset dataset string id below, e.g., "beir/quora/test"
ir_dataset_string = ""

ir_results = [ir_measures.ScoredDoc(query_id, doc_id, score) for r in results for (query_id, score, doc_id) in r]
qrels = ir_datasets.load(ir_dataset_string).qrels

In [None]:
from ir_measures import *

measure_to_compute = "RR@10"
ir_measures.calc_aggregate([measure_to_compute], qrels, ir_results)

# 5. Raw Seismic Index


### Raw Seismic Index: input a file in the Seismic internal format, i.e., as the plain Rust index. See how to use the script `scripts/convert_json_to_inner_format.py`

In [None]:
from seismic import SeismicIndexRaw

In [None]:
input_path = ""

index = SeismicIndexRaw.build(input_path)

In [None]:
query_path = ""

results = index.batch_search(
    query_path,
    k=10,
    query_cut=3,
    heap_factor=0.9,
    n_knn=0,
    sorted=True
)