# First Experiments With Toy Dataset
# (Following instructions of SeismicGuide.ipynb)

In [1]:
from seismic import SeismicIndex, SeismicDataset # use python 3.12.0 as kernel


# 1. Indexing

## 1.0 Building from in-memory vectors

### We can build an index from numpy vectors loaded in memory. 

In [2]:
import json
import numpy as np

import seismic
from seismic import SeismicDataset # -> SeismicDataset allows to incrementally add vectors and then build the SeismicIndex.

### Load the vectors into the dataset. 

In [3]:
document_path = "./toy_dataset/documents.jsonl" # loading the dataset

dataset = SeismicDataset()
string_type  = seismic.get_seismic_string()

# data parsing and building sparse representation
with open(document_path, "r") as file:
    for line in file:
        line_data = json.loads(line.strip())
        vs = np.array([v for v in line_data["vector"].values()], dtype=np.float32)
        ks = np.array([k for k in line_data["vector"].keys()], dtype=string_type)
        dataset.add_document(str(line_data["id"]), ks, vs)

In [4]:
# print first lines to check what is in the dataset
with open(document_path, "r") as file:
    for i, line in enumerate(file):
        print(line)
        if i >= 9:
            break  # show first 10 lines only

{"id": 0, "content": "The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.", "vector": {"was": 0.6476961374282837, "were": 0.2976420819759369, "also": 0.045809537172317505, "their": 0.022212404757738113, "said": 0.04674193635582924, "over": 0.7201237678527832, "only": 0.7370892763137817, "people": 0.1946154683828354, "many": 0.5773153901100159, "because": 0.5232481360435486, "war": 0.02412080205976963, "life": 0.4349733293056488, "thought": 0.6931471824645996, "went": 0.09228714555501938, "york": 0.9680526852607727, "within": 0.175953671336174, "great": 0.09584271162748337, "started": 0.18654440343379974, "death": 0.43244171142578125, "died": 0.055094651877880096, "wanted": 0.0009760859538801014, "army": 0.13840232789

### Build the SeismicIndex from the SeismicDataset

In [None]:
# building the index: METRIC TIME
import time  # import module to measure time
start_time = time.time()

index = SeismicIndex.build_from_dataset(dataset) ## The index is built with default parameters, check them using help(SeismicIndex.build_from_dataset)

# End the timer
end_time = time.time()

# Calculate and print the duration
duration = end_time - start_time
print(f"\nIndex completed in {duration:.8f} seconds ({duration*1000:.8f} milliseconds).")

# PROBLEM: the time varies - so best build it like 10 times and take average?
# e.g. here: (12.01 + 12.01 + 11.01 + 11.03 + 11.00 + 12.01)/6 = 11,5 ms on average etc.


Index completed in 0.01201081 seconds (12.01081276 milliseconds).


In [16]:
# store the index to check the size of it (with ToyDataset it is 772,285 Bytes (KB number is typically rounded, so use Bytes)

index_path = "C:/Users/elisa/OneDrive/Dokumente/VS_Studio_Project/Index/TestIndex"

index.save(index_path)

print(f"Index saved to: {index_path}")

Index saved to: C:/Users/elisa/OneDrive/Dokumente/VS_Studio_Project/Index/TestIndex


## 1.2 Loading

### We may want to load a serialized index to query it.

In [19]:
index_path = "C:/Users/elisa/OneDrive/Dokumente/VS_Studio_Project/Index/TestIndex.index.seismic"

index = SeismicIndex.load(index_path) # load already built index

In [20]:
print("Number of documents: ", index.len)
print("Avg number of non-zero components: ", index.nnz / index.len)
print("Dimensionality of the vectors: ", index.dim)

index.print_space_usage_byte()

Number of documents:  20
Avg number of non-zero components:  145.0
Dimensionality of the vectors:  1396


# 3. Perform the search

### Prepare the data to perform the search

In [23]:
# here I have to preprocess the dataset - MMarco?

import numpy as np
import json

file_path = "./toy_dataset/queries.jsonl" # . current directory and .. go one older above

queries = []
with open(file_path, 'r') as f:
    for line in f:
        queries.append(json.loads(line))

MAX_TOKEN_LEN = 30
string_type  = f'U{MAX_TOKEN_LEN}'

queries_ids = np.array([q['id'] for q in queries], dtype=string_type)

query_components = []
query_values = []

for query in queries:
    vector = query['vector']
    query_components.append(np.array(list(vector.keys()), dtype=string_type))
    query_values.append(np.array(list(vector.values()), dtype=np.float32))

### We can ran a single search or a parallel batch search

In [24]:
results = index.search(
    query_id=str(queries_ids[0]),
    query_components=query_components[0],
    query_values=query_values[0],
    k=10,
    query_cut=20,
    heap_factor=0.7,
    n_knn=0,
    sorted=True, #specified even if default value
)

In [25]:
results = index.batch_search(
    queries_ids=queries_ids,
    query_components=query_components,
    query_values=query_values,
    k=10,
    query_cut=20,
    heap_factor=0.7,
    n_knn=0,
    sorted=True, #specified even if default value
    num_threads=1,
)

# 4. Evaluation of results



### Evaluation of the results with the ir_measure library

In [26]:
# note: this is matching the results with QUORA as a ground truth, so if the dataset is not QUORA it obviously won't work, so the case with toy dataset it seems
import ir_measures
import ir_datasets

# add your ir_dataset dataset string id below, e.g., "beir/quora/test"
ir_dataset_string = "beir/quora/test" # how it should be evaluated?

ir_results = [ir_measures.ScoredDoc(query_id, doc_id, score) for r in results for (query_id, score, doc_id) in r]
qrels = ir_datasets.load(ir_dataset_string).qrels

In [27]:
from ir_measures import *

measure_to_compute = RR@10 # I removed " " around RR@10
ir_measures.calc_aggregate([measure_to_compute], qrels, ir_results)

{RR@10: 0.0}