In [1]:
# Replace with your tenant name from the Vespa Cloud Console
TENANT_NAME = "projetos-cd-msmarco"
# Replace with your application name (does not need to exist yet)
APP_NAME = "hybridsearch"
SCHEMA_NAME = "doc"

In [21]:
# Precomputed minimum and maximum values for fusion weights
BM25_MIN = 1.4562415365049788
BM25_MAX = 68.19806521677549
BM25_RANGE = BM25_MAX - BM25_MIN
DOT_PRODUCT_MIN = 41.41029357910156
DOT_PRODUCT_MAX = 66.20972442626953
DOT_PRODUCT_RANGE = DOT_PRODUCT_MAX - DOT_PRODUCT_MIN
CLOSENESS_MIN = 0.5747992823289696
CLOSENESS_MAX = 0.7822561412418744
CLOSENESS_RANGE = CLOSENESS_MAX - CLOSENESS_MIN

In [22]:
from vespa.package import (
    ApplicationPackage,
    Field,
    Schema,
    Document,
    HNSW,
    RankProfile,
    OnnxModel,
    Component,
    Parameter,
    FieldSet,
    SecondPhaseRanking,
    GlobalPhaseRanking,
    Function,
)

class MySchema(Schema):
    @property
    def schema_to_text(self):
        og_text = super().schema_to_text
        # Conserta o caminho do modelo ONNX para usar barras normais
        og_text = og_text.replace("\\", "/")
        return og_text

fusion_dot_product_expression = (
    f"(bm25(text) - {BM25_MIN}) / {BM25_RANGE} + "
    f"(dot_product - {DOT_PRODUCT_MIN}) / {DOT_PRODUCT_RANGE}"
)
fusion_closeness_expression = (
    f"(bm25(text) - {BM25_MIN}) / {BM25_RANGE} + "
    f"(closeness(field, embedding) - {CLOSENESS_MIN}) / {CLOSENESS_RANGE}"
)

schema = MySchema(
    name=SCHEMA_NAME,
    document=Document(
        fields=[
            Field(
                name="id",
                type="string",
                indexing=[
                    "summary",
                    "attribute",
                ],
            ),
            Field(
                name="text",
                type="string",
                indexing=[
                    "summary",
                    "index",
                ],
                index="enable-bm25",
                bolding=True,
            ),
            Field(
                name="text_token_ids",
                type="tensor<float>(d0[64])",
                indexing=[
                    "input text",
                    "embed tokenizer",
                    "attribute",
                ],
                attribute=["paged"],
                is_document_field=False,
            ),
            Field(
                name="embedding",
                type="tensor<float>(x[384])",
                indexing=[
                    "input text",
                    "embed e5",
                    "attribute",
                    "index",
                ],
                ann=HNSW(
                    distance_metric="angular",
                    max_links_per_node=32,
                    neighbors_to_explore_at_insert=400,
                ),
                is_document_field=False,
            ),
            Field(
                name="colbert",
                type="tensor<int8>(dt{}, x[16])",
                indexing=[
                    "input text",
                    "embed colbert",
                    "attribute",
                ],
                attribute=["paged"],
                is_document_field=False,
            ),
        ]
    ),
    fieldsets=[FieldSet(name="default", fields=["text"])],
    rank_profiles=[
        RankProfile(
            name="bm25",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
            ],
            first_phase="bm25(text)",
        ),
        RankProfile(
            name="closeness",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
            ],
            first_phase="closeness(field, embedding)",
        ),
        RankProfile(
            name="dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
            ],
            functions=[
                Function(name="dot_product", expression="sum(query(q) * attribute(embedding))"),
            ],
            first_phase="dot_product",
        ),
        RankProfile(
            name="fusion-dot-product",
            inherits="dot-product",
            functions=[
                Function(name="fusion_dot_product", expression=fusion_dot_product_expression),
            ],
            first_phase="fusion_dot_product",
        ),
        RankProfile(
            name="fusion-closeness",
            inherits="closeness",
            functions=[
                Function(name="fusion_closeness", expression=fusion_closeness_expression),
            ],
            first_phase="fusion_closeness",
        ),
        RankProfile(
            name="fusion-dot-product-second-phase",
            inherits="fusion-dot-product",
            first_phase="dot_product",
            second_phase=SecondPhaseRanking(
                expression="fusion_dot_product",
                rerank_count=1000,
            ),
        ),
        RankProfile(
            name="fusion-closeness-second-phase",
            inherits="fusion-closeness",
            first_phase="closeness(field, embedding)",
            second_phase=SecondPhaseRanking(
                expression="fusion_closeness",
                rerank_count=1000,
            ),
        ),
        RankProfile(
            name="bm25-colbert",
            inherits="fusion-dot-product-colbert",
            first_phase="bm25(text)",
        ),
        RankProfile(
            name="closeness-colbert",
            inherits="fusion-dot-product-colbert",
            first_phase="closeness(field, embedding)",
        ),
        RankProfile(
            name="dot-product-colbert",
            inherits="fusion-dot-product-colbert",
            first_phase="dot_product",
        ),
        RankProfile(
            name="fusion-dot-product-colbert",
            inherits="fusion-dot-product",
            inputs=[
                ("query(qt)", "tensor<float>(qt{},x[128])"),
                ("query(q)", "tensor<float>(x[384])"),
            ],
            functions=[
                Function(name="cos_sim", expression="cos(distance(field, embedding))"),
                Function(name="max_sim", expression="sum(reduce(sum(query(qt) * unpack_bits(attribute(colbert)), x), max, dt),qt)"),
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="max_sim",
                rerank_count=100,
            ),
            match_features=["max_sim", "cos_sim"],
        ),
        RankProfile(
            name="fusion-closeness-colbert",
            inherits="fusion-dot-product-colbert",
            functions=[
                Function(name="fusion_closeness", expression=fusion_closeness_expression),
            ],
            first_phase="fusion_closeness",
        ),
        RankProfile(
            name="bm25-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            first_phase="bm25(text)",
        ),
        RankProfile(
            name="closeness-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            first_phase="closeness(field, embedding)",
        ),
        RankProfile(
            name="dot-product-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            first_phase="dot_product",
        ),
        RankProfile(
            name="fusion-dot-product-cross-encoder",
            inherits="fusion-dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
                ("query(query_token_ids)",  "tensor<float>(d0[32])"),
            ],
            functions=[
                Function(name="input_ids", expression="tokenInputIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="token_type_ids", expression="tokenTypeIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="attention_mask", expression="tokenAttentionMask(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="cross_encoder", expression="onnx(cross_encoder_model){d0:0,d1:0}")
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="cross_encoder",
                rerank_count=100
            )
        ),
        RankProfile(
            name="fusion-closeness-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            functions=[
                Function(name="fusion_closeness", expression=fusion_closeness_expression),
            ],
            first_phase="fusion_closeness",
        ),
    ],
    models=[
        OnnxModel(
            model_name="cross_encoder_model",
            model_file_path="models/model.onnx",
            inputs={
                "input_ids": "input_ids",
                "attention_mask": "attention_mask",
                "token_type_ids": "token_type_ids",
            },
            outputs={"logits": "logits"},
        ),
    ],
)

package = ApplicationPackage(
    name=APP_NAME,
    schema=[
        schema
    ],
    components=[
        Component(
            id="e5",
            type="hugging-face-embedder",
            parameters=[
                Parameter(
                    "transformer-model",
                    {
                        "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                    },
                ),
                Parameter(
                    "tokenizer-model",
                    {
                        "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
        Component(
            id="colbert",
            type="colbert-embedder",
            parameters=[
                Parameter(
                    "transformer-model",
                    {
                        "url": "https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"
                    },
                ),
                Parameter(
                    "tokenizer-model",
                    {
                        "url": "https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
        Component(
            id="tokenizer",
            type="hugging-face-tokenizer",
            parameters=[
                Parameter(
                    "model",
                    {
                        "url": "https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
    ],
)

In [3]:
import requests
import os

# Check if model file already exists
if os.path.exists("models/model.onnx"):
    print("Model file already exists. Skipping download.")
else:
    # Download the ONNX model file
    url = "https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2/resolve/main/onnx/model.onnx"
    output_path = "models/model.onnx"

    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Download the file with following redirects
    response = requests.get(url, allow_redirects=True)

    # Save the file
    with open(output_path, 'wb') as f:
        f.write(response.content)

    print(f"Model saved to {output_path}")

Model file already exists. Skipping download.


In [25]:
from vespa.deployment import VespaCloud

vespa_cloud = VespaCloud(
    tenant=TENANT_NAME,
    application=APP_NAME,
    application_package=package,
)

Setting application...
Running: vespa config set application projetos-cd-msmarco.hybridsearch.default
Setting target cloud...
Running: vespa config set target cloud

No api-key found for control plane access. Using access token.
Checking for access token in auth.json...
Successfully obtained access token for control plane access.


In [None]:
# Usa .deploy() na primeira vez e .get_application() nas próximas vezes
app = vespa_cloud.deploy()

In [None]:
endpoint = vespa_cloud.get_mtls_endpoint()
endpoint

In [5]:
import random
import ir_datasets
import pickle
import re

In [6]:
random.seed(42)

def load_dataset(input_file):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

data_set = "subset_msmarco_train_0/subset_msmarco_train_0.01_99.pkl"

data = load_dataset(data_set)
queries = data["queries"]
documents = data["docs"]

# Split the queries (queries is a dictionary of {query_id: query_object})
query_ids = list(queries.keys())  # List of query IDs

# Shuffle query IDs to ensure a random split
random.shuffle(query_ids)

# Split into 80% for training, 20% for validation
split_ratio = 0.8
train_query_ids = query_ids[:int(len(query_ids) * split_ratio)]
test_query_ids = query_ids[int(len(query_ids) * split_ratio):]

train_queries = {qid: queries[qid] for qid in train_query_ids}
test_queries = {qid: queries[qid] for qid in test_query_ids}

In [10]:
def remove_control_characters(text):
    # Remove control characters using regex
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

In [11]:
vespa_cli_feed = [
    {
        "put": f"id:{APP_NAME}:{SCHEMA_NAME}::{doc.doc_id}",
        "fields": {
            "text": remove_control_characters(doc.text),
            "id": doc.doc_id,
        }
    }
    for doc in documents.values()
]

# vespa_cli_feed = vespa_cli_feed[:1000]  # Limit to first 1000 documents for testing

# write to jsonl file
import json
import os

def write_jsonl(data, filename):
    with open(filename, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
os.makedirs("data", exist_ok=True)
write_jsonl(vespa_cli_feed, "data/vespa_cli_feed.jsonl")

In [12]:
output_list = !vespa feed data/vespa_cli_feed.jsonl
results = json.loads("".join(output_list))
print(results)

{'feeder.operation.count': 277168, 'feeder.seconds': 42400.531, 'feeder.ok.count': 277168, 'feeder.ok.rate': 6.537, 'feeder.error.count': 0, 'feeder.inflight.count': 0, 'http.request.count': 277173, 'http.request.bytes': 95651198, 'http.request.MBps': 0.002, 'http.exception.count': 0, 'http.response.count': 277173, 'http.response.bytes': 37069725, 'http.response.MBps': 0.001, 'http.response.error.count': 5, 'http.response.latency.millis.min': 141, 'http.response.latency.millis.avg': 2757, 'http.response.latency.millis.max': 5963, 'http.response.code.counts': {'200': 277168, '429': 5}}


In [27]:
import types
from typing import Dict, Iterable
from vespa.application import Vespa

def new_query_many(
        self,
        queries: Iterable[Dict],
        num_connections: int = 1,
        max_concurrent: int = 5,
        client_kwargs: Dict = {},
        **query_kwargs,
    ):
    return Vespa.query_many(
        self,
        queries=queries,
        num_connections=num_connections,
        max_concurrent=max_concurrent,
        client_kwargs=client_kwargs,
        **query_kwargs,
    )

# Patch the Vespa app instance to use the new query_many method
app.query_many = types.MethodType(new_query_many, app)

In [28]:
from vespa.evaluation import VespaEvaluator
from collections.abc import Callable

test_queries_dict = {
    q.query_id: q.text
    for q in test_queries.values()
}

relevant_docs = dict()
for qrel in data["qrels"]:
    relevant_docs[qrel.query_id] = relevant_docs.get(qrel.query_id, set())
    relevant_docs[qrel.query_id].add(qrel.doc_id)

def query_fn_lexical_bm25(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
    }

def query_fn_semantic_bm25(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_bm25(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
    }

def query_fn_semantic_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
    }

def query_fn_semantic_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_fusion_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product",
    }

def query_fn_semantic_fusion_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_fusion_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_fusion_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness",
    }

def query_fn_semantic_fusion_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_fusion_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

query_fn_for_match_and_ranking = {
    "lexical": {
        "bm25": query_fn_lexical_bm25,
        "dot-product": query_fn_lexical_dot_product,
        "closeness": query_fn_lexical_closeness,
        "fusion-dot-product": query_fn_lexical_fusion_dot_product,
        "fusion-closeness": query_fn_lexical_fusion_closeness,
    },
    "semantic": {
        "bm25": query_fn_semantic_bm25,
        "dot-product": query_fn_semantic_dot_product,
        "closeness": query_fn_semantic_closeness,
        "fusion-dot-product": query_fn_semantic_fusion_dot_product,
        "fusion-closeness": query_fn_semantic_fusion_closeness,
    },
    "hybrid": {
        "bm25": query_fn_hybrid_bm25,
        "dot-product": query_fn_hybrid_dot_product,
        "closeness": query_fn_hybrid_closeness,
        "fusion-dot-product": query_fn_hybrid_fusion_dot_product,
        "fusion-closeness": query_fn_hybrid_fusion_closeness,
    },
}

In [29]:
for match in ["lexical", "semantic", "hybrid"]:
    for ranking in ["bm25", "dot-product", "closeness", "fusion-dot-product", "fusion-closeness"]:
        query_fn = query_fn_for_match_and_ranking[match][ranking]
        print(query_fn("example query", 10))
        print(f"Results for match: {match}, ranking: {ranking}")
        evaluator = VespaEvaluator(
            queries=test_queries_dict,
            relevant_docs=relevant_docs,
            vespa_query_fn=query_fn,
            app=app,
            name=f"test-run-{match}-{ranking}",
            accuracy_at_k=[10],
            precision_recall_at_k=[100],
            mrr_at_k=[10],
            ndcg_at_k=[10],
            write_csv=True,
            csv_dir="results",
        )
        
        results = evaluator.run()
        print("Primary metric:", evaluator.primary_metric)
        print("All results:", results)

{'yql': 'select * from sources * where userQuery()', 'query': 'example query', 'hits': 10, 'ranking': 'bm25'}
Results for match: lexical, ranking: bm25
Primary metric: ndcg@10
All results: {'accuracy@10': 0.7261261261261261, 'precision@100': 0.009045045045045046, 'recall@100': 0.8675675675675676, 'mrr@10': 0.5284355784355782, 'ndcg@10': 0.5695866399682291, 'map@100': 0.5286746933809584, 'searchtime_avg': 0.044108108108108106, 'searchtime_q50': 0.043000000000000003, 'searchtime_q90': 0.07200000000000001, 'searchtime_q95': 0.08159999999999991}
{'yql': 'select * from sources * where userQuery()', 'query': 'example query', 'hits': 10, 'ranking': 'dot-product'}
Results for match: lexical, ranking: dot-product
Primary metric: ndcg@10
All results: {'accuracy@10': 0.03423423423423423, 'precision@100': 0.0018378378378378379, 'recall@100': 0.17477477477477477, 'mrr@10': 0.012418847418847419, 'ndcg@10': 0.016567284051945783, 'map@100': 0.015376290483545885, 'searchtime_avg': 0.011882882882882882,

In [31]:
def query_fn_hybrid_fusion_closeness_colbert(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness-colbert",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(qt)": f"embed(colbert,'{query_text}')",
        "timeout": 100,
        "ranking.softtimeout.enable": "false",
    }

def query_fn_hybrid_fusion_closeness_cross_encoder(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness-cross-encoder",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(query_token_ids)": f"embed(tokenizer,'{query_text}')",
        "timeout": 100,
        "ranking.softtimeout.enable": "false",
    }

def query_fn_hybrid_fusion_dot_product_colbert(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product-colbert",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(qt)": f"embed(colbert,'{query_text}')",
        "timeout": 100,
        "ranking.softtimeout.enable": "false",
    }

def query_fn_hybrid_fusion_dot_product_cross_encoder(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product-cross-encoder",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(query_token_ids)": f"embed(tokenizer,'{query_text}')",
        "timeout": 100,
        "ranking.softtimeout.enable": "false",
    }

def query_fn_semantic_bm25_colbert(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25-colbert",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(qt)": f"embed(colbert,'{query_text}')",
        "timeout": 100,
        "ranking.softtimeout.enable": "false",
    }

def query_fn_semantic_bm25_cross_encoder(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25-cross-encoder",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(query_token_ids)": f"embed(tokenizer,'{query_text}')",
        "timeout": 100,
        "ranking.softtimeout.enable": "false",
    }

query_fn_for_match_and_ranking = {
    "hybrid": {
        "fusion-closeness-colbert": query_fn_hybrid_fusion_closeness_colbert,
        "fusion-closeness-cross-encoder": query_fn_hybrid_fusion_closeness_cross_encoder,
        "fusion-dot-product-colbert": query_fn_hybrid_fusion_dot_product_colbert,
        "fusion-dot-product-cross-encoder": query_fn_hybrid_fusion_dot_product_cross_encoder,
    },
    "semantic": {
        "bm25-colbert": query_fn_semantic_bm25_colbert,
        "bm25-cross-encoder": query_fn_semantic_bm25_cross_encoder,
    },
}

for match in query_fn_for_match_and_ranking.keys():
    for ranking in query_fn_for_match_and_ranking[match].keys():
        query_fn = query_fn_for_match_and_ranking[match][ranking]
        print(query_fn("example query", 10))
        print(f"Results for match: {match}, ranking: {ranking}")
        evaluator = VespaEvaluator(
            queries=test_queries_dict,
            relevant_docs=relevant_docs,
            vespa_query_fn=query_fn,
            app=app,
            name=f"test-run-{match}-{ranking}",
            accuracy_at_k=[10],
            precision_recall_at_k=[100],
            mrr_at_k=[10],
            ndcg_at_k=[10],
            write_csv=True,
            csv_dir="results",
        )
        
        results = evaluator.run()
        print("Primary metric:", evaluator.primary_metric)
        print("All results:", results)

{'yql': 'select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))', 'query': 'example query', 'hits': 10, 'ranking': 'fusion-closeness-colbert', 'input.query(q)': "embed(e5,'example query')", 'input.query(qt)': "embed(colbert,'example query')", 'timeout': 100, 'ranking.softtimeout.enable': 'false'}
Results for match: hybrid, ranking: fusion-closeness-colbert
Primary metric: ndcg@10
All results: {'accuracy@10': 0.9243243243243243, 'precision@100': 0.01001801801801802, 'recall@100': 0.9621621621621622, 'mrr@10': 0.7558344058344056, 'ndcg@10': 0.7942731788285473, 'map@100': 0.7526313097208859, 'searchtime_avg': 0.056190990990990994, 'searchtime_q50': 0.055, 'searchtime_q90': 0.078, 'searchtime_q95': 0.085}
{'yql': 'select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))', 'query': 'example query', 'hits': 10, 'ranking': 'fusion-closeness-cross-encoder', 'input.query(q)': "embed(e5,'example query')", 'input.query(quer

In [37]:
# Iterate through the results and print them
import pandas as pd
results_dir = "results"
results_files = [f for f in os.listdir(results_dir) if f.startswith('Vespa-evaluation_test-run')]
final_results = []
for results_file in results_files:
    file_path = os.path.join(results_dir, results_file)
    df = pd.read_csv(file_path)
    # Get the last row of the DataFrame
    last_row = df.iloc[-1]
    # Convert the row to a dictionary and add it to the final results
    final_results.append(last_row.to_dict())

# Create a DataFrame from the final results
final_df = pd.DataFrame(final_results)
# Remove beginning of name column
final_df['name'] = final_df['name'].str[9:]
# Save the final results to a CSV file
final_df.to_csv(os.path.join(results_dir, "final_results.csv"), index=False)
# Print the final results DataFrame
print(final_df)

                                       name  accuracy@10   map@100    mrr@10  \
0                               hybrid-bm25     0.726126  0.528714  0.528468   
1                          hybrid-closeness     0.891892  0.722806  0.725947   
2                        hybrid-dot-product     0.356757  0.216320  0.206825   
3           hybrid-fusion-closeness-colbert     0.924324  0.752631  0.755834   
4     hybrid-fusion-closeness-cross-encoder     0.915315  0.766987  0.768537   
5                   hybrid-fusion-closeness     0.909910  0.741583  0.746616   
6         hybrid-fusion-dot-product-colbert     0.909910  0.757168  0.760482   
7   hybrid-fusion-dot-product-cross-encoder     0.906306  0.767544  0.769526   
8                 hybrid-fusion-dot-product     0.812613  0.606929  0.608690   
9                              lexical-bm25     0.726126  0.528675  0.528436   
10                        lexical-closeness     0.034234  0.015376  0.012419   
11                      lexical-dot-prod

In [12]:
def query_fn_bm25_score(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
        "ranking.listFeatures": "true",
    }

def query_fn_dot_product_score(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "ranking.listFeatures": "true",
    }

def query_fn_closeness_score(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "ranking.listFeatures": "true",
    }

query_fn_score = {
    "bm25": query_fn_bm25_score,
    "dot-product": query_fn_dot_product_score,
    "closeness": query_fn_closeness_score,
}

In [None]:
min_max_values = {
    "bm25": (float("inf"), float("-inf")),
    "dot-product": (float("inf"), float("-inf")),
    "closeness": (float("inf"), float("-inf")),
}

# Calculate min and max for each ranking function
for ranking in ["bm25", "dot-product", "closeness"]:
    print(f"Calculating min and max for ranking function: {ranking}")
    query_fn = query_fn_score[ranking]
    vespa_queries = [query_fn(q.text, 400) for q in train_queries.values()]
    responses = app.query_many(vespa_queries, max_concurrent=10)
    for response in responses:
        for hit in response.hits:
            score = hit['fields']['rankfeatures']['firstPhase']
            min_val, max_val = min_max_values[ranking]
            min_max_values[ranking] = (min(min_val, score), max(max_val, score))
del vespa_queries, responses  # Clear memory

print("Min and max values for each ranking function:")
for ranking, (min_val, max_val) in min_max_values.items():
    print(f"{ranking}: min={min_val}, max={max_val}")

Calculating min and max for ranking function: bm25
Calculating min and max for ranking function: dot-product
Calculating min and max for ranking function: closeness
Min and max values for each ranking function:
bm25: min=1.4562415365049788, max=68.19806521677549
dot-product: min=41.41029357910156, max=66.20972442626953
closeness: min=0.5747992823289696, max=0.7822561412418744


In [30]:
min_max_values = {
    "bm25": (float("inf"), float("-inf")),
    "dot-product": (float("inf"), float("-inf")),
    "closeness": (float("inf"), float("-inf")),
}

# Calculate min and max for each ranking function
for ranking in ["closeness"]:
    print(f"Calculating min and max for ranking function: {ranking}")
    query_fn = query_fn_score[ranking]
    vespa_queries = [query_fn(q.text, 400) for q in train_queries.values()]
    responses = app.query_many(vespa_queries, max_concurrent=10)
    for response in responses:
        for hit in response.hits:
            score = hit['fields']['rankfeatures']['firstPhase']
            min_val, max_val = min_max_values[ranking]
            min_max_values[ranking] = (min(min_val, score), max(max_val, score))
del vespa_queries, responses  # Clear memory

print("Min and max values for each ranking function:")
for ranking, (min_val, max_val) in min_max_values.items():
    print(f"{ranking}: min={min_val}, max={max_val}")

Calculating min and max for ranking function: closeness
Min and max values for each ranking function:
bm25: min=inf, max=-inf
dot-product: min=inf, max=-inf
closeness: min=0.5747992823289696, max=0.7822561412418744


In [15]:
app.query(query_fn_bm25_score("example query", 10)).get_json()

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 3779},
  'coverage': {'coverage': 100,
   'documents': 277168,
   'full': True,
   'nodes': 1,
   'results': 1,
   'resultsFull': 1},
  'children': [{'id': 'id:hybridsearch:doc::msmarco_passage_01_758029229',
    'relevance': 16.660505229338476,
    'source': 'hybridsearch_content',
    'fields': {'sddocname': 'doc',
     'text': 'Uncorrelated Sub-<hi>query</hi>. A uncorrelated sub-<hi>query</hi> is a type of sub-<hi>query</hi> where inner <hi>query</hi> doesn’t depend upon the outer <hi>query</hi> for its execution. It can complete its execution as a standalone <hi>query</hi>. Let us explain uncorrelated sub-<hi>queries</hi> with the help of an <hi>example</hi>.',
     'documentid': 'id:hybridsearch:doc::msmarco_passage_01_758029229',
     'id': 'msmarco_passage_01_758029229',
     'rankfeatures': {'attributeMatch(id)': 0.0,
      'attributeMatch(id).averageWeight': 0.0,
      'attributeMatch(id).completeness':

In [18]:
min_max_values

{'bm25': (1.4562415365049788, 68.19806521677549),
 'dot-product': (41.41029357910156, 66.20972442626953),
 'closeness': (0.5747992823289696, 0.7822561412418744)}