In [1]:
# Replace with your tenant name from the Vespa Cloud Console
TENANT_NAME = "projetos-cd-msmarco"
# Replace with your application name (does not need to exist yet)
APP_NAME = "hybridsearch"
SCHEMA_NAME = "doc"

In [2]:
from vespa.package import (
    ApplicationPackage,
    Field,
    Schema,
    Document,
    HNSW,
    RankProfile,
    OnnxModel,
    Component,
    Parameter,
    FieldSet,
    SecondPhaseRanking,
    GlobalPhaseRanking,
    Function,
)

class MySchema(Schema):
    @property
    def schema_to_text(self):
        og_text = super().schema_to_text
        # Conserta o caminho do modelo ONNX para usar barras normais
        og_text = og_text.replace("\\", "/")
        return og_text

schema = MySchema(
    name=SCHEMA_NAME,
    document=Document(
        fields=[
            Field(
                name="id",
                type="string",
                indexing=[
                    "summary",
                    "attribute",
                ],
            ),
            Field(
                name="text",
                type="string",
                indexing=[
                    "summary",
                    "index",
                ],
                index="enable-bm25",
                bolding=True,
            ),
            Field(
                name="text_token_ids",
                type="tensor<float>(d0[64])",
                indexing=[
                    "input text",
                    "embed tokenizer",
                    "attribute",
                ],
                # attribute=["paged"],
                is_document_field=False,
            ),
            Field(
                name="embedding",
                type="tensor<float>(x[384])",
                indexing=[
                    "input text",
                    "embed e5",
                    "attribute",
                    "index",
                ],
                ann=HNSW(
                    distance_metric="angular",
                    max_links_per_node=32,
                    neighbors_to_explore_at_insert=400,
                ),
                is_document_field=False,
            ),
            Field(
                name="colbert",
                type="tensor<int8>(dt{}, x[16])",
                indexing=[
                    "input text",
                    "embed colbert",
                    "attribute",
                ],
                # attribute=["paged"],
                is_document_field=False,
            ),
        ]
    ),
    fieldsets=[FieldSet(name="default", fields=["text"])],
    rank_profiles=[
        RankProfile(
            name="bm25",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
            ],
            first_phase="bm25(text)",
        ),
        RankProfile(
            name="closeness",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
            ],
            first_phase="closeness(field, embedding)",
        ),
        RankProfile(
            name="dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
            ],
            functions=[
                Function(name="dot_product", expression="sum(query(q) * attribute(embedding))"),
            ],
            first_phase="dot_product",
        ),
        RankProfile(
            name="fusion-dot-product",
            inherits="dot-product",
            functions=[
                Function(name="fusion_dot_product", expression="0.5*bm25(text) + 0.5*dot_product"),
            ],
            first_phase="fusion_dot_product",
        ),
        RankProfile(
            name="fusion-closeness",
            inherits="closeness",
            functions=[
                Function(name="fusion_closeness", expression="0.5*bm25(text) + 0.5*closeness(field, embedding)"),
            ],
            first_phase="fusion_closeness",
        ),
        RankProfile(
            name="fusion-dot-product-second-phase",
            inherits="fusion-dot-product",
            first_phase="dot_product",
            second_phase=SecondPhaseRanking(
                expression="fusion_dot_product",
                rerank_count=1000,
            ),
        ),
        RankProfile(
            name="fusion-closeness-second-phase",
            inherits="fusion-closeness",
            first_phase="closeness(field, embedding)",
            second_phase=SecondPhaseRanking(
                expression="fusion_closeness",
                rerank_count=1000,
            ),
        ),
        RankProfile(
            name="bm25-colbert",
            inherits="fusion-dot-product-colbert",
            first_phase="bm25(text)",
        ),
        RankProfile(
            name="closeness-colbert",
            inherits="fusion-dot-product-colbert",
            first_phase="closeness(field, embedding)",
        ),
        RankProfile(
            name="dot-product-colbert",
            inherits="fusion-dot-product-colbert",
            first_phase="dot_product",
        ),
        RankProfile(
            name="fusion-dot-product-colbert",
            inherits="fusion-dot-product",
            inputs=[
                ("query(qt)", "tensor<float>(qt{},x[128])"),
                ("query(q)", "tensor<float>(x[384])"),
            ],
            functions=[
                Function(name="cos_sim", expression="cos(distance(field, embedding))"),
                Function(name="max_sim", expression="sum(reduce(sum(query(qt) * unpack_bits(attribute(colbert)), x), max, dt),qt)"),
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="max_sim",
                rerank_count=100,
            ),
            match_features=["max_sim", "cos_sim"],
        ),
        RankProfile(
            name="fusion-closeness-colbert",
            inherits="fusion-dot-product-colbert",
            functions=[
                Function(name="fusion_closeness", expression="0.5*bm25(text) + 0.5*closeness(field, embedding)"),
            ],
            first_phase="fusion_closeness",
        ),
        RankProfile(
            name="bm25-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            first_phase="bm25(text)",
        ),
        RankProfile(
            name="closeness-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            first_phase="closeness(field, embedding)",
        ),
        RankProfile(
            name="dot-product-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            first_phase="dot_product",
        ),
        RankProfile(
            name="fusion-dot-product-cross-encoder",
            inherits="fusion-dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
                ("query(query_token_ids)",  "tensor<float>(d0[32])"),
            ],
            functions=[
                Function(name="input_ids", expression="tokenInputIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="token_type_ids", expression="tokenTypeIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="attention_mask", expression="tokenAttentionMask(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="cross_encoder", expression="onnx(cross_encoder_model){d0:0,d1:0}")
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="cross_encoder",
                rerank_count=100
            )
        ),
        RankProfile(
            name="fusion-dot-product-cross-encoder-50",
            inherits="fusion-dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
                ("query(query_token_ids)",  "tensor<float>(d0[32])"),
            ],
            functions=[
                Function(name="input_ids", expression="tokenInputIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="token_type_ids", expression="tokenTypeIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="attention_mask", expression="tokenAttentionMask(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="cross_encoder", expression="onnx(cross_encoder_model){d0:0,d1:0}")
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="cross_encoder",
                rerank_count=50
            )
        ),
        RankProfile(
            name="fusion-dot-product-cross-encoder-25",
            inherits="fusion-dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
                ("query(query_token_ids)",  "tensor<float>(d0[32])"),
            ],
            functions=[
                Function(name="input_ids", expression="tokenInputIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="token_type_ids", expression="tokenTypeIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="attention_mask", expression="tokenAttentionMask(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="cross_encoder", expression="onnx(cross_encoder_model){d0:0,d1:0}")
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="cross_encoder",
                rerank_count=25
            )
        ),
        RankProfile(
            name="fusion-dot-product-cross-encoder-12",
            inherits="fusion-dot-product",
            inputs=[
                ("query(q)", "tensor<float>(x[384])"),
                ("query(query_token_ids)",  "tensor<float>(d0[32])"),
            ],
            functions=[
                Function(name="input_ids", expression="tokenInputIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="token_type_ids", expression="tokenTypeIds(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="attention_mask", expression="tokenAttentionMask(96, query(query_token_ids), attribute(text_token_ids))"),
                Function(name="cross_encoder", expression="onnx(cross_encoder_model){d0:0,d1:0}")
            ],
            first_phase="fusion_dot_product",
            second_phase=SecondPhaseRanking(
                expression="cross_encoder",
                rerank_count=12
            )
        ),
        RankProfile(
            name="fusion-closeness-cross-encoder",
            inherits="fusion-dot-product-cross-encoder",
            functions=[
                Function(name="fusion_closeness", expression="0.5*bm25(text) + 0.5*closeness(field, embedding)"),
            ],
            first_phase="fusion_closeness",
        ),
    ],
    models=[
        OnnxModel(
            model_name="cross_encoder_model",
            model_file_path="models/model.onnx",
            inputs={
                "input_ids": "input_ids",
                "attention_mask": "attention_mask",
                "token_type_ids": "token_type_ids",
            },
            outputs={"logits": "logits"},
        ),
    ],
)

package = ApplicationPackage(
    name=APP_NAME,
    schema=[
        schema
    ],
    components=[
        Component(
            id="e5",
            type="hugging-face-embedder",
            parameters=[
                Parameter(
                    "transformer-model",
                    {
                        "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                    },
                ),
                Parameter(
                    "tokenizer-model",
                    {
                        "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
        Component(
            id="colbert",
            type="colbert-embedder",
            parameters=[
                Parameter(
                    "transformer-model",
                    {
                        "url": "https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"
                    },
                ),
                Parameter(
                    "tokenizer-model",
                    {
                        "url": "https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
        Component(
            id="tokenizer",
            type="hugging-face-tokenizer",
            parameters=[
                Parameter(
                    "model",
                    {
                        "url": "https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
    ],
)

In [48]:
import requests
import os

url = "https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2/resolve/main/onnx/model.onnx"
output_path = "models/model.onnx"

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Download the file with following redirects
response = requests.get(url, allow_redirects=True)

# Save the file
with open(output_path, 'wb') as f:
    f.write(response.content)

print(f"Model saved to {output_path}")

Model saved to models/model.onnx


In [3]:
from vespa.deployment import VespaCloud

vespa_cloud = VespaCloud(
    tenant=TENANT_NAME,
    application=APP_NAME,
    application_package=package,
)

Setting application...
Running: vespa config set application projetos-cd-msmarco.hybridsearch.default
Setting target cloud...
Running: vespa config set target cloud

No api-key found for control plane access. Using access token.
Checking for access token in auth.json...
Successfully obtained access token for control plane access.


In [None]:
# Usa .deploy() na primeira vez e .get_application() nas próximas vezes
app = vespa_cloud.deploy()

In [None]:
endpoint = vespa_cloud.get_mtls_endpoint()
endpoint

In [5]:
import random
import ir_datasets
import pickle
import re

In [6]:
random.seed(42)

def load_dataset(input_file):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

data_set = "subset_msmarco_train_0/subset_msmarco_train_0.01_99.pkl"

data = load_dataset(data_set)
queries = data["queries"]
documents = data["docs"]

# Split the queries (queries is a dictionary of {query_id: query_object})
query_ids = list(queries.keys())  # List of query IDs

# Shuffle query IDs to ensure a random split
random.shuffle(query_ids)

# Split into 80% for training, 20% for validation
split_ratio = 0.8
train_query_ids = query_ids[:int(len(query_ids) * split_ratio)]
test_query_ids = query_ids[int(len(query_ids) * split_ratio):]

train_queries = {qid: queries[qid] for qid in train_query_ids}
test_queries = {qid: queries[qid] for qid in test_query_ids}

In [7]:
def remove_control_characters(text):
    # Remove control characters using regex
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

In [8]:
vespa_cli_feed = [
    {
        "put": f"id:{APP_NAME}:{SCHEMA_NAME}::{doc.doc_id}",
        "fields": {
            "text": remove_control_characters(doc.text),
            "id": doc.doc_id,
        }
    }
    for doc in documents.values()
]

vespa_cli_feed = vespa_cli_feed[:1000]  # Limit to first 1000 documents for testing

# write to jsonl file
import json
import os

def write_jsonl(data, filename):
    with open(filename, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
os.makedirs("data", exist_ok=True)
write_jsonl(vespa_cli_feed, "data/vespa_cli_feed.jsonl")

In [9]:
output_list = !vespa feed data/vespa_cli_feed.jsonl
results = json.loads("".join(output_list))
print(results)

{'feeder.operation.count': 1000, 'feeder.seconds': 167.203, 'feeder.ok.count': 1000, 'feeder.ok.rate': 5.981, 'feeder.error.count': 0, 'feeder.inflight.count': 0, 'http.request.count': 1000, 'http.request.bytes': 355974, 'http.request.MBps': 0.002, 'http.exception.count': 0, 'http.response.count': 1000, 'http.response.bytes': 133698, 'http.response.MBps': 0.001, 'http.response.error.count': 0, 'http.response.latency.millis.min': 1529, 'http.response.latency.millis.avg': 3053, 'http.response.latency.millis.max': 4218, 'http.response.code.counts': {'200': 1000}}


In [9]:
vespa_feed = [
    {"id": doc.doc_id, "fields": {"body": remove_control_characters(doc.text), "id": doc.doc_id}}
    for doc in documents.values()
]

In [10]:
vespa_feed = vespa_feed[151000:]

In [11]:
from vespa.io import VespaResponse, VespaQueryResponse


def callback(response: VespaResponse, id: str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")


app.feed_iterable(vespa_feed, schema="doc", namespace="tutorial", callback=callback)

In [None]:
from vespa.evaluation import VespaEvaluator
from collections.abc import Callable

test_queries_dict = {
    q.query_id: q.text
    for q in test_queries.values()
}

relevant_docs = dict()
for qrel in data["qrels"]:
    relevant_docs[qrel.query_id] = relevant_docs.get(qrel.query_id, set())
    relevant_docs[qrel.query_id].add(qrel.doc_id)

def create_query_fn(match: str, ranking: str) -> Callable:
    query_dict = dict()
    yql = "select * from sources * where "
    if match == "lexical":
        yql = yql + "userQuery()"
    elif match == "semantic":
        yql = yql + "({targetHits:1000}nearestNeighbor(embedding,q))"
    elif match == "hybrid":
        yql = yql + "userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))"
    query_dict["yql"] = yql
    query_dict["ranking"] = ranking
    
    def query_fn(query_text: str, top_k: int) -> dict:
        query_dict["query"] = query_text
        query_dict["hits"] = top_k
        if match == "semantic" or match == "hybrid":
            query_dict["input.query(q)"] = f"embed(e5,'{query_text}')"
        return query_dict

    return query_fn

for match in ["lexical", "semantic", "hybrid"]:
    for ranking in ["bm25", "dot-product", "fusion"]:
        query_fn = create_query_fn(match, ranking)
        print(query_fn("example query", 10))
        print(f"Results for match: {match}, ranking: {ranking}")
        evaluator = VespaEvaluator(
            queries=test_queries_dict,
            relevant_docs=relevant_docs,
            vespa_query_fn=query_fn,
            app=app,
            name=f"test-run-{match}-{ranking}",
            accuracy_at_k=[10],
            precision_recall_at_k=[100],
            mrr_at_k=[10],
            ndcg_at_k=[10],
            write_csv=True
        )
        
        results = evaluator.run()
        print("Primary metric:", evaluator.primary_metric)
        print("All results:", results)

In [None]:
from vespa.evaluation import VespaEvaluator

test_queries_dict = {
    q.query_id: q.text
    for q in test_queries.values()
}

relevant_docs = dict()
for qrel in data["qrels"]:
    relevant_docs[qrel.query_id] = relevant_docs.get(qrel.query_id, set())
    relevant_docs[qrel.query_id].add(qrel.doc_id)

def query_fn_lexical(query_text: str, top_k: int) -> dict:
    return {
        "yql": f"select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
    }

def query_fn_semantic(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_fusion(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

query_fn_for_ranking = {
    "bm25": create_query_fn("lexical", "bm25"),
    "semantic": query_fn_semantic,
    "fusion": query_fn_fusion,
}

for ranking in ["bm25", "semantic", "fusion"]:
    query_fn = query_fn_for_ranking[ranking]
    evaluator = VespaEvaluator(
        queries=test_queries_dict,
        relevant_docs=relevant_docs,
        vespa_query_fn=query_fn,
        app=app,
        name=f"test-run-{ranking}",
        accuracy_at_k=[10],
        precision_recall_at_k=[100],
        mrr_at_k=[10],
        ndcg_at_k=[10],
        write_csv=True
    )
    
    results = evaluator.run()
    print(f"Results for {ranking}:")
    print("Primary metric:", evaluator.primary_metric)
    print("All results:", results)

Results for bm25:
Primary metric: ndcg@10
All results: {'accuracy@10': 0.0018018018018018018, 'precision@100': 5.4054054054054054e-05, 'recall@100': 0.005405405405405406, 'mrr@10': 0.0006006006006006006, 'ndcg@10': 0.0009009009009009009, 'map@100': 0.0006466884563047749, 'searchtime_avg': 0.032095495495495494, 'searchtime_q50': 0.033, 'searchtime_q90': 0.05860000000000003, 'searchtime_q95': 0.066}
Results for semantic:
Primary metric: ndcg@10
All results: {'accuracy@10': 0.3099099099099099, 'precision@100': 0.0067747747747747755, 'recall@100': 0.6522522522522523, 'mrr@10': 0.181061776061776, 'ndcg@10': 0.2091735771194579, 'map@100': 0.19173760828363734, 'searchtime_avg': 0.07447387387387387, 'searchtime_q50': 0.078, 'searchtime_q90': 0.111, 'searchtime_q95': 0.123}
Results for fusion:
Primary metric: ndcg@10
All results: {'accuracy@10': 0.7873873873873873, 'precision@100': 0.009513513513513514, 'recall@100': 0.9126126126126126, 'mrr@10': 0.553958243958244, 'ndcg@10': 0.6044627268568112

In [10]:
from vespa.evaluation import VespaEvaluator
from collections.abc import Callable

test_queries_dict = {
    q.query_id: q.text
    for q in test_queries.values()
}

relevant_docs = dict()
for qrel in data["qrels"]:
    relevant_docs[qrel.query_id] = relevant_docs.get(qrel.query_id, set())
    relevant_docs[qrel.query_id].add(qrel.doc_id)

def query_fn_lexical_bm25(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
    }

def query_fn_semantic_bm25(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_bm25(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
    }

def query_fn_semantic_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
    }

def query_fn_semantic_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_fusion_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product",
    }

def query_fn_semantic_fusion_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_fusion_dot_product(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_lexical_fusion_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness",
    }

def query_fn_semantic_fusion_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

def query_fn_hybrid_fusion_closeness(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness",
        "input.query(q)": f"embed(e5,'{query_text}')",
    }

query_fn_for_match_and_ranking = {
    "lexical": {
        "bm25": query_fn_lexical_bm25,
        "dot-product": query_fn_lexical_dot_product,
        "closeness": query_fn_lexical_closeness,
        "fusion-dot-product": query_fn_lexical_fusion_dot_product,
        "fusion-closeness": query_fn_lexical_fusion_closeness,
    },
    "semantic": {
        "bm25": query_fn_semantic_bm25,
        "dot-product": query_fn_semantic_dot_product,
        "closeness": query_fn_semantic_closeness,
        "fusion-dot-product": query_fn_semantic_fusion_dot_product,
        "fusion-closeness": query_fn_semantic_fusion_closeness,
    },
    "hybrid": {
        "bm25": query_fn_hybrid_bm25,
        "dot-product": query_fn_hybrid_dot_product,
        "closeness": query_fn_hybrid_closeness,
        "fusion-dot-product": query_fn_hybrid_fusion_dot_product,
        "fusion-closeness": query_fn_hybrid_fusion_closeness,
    },
}

In [None]:
for match in ["lexical", "semantic", "hybrid"]:
    for ranking in ["dot-product", "closeness", "fusion-dot-product", "fusion-closeness"]:
        query_fn = query_fn_for_match_and_ranking[match][ranking]
        print(query_fn("example query", 10))
        print(f"Results for match: {match}, ranking: {ranking}")
        evaluator = VespaEvaluator(
            queries=test_queries_dict,
            relevant_docs=relevant_docs,
            vespa_query_fn=query_fn,
            app=app,
            name=f"test-run-{match}-{ranking}",
            accuracy_at_k=[10],
            precision_recall_at_k=[100],
            mrr_at_k=[10],
            ndcg_at_k=[10],
            write_csv=True,
            csv_dir="results",
        )
        
        results = evaluator.run()
        print("Primary metric:", evaluator.primary_metric)
        print("All results:", results)

{'yql': 'select * from sources * where userQuery()', 'query': 'example query', 'hits': 10, 'ranking': 'dot-product'}
Results for match: lexical, ranking: dot-product
Primary metric: ndcg@10
All results: {'accuracy@10': 0.025225225225225224, 'precision@100': 0.001135135135135135, 'recall@100': 0.10540540540540541, 'mrr@10': 0.009010439010439011, 'ndcg@10': 0.011827080158329223, 'map@100': 0.010302619363496066, 'searchtime_avg': 0.03615315315315316, 'searchtime_q50': 0.025, 'searchtime_q90': 0.07920000000000005, 'searchtime_q95': 0.10889999999999986}
{'yql': 'select * from sources * where userQuery()', 'query': 'example query', 'hits': 10, 'ranking': 'closeness'}
Results for match: lexical, ranking: closeness
Primary metric: ndcg@10
All results: {'accuracy@10': 0.025225225225225224, 'precision@100': 0.001135135135135135, 'recall@100': 0.10540540540540541, 'mrr@10': 0.009010439010439011, 'ndcg@10': 0.011827080158329223, 'map@100': 0.010302619363496066, 'searchtime_avg': 0.0219639639639639

In [None]:
def query_fn_hybrid_fusion_closeness_colbert(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness-colbert",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(qt)": f"embed(colbert,'{query_text}')",
    }

def query_fn_hybrid_fusion_closeness_cross_encoder(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-closeness-cross-encoder",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(query_token_ids)": f"embed(tokenizer,'{query_text}')",
    }

def query_fn_hybrid_fusion_dot_product_cross_encoder(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion-dot-product-cross-encoder-12",
        "input.query(q)": f"embed(e5,'{query_text}')",
        "input.query(query_token_ids)": f"embed(tokenizer,'{query_text}')",
    }

query_fn_for_match_and_ranking["hybrid"]["fusion-closeness-colbert"] = query_fn_hybrid_fusion_closeness_colbert
query_fn_for_match_and_ranking["hybrid"]["fusion-closeness-cross-encoder"] = query_fn_hybrid_fusion_closeness_cross_encoder
query_fn_for_match_and_ranking["hybrid"]["fusion-dot-product-cross-encoder"] = query_fn_hybrid_fusion_dot_product_cross_encoder

for match in ["hybrid"]:
    for ranking in ["fusion-dot-product-cross-encoder"]:
        query_fn = query_fn_for_match_and_ranking[match][ranking]
        print(query_fn("example query", 10))
        print(f"Results for match: {match}, ranking: {ranking}")
        evaluator = VespaEvaluator(
            queries=test_queries_dict,
            relevant_docs=relevant_docs,
            vespa_query_fn=query_fn,
            app=app,
            name=f"test-run-{match}-{ranking}",
            accuracy_at_k=[10],
            precision_recall_at_k=[100],
            mrr_at_k=[10],
            ndcg_at_k=[10],
            write_csv=True,
            csv_dir="results",
        )
        
        results = evaluator.run()
        print("Primary metric:", evaluator.primary_metric)
        print("All results:", results)

In [24]:
def query_fn_lexical(query_text: str, top_k: int) -> dict:
    return {
        "yql": f"select * from sources * where userQuery()",
        "ranking": "bm25",
        "query": query_text,
        "hits": top_k,
    }

query_fn_lexical_bm25 = create_query_fn("lexical", "bm25")

# Check for difference in test_queries_dict
for query_id, query_text in test_queries_dict.items():
    first_query = query_fn_lexical_bm25(query_text, 10)
    second_query = query_fn_lexical(query_text, 10)
    if first_query != second_query:
        print(f"Difference found for query ID {query_id}:")
        print("First query:", first_query)
        print("Second query:", second_query)

In [46]:
app.get_data(schema=SCHEMA_NAME, data_id="msmarco_passage_03_746693842").json

{'pathId': '/document/v1/doc/doc/docid/msmarco_passage_03_746693842',
 'id': 'id:doc:doc::msmarco_passage_03_746693842'}