In [1]:
# Replace with your tenant name from the Vespa Cloud Console
TENANT_NAME = "projetos-cd-msmarco"
# Replace with your application name (does not need to exist yet)
APP_NAME = "hybridsearch"

In [2]:
from vespa.package import (
    ApplicationPackage,
    Field,
    Schema,
    Document,
    HNSW,
    RankProfile,
    Component,
    Parameter,
    FieldSet,
    GlobalPhaseRanking,
    Function,
)

package = ApplicationPackage(
    name=APP_NAME,
    schema=[
        Schema(
            name="doc",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(
                        name="body",
                        type="string",
                        indexing=["index", "summary"],
                        index="enable-bm25",
                        bolding=True,
                    ),
                    Field(
                        name="embedding",
                        type="tensor(x[384])",
                        indexing=[
                            'input body',
                            "embed",
                            "index",
                            "attribute",
                        ],
                        ann=HNSW(distance_metric="angular"),
                        is_document_field=False,
                    ),
                ]
            ),
            fieldsets=[FieldSet(name="default", fields=["body"])],
            rank_profiles=[
                RankProfile(
                    name="bm25",
                    inputs=[("query(q)", "tensor(x[384])")],
                    functions=[
                        Function(name="bm25sum", expression="bm25(body)")
                    ],
                    first_phase="bm25sum",
                ),
                RankProfile(
                    name="semantic",
                    inputs=[("query(q)", "tensor(x[384])")],
                    first_phase="closeness(field, embedding)",
                ),
                RankProfile(
                    name="fusion",
                    inherits="bm25",
                    inputs=[("query(q)", "tensor(x[384])")],
                    first_phase="closeness(field, embedding)",
                    global_phase=GlobalPhaseRanking(
                        expression="reciprocal_rank_fusion(bm25sum, closeness(field, embedding))",
                        rerank_count=1000,
                    ),
                ),
            ],
        )
    ],
    components=[
        Component(
            id="e5",
            type="hugging-face-embedder",
            parameters=[
                Parameter(
                    "transformer-model",
                    {
                        "url": "https://github.com/vespa-engine/sample-apps/raw/master/examples/model-exporting/model/e5-small-v2-int8.onnx"
                    },
                ),
                Parameter(
                    "tokenizer-model",
                    {
                        "url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/examples/model-exporting/model/tokenizer.json"
                    },
                ),
            ],
        )
    ],
)

In [4]:
from vespa.deployment import VespaCloud

vespa_cloud = VespaCloud(
    tenant=TENANT_NAME,
    application=APP_NAME,
    application_package=package,
)

Setting application...
Running: vespa config set application projetos-cd-msmarco.hybridsearch.default
Setting target cloud...
Running: vespa config set target cloud

No api-key found for control plane access. Using access token.
Checking for access token in auth.json...
Successfully obtained access token for control plane access.


In [5]:
# Usa .deploy() na primeira vez e .get_application() nas próximas vezes
app = vespa_cloud.get_application()

Only region: aws-us-east-1c available in dev environment.
Found mtls endpoint for hybridsearch_container
URL: https://e9584422.d28ec201.z.vespa-app.cloud/
Application is up!


In [5]:
endpoint = vespa_cloud.get_mtls_endpoint()
endpoint

Found mtls endpoint for hybridsearch_container
URL: https://e9584422.d28ec201.z.vespa-app.cloud/


'https://e9584422.d28ec201.z.vespa-app.cloud/'

In [6]:
import random
import ir_datasets
import pickle
import re

In [7]:
random.seed(42)

def load_dataset(input_file):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

data_set = "subset_msmarco_train_0/subset_msmarco_train_0.01_99.pkl"

data = load_dataset(data_set)
queries = data["queries"]
documents = data["docs"]

# Split the queries (queries is a dictionary of {query_id: query_object})
query_ids = list(queries.keys())  # List of query IDs

# Shuffle query IDs to ensure a random split
random.shuffle(query_ids)

# Split into 80% for training, 20% for validation
split_ratio = 0.8
train_query_ids = query_ids[:int(len(query_ids) * split_ratio)]
test_query_ids = query_ids[int(len(query_ids) * split_ratio):]

train_queries = {qid: queries[qid] for qid in train_query_ids}
test_queries = {qid: queries[qid] for qid in test_query_ids}

In [8]:
def remove_control_characters(text):
    # Remove control characters using regex
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

In [9]:
vespa_feed = [
    {"id": doc.doc_id, "fields": {"body": remove_control_characters(doc.text), "id": doc.doc_id}}
    for doc in documents.values()
]

In [10]:
vespa_feed = vespa_feed[151000:]

In [11]:
from vespa.io import VespaResponse, VespaQueryResponse


def callback(response: VespaResponse, id: str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")


app.feed_iterable(vespa_feed, schema="doc", namespace="tutorial", callback=callback)

In [12]:
from vespa.evaluation import VespaEvaluator

test_queries_dict = {
    q.query_id: q.text
    for q in test_queries.values()
}

relevant_docs = dict()
for qrel in data["qrels"]:
    relevant_docs[qrel.query_id] = relevant_docs.get(qrel.query_id, set())
    relevant_docs[qrel.query_id].add(qrel.doc_id)

def query_fn_lexical(query_text: str, top_k: int) -> dict:
    return {
        "yql": f"select * from sources * where userQuery()",
        "query": query_text,
        "hits": top_k,
        "ranking": "bm25",
    }

def query_fn_semantic(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "semantic",
        "input.query(q)": f"embed({query_text})",
    }

def query_fn_fusion(query_text: str, top_k: int) -> dict:
    return {
        "yql": "select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q))",
        "query": query_text,
        "hits": top_k,
        "ranking": "fusion",
        "input.query(q)": f"embed({query_text})",
    }

query_fn_for_ranking = {
    "bm25": query_fn_lexical,
    "semantic": query_fn_semantic,
    "fusion": query_fn_fusion,
}

for ranking in ["bm25", "semantic", "fusion"]:
    query_fn = query_fn_for_ranking[ranking]
    evaluator = VespaEvaluator(
        queries=test_queries_dict,
        relevant_docs=relevant_docs,
        vespa_query_fn=query_fn,
        app=app,
        name=f"test-run-{ranking}",
        accuracy_at_k=[10],
        precision_recall_at_k=[100],
        mrr_at_k=[10],
        ndcg_at_k=[10],
        write_csv=True
    )
    
    results = evaluator.run()
    print(f"Results for {ranking}:")
    print("Primary metric:", evaluator.primary_metric)
    print("All results:", results)

Results for bm25:
Primary metric: ndcg@10
All results: {'accuracy@10': 0.7261261261261261, 'precision@100': 0.009045045045045046, 'recall@100': 0.8675675675675676, 'mrr@10': 0.5284355784355782, 'ndcg@10': 0.5695866399682291, 'map@100': 0.5286580403643053, 'searchtime_avg': 0.030329729729729735, 'searchtime_q50': 0.03, 'searchtime_q90': 0.051000000000000004, 'searchtime_q95': 0.056}
Results for semantic:
Primary metric: ndcg@10
All results: {'accuracy@10': 0.8738738738738738, 'precision@100': 0.009729729729729731, 'recall@100': 0.9333333333333333, 'mrr@10': 0.715070785070785, 'ndcg@10': 0.7487514746276558, 'map@100': 0.7112547552156819, 'searchtime_avg': 0.10021261261261262, 'searchtime_q50': 0.106, 'searchtime_q90': 0.122, 'searchtime_q95': 0.126}
Results for fusion:
Primary metric: ndcg@10
All results: {'accuracy@10': 0.8738738738738738, 'precision@100': 0.00990990990990991, 'recall@100': 0.9504504504504504, 'mrr@10': 0.6935242385242383, 'ndcg@10': 0.7312417771698385, 'map@100': 0.689