In [14]:
import pyvespa_functions as pf
import pandas as pd
from vespa.deployment import VespaDocker

from vespa.package import ApplicationPackage, Document, Field, FieldSet,\
                            RankProfile, Schema, Function, HNSW,\
                            FirstPhaseRanking, SecondPhaseRanking,\
                            Component, Parameter, GlobalPhaseRanking

In [15]:
vespa_docker = VespaDocker()
app = ApplicationPackage(name="findmypasta")

# In case you want to remove the container, uncomment the following line
# vespa_docker.container.stop()
# vespa_docker.container.remove()

In [16]:
package = ApplicationPackage(
            name="findmypasta",
            schema=[
                Schema(
                    name="doc",
                    document=Document(
                        fields=[
                            Field(
                                name="id",
                                type="int",
                                indexing=["attribute", "summary"]
                            ),
                            Field(
                                name="title",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25"
                            ),
                            Field(
                                name="description",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25"
                            ),
                            Field(
                                name="tags",
                                type="array<string>",
                                indexing=["index", "summary"],
                                index="enable-bm25",
                                bolding=True,
                            ),
                            Field(
                                name="steps",
                                type="array<string>",
                                indexing=["index", "summary"],
                                index="enable-bm25",
                                bolding=True,
                            ),
                            Field(
                                name="ingredients",
                                type="array<string>",
                                indexing=["index", "summary"],
                                index="enable-bm25",
                                bolding=True,
                            ),
                            Field(name="body",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25", bolding=True
                            ),
                            Field(name="embedding", type="tensor<float>(x[384])",
                                indexing=["input body", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False
                            ),
                            Field(
                                name = "body_split",
                                type = "array<string>",
                                indexing = ["index", "summary"],
                                index = "enable-bm25",
                                bolding = True,
                            ),
                            Field(
                                name="body_split_embedding",
                                type="tensor<float>(p{},x[384])",
                                indexing=["input body_split", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False,
                            ),
                        ]
                    ),
                    fieldsets=[
                        FieldSet(name="default", fields=["title", "body", "tags", "steps", "description", "ingredients"]),
                    ],
                    rank_profiles=[
                        RankProfile(
                            name="semantic", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase="closeness(field, embedding)"
                        ),
                        RankProfile(
                            name="multi_index_cos_body",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            inherits="default",
                            first_phase="cos(distance(field,embedding))", # dá pra calcular isso
                            # match_features=["closest(embedding)"],
                        ),
                        RankProfile(
                            name="multi_index_cos_body_split",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            inherits="default",
                            first_phase="cos(distance(field,body_split_embedding))",
                            match_features=["closest(body_split_embedding)"],
                        ),
                        RankProfile(
                            name="multi_index_closseness_body_split",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase="closeness(field, body_split_embedding)"
                        ),
                        RankProfile(
                            name="multi_index_body_split_bm25",
                            inherits="semantic",
                            functions=[
                                Function(
                                    name="avg_fields_similarity",
                                    expression="""reduce(
                                                sum(l2_normalize(query(q),x) * l2_normalize(attribute(body_split_embedding),x),x),
                                                avg,
                                                p
                                            )""",
                                ),
                            ],
                            first_phase=FirstPhaseRanking(
                                expression="cos(distance(field,embedding))"
                            ),
                            second_phase=SecondPhaseRanking(
                                expression="firstPhase + avg_fields_similarity() + log( bm25(title) + bm25(description))"
                            ),
                            match_features=[
                                "firstPhase",
                                "bm25(title)",
                                "avg_fields_similarity",
                            ],
                        ),
                        RankProfile(
                            name="multi_index_body_split_avg_fields_similarity",
                            inherits="semantic",
                            functions=[
                                Function(
                                    name="avg_fields_similarity",
                                    expression="""reduce(
                                                sum(l2_normalize(query(q),x) * l2_normalize(attribute(body_split_embedding),x),x),
                                                avg,
                                                p
                                            )""",
                                ),
                            ],
                            first_phase=FirstPhaseRanking(
                                expression="cos(distance(field,embedding))"
                            ),
                            second_phase=SecondPhaseRanking(
                                expression="firstPhase + avg_fields_similarity()"
                            ),
                            match_features=[
                                "firstPhase",
                                "bm25(title)",
                                "avg_fields_similarity",
                            ],
                        )
                    ]
                )
            ],
            components=[
                Component(
                    id="e5",
                    type="hugging-face-embedder",
                    parameters=[
                        Parameter(
                            name="transformer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                            },
                        ),
                        Parameter(
                            name="tokenizer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                            },
                        ),
                    ],
                ),
            ]
        )


In [17]:
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 30/300 seconds...
Using plain http against endpoin

In [18]:
# df = pd.read_csv('../archive/RAW_recipes.csv')

# # rename columns
# df = df.rename(columns={'id': 'id',
#                         'name': 'title',
#                         'description': 'description',
#                         'tags': 'tags',
#                         'steps': 'steps',
#                         'ingredients': 'ingredients'}
#                     )


# df.head()

In [19]:
from vespa.io import VespaQueryResponse, VespaResponse

def callback(response: VespaResponse, id: str):
        if not response.is_successful():
                print(f"Error when feeding document {id}: {response.get_json()}")

def feed(data_to_feed):
        vespa_feed_slice = data_to_feed.apply(to_vespa_format, axis=1)

        app.feed_iterable(vespa_feed_slice, schema="doc", namespace="findmypasta", callback=callback)

def to_vespa_format(x):
#     "id"
#         "title"
#         "description"
#         "tags"
#         "steps"
#         "ingredients"
#         "body"
    return {"id": x["id"], 
            "fields": { 
                "id": x["id"],
                "title": x["name"],
                "description": x["description"],
                "tags": x["tags"],
                "steps": x["steps"],
                "ingredients": x["ingredients"],
                "body": x["body"],
                }
        }

In [20]:
import json

def from_file_generator() -> dict:
    with open("vespa_feed2.jsonl") as f:
        for line in f:
            yield json.loads(line)

In [21]:
NUMBER_OF_DOCS = 10000

# number of fed documents
documents = app.query(yql='select * from sources * where true')
if documents.number_documents_indexed > 0:
    print(f"Number of documents fed: {documents.number_documents_indexed}")
else:
    # feed(df[:NUMBER_OF_DOCS])
    app.feed_iterable(from_file_generator(), schema="doc", namespace="findmypasta", callback=callback)
    
    documents = app.query(yql='select * from sources * where true')
    print(f"Number of documents fed: {documents.number_documents_indexed}")



Error when feeding document 12441: {'Exception': "Error in document 'id:findmypasta:doc::12441' - could not parse field 'body_split' of type 'Array<string>': The string field value contains illegal code point 0xC: The string field value contains illegal code point 0xC", 'id': 12441, 'message': 'Exception during feed_data_point'}
Number of documents fed: 9999


In [27]:
from vespa.io import VespaQueryResponse

# model_to_ranking_dict = {
#     "multi_index_cos_body": "multi_index_cos_body",
#     "multi_index_cos_body_split": "multi_index_cos_body_split",
#     "multi_index_closseness_body_split": "multi_index_closseness_body_split",
#     "multi_index_body_split_bm25": "multi_index_body_split_bm25",
#     "multi_index_body_split_avg_fields_similarity": "multi_index_body_split_avg_fields_similarity"
# }

model = "multi_index_body_split_avg_fields_similarity"




with app.syncio(connections=1) as session:
    query="my daugheter is allergic to eggs and milk, what can I cook for her?"
    with app.syncio(connections=1) as session:
            response:VespaQueryResponse = session.query(
                yql="""select * from sources * where rank({targetHits:1000}nearestNeighbor(body_split_embedding,q),
                                                        {targetHits:1000}nearestNeighbor(embedding,q),
                                                        userQuery()) limit 5""",
                query=query,
                ranking=model,
                body = {
                "input.query(q)": f"embed({query})"
                }
            )
    assert(response.is_successful())

print(response.hits)

for hit in response.hits:
    print(f"{hit['fields']['title']}")
    for key, value in hit.items():
        print(f"{key}: {value}")
    print('\n\n')

[{'id': 'id:findmypasta:doc::367865', 'relevance': -0.1734251390937488, 'source': 'findmypasta_content', 'fields': {'matchfeatures': {'bm25(title)': 1.0374410702880272, 'firstPhase': -0.9999876894265599, 'avg_fields_similarity': 0.8265625503328111}, 'sddocname': 'doc', 'ingredients': ['cream', 'water', 'parmesan cheese', 'sea salt', 'pepper', 'mustard greens', '<hi>eggs</hi>', 'butter', 'olive oil flavored cooking spray'], 'body_split': ['green eggs  kid friendly   high in iron', 'Recipe posted on: 2009-04-25', 'Tags: 15-minutes-or-less, time-to-make, course, main-ingredient, preparation, very-low-carbs, breakfast, eggs-dairy, vegetables, easy, beginner-cook, dietary, low-sodium, low-calorie, low-carb, low-in-something, 3-steps-or-less', 'Description: my daughter and i are both anemic and were looking for easy and yummy ways to incorporate more iron into our diet. i\'ll admit they don\'t look really appetizing, but these eggs are light and fluffy and are great accompanied by dr. seuss\

In [23]:
# loading the Questions.xlsx and answering each question query
import pandas as pd
questions = pd.read_excel('../input/Questions.xlsx')
questions = pd.read_excel('../input/Recipe_Search_Questions.xlsx')

questions.head()

Unnamed: 0,Tipo,Descrição,Query
0,Keywords,Pergunta simples,grilled cheese sandwich recipe
1,Keywords,Pergunta simples,mango smoothie
2,Semantica,Pergunta média,gluten-free bread without yeast
3,Semantica,Pergunta média,low carb dessert for diabetics
4,Semantica,Pergunta difícil,traditional Japanese breakfast for a family


In [24]:
from vespa.io import VespaQueryResponse
import json

# Supondo que 'questions' é um DataFrame com colunas ['Query', 'Tipo', 'Descrição']


model_to_ranking_dict = {
    "multi_index_cos_body": "multi_index_cos_body",
    "multi_index_cos_body_split": "multi_index_cos_body_split",
    "multi_index_closseness_body_split": "multi_index_closseness_body_split",
    "multi_index_body_split_bm25": "multi_index_body_split_bm25",
    "multi_index_body_split_avg_fields_similarity": "multi_index_body_split_avg_fields_similarity"
}


for selected_model in model_to_ranking_dict.values():
    data = pd.DataFrame(columns=['id', 'title', 'Query', 'Tipo', 'Descrição'])
    output_name = 'output_2/results_working_' + selected_model + '.xlsx'
    if model_to_ranking_dict[selected_model] is not None:
        i = 0
        for input_query in questions['Query']:
            # save a checkpoint each 100 queries
            if i % 100 == 0:
                data.to_excel(output_name, index=False)

            with app.syncio(connections=1) as session:
                try:
                    response:VespaQueryResponse = session.query(
                        yql="""select * from sources * where rank({targetHits:1000}nearestNeighbor(body_split_embedding,q),
                                                                {targetHits:1000}nearestNeighbor(embedding,q),
                                                                userQuery()) limit 5""",
                        ranking=selected_model,
                        query=input_query, 
                        body={
                            "input.query(q)": f'embed(e5, "{input_query}")',
                            "input.query(qt)": f'embed(colbert, "{input_query}")',
                        },
                        hits=5
                    )
                    assert response.is_successful()
                except Exception as e:
                    print(f"Error with query '{input_query}': {e}")
                    continue


                for hit in response.hits:
                    record = {}
                    for field in ['id', 'title']:
                        record[field] = hit['fields'].get(field, None)
                    record["Query"] = input_query
                    record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
                    record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
                    
                    # get the
                    data = pd.concat([data, pd.DataFrame([record])], ignore_index=True)

            i += 1

        # Sorting
        data = data.sort_values(by=['Tipo', 'Query'])

        # reordering columns
        data = data[['Tipo', 'Descrição', 'Query', 'id', 'title']]

        # exporting to excel
        data.to_excel(output_name, index=False)