In [1]:
import pyvespa_functions as pf
import pandas as pd
from vespa.deployment import VespaDocker

from vespa.package import ApplicationPackage, Document, Field, FieldSet,\
                            RankProfile, Schema, Function, HNSW,\
                            FirstPhaseRanking, SecondPhaseRanking,\
                            Component, Parameter, GlobalPhaseRanking

In [2]:
vespa_docker = VespaDocker()
app = ApplicationPackage(name="findmypasta")

# In case you want to remove the container, uncomment the following line
# vespa_docker.container.stop()
# vespa_docker.container.remove()

In [3]:
package = ApplicationPackage(
            name="findmypasta",
            schema=[
                Schema(
                    name="doc",
                    document=Document(
                        fields=[
                            Field(
                                name="id",
                                type="int",
                                indexing=["attribute", "summary"]
                            ),
                            Field(
                                name="title",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25"
                            ),
                            Field(
                                name="description",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25"
                            ),
                            Field(
                                name="tags",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25",
                                bolding=True,
                            ),
                            Field(
                                name="steps",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25",
                                bolding=True,
                            ),
                            Field(
                                name="ingredients",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25",
                                bolding=True,
                            ),
                            Field(name="body",
                                type="string",
                                indexing=["index", "summary"],
                                index="enable-bm25", bolding=True
                            ),
                            Field(
                                name="embedding_body",
                                type="tensor<float>(x[384])",
                                indexing=["input title . \" \" . input body", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False
                            ),
                            Field(
                                name="embedding_tags",
                                type="tensor<float>(x[384])",
                                indexing=["input title . \" \" . input tags", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False
                            ),
                            Field(
                                name="embedding_steps",
                                type="tensor<float>(x[384])",
                                indexing=["input title . \" \" . input steps", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False
                            ),
                            Field(
                                name="embedding_description",
                                type="tensor<float>(x[384])",
                                indexing=["input title . \" \" . input description", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False
                            ),
                            Field(
                                name="embedding_ingredients",
                                type="tensor<float>(x[384])",
                                indexing=["input title . \" \" . input ingredients", "embed", "index", "attribute"],
                                ann=HNSW(distance_metric="angular"),
                                is_document_field=False
                            ),
                        ]
                    ),
                    fieldsets=[
                        FieldSet(name="default", fields=["title", "body", "tags", "steps", "description", "ingredients"]),
                    ],
                    rank_profiles=[
                        RankProfile(
                            name="hybrid_all_fields_separeted", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            functions=[
                                Function(
                                    name="fusion",
                                    expression="bm25(title) + bm25(tags) + bm25(steps) + bm25(description) + bm25(ingredients)"
                                )
                            ],
                            first_phase=FirstPhaseRanking(expression="closeness(field, embedding_body)"),
                            global_phase=GlobalPhaseRanking(expression="fusion"),
                            match_features = [
                                "bm25(title)",
                                "bm25(tags)",
                                "bm25(steps)",
                                "bm25(description)",
                                "bm25(ingredients)",
                                "fusion"
                            ]
                        ),
                        RankProfile(
                            name="hybrid_body", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase=FirstPhaseRanking(expression="closeness(field, embedding_body)"),
                            global_phase=GlobalPhaseRanking(expression="reciprocal_rank_fusion(bm25(title), closeness(field, embedding_body))"),
                            match_features = [
                                "bm25(title)",
                                "closeness(field, embedding_body)",
                            ]
                        ),
                        RankProfile(
                            name="hybrid_steps_description", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase=FirstPhaseRanking(expression="closeness(field, embedding_body)"),
                            global_phase=GlobalPhaseRanking(expression="reciprocal_rank_fusion(bm25(title), bm25(steps), closeness(field, embedding_description))"),
                            match_features = [
                                "bm25(title)",
                                "bm25(steps)",
                                "closeness(field, embedding_description)",
                            ]
                        ),
                        RankProfile(
                            name="hybrid_guided_by_colbert_1", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase=FirstPhaseRanking(expression="closeness(field, embedding_body)"),
                            global_phase=GlobalPhaseRanking(expression="reciprocal_rank_fusion(bm25(title), closeness(field, embedding_description), closeness(field, embedding_ingredients))"),
                            match_features = [
                                "bm25(title)",
                                "closeness(field, embedding_description)",
                                "closeness(field, embedding_ingredients)",
                            ]
                        ),
                        RankProfile(
                            name="hybrid_guided_by_colbert_2", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase=FirstPhaseRanking(expression="closeness(field, embedding_body)"),
                            global_phase=GlobalPhaseRanking(expression="reciprocal_rank_fusion(bm25(title), closeness(field, embedding_description), closeness(field, embedding_ingredients), closeness(field, embedding_steps))"),
                            match_features = [
                                "bm25(title)",
                                "closeness(field, embedding_description)",
                                "closeness(field, embedding_ingredients)",
                                "closeness(field, embedding_steps)",
                            ]
                        ),
                        RankProfile(
                            name="semantic_only_body", 
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase=FirstPhaseRanking(expression="closeness(field, embedding_body)"),
                            match_features = [
                                "closeness(field, embedding_body)",
                            ]
                        ),
                    ]
                )
            ],
            components=[
                Component(
                    id="e5",
                    type="hugging-face-embedder",
                    parameters=[
                        Parameter(
                            name="transformer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                            },
                        ),
                        Parameter(
                            name="tokenizer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                            },
                        ),
                    ],
                ),
            ]
        )


In [4]:
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 30/300 seconds...
Using plain http against endpoin

In [5]:
# df = pd.read_csv('../archive/RAW_recipes.csv')

# # rename columns
# df = df.rename(columns={'id': 'id',
#                         'name': 'title',
#                         'description': 'description',
#                         'tags': 'tags',
#                         'steps': 'steps',
#                         'ingredients': 'ingredients'}
#                     )


# df.head()

In [6]:
df = pd.read_csv('food_dataset.csv')

df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,title,body
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,60-minutes-or-less ; time-to-make ; course ; m...,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,make a choice and proceed with recipe ; depend...,autumn is my favorite time of year to cook! th...,winter squash ; mexican seasoning ; mixed spic...,7,arriba baked winter squash mexican style,arriba baked winter squash mexican style\n\n...
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,30-minutes-or-less ; time-to-make ; course ; m...,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,preheat oven to 425 degrees f ; press dough in...,this recipe calls for the crust to be prebaked...,prepared pizza crust ; sausage patty ; eggs ; ...,6,a bit different breakfast pizza,a bit different breakfast pizza\n\nRecipe pos...
2,all in the kitchen chili,112140,130,196586,2005-02-25,time-to-make ; course ; preparation ; main-dis...,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,brown ground beef in large pot ; add chopped o...,this modified version of 'mom's' chili was a h...,ground beef ; yellow onions ; diced tomatoes ;...,13,all in the kitchen chili,all in the kitchen chili\n\nRecipe posted on:...
3,alouette potatoes,59389,45,68585,2003-04-14,60-minutes-or-less ; time-to-make ; course ; m...,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,place potatoes in a large pot of lightly salte...,"this is a super easy, great tasting, make ahea...",spreadable cheese with garlic and herbs ; new ...,11,alouette potatoes,alouette potatoes\n\nRecipe posted on: 2003-0...
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,weeknight ; time-to-make ; course ; main-ingre...,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,mix all ingredients& boil for 2 1 / 2 hours ;...,my dh's amish mother raised him on this recipe...,tomato juice ; apple cider vinegar ; sugar ; s...,8,amish tomato ketchup for canning,amish tomato ketchup for canning\n\nRecipe p...


In [7]:
df.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'title', 'body'],
      dtype='object')

In [8]:
from vespa.io import VespaQueryResponse, VespaResponse

def callback(response: VespaResponse, id: str):
        if not response.is_successful():
                print(f"Error when feeding document {id}: {response.get_json()}")

def feed(data_to_feed):
        vespa_feed_slice = data_to_feed.apply(to_vespa_format, axis=1)

        app.feed_iterable(vespa_feed_slice, schema="doc", namespace="findmypasta", callback=callback)

def to_vespa_format(x):
#     "id"
#         "title"
#         "description"
#         "tags"
#         "steps"
#         "ingredients"
#         "body"
    return {"id": x["id"], 
            "fields": { 
                "id": x["id"],
                "title": x["name"],
                "description": x["description"],
                "tags": x["tags"],
                "steps": x["steps"],
                "ingredients": x["ingredients"],
                "body": x["body"],
                }
        }

In [9]:
NUMBER_OF_DOCS = 10000

# number of fed documents
documents = app.query(yql='select * from sources * where true')
if documents.number_documents_indexed > 0:
    print(f"Number of documents fed: {documents.number_documents_indexed}")
else:
    feed(df[:NUMBER_OF_DOCS])
    
    documents = app.query(yql='select * from sources * where true')
    print(f"Number of documents fed: {documents.number_documents_indexed}")



Error when feeding document 12441: {'Exception': "Error in document 'id:findmypasta:doc::12441' - could not parse field 'description' of type 'string': The string field value contains illegal code point 0xC: The string field value contains illegal code point 0xC", 'id': 12441, 'message': 'Exception during feed_data_point'}
Number of documents fed: 9999


In [14]:
from vespa.io import VespaQueryResponse

model_to_ranking_dict = {
    "hybrid_all_fields_separeted": "hybrid_all_fields_separeted",
    "hybrid_body": "hybrid_body",
    "hybrid_steps_description": "hybrid_steps_description",
    "hybrid_guided_by_colbert_1": "hybrid_guided_by_colbert_1",
    "hybrid_guided_by_colbert_2": "hybrid_guided_by_colbert_2",
    "semantic_only_body": "semantic_only_body",
}

model = "semantic_only_body"




with app.syncio(connections=1) as session:
    query="Vegan meat"
    with app.syncio(connections=1) as session:
            response:VespaQueryResponse = session.query(
                yql="""select * from sources * where rank({targetHits:1000}nearestNeighbor(embedding_description,q),
                                                        {targetHits:1000}nearestNeighbor(embedding_ingredients,q), 
                                                        {targetHits:1000}nearestNeighbor(embedding_steps,q),
                                                        {targetHits:1000}nearestNeighbor(embedding_tags,q),
                                                        {targetHits:1000}nearestNeighbor(embedding_body,q),
                                                        userQuery()) limit 5""",
                query=query,
                ranking=model,
                body = {
                "input.query(q)": f"embed({query})"
                }
            )
    assert(response.is_successful())

print(response.hits)

for hit in response.hits:
    print(f"{hit['fields']['title']}")
    for key, value in hit.items():
        print(f"{key}: {value}")
    print('\n\n')

[{'id': 'id:findmypasta:doc::522117', 'relevance': 0.6240735019665155, 'source': 'findmypasta_content', 'fields': {'matchfeatures': {'closeness(field,embedding_body)': 0.6240735019665155}, 'sddocname': 'doc', 'body': 'alan s vegetarian <hi>vegan</hi> bean and rice burger mashup\n\nRecipe posted on: 2015-05-19\n\nTags: time-to-make, course, main-ingredient, cuisine, preparation, for-large-groups, healthy, main-dish, beans, rice, mexican, low-fat, <hi>vegan</hi>, vegetarian, dietary, low-cholesterol, low-saturated-fat, black-beans, low-in-something, pasta-rice-and-grains, brown-rice, number-of-servings, 4-hours-or-less, burgers\n\nDescription: black beans, pinto beans and brown rice mixed with breadcrumbs, picante sauce and seasoned with chili powder, cumin, garlic, oregano, cayenne pepper, salt, pepper.\ni use this in bulk.\nroll up your sleeves, and when you wash your hands, you might want to wash up to your elbows because you might be elbow deep in this when mixing it. \ni like to fre

In [11]:
# loading the Questions.xlsx and answering each question query
import pandas as pd
questions = pd.read_excel('../input/Questions.xlsx')
questions = pd.read_excel('../input/Recipe_Search_Questions.xlsx')

questions.head()

Unnamed: 0,Tipo,Descrição,Query
0,Keywords,Pergunta simples,grilled cheese sandwich recipe
1,Keywords,Pergunta simples,mango smoothie
2,Semantica,Pergunta média,gluten-free bread without yeast
3,Semantica,Pergunta média,low carb dessert for diabetics
4,Semantica,Pergunta difícil,traditional Japanese breakfast for a family


In [16]:
from vespa.io import VespaQueryResponse
import json

# Supondo que 'questions' é um DataFrame com colunas ['Query', 'Tipo', 'Descrição']

model_to_ranking_dict = {
    "hybrid_all_fields_separeted": "hybrid_all_fields_separeted",
    "hybrid_body": "hybrid_body",
    "hybrid_steps_description": "hybrid_steps_description",
    "hybrid_guided_by_colbert_1": "hybrid_guided_by_colbert_1",
    "hybrid_guided_by_colbert_2": "hybrid_guided_by_colbert_2",
    "semantic_only_body": "semantic_only_body",
}


for selected_model in model_to_ranking_dict.values():
    data = pd.DataFrame(columns=['id', 'title', 'Query', 'Tipo', 'Descrição'])
    output_name = 'output/results_working_' + selected_model + '.xlsx'
    if model_to_ranking_dict[selected_model] is not None:
        i = 0
        for input_query in questions['Query']:
            # save a checkpoint each 100 queries
            if i % 100 == 0:
                data.to_excel(output_name, index=False)

            with app.syncio(connections=1) as session:
                try:
                    response:VespaQueryResponse = session.query(
                        yql="""select * from sources * where rank({targetHits:1000}nearestNeighbor(embedding_description,q),
                                        {targetHits:1000}nearestNeighbor(embedding_ingredients,q), 
                                        {targetHits:1000}nearestNeighbor(embedding_steps,q),
                                        {targetHits:1000}nearestNeighbor(embedding_tags,q),
                                        {targetHits:1000}nearestNeighbor(embedding_body,q),
                                        userQuery()) limit 5""",
                        ranking=selected_model,
                        query=input_query, 
                        body={
                            "input.query(q)": f'embed(e5, "{input_query}")',
                            "input.query(qt)": f'embed(colbert, "{input_query}")',
                        },
                        hits=5
                    )
                    assert response.is_successful()
                except Exception as e:
                    print(f"Error with query '{input_query}': {e}")
                    continue


                for hit in response.hits:
                    record = {}
                    for field in ['id', 'title']:
                        record[field] = hit['fields'].get(field, None)
                    record["Query"] = input_query
                    record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
                    record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
                    
                    # get the
                    data = pd.concat([data, pd.DataFrame([record])], ignore_index=True)

            i += 1

        # Sorting
        data = data.sort_values(by=['Tipo', 'Query'])

        # reordering columns
        data = data[['Tipo', 'Descrição', 'Query', 'id', 'title']]

        # exporting to excel
        data.to_excel(output_name, index=False)