In [17]:
import pyvespa_functions as pf
import pandas as pd
from vespa.deployment import VespaDocker

from vespa.package import ApplicationPackage, Document, Field, FieldSet,\
                            RankProfile, Schema, Function, HNSW,\
                            FirstPhaseRanking, SecondPhaseRanking,\
                            Component, Parameter

In [18]:
vespa_docker = VespaDocker()
app = ApplicationPackage(name="findmypasta")

# In case you want to remove the container, uncomment the following line
# vespa_docker.container.stop()
# vespa_docker.container.remove()

In [19]:
package = ApplicationPackage(
            name="findmypasta",
            schema=[
                Schema(
                    name="doc",
                    document=Document(
                        fields=[
                            Field(name="id", type="string", indexing=["summary"]),
                            Field(name="title", type="string", indexing=["index", "summary"]),
                            Field(name="steps", type="array<string>", indexing=["summary", "index"]),
                            Field(
                                name="embedding",
                                type="tensor<bfloat16>(steps{}, x[384])",
                                indexing=[
                                    "input steps",
                                    'for_each { (input title || "") . " " . ( _ || "") }',
                                    "embed e5",
                                    "attribute",
                                ],
                                attribute=["distance-metric: angular"],
                                is_document_field=False,
                            ),
                            Field(
                                name="colbert",
                                type="tensor<int8>(steps{}, token{}, v[16])",
                                indexing=["input steps", "embed colbert steps", "attribute"],
                                is_document_field=False,
                            ),
                        ]
                    ),
                    rank_profiles=[
                        RankProfile(
                            name="colbert_local",
                            inputs=[
                                ("query(q)", "tensor<float>(x[384])"),
                                ("query(qt)", "tensor<float>(querytoken{}, v[128])"),
                            ],
                            functions=[
                                Function(name="cos_sim", expression="closeness(field, embedding)"),
                                Function(
                                    name="max_sim_per_steps",
                                    expression="""
                                        sum(
                                            reduce(
                                                sum(
                                                    query(qt) * unpack_bits(attribute(colbert)) , v
                                                ),
                                                max, token
                                            ),
                                            querytoken
                                        )
                                    """,
                                ),
                                Function(
                                    name="max_sim_local", expression="reduce(max_sim_per_steps, max, steps)"
                                ),
                            ],
                            first_phase=FirstPhaseRanking(expression="cos_sim"),
                            second_phase=SecondPhaseRanking(expression="max_sim_local"),
                            match_features=["cos_sim", "max_sim_local", "max_sim_per_steps"],
                        ),
                        RankProfile(
                            name="colbert_global",
                            inputs=[
                                ("query(q)", "tensor<float>(x[384])"),
                                ("query(qt)", "tensor<float>(querytoken{}, v[128])"),
                            ],
                            functions=[
                                Function(name="cos_sim", expression="closeness(field, embedding)"),
                                Function(
                                    name="max_sim_cross_steps",
                                    expression="""
                                        sum(
                                            reduce(
                                                sum(
                                                    query(qt) *  unpack_bits(attribute(colbert)) , v
                                                ),
                                                max, token, steps
                                            ),
                                            querytoken
                                        )
                                        """
                                ),
                                Function(
                                    name="max_sim_global", expression="reduce(max_sim_cross_steps, max)"
                                ),
                            ],
                            first_phase=FirstPhaseRanking(expression="cos_sim"),
                            second_phase=SecondPhaseRanking(expression="max_sim_global", rerank_count=5),
                            match_features=[
                            "cos_sim",
                            "max_sim_global",
                            "max_sim_cross_steps",
                            ],
                        )
                    ]
                )
            ],
            components=[
                Component(
                    id="e5",
                    type="hugging-face-embedder",
                    parameters=[
                        Parameter(
                            name="transformer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                            },
                        ),
                        Parameter(
                            name="tokenizer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                            },
                        ),
                    ],
                ),
                Component(
                    id="colbert",
                    type="colbert-embedder",
                    parameters=[
                        Parameter(
                            name="transformer-model",
                            args={
                                "url": "https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"
                            },
                        ),
                        Parameter(
                            name="tokenizer-model",
                            args={
                                "url": "https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"
                            },
                        ),
                    ],
                ),
            ]
        )


In [20]:
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 30/300 seconds...
Using plain http against endpoin

In [21]:
df = pd.read_csv('../archive/RAW_recipes.csv')
df['steps'] = df['steps'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

# rename columns
df = df.rename(columns={'name': 'title', 'steps': 'steps', 'ingredients': 'ingredients'})
df.head()

Unnamed: 0,title,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [22]:
from vespa.io import VespaQueryResponse, VespaResponse

def callback(response: VespaResponse, id: str):
        if not response.is_successful():
                print(f"Error when feeding document {id}: {response.get_json()}")

def feed(data_to_feed):
        vespa_feed_slice = data_to_feed.apply(to_vespa_format, axis=1)

        app.feed_iterable(vespa_feed_slice, schema="doc", namespace="findmypasta", callback=callback)

def to_vespa_format(x):
    return {"id": x["id"], "fields": { "title": x["title"], "steps": x["steps"], "id": x["id"]}}

In [23]:
# number of fed documents
documents = app.query(yql='select * from sources * where true')
if documents.number_documents_indexed > 0:
    print(f"Number of documents fed: {documents.number_documents_indexed}")
else:
    feed(df[:1000])
    
    documents = app.query(yql='select * from sources * where true')
    print(f"Number of documents fed: {documents.number_documents_indexed}")



Error when feeding document 368257: {'Exception': 'Out of range float values are not JSON compliant', 'id': 368257, 'message': 'Exception during feed_data_point'}
Number of documents fed: 999


In [32]:
from vespa.io import VespaQueryResponse

query="vegan meat"
with app.syncio(connections=1) as session:
    response:VespaQueryResponse = session.query(
        yql="select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q))",
        ranking="colbert_local",
        query=query, 
        body={
            "input.query(q)": f'embed(e5, "{query}")',
            "input.query(qt)": f'embed(colbert, "{query}")',
        },
    )
for hit in response.hits:
    for key, value in hit.items():
        print(f"{key}: {value}")
# assert(response.is_successful())
# for hit in response.hits:
#     record = {}
#     for field in ['id', 'title', 'steps']:
#         record[field] = hit['fields'][field]
#     print(record)

id: id:findmypasta:doc::279052
relevance: 93.73028564453125
source: findmypasta_content
fields: {'matchfeatures': {'cos_sim': 0.6456877741230913, 'max_sim_local': 93.73028564453125, 'max_sim_per_steps': {'type': 'tensor<float>(steps{})', 'cells': {'0': 43.09580993652344, '1': 36.71079635620117, '2': 24.26297378540039, '3': 32.31108856201172, '4': 19.398595809936523, '5': 32.56810760498047, '6': 93.73028564453125, '7': 33.17496871948242, '8': 20.560468673706055, '9': 26.467790603637695, '10': 19.496294021606445, '11': 29.603023529052734, '12': 31.67087745666504, '13': 17.117517471313477, '14': 16.595781326293945, '15': 44.35263442993164, '16': 34.76308059692383, '17': 33.315940856933594, '18': 23.643291473388672, '19': 16.029190063476562}}}, 'sddocname': 'doc', 'documentid': 'id:findmypasta:doc::279052', 'id': '279052', 'title': 'cheesy  vegan lasagna', 'steps': ['slice carrots and steam ', 'set aside', 'saute parsley and garlic for 1 minute in an oiled pan', 'add carrots and veggie bro

In [33]:
response.hits[0]

{'id': 'id:findmypasta:doc::279052',
 'relevance': 93.73028564453125,
 'source': 'findmypasta_content',
 'fields': {'matchfeatures': {'cos_sim': 0.6456877741230913,
   'max_sim_local': 93.73028564453125,
   'max_sim_per_steps': {'type': 'tensor<float>(steps{})',
    'cells': {'0': 43.09580993652344,
     '1': 36.71079635620117,
     '2': 24.26297378540039,
     '3': 32.31108856201172,
     '4': 19.398595809936523,
     '5': 32.56810760498047,
     '6': 93.73028564453125,
     '7': 33.17496871948242,
     '8': 20.560468673706055,
     '9': 26.467790603637695,
     '10': 19.496294021606445,
     '11': 29.603023529052734,
     '12': 31.67087745666504,
     '13': 17.117517471313477,
     '14': 16.595781326293945,
     '15': 44.35263442993164,
     '16': 34.76308059692383,
     '17': 33.315940856933594,
     '18': 23.643291473388672,
     '19': 16.029190063476562}}},
  'sddocname': 'doc',
  'documentid': 'id:findmypasta:doc::279052',
  'id': '279052',
  'title': 'cheesy  vegan lasagna',
  '

In [26]:
# from vespa.io import VespaQueryResponse
# import json

# # Supondo que 'questions' é um DataFrame com colunas ['Query', 'Tipo', 'Descrição']
# data = pd.DataFrame(columns=['id', 'title', 'Query', 'Tipo', 'Descrição'])

# model_to_ranking_dict = {
#     "bm25_2": "bm25_2",
#     "bm25_3": "bm25_3", 
#     "semantic_2": "semantic_2",
#     "semantic_3": "semantic_3",
#     "hybrid_2": "hybrid_2",
#     "hybrid_3": "hybrid_3",
# }

# selected_model = "bm25_2"

# assert selected_model in model_to_ranking_dict.keys()

# output_name = 'output/Results_' + selected_model + '_extraQuestions' + '.xlsx'

# if model_to_ranking_dict[selected_model] is not None:
#     i = 0
#     for input_query in questions['Query']:
#         # save a checkpoint each 100 queries
#         if i % 100 == 0:
#             data.to_excel(output_name, index=False)

#         with app.syncio(connections=1) as session:
#             try:
#                 response: VespaQueryResponse = session.query(
#                     yql="select * from sources * where userQuery() limit 5",
#                     query=input_query,
#                     ranking=model_to_ranking_dict[selected_model],
#                     # body={
#                     #     "input.query(q)": f"embed({input_query})",
#                     #     "timeout": "30s"  # Aumentar o tempo limite para 10 segundos
#                     # }
#                 )
#                 assert response.is_successful()
#             except Exception as e:
#                 print(f"Error with query '{input_query}': {e}")
#                 continue

#             for hit in response.hits:
#                 record = {}
#                 for field in ['id', 'title']:
#                     record[field] = hit['fields'].get(field, None)
#                 record["Query"] = input_query
#                 record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
#                 record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
#                 data = pd.concat([data, pd.DataFrame([record])], ignore_index=True)

#         i += 1

#     # Sorting
#     data = data.sort_values(by=['Tipo', 'Query'])

#     # reordering columns
#     data = data[['Tipo', 'Descrição', 'Query', 'id', 'title']]

#     # exporting to excel
#     data.to_excel(output_name, index=False)
