# Avaliação de modelos em massa

O objetivo desse notebooks é criar diversos modelos para entender quais campos são mais importantes para o retrieval de documentos.

In [1]:
import pyvespa_functions as pf
# from build_dataset import make_food_dataset
import pandas as pd
from vespa.deployment import VespaDocker


from pathlib import Path
import json
import pandas as pd
import ast
import numpy as np

from vespa.package import (
    ApplicationPackage,
    Component,
    Parameter,
    Field,
    HNSW,
    RankProfile,
    Function,
    FirstPhaseRanking,
    SecondPhaseRanking,
    FieldSet,
    DocumentSummary,
    Summary,
)

In [2]:
app_package = ApplicationPackage(
    name="findmypasta",
    components=[
        Component(
            id="e5-small-q",
            type="hugging-face-embedder",
            parameters=[
                    Parameter("transformer-model", {"url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"}),
                    Parameter("tokenizer-model", {"url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"})
            ],
        )
    ],
)

In [3]:
app_package.schema.add_fields(
    Field(
        name="id",
        type="int",
        indexing=["attribute", "summary"]
        ),
    Field(
        name="title",
        type="string",
        indexing=["index", "summary"],
        index="enable-bm25"
        ),
    Field(
        name="description",
        type="string",
        indexing=["index", "summary"],
        index="enable-bm25"
        ),
    Field(
        name="minutes",
        type="string",
        indexing=["summary"],
        ),
    Field(
        name="n_steps",
        type="string",
        indexing=["attribute", "summary"],
        ),
    Field(
        name="n_ingredients",
        type="string",
        indexing=["attribute", "summary"],
        ),
    Field(
        name="submitted",
        type="string",
        indexing=["attribute", "summary"],
        ),
    Field(
        name="body",
        type="string", 
        indexing=["index", "summary"],
        index="enable-bm25",
        bolding=True
        ),
    Field(
        name="tags",
        type="array<string>",
        indexing=["index", "summary"],
        index="enable-bm25",
        bolding=True,
        ),
    Field(
        name="steps",
        type="array<string>",
        indexing=["index", "summary"],
        index="enable-bm25",
        bolding=True,
        ),
    Field(
        name="ingredients",
        type="array<string>",
        indexing=["index", "summary"],
        index="enable-bm25",
        bolding=True,
        ),
    Field(name="embedding", type="tensor<float>(x[384])",
        indexing=["input body", "embed", "index", "attribute"],
        ann=HNSW(distance_metric="angular"),
        is_document_field=False
        ),
    Field(
        name="description_embeddings",
        type="tensor<float>(x[384])",
        indexing=["input description", "embed", "index", "attribute"],
        ann=HNSW(distance_metric="angular"),
        is_document_field=False,
        ),
    Field(
        name="tag_embeddings",
        type="tensor<float>(p{},x[384])",
        indexing=["input tags", "embed", "index", "attribute"],
        ann=HNSW(distance_metric="angular"),
        is_document_field=False,
        ),
    Field(
        name="step_embeddings",
        type="tensor<float>(p{},x[384])",
        indexing=["input steps", "embed", "index", "attribute"],
        ann=HNSW(distance_metric="angular"),
        is_document_field=False,
        ),
    Field(
        name="ingredient_embeddings",
        type="tensor<float>(p{},x[384])",
        indexing=["input ingredients", "embed", "index", "attribute"],
        ann=HNSW(distance_metric="angular"),
        is_document_field=False,
        ),
    # Field(
    #     name="body_split_embedding",
    #     type="tensor<float>(p{},x[384])",
    #     indexing=["input body_split", "embed", "index", "attribute"],
    #     ann=HNSW(distance_metric="angular"),
    #     is_document_field=False,
    # )
)

fields_names = [field.name for field in app_package.schema.document.fields]
print(len(fields_names))

16


In [4]:
# remove the fields that have embedding in their name
keywords_fields = [field for field in fields_names if "embedding" not in field]
remove = ["id", "minutes", "n_steps", "n_ingredients", "submitted"]

# remove 
keywords_fields = [field for field in keywords_fields if field not in remove]

print(keywords_fields)


semantic_fields = [field for field in fields_names if "embedding" in field]
print(semantic_fields)

['title', 'description', 'body', 'tags', 'steps', 'ingredients']
['embedding', 'description_embeddings', 'tag_embeddings', 'step_embeddings', 'ingredient_embeddings']


In [5]:
# get all the combinations removing the field itself
keywords_fields_combinations = {}
keywords_fields_combinations["all_fields"] = keywords_fields
for i in keywords_fields:
    combination = [field for field in keywords_fields if field != i]
    keywords_fields_combinations[f"without_{i}"] = combination

semantic_fields_combinations = {}
semantic_fields_combinations["all_fields"] = semantic_fields
for i in semantic_fields:
    combination = [field for field in semantic_fields if field != i]
    semantic_fields_combinations[f"without_{i}"] = combination

for combination in keywords_fields_combinations:
    print(combination)

for combination in semantic_fields_combinations:
    print(combination)



all_fields
without_title
without_description
without_body
without_tags
without_steps
without_ingredients
all_fields
without_embedding
without_description_embeddings
without_tag_embeddings
without_step_embeddings
without_ingredient_embeddings


In [14]:
# create all the expressions
keywords_fields_expressions = {}
for key, combination in keywords_fields_combinations.items():
    expression = ""
    for field in combination:
        expression += f"bm25({field}) + "

    keywords_fields_expressions[key] = expression[:-3]

    print(expression[:-3])


semantic_fields_expressions = {}
for key, combination in semantic_fields_combinations.items():
    expression = ""
    for field in combination:
        expression += f"sum(query({field}, embedding, sum),{{x: 0, y: 0}}) + "

    semantic_fields_expressions[key] = expression[:-3]

    print(expression[:-3])

bm25(title) + bm25(description) + bm25(body) + bm25(tags) + bm25(steps) + bm25(ingredients)
bm25(description) + bm25(body) + bm25(tags) + bm25(steps) + bm25(ingredients)
bm25(title) + bm25(body) + bm25(tags) + bm25(steps) + bm25(ingredients)
bm25(title) + bm25(description) + bm25(tags) + bm25(steps) + bm25(ingredients)
bm25(title) + bm25(description) + bm25(body) + bm25(steps) + bm25(ingredients)
bm25(title) + bm25(description) + bm25(body) + bm25(tags) + bm25(ingredients)
bm25(title) + bm25(description) + bm25(body) + bm25(tags) + bm25(steps)
sum(query(embedding, embedding, sum),{x: 0, y: 0}) + sum(query(description_embeddings, embedding, sum),{x: 0, y: 0}) + sum(query(tag_embeddings, embedding, sum),{x: 0, y: 0}) + sum(query(step_embeddings, embedding, sum),{x: 0, y: 0}) + sum(query(ingredient_embeddings, embedding, sum),{x: 0, y: 0})
sum(query(description_embeddings, embedding, sum),{x: 0, y: 0}) + sum(query(tag_embeddings, embedding, sum),{x: 0, y: 0}) + sum(query(step_embeddings, 

In [7]:
# add ranking profiles
for key, expression in keywords_fields_expressions.items():
    app_package.schema.add_rank_profile(
        RankProfile(
            name=f"bm25_{key}",
            first_phase=FirstPhaseRanking(
                expression=expression
            )
        )
    )

In [8]:
# see the ranking profiles
for rank_profile in app_package.schema.rank_profiles:
    print(rank_profile)

bm25_all_fields
bm25_without_title
bm25_without_description
bm25_without_body
bm25_without_tags
bm25_without_steps
bm25_without_ingredients


In [9]:
vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=app_package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Waiting for configuration server, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status

In [11]:
def recipe_file_body_lines(recipe, complementary_data = None):
    """
    Function responsible for creating the recipe body.
    """
    # Transformar as colunas de strings para listas
    recipe['tags'] = recipe['tags'].strip("[]").replace("'", "").split(', ')
    recipe['steps'] = recipe['steps'].strip("[]").replace("'", "").split(', ')
    recipe['ingredients'] = recipe['ingredients'].strip("[]").replace("'", "").split(', ')

    # reviews = complementary_data[complementary_data['recipe_id'] == recipe['id']]

    # # ordering by descending date
    # reviews = reviews.sort_values('date', ascending=False)

    # # getting the average rating
    # avg_rating = reviews['rating'].mean()

    # # if the average rating is NaN, we will set it to "No reviews"
    # if np.isnan(avg_rating):
    #     avg_rating = "No reviews"

    # creating the recipe body
    recipe_body = recipe['name'] + '\n' \
    + "Recipe posted on: " + str(recipe['submitted']) + '\n' \
    + "Tags: " + ', '.join(recipe['tags']) + '\n' \
    + "Description: " + recipe['description'] + '\n' \
    + "This recipe takes " + str(recipe['minutes']) + " minutes to be done." + '\n' \
    + "For this recipe you will need the ingredients: " + '\n' \
    + ', '.join(recipe['ingredients']) + '\n' \
    + "The " + str(recipe["n_steps"]) + " steps to make this recipe are: " + '\n' \
    + ', '.join(recipe['steps']) 
    return recipe_body

In [12]:
# Função para aplicar recipe_file_body_lines a cada linha do DataFrame de receitas
def apply_recipe_file_body_lines(recipe_row):
    return recipe_file_body_lines(recipe_row)

In [13]:
# Carregando o CSV e removendo valores nulos
df = pd.read_csv('archive/RAW_recipes.csv')
df = df.dropna()
df = df.reset_index(drop=True)

df['body'] = df.apply(apply_recipe_file_body_lines, axis=1)
df['body_split'] = df['body'].str.split('\n')

df['minutes'] = "This recipe takes " + df['minutes'].astype(str) + " minutes to be done."
df['submitted'] = 'Recipe submitted on: ' + df["submitted"]
df['tags'] = df["tags"]
df['n_steps'] = 'Number of steps to make this recipe: ' + df['n_steps'].astype(str)
df['n_ingredients'] = 'Number of ingredients: ' + df['n_ingredients'].astype(str)
df['steps'] = df["steps"]
df['description'] = df["description"]
df['ingredients'] = df["ingredients"]
df['title'] = df['name']

namespace = "recipes"
document_type = "findmypasta"

# Função para converter o formato dos dados para o formato esperado pelo Vespa
def to_vespa_format(x):
    document_id = f"id:{namespace}:{document_type}::{x['id']}"
    return {
        "put": document_id,
        "fields": {
            "id": x["id"],
            "title": x["name"],
            "tags": ast.literal_eval(x["tags"]),
            "steps": ast.literal_eval(x["steps"]),
            "description": x["description"],
            "ingredients": ast.literal_eval(x["ingredients"]),
            "minutes": x["minutes"],
            "n_steps": x["n_steps"],
            "n_ingredients": x["n_ingredients"],
            "submitted": x["submitted"],
            "body": x["body"],
            "body_split": x["body_split"]
        }
    }

# Criando o feed do Vespa
vespa_feed = df.apply(to_vespa_format, axis=1).tolist()
vespa_feed_slice = vespa_feed[0:10000]
# Salvando o feed em um arquivo JSONL
with open("vespa_feed2.jsonl", "w") as f:
    for item in vespa_feed_slice:
        f.write(json.dumps(item) + "\n")