## Initalization of the vespa app

In [3]:
import pyvespa_functions as pf
# from build_dataset import make_food_dataset
import pandas as pd
from vespa.deployment import VespaDocker

package = pf.create_package(app_type="semantic-search")

vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 30/300 seconds...
Using plain http against endpoin

## Loading the dataset used to feed vespa

In [2]:
# tenta carregar o dataset do arquivo, se não conseguir, cria um novo
try:
    df = pd.read_csv('input/food_dataset.csv')
except FileNotFoundError:
    types = {
        "contributor_id": "string",
        "name": "string",
        "id": "string",
        "minutes": "int",
        "tags": "string",
        "nutrition": "string",
        "n_steps": "int",
        "n_ingredients": "int",
        "steps": "string",
        "description": "string",
        "ingredients": "string",
        "submitted": "string"
    }

    df = pd.read_csv('archive/RAW_recipes.csv', dtype=types)
    df = df.dropna()
    df = df.reset_index(drop=True)
    df_reviews = pd.read_csv('archive/RAW_interactions.csv')
    df_reviews = df_reviews.dropna()
    df_reviews = df_reviews.reset_index(drop=True)

    # df = df[:5000]
    df = make_food_dataset(df, df_reviews) # add the body for each recipe
    df = df.dropna()
    df = df.reset_index(drop=True)
    # selecting only the body and id columns
    df.rename(columns={'recipe_body': 'body'}, inplace=True)
    df = df[['id', 'body', 'title']]
    # save the df as a csv file
    df.to_csv('input/food_dataset.csv', index=False)

# df = df[197319:-1]
df.head()

Unnamed: 0,id,body,title,text
0,137739,arriba baked winter squash mexican style\n\n...,arriba baked winter squash mexican style,arriba baked winter squash mexican style arr...
1,31490,a bit different breakfast pizza\n\nRecipe pos...,a bit different breakfast pizza,a bit different breakfast pizza a bit differe...
2,112140,all in the kitchen chili\n\nRecipe posted on:...,all in the kitchen chili,all in the kitchen chili all in the kitchen ...
3,59389,alouette potatoes\n\nRecipe posted on: 2003-0...,alouette potatoes,alouette potatoes alouette potatoes\n\nRecip...
4,44061,amish tomato ketchup for canning\n\nRecipe p...,amish tomato ketchup for canning,amish tomato ketchup for canning amish toma...


## Feeding the vespa or getting the number of documents fed

In [42]:
# number of fed documents
documents = app.query(yql='select * from sources * where true')
if documents.number_documents_indexed > 0:
    print(f"Number of documents fed: {documents.number_documents_indexed}")
else:
    feeder = pf.VespaFeeder(app)
    feeder.feed(df)
    
    documents = app.query(yql='select * from sources * where true')
    print(f"Number of documents fed: {documents.number_documents_indexed}")


Number of documents fed: 226654


## Questions df

In [48]:
# loading the Questions.xlsx and answering each question query
questions = pd.read_excel('input/Questions.xlsx')
questions = pd.read_excel('input/Recipe_Search_Questions.xlsx')
questions

Unnamed: 0,Tipo,Descrição,Query
0,Keywords,Pergunta simples,grilled cheese sandwich recipe
1,Keywords,Pergunta simples,mango smoothie
2,Semantica,Pergunta média,gluten-free bread without yeast
3,Semantica,Pergunta média,low carb dessert for diabetics
4,Semantica,Pergunta difícil,traditional Japanese breakfast for a family
5,Semantica,Pergunta difícil +,What kind of soup can I make with butternut sq...
6,Keywords,Pergunta simples,recipe for chicken curry
7,Keywords,Pergunta simples,how to make iced tea
8,Semantica,Pergunta média,vegan options for a Thanksgiving dinner
9,Semantica,Pergunta difícil,What can I cook with quinoa and kale for a nut...


## Model evaluation

Running the next cells will generate the output needed to evaluate the chosen model, generating the output file with the top 5 answers for each question

In [60]:
model_to_ranking_dict = {
    "bm25_body": "bm25",
    "semantic_body_e5": "semantic",
    "hybrid_body_e5": "fusion",
    "tfidf_body": None
}

selected_model = "semantic"

In [61]:
from vespa.io import VespaQueryResponse

data = pd.DataFrame(columns=['id', 'title', 'body', 'Query', 'Tipo', 'Descrição'])

assert selected_model in model_to_ranking_dict.keys()

output_name = 'output/Results_Full_'+selected_model+'.xlsx'

if model_to_ranking_dict[selected_model] is not None:
    i=0
    for input_query in questions['Query']:

        # save a checkpoint each 100 queries
        if i % 100 == 0:
            data.to_excel(output_name, index=False)

        with app.syncio(connections=1) as session:
            response:VespaQueryResponse = session.query(
                yql="select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q)) limit 5", 
                query=input_query, 
                ranking=model_to_ranking_dict[selected_model], 
                body = {
                "input.query(q)": f"embed({input_query})"
                }
            )
        assert(response.is_successful())
        
        for hit in response.hits:
            record = {}
            for field in ['id', 'title', 'body']:
                record[field] = hit['fields'][field]
            record["Query"] = input_query
            record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
            record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
            data = pd.concat([data, pd.DataFrame([record])])

        i+=1

    # Sorting
    data = data.sort_values(by=['Tipo', 'Query'])

    # reordering columns
    data = data[['Tipo', 'Descrição', 'Query', 'id', 'title', 'body']]

    # exporting to excel
    data.to_excel(output_name, index=False)

## Tfidf model evaluation

In [62]:
if selected_model == "tfidf":
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import linear_kernel

    def setup_tfidf(documents):
        # Combine title and body for TF-IDF processing
        documents['text'] = documents['title'] + " " + documents['body']
        
        # Create the TF-IDF vectorizer and fit to the document texts
        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vectorizer.fit_transform(documents['text'])
        
        return tfidf_vectorizer, tfidf_matrix

    def find_top_hits(tfidf_vectorizer, tfidf_matrix, query, top_n=5):
        # Transform the query to the same TF-IDF vector space as the documents
        query_tfidf = tfidf_vectorizer.transform([query])
        
        # Calculate cosine similarities between the query and all documents
        cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()
        
        # Get the top N matching documents
        top_indices = cosine_similarities.argsort()[-top_n:][::-1]
        
        return top_indices

    documents = df

    # Set up TF-IDF
    tfidf_vectorizer, tfidf_matrix = setup_tfidf(documents)

    # DataFrame to store results
    data_tfidf = pd.DataFrame()

    for input_query in questions['Query']:
        # Find top hits for the query
        top_indices = find_top_hits(tfidf_vectorizer, tfidf_matrix, input_query)
        
        for index in top_indices:
            record = {}
            # Extract details from documents
            for field in ['id', 'title', 'body']:
                record[field] = documents.loc[index, field]
            # Add query details
            record["Query"] = input_query
            record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
            record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
            
            # Append to the result DataFrame
            data_tfidf = pd.concat([data_tfidf, pd.DataFrame([record])])


    # Sorting
    data_tfidf = data_tfidf.sort_values(by=['Tipo', 'Query'])

    # reordering columns
    data_tfidf = data_tfidf[['Tipo', 'Descrição', 'Query', 'id', 'title', 'body']]

    # exporting to excel
    data_tfidf.to_excel(output_name, index=False)



# ...

In [51]:
input_query = "How to make a pizza"
with app.syncio(connections=1) as session:
        response:VespaQueryResponse = session.query(
            yql="select * from sources * where ({targetHits:1000}nearestNeighbor(embedding,q)) limit 5", 
            query=input_query, 
            ranking="semantic", 
            body = {
            "input.query(q)": f"embed({input_query})"
            }
        )
assert(response.is_successful())
    
for hit in response.hits:
    record = {}
    for field in ['id', 'title', 'body']:
        record[field] = hit['fields'][field]
    record["Query"] = input_query
    print(record)

{'id': '248174', 'title': 'how to make pizza', 'body': 'how to make pizza\n\nRecipe posted on: 2007-08-22\n\nTags: 60-minutes-or-less, time-to-make, course, preparation, lunch, main-dish, easy, pizza, dietary\n\nDescription: pizza was originally started in naples, italy but has now become a popular type of food, it can go with almost anything and creating your own could taste better than the ones you buy in the shop. this article will give you 2 different ways of creating your pizza, one method is faster than the other.!\n\nThis recipe takes 60 minutes to be done.\n\nFor this recipe you will need the ingredients: \ntomato paste\nham\ncheese\nolive\nsausage\n\nThe 66 steps to make this recipe are: \npreheat your oven to about 180 degrees\nbuy ready-made pizza bases \nsuch as mccains ready-made pizza bases if you are short on time\nif you have plenty of time \nthen you can just get dough\n"to make the method faster \ndont get a huge pizza if youre only serving yourself or two"\n"you dont