In [21]:
import pyvespa_functions as pf
from build_dataset import make_food_dataset
import pandas as pd
from vespa.deployment import VespaDocker

package = pf.create_package(app_type="text-search")

vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Application is up!
Finished deployment.


In [22]:
# tenta carregar o dataset do arquivo, se não conseguir, cria um novo
try:
    df = pd.read_csv('input/food_dataset.csv')
except FileNotFoundError:
    types = {
        "contributor_id": "string",
        "name": "string",
        "id": "string",
        "minutes": "int",
        "tags": "string",
        "nutrition": "string",
        "n_steps": "int",
        "n_ingredients": "int",
        "steps": "string",
        "description": "string",
        "ingredients": "string",
        "submitted": "string"
    }

    df = pd.read_csv('archive/RAW_recipes.csv', dtype=types)
    df = df.dropna()
    df = df.reset_index(drop=True)
    df_reviews = pd.read_csv('archive/RAW_interactions.csv')
    df_reviews = df_reviews.dropna()
    df_reviews = df_reviews.reset_index(drop=True)

    # df = df[:5000]
    df = make_food_dataset(df, df_reviews) # add the body for each recipe
    df = df.dropna()
    df = df.reset_index(drop=True)
    # selecting only the body and id columns
    df.rename(columns={'recipe_body': 'body'}, inplace=True)
    df = df[['id', 'body', 'title']]
    # save the df as a csv file
    df.to_csv('input/food_dataset.csv', index=False)

df.head()

Formatting recipes: 100%|██████████| 226657/226657 [1:03:52<00:00, 59.14it/s]


Unnamed: 0,id,body,title
0,137739,arriba baked winter squash mexican style\n\n...,arriba baked winter squash mexican style
1,31490,a bit different breakfast pizza\n\nRecipe pos...,a bit different breakfast pizza
2,112140,all in the kitchen chili\n\nRecipe posted on:...,all in the kitchen chili
3,59389,alouette potatoes\n\nRecipe posted on: 2003-0...,alouette potatoes
4,44061,amish tomato ketchup for canning\n\nRecipe p...,amish tomato ketchup for canning


In [23]:
feeder = pf.VespaFeeder(app)
feeder.feed(df)

VBox(children=(IntProgress(value=0, description='Progress:', layout=Layout(width='50%'), max=226657), Label(va…

Error when feeding document 12441: {'Exception': "Error in document 'id:findmypasta:doc::12441' - could not parse field 'body' of type 'string': The string field value contains illegal code point 0xC: The string field value contains illegal code point 0xC", 'id': '12441', 'message': 'Exception during feed_data_point'}
Error when feeding document 465219: {'Exception': "Error in document 'id:findmypasta:doc::465219' - could not parse field 'body' of type 'string': The string field value contains illegal code point 0x10: The string field value contains illegal code point 0x10", 'id': '465219', 'message': 'Exception during feed_data_point'}


In [24]:
# loading the Questions.xlsx and answering each question query
questions = pd.read_excel('input/Questions.xlsx')
questions

Unnamed: 0,Tipo,Descrição,Query
0,Keywords,Pergunta simples,chocolate cake recipe
1,Keywords,Pergunta simples,strogonoff with rice
2,Keywords,Pergunta simples,fresh lemonade
3,Semantica,Pergunta média,pasta without eggs
4,Keywords,Pergunta simples,apple pie
5,Keywords,Pergunta simples,Brûlée Cream
6,Semantica,Pergunta média,how to make a pizza without an oven
7,Semantica,Pergunta média,pancake without flour and milk
8,Semantica,Pergunta difícil,healthy recipe for quick lunch
9,Semantica,Pergunta difícil,what can I make for a romantic dinner


In [25]:
from vespa.io import VespaQueryResponse

data = pd.DataFrame(columns=['id', 'title', 'body', 'Query', 'Tipo', 'Descrição'])

i=0
for input_query in questions['Query']:

    # save a checkpoint each 100 queries
    if i % 100 == 0:
        data.to_excel('output/Results_Bm25.xlsx', index=False)

    with app.syncio(connections=1) as session:
        response:VespaQueryResponse = session.query(
            yql="select * from sources * where userQuery() limit 5", 
            query=input_query, 
            ranking="bm25"
        )
    assert(response.is_successful())
    
    for hit in response.hits:
        record = {}
        for field in ['id', 'title', 'body']:
            record[field] = hit['fields'][field]
        record["Query"] = input_query
        record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
        record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
        data = pd.concat([data, pd.DataFrame([record])])

    i+=1

# Sorting
data = data.sort_values(by=['Tipo', 'Query'])

# reordering columns
data = data[['Tipo', 'Descrição', 'Query', 'id', 'title', 'body']]

# exporting to excel
data.to_excel('output/Results_Bm25.xlsx', index=False)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def setup_tfidf(documents):
    # Combine title and body for TF-IDF processing
    documents['text'] = documents['title'] + " " + documents['body']
    
    # Create the TF-IDF vectorizer and fit to the document texts
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents['text'])
    
    return tfidf_vectorizer, tfidf_matrix

def find_top_hits(tfidf_vectorizer, tfidf_matrix, query, top_n=5):
    # Transform the query to the same TF-IDF vector space as the documents
    query_tfidf = tfidf_vectorizer.transform([query])
    
    # Calculate cosine similarities between the query and all documents
    cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()
    
    # Get the top N matching documents
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    return top_indices

documents = df

# Set up TF-IDF
tfidf_vectorizer, tfidf_matrix = setup_tfidf(documents)

# DataFrame to store results
data_tfidf = pd.DataFrame()

for input_query in questions['Query']:
    # Find top hits for the query
    top_indices = find_top_hits(tfidf_vectorizer, tfidf_matrix, input_query)
    
    for index in top_indices:
        record = {}
        # Extract details from documents
        for field in ['id', 'title', 'body']:
            record[field] = documents.loc[index, field]
        # Add query details
        record["Query"] = input_query
        record["Tipo"] = questions[questions['Query'] == input_query]['Tipo'].values[0]
        record["Descrição"] = questions[questions['Query'] == input_query]['Descrição'].values[0]
        
        # Append to the result DataFrame
        data_tfidf = pd.concat([data_tfidf, pd.DataFrame([record])])


# Sorting
data_tfidf = data_tfidf.sort_values(by=['Tipo', 'Query'])

# reordering columns
data_tfidf = data_tfidf[['Tipo', 'Descrição', 'Query', 'id', 'title', 'body']]

# exporting to excel
data_tfidf.to_excel('output/Results_Tfidf.xlsx', index=False)
