## The goal is to compare different models

For this task we already have a set of queries that have been evaluated, we will load them and use to calculate the scores for the models search results

In [1]:
import pandas as pd
import re
import os

# trying to load the review bank
try:
    review_bank = pd.read_excel('reviews/review_bank.xlsx')
except:
    review_bank = pd.DataFrame()

# trying to load the missing reviews bank
try:
    missing_reviews_bank = pd.read_excel("reviews/missing_reviews_bank.xlsx")
    extra_reviews = missing_reviews_bank[missing_reviews_bank["Nota"] != None]
    extra_reviews.rename(columns={"title": "Receita"}, inplace=True)
    extra_reviews[["Tipo", "Descrição", "Query", "Receita", "Evaluator", "Nota"]]
    review_bank = pd.concat([review_bank, extra_reviews])
except:
    missing_reviews_bank = pd.DataFrame()


# the function for getting the ratings for pre-evaluated query-recipe pairs
def lookup_rating(query, recipe):
    try:
        ratings = review_bank[(review_bank['Query'] == query) & (review_bank['Receita'] == recipe)][["Nota", "Evaluator"]]
        person_rating = ratings[ratings['Evaluator'] == "Person"]
        if not person_rating.empty:
            # If there is a human evaluation, it gets the preference
            return person_rating.values[0][0]
        else:
            nota = ratings.iloc[0].values[0]
            return nota
    except:
        return None

In [2]:
# getting all the files in the output folder that are in the format Results_*.xlsx
pattern = r"Results_.*\.xlsx$"

model_results_paths = [os.path.join('output', file) for file in os.listdir('output') if re.match(pattern, file)]

models = {}
for model_result_path in model_results_paths:
    model_name = re.search(r"Results_(.*).xlsx", os.path.basename(model_result_path)).group(1)

    result_df = pd.read_excel(model_result_path)
    result_df["Nota"] = result_df.apply(lambda row: lookup_rating(row['Query'], row['title']), axis=1)

    models[model_name] = result_df


In [3]:
missing_reviews = pd.DataFrame()

for model in models:
    df = models[model]
    # Filtrar as linhas onde Nota é None
    model_missing_reviews = df[df['Nota'].isnull()]
    
    # Calcular a média de Nota
    mean_score = df['Nota'].mean()
    
    # Imprimir o relatório
    print(f'Modelo: {model}')
    print(f'Avaliações ausentes: {len(model_missing_reviews)}')
    print(f'Média de pontuação: {mean_score}\n')

    missing_reviews = pd.concat([missing_reviews, model_missing_reviews])

Modelo: Bm25
Avaliações ausentes: 0
Média de pontuação: 2.6

Modelo: bm25_extraQuestions
Avaliações ausentes: 150
Média de pontuação: 3.3454545454545452

Modelo: hybrid
Avaliações ausentes: 0
Média de pontuação: 3.3454545454545452

Modelo: hybrid_extraQuestions
Avaliações ausentes: 150
Média de pontuação: 3.3454545454545452

Modelo: semantic
Avaliações ausentes: 0
Média de pontuação: 3.3454545454545452

Modelo: semantic_extraQuestions
Avaliações ausentes: 150
Média de pontuação: 3.3454545454545452

Modelo: Tfidf
Avaliações ausentes: 0
Média de pontuação: 2.6363636363636362

Modelo: tfidf_extraQuestions
Avaliações ausentes: 150
Média de pontuação: 2.6363636363636362



In [4]:
missing_reviews

Unnamed: 0,Tipo,Descrição,Query,id,title,body,Nota
10,Keywords,Pergunta simples,banana bread recipe,319415,best banana bread recipe,best banana bread recipe\n\nRecipe posted on: ...,
11,Keywords,Pergunta simples,banana bread recipe,351947,healthy and tasty banana bread,healthy and tasty banana bread\n\nRecipe poste...,
12,Keywords,Pergunta simples,banana bread recipe,295635,banana barley bread,banana barley bread\n\nRecipe posted on: 2008-...,
13,Keywords,Pergunta simples,banana bread recipe,311380,wonderful banana bread,wonderful banana bread\n\nRecipe posted on: 20...,
14,Keywords,Pergunta simples,banana bread recipe,214551,my favorite banana bread,my favorite banana bread\n\nRecipe posted on: ...,
...,...,...,...,...,...,...,...
195,Semantica,Pergunta média,vegan options for a Thanksgiving dinner,354619,the classic casserole,the classic casserole\n\nRecipe posted on: 200...,
196,Semantica,Pergunta média,vegan options for a Thanksgiving dinner,323651,rice options oamc,rice options oamc\n\nRecipe posted on: 2008-...,
197,Semantica,Pergunta média,vegan options for a Thanksgiving dinner,347401,wonderful whipped cream frosting with flavor ...,wonderful whipped cream frosting with flavor ...,
198,Semantica,Pergunta média,vegan options for a Thanksgiving dinner,61280,quesadillas with options,quesadillas with options\n\nRecipe posted on: ...,


In [5]:
# Prompt for chatbot evaluation
def get_gpt_template(resulting_recipe, query):
    return f"""
        You are a query result validator assistant tasked with evaluating the quality of a given recipe in answering a given query.

        Here is the recipe: \

        {resulting_recipe}

        Here is the query:\

        {query}

        Now evaluate from 0 to 5, the relevance of the recipe for answering the given query, where 0 is unrelated, 1 is poorly related, 2 is a little relevant but miss some important things, 3 is relevant but miss some restrictions, 4 is a relevant recipe that nearly matches all the possible criterias, and 5 is a perfect result, where every possible consideration and restrain included in the query is answered in the recipe. Include both your grading and a brief justificative of the grade.


        Assistant grading:
        Justificative:
    """

missing_reviews["gpt_template"] = missing_reviews.apply(lambda row: get_gpt_template(row['body'], row['Query']), axis=1)
missing_reviews["Nota"] = None
missing_reviews["Evaluator"] = "GPT"

missing_reviews_bank = pd.concat([missing_reviews_bank, missing_reviews])

missing_reviews_bank.drop_duplicates(inplace=True)

# export to xlsx
missing_reviews_bank.to_excel('reviews/missing_reviews_bank.xlsx', index=False)