## The goal is to compare different models

For this task we already have a set of queries that have been evaluated, we will load them and use to calculate the scores for the models search results. 

In [1]:
import pandas as pd
import re
import os
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# trying to load the review bank
try:
    review_bank = pd.read_excel('reviews/review_bank.xlsx')
except:
    review_bank = pd.DataFrame()

# trying to load the missing reviews bank
try:
    missing_reviews_bank = pd.read_excel("reviews/missing_reviews_bank.xlsx")
    extra_reviews = missing_reviews_bank[missing_reviews_bank["Nota"] != None]
    extra_reviews.rename(columns={"title": "Receita"}, inplace=True)
    extra_reviews[["Tipo", "Descrição", "Query", "Receita", "Evaluator", "Nota"]]
    review_bank = pd.concat([review_bank, extra_reviews])
except:
    missing_reviews_bank = pd.DataFrame()


# the function for getting the ratings for pre-evaluated query-recipe pairs
def lookup_rating(query, recipe):
    try:
        ratings = review_bank[(review_bank['Query'] == query) & (review_bank['Receita'] == recipe)][["Nota", "Evaluator"]]
        person_rating = ratings[ratings['Evaluator'] == "Person"]
        if not person_rating.empty:
            # If there is a human evaluation, it gets the preference
            return person_rating.values[0][0]
        else:
            nota = ratings.iloc[0].values[0]
            return nota
    except:
        return None

In [2]:
# getting all the files in the output folder that are in the format Results_*.xlsx
pattern = r"Results_.*\.xlsx$"

model_results_paths = [os.path.join('output', file) for file in os.listdir('output') if re.match(pattern, file)]

models = {}

for model_result_path in model_results_paths:
    model_name = re.search(r"Results_(.*).xlsx", os.path.basename(model_result_path)).group(1)

    result_df = pd.read_excel(model_result_path)
    result_df["Nota"] = result_df.apply(lambda row: lookup_rating(row['Query'], row['title']), axis=1)

    models[model_name] = result_df


In [3]:

missing_reviews = pd.DataFrame()
simple_mean = {}
media_mean = {}
hard_mean = {}
complex_mean = {}
semanticc_mean = {}
keywordd_mean = {}

for model in models:
    df = models[model]
    model_missing_reviews = df[df['Nota'].isnull()]

    simple_question = df[df['Descrição'] == 'Pergunta simples']
    media_question = df[df['Descrição'] == 'Pergunta média']
    hard_question = df[df['Descrição'] == 'Pergunta difícil']
    complex_question = df[df['Descrição'] == 'Pergunta difícil +']

    keyword = df[df['Tipo'] == 'Keywords']
    semantically = df[df['Tipo'] == 'Semantica']
    
    mean_simple = simple_question['Nota'].mean()
    mean_media = media_question['Nota'].mean()
    mean_hard = hard_question['Nota'].mean()
    mean_complex = complex_question['Nota'].mean()
    mean_score = df['Nota'].mean()

    keyword_mean = keyword['Nota'].mean()
    semantically_mean = semantically['Nota'].mean()
    
    # Imprimir o relatório
    print(f'Modelo: {model}')
    print(f'Avaliações ausentes: {len(model_missing_reviews)}')
    # print(f'Média de pontuação: {mean_score}')
    # print(f'Média de pontuação para perguntas simples: {mean_simple}')
    # print(f'Média de pontuação para perguntas médias: {mean_media}')
    # print(f'Média de pontuação para perguntas difíceis: {mean_hard}')
    # print(f'Média de pontuação para perguntas complexas: {mean_complex}\n')

    # print(f'Média de pontuação para perguntas com palavras-chave: {keyword_mean}')
    # print(f'Média de pontuação para perguntas semânticas: {semantically_mean}\n')

    #add each mean to dictionary
    simple_mean[model] = mean_simple
    media_mean[model] = mean_media
    hard_mean[model] = mean_hard
    complex_mean[model] = mean_complex

    semanticc_mean[model] = semantically_mean
    keywordd_mean[model] = keyword_mean
    

    missing_reviews = pd.concat([missing_reviews, model_missing_reviews])

Modelo: Bm25
Avaliações ausentes: 0
Modelo: bm25_extraQuestions
Avaliações ausentes: 0
Modelo: hybrid
Avaliações ausentes: 0
Modelo: hybrid_extraQuestions
Avaliações ausentes: 0
Modelo: semantic
Avaliações ausentes: 0
Modelo: semantic_extraQuestions
Avaliações ausentes: 0
Modelo: Tfidf
Avaliações ausentes: 0
Modelo: tfidf_extraQuestions
Avaliações ausentes: 0


In [4]:
def calculate_dcg(scores):
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log2(np.arange(2, scores.size + 2))),
        dtype=np.float32)

dcg_mean = {}

for model in models:
    df = models[model]
    
    sorted_scores = df['Nota'].sort_values(ascending=False)
    
    dcg = calculate_dcg(sorted_scores.values)
    
    dcg_mean[model] = dcg

    print(f'Modelo: {model}')
    print(f'DCG: {dcg}\n')

Modelo: Bm25
DCG: 223.1525115966797

Modelo: bm25_extraQuestions
DCG: 313.8639831542969

Modelo: hybrid
DCG: 276.5688781738281

Modelo: hybrid_extraQuestions
DCG: 601.5553588867188

Modelo: semantic
DCG: 280.0585632324219

Modelo: semantic_extraQuestions
DCG: 601.5553588867188

Modelo: Tfidf
DCG: 225.56451416015625

Modelo: tfidf_extraQuestions
DCG: 472.3876953125



In [5]:

modelos = list(simple_mean.keys())
notas = list(simple_mean.values())

df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])

df = df.sort_values('Nota')

fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[0,5])  # Set min and max values of the color scale

fig.update_layout(
    title='Notas dos Modelos - Perguntas Simples',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

fig.update_yaxes(range=[0, 5])
fig.show()

In [6]:

modelos = list(media_mean.keys())
notas = list(media_mean.values())

df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])


df = df.sort_values('Nota')

fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[0,5])  # Set min and max values of the color scale

fig.update_layout(
    title='Notas dos Modelos - Perguntas Médias',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

fig.update_yaxes(range=[0, 5])
fig.show()

In [7]:
modelos = list(hard_mean.keys())
notas = list(hard_mean.values())

df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])

df = df.sort_values('Nota')

fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[0,5])  # Set min and max values of the color scale

fig.update_layout(
    title='Notas dos Modelos - Perguntas Difíceis',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

fig.update_yaxes(range=[0, 5])
fig.show()

In [8]:
modelos = list(complex_mean.keys())
notas = list(complex_mean.values())

df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])

df = df.sort_values('Nota')

fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[0,5])  # Set min and max values of the color scale

fig.update_layout(
    title='Notas dos Modelos - Perguntas Complexas',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

fig.update_yaxes(range=[0, 5])
fig.show()

In [9]:

all_means = {model: {} for model in simple_mean.keys()}

for model in all_means:
    all_means[model]["simple"] = simple_mean[model]
    all_means[model]["media"] = media_mean[model]
    all_means[model]["hard"] = hard_mean[model]
    all_means[model]["complex"] = complex_mean[model]

df = pd.DataFrame.from_dict(all_means, orient='index')


df['Total'] = df.sum(axis=1)

df = df.sort_values('Total', ascending=True)

df = df.drop(columns=['Total'])

color_palette = px.colors.qualitative.Pastel


fig = px.bar(df, barmode='stack', color_discrete_sequence=color_palette, labels={'index':'Modelo', 'value':'Nota', 'variable':'Dificuldade'})

fig.update_layout(
    title='Notas dos Modelos - Perguntas Simples, Médias, Difíceis e Complexas',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)

fig.show()

O semantic é melhor quando comparamos as dificuldades das perguntas, em geral. É naturalmente muito parecido com o hybrid, mas em algumas poucas perguntas tem melhor avaliação.

In [10]:

modelos = list(keywordd_mean.keys())
notas = list(keywordd_mean.values())

df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])

df = df.sort_values('Nota')

fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[0,5])  # Set min and max values of the color scale

fig.update_layout(
    title='Notas dos Modelos - Perguntas com Keywords',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

fig.update_yaxes(range=[0, 5])
fig.show()

In [11]:

modelos = list(semanticc_mean.keys())
notas = list(semanticc_mean.values())


df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])


df = df.sort_values('Nota')


fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[0,5])  # Set min and max values of the color scale

fig.update_layout(
    title='Notas dos Modelos - Perguntas semânticas',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

fig.update_yaxes(range=[0, 5])
fig.show()

In [12]:
all_means = {model: {} for model in simple_mean.keys()}

for model in all_means:
    all_means[model]["semanticc"] = semanticc_mean[model]
    all_means[model]["keyword"] = keywordd_mean[model]


df = pd.DataFrame.from_dict(all_means, orient='index')

df['Total'] = df.sum(axis=1)

df = df.sort_values('Total', ascending=True)

df = df.drop(columns=['Total'])

color_palette = px.colors.qualitative.Pastel

fig = px.bar(df, barmode='stack', color_discrete_sequence=color_palette, labels={'index':'Modelo', 'value':'Nota', 'variable':'Dificuldade'})

fig.update_layout(
    title='Notas dos Modelos - Perguntas com Keywords e Semânticas',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)

fig.show()

O semantic é marginalmente melhor do que o hybrid, se destacando também entre os tipos de perguntas. Os valores são muito parecidos, porém tem uma leve melhora nas perguntas semânticas.

In [13]:
modelos = list(dcg_mean.keys())
notas = list(dcg_mean.values())

# Create a DataFrame from your data
df = pd.DataFrame(list(zip(modelos, notas)), columns=['Modelo', 'Nota'])

# Sort the DataFrame by 'Nota'
df = df.sort_values('Nota')

# Create the bar chart
fig = px.bar(df, x='Modelo', y='Nota', color='Nota', color_continuous_scale='Viridis',
             hover_data={'Nota':':.2f'}, range_color=[200,700])  # Set min and max values of the color scale

fig.update_layout(
    title='Discounted Cumulative Gain (DCG) dos Modelos',
    title_font=dict(size=24, family='Courier', color='black'),
    showlegend=True,
    template='plotly_white'
)
fig.update_traces(hovertemplate='Modelo: %{x}<br>Nota: %{y:.2f}')

# change limits of y-axis
fig.update_yaxes(range=[0, 700])
fig.show()

O modelo semântico foi melhor ao avaliarmos o DCG, que é uma métrica relevante para nossos dados. Ela penaliza relevâncias nas posições mais baixas por um fator de desconto logarítmico. Se as respostas mais relevantes estão concentradas no topo dos modelos, o DCG será maior.

In [14]:
# Convert each dictionary to a DataFrame
df_simple = pd.DataFrame(list(simple_mean.items()), columns=['Model', 'SimpleMean'])
df_media = pd.DataFrame(list(media_mean.items()), columns=['Model', 'MediaMean'])
df_hard = pd.DataFrame(list(hard_mean.items()), columns=['Model', 'HardMean'])
df_complex = pd.DataFrame(list(complex_mean.items()), columns=['Model', 'ComplexMean'])
df_semantic = pd.DataFrame(list(semanticc_mean.items()), columns=['Model', 'SemanticMean'])
df_keyword = pd.DataFrame(list(keywordd_mean.items()), columns=['Model', 'KeywordMean'])

# Concatenate the DataFrames along the column axis
df = pd.concat([df_simple, df_media['MediaMean'], df_hard['HardMean'], df_complex['ComplexMean'], df_semantic['SemanticMean'], df_keyword['KeywordMean']], axis=1)

# Convert the DCG scores to a DataFrame
df_dcg = pd.DataFrame(list(dcg_mean.items()), columns=['Model', 'DCG'])

# Concatenate the DCG scores with the existing DataFrame
df = pd.concat([df, df_dcg['DCG']], axis=1)

# Write the DataFrame to a CSV file

df

Unnamed: 0,Model,SimpleMean,MediaMean,HardMean,ComplexMean,SemanticMean,KeywordMean,DCG
0,Bm25,3.8,0.2,3.9,1.2,1.6,3.8,223.152512
1,bm25_extraQuestions,2.388235,1.766667,1.775,1.1,1.615385,2.388235,313.863983
2,hybrid,4.08,1.8,4.1,2.6,2.7,4.08,276.568878
3,hybrid_extraQuestions,3.917647,2.583333,3.45,3.0,2.946154,3.917647,601.555359
4,semantic,4.08,1.8,4.2,2.6,2.733333,4.08,280.058563
5,semantic_extraQuestions,3.917647,2.583333,3.45,3.0,2.946154,3.917647,601.555359
6,Tfidf,3.24,1.133333,4.6,0.2,2.133333,3.24,225.564514
7,tfidf_extraQuestions,3.305882,1.916667,2.925,1.333333,2.092308,3.305882,472.387695


In [15]:
missing_reviews

Unnamed: 0,Tipo,Descrição,Query,id,title,body,Nota


In [16]:
# Prompt for chatbot evaluation
def get_gpt_template(resulting_recipe, query):
    return f"""
        You are a query result validator assistant tasked with evaluating the quality of a given recipe in answering a given query.

        Here is the recipe: \

        {resulting_recipe}

        Here is the query:\

        {query}

        Now evaluate from 0 to 5, the relevance of the recipe for answering the given query, where 0 is unrelated, 1 is poorly related, 2 is a little relevant but miss some important things, 3 is relevant but miss some restrictions, 4 is a relevant recipe that nearly matches all the possible criterias, and 5 is a perfect result, where every possible consideration and restrain included in the query is answered in the recipe. Include both your grading and a brief justificative of the grade.


        Assistant grading:
        Justificative:
    """

missing_reviews["gpt_template"] = missing_reviews.apply(lambda row: get_gpt_template(row['body'], row['Query']), axis=1)
missing_reviews["Nota"] = None
missing_reviews["Evaluator"] = "GPT"

missing_reviews_bank = pd.concat([missing_reviews_bank, missing_reviews])

missing_reviews_bank.drop_duplicates(inplace=True)

# export to xlsx
missing_reviews_bank.to_excel('reviews/missing_reviews_bank.xlsx', index=False)

In [17]:
# checks if there is any missing review
if missing_reviews.empty:
    print("All reviews were evaluated!")
else:
    print(f"{len(missing_reviews)} reviews are missing.")

    from fill_in_missing_reviews import fill_in_missing_reviews
    fill_in_missing_reviews()

All reviews were evaluated!
