In [21]:
import requests
import gdown
import tarfile
from bs4 import BeautifulSoup
import json
import time
import random
from tqdm import tqdm
from rich.pretty import pprint
import os

import collections
import re
import string
import unicodedata

from datasets import Dataset
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
import ragas.evaluation as ragas_eval

from IPython.display import Markdown, display
import pickle
import pandas as pd

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
embedder = HuggingFaceEmbeddings()

# 1. Dataset Format

In [22]:
def extract_evidence_and_answer(text):
    try:
        # Ajustando a expressão regular para ser mais flexível com espaços e quebras de linha
        pattern = r"Evidence:\s*(.*?)\s*\n+\s*Answer:\s*(.*?)(?=.', ChatCompletion)"
        matches = re.search(pattern, text, re.DOTALL)

        if not matches:
            # Se não encontrar, tentar uma abordagem menos específica
            pattern_loose = r"Evidence:\s*(.*?)\s*Answer:\s*(.*)"
            matches = re.search(pattern_loose, text, re.DOTALL)
            if not matches:
                raise ValueError("As seções 'Evidence' e 'Answer' não puderam ser encontradas no texto fornecido.")

        evidence = matches.group(1)
        answer = matches.group(2)

        return {
            "evidence": evidence.strip(),
            "answer": answer.strip()
        }

    except Exception as e:
        print(f"Erro ao extrair os componentes: {e}")
        return None

In [23]:
# opens the pickle file
with open("../input/Raw Text/Rag Data Experiments/experiment_4/respostas_obtidas.pickle_9", 'rb') as input_file:
    # loads the pickle file into a pandas DataFrame
    dataset = pd.read_pickle(input_file)

In [24]:
# iterando sobre todos os dados do FAQ atualizado para formatá-los
dataset_organized = []
for sample in dataset:
    if 'resposta_obtida' in sample and isinstance(sample['resposta_obtida'], tuple):
        
        text = sample['resposta_obtida'][0]  # pegando Evidence e Answer das respostas do RAG - pega so o primeiro elemento da tupla, pra nao pegar o ChatCompletion
        extracted_data = extract_evidence_and_answer(text)  # segmentando Evidence e Answer em formato dict 
        
        if extracted_data:
            organized_sample = {
                'question': sample['pergunta'],  # pergunta do FAQ
                'ground_truths': [sample['resposta_esperada']],  # resposta esperada do FAQ
                'answer': extracted_data['answer'],  # resposta obtida, segmentada answer
                'contexts': [extracted_data['evidence']]
            }
            dataset_organized.append(organized_sample)  # novo dataset organizado

Erro ao extrair os componentes: As seções 'Evidence' e 'Answer' não puderam ser encontradas no texto fornecido.
Erro ao extrair os componentes: As seções 'Evidence' e 'Answer' não puderam ser encontradas no texto fornecido.
Erro ao extrair os componentes: As seções 'Evidence' e 'Answer' não puderam ser encontradas no texto fornecido.


In [25]:
# Salvando o dataset organizado em formato JSON
file_path = '../input/Raw Text/Rag Data Experiments/experiment_4/RAG_dataset_IIRCformat.json'
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(dataset_organized, file, ensure_ascii=False, indent=4)

print(f'Dataset organizado salvo com sucesso em {file_path}!')

Dataset organizado salvo com sucesso em ../input/Raw Text/Rag Data Experiments/experiment_4/RAG_dataset_IIRCformat.json!


# RAGAS Implementation

In [26]:
from datasets import Dataset
import time

def evaluate_per_sample(dataset_organized):
    # Cria um Dataset para uso com a biblioteca 'ragas'
    formated_dataset = Dataset.from_list(dataset_organized)

    for i, sample in enumerate(dataset_organized):
        # Seleciona cada amostra individualmente para avaliação
        chunk = formated_dataset.select([i])
        
        # Avalia a amostra com as métricas definidas
        result = ragas_eval.evaluate(dataset=chunk, metrics=[answer_relevancy, faithfulness, context_recall, context_precision], llm=llm, embeddings=embedder)
        
        # Atualiza o dicionário da amostra com os resultados das métricas
        sample.update({
            'answer_relevancy': result['answer_relevancy'],
            'faithfulness': result['faithfulness'],
            'context_recall': result['context_recall'],
            'context_precision': result['context_precision']
        })
        
        print(f"Evaluating sample {i + 1}/{len(dataset_organized)}: {result}")
        time.sleep(60)  # Sleep 60 seconds after each sample evaluation to avoid overloading the system

    return dataset_organized

In [27]:
# dataset_organized should be defined and filled with the required data before calling this function
updated_dataset = evaluate_per_sample(dataset_organized)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


Evaluating sample 1/37: {'answer_relevancy': 0.4912, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:41<00:00, 10.28s/it]


Evaluating sample 2/37: {'answer_relevancy': 0.5059, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.29it/s]


Evaluating sample 3/37: {'answer_relevancy': 0.2579, 'faithfulness': 1.0000, 'context_recall': 0.1250, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.84it/s]


Evaluating sample 4/37: {'answer_relevancy': 0.8210, 'faithfulness': 1.0000, 'context_recall': 0.3333, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:45<00:00, 11.43s/it]


Evaluating sample 5/37: {'answer_relevancy': 0.4668, 'faithfulness': 1.0000, 'context_recall': 0.2000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


Evaluating sample 6/37: {'answer_relevancy': 0.3452, 'faithfulness': 1.0000, 'context_recall': 0.6667, 'context_precision': 0.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


Evaluating sample 7/37: {'answer_relevancy': 0.5125, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:17<00:00,  4.32s/it]


Evaluating sample 8/37: {'answer_relevancy': 0.6970, 'faithfulness': 1.0000, 'context_recall': 0.3333, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


Evaluating sample 9/37: {'answer_relevancy': 0.1249, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:18<00:00,  4.60s/it]


Evaluating sample 10/37: {'answer_relevancy': 0.3004, 'faithfulness': 1.0000, 'context_recall': 0.5000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:17<00:00,  4.47s/it]


Evaluating sample 11/37: {'answer_relevancy': 0.4881, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


Evaluating sample 12/37: {'answer_relevancy': 0.3914, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:05<00:00,  1.45s/it]


Evaluating sample 13/37: {'answer_relevancy': 0.4911, 'faithfulness': 1.0000, 'context_recall': 0.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]


Evaluating sample 14/37: {'answer_relevancy': 0.3736, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.15it/s]


Evaluating sample 15/37: {'answer_relevancy': 0.4870, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.72it/s]


Evaluating sample 16/37: {'answer_relevancy': 0.2273, 'faithfulness': 1.0000, 'context_recall': 0.2000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.97it/s]


Evaluating sample 17/37: {'answer_relevancy': 0.1102, 'faithfulness': 1.0000, 'context_recall': 0.3333, 'context_precision': 0.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:25<00:00,  6.47s/it]


Evaluating sample 18/37: {'answer_relevancy': 0.2575, 'faithfulness': 1.0000, 'context_recall': 0.2500, 'context_precision': 0.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


Evaluating sample 19/37: {'answer_relevancy': 0.4650, 'faithfulness': 0.5000, 'context_recall': 0.8000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.76s/it]


Evaluating sample 20/37: {'answer_relevancy': 0.2759, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Evaluating sample 21/37: {'answer_relevancy': 0.4704, 'faithfulness': 1.0000, 'context_recall': 0.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.20it/s]


Evaluating sample 22/37: {'answer_relevancy': 0.0796, 'faithfulness': 1.0000, 'context_recall': 0.2500, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:09<00:00,  2.32s/it]


Evaluating sample 23/37: {'answer_relevancy': 0.3458, 'faithfulness': 0.5000, 'context_recall': 0.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.51it/s]


Evaluating sample 24/37: {'answer_relevancy': 0.3992, 'faithfulness': 1.0000, 'context_recall': 0.2000, 'context_precision': 0.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]


Evaluating sample 25/37: {'answer_relevancy': 0.2542, 'faithfulness': 1.0000, 'context_recall': 0.6667, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.87s/it]


Evaluating sample 26/37: {'answer_relevancy': 0.4405, 'faithfulness': 1.0000, 'context_recall': 0.3333, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:08<00:00,  2.04s/it]


Evaluating sample 27/37: {'answer_relevancy': 0.2849, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]


Evaluating sample 28/37: {'answer_relevancy': 0.2479, 'faithfulness': 0.5000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.93it/s]


Evaluating sample 29/37: {'answer_relevancy': 0.1624, 'faithfulness': 1.0000, 'context_recall': 0.3333, 'context_precision': 0.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:27<00:00,  6.89s/it]


Evaluating sample 30/37: {'answer_relevancy': 0.5792, 'faithfulness': 0.8333, 'context_recall': 0.7500, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  2.08it/s]


Evaluating sample 31/37: {'answer_relevancy': 0.3498, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.83s/it]


Evaluating sample 32/37: {'answer_relevancy': 0.1005, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


Evaluating sample 33/37: {'answer_relevancy': 0.3923, 'faithfulness': 1.0000, 'context_recall': 0.0000, 'context_precision': 0.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:06<00:00,  1.70s/it]


Evaluating sample 34/37: {'answer_relevancy': 0.5795, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:40<00:00, 10.11s/it]


Evaluating sample 35/37: {'answer_relevancy': 0.6043, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.89s/it]


Evaluating sample 36/37: {'answer_relevancy': 0.4114, 'faithfulness': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it]


Evaluating sample 37/37: {'answer_relevancy': 0.5764, 'faithfulness': 1.0000, 'context_recall': 0.8571, 'context_precision': 1.0000}


In [28]:
# Salvando o dataset organizado em formato JSON
file_path = '../input/Raw Text/dataset/RAG_FinalDataset_experiment_4.json'
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(updated_dataset, file, ensure_ascii=False, indent=5)

print(f'Dataset organizado salvo com sucesso em {file_path}!')

Dataset organizado salvo com sucesso em ../input/Raw Text/dataset/RAG_FinalDataset_experiment_4.json!


In [29]:
import pandas as pd
df = pd.read_json(file_path)
df

Unnamed: 0,question,ground_truths,answer,contexts,answer_relevancy,faithfulness,context_recall,context_precision
0,"Como ocorrem as atividades de cooperação, pesq...",[Nos termos do art. 1° da Deliberação CONSU-A-...,"mediante a celebração de convênios, contratos ...",[De acordo com o trecho 'A atuação da Universi...,0.491248,1.0,1.0,1.0
1,Qual é o procedimento para a celebração de con...,"[Na Unicamp, a celebração de convênios, contra...","O procedimento para a celebração de convênios,...",[De acordo com o trecho 'CAPÍTULO I – DISPOSIÇ...,0.505855,1.0,1.0,1.0
2,Qual é o sistema utilizado para a tramitação d...,[Os documentos essenciais estão elencados no a...,Processos administrativos eletrônicos.,[De acordo com o trecho 'As propostas de convê...,0.257911,1.0,0.125,1.0
3,O que é o Plano de Aplicação de Recursos?,[O Plano de Aplicação de Recursos é o document...,O Plano de Aplicação de Recursos é o documento...,[De acordo com o trecho '[nan1]: O Plano de Ap...,0.821026,1.0,0.333333,1.0
4,Quem pode ser executor de um convênio e quais ...,"[Nos termos do art. 18, §1° da Deliberação CON...",Os executores de um convênio são servidores at...,[De acordo com o trecho '1º – Poderão figurar...,0.466763,1.0,0.2,1.0
5,Quem é a autoridade competente para assinatura...,"[Como regra, a autoridade competente para assi...",As autoridades competentes para assinatura dos...,[De acordo com o trecho 'Artigo 9º – Fica del...,0.345204,1.0,0.666667,0.0
6,Existe uma tramitação simplificada para aprova...,"[Sim, o art. 7º da Deliberação CONSU-A-016/202...",sim,[De acordo com o trecho 'Seção III – Das Trami...,0.512463,1.0,1.0,1.0
7,Quando se deve utilizar um Termo Aditivo a um ...,[Em razão da inexistência de uma única lei ou ...,Quando se deseja alterar os termos do convênio...,"[De acordo com o trecho 'Em suma, independente...",0.696983,1.0,0.333333,1.0
8,É possível que o convênio preveja o pagamento ...,"[Sim, a concessão de bolsas estímulo à inovaçã...",sim,[De acordo com o trecho 'Artigo 1º - A Unicam...,0.124859,1.0,1.0,1.0
9,Como ocorre a tramitação de convênio com insti...,[A tramitação de convênio que só tenha a funda...,Formalizar nos termos do artigo 2º e seguir a ...,[De acordo com o trecho 'O processo de convêni...,0.300364,1.0,0.5,1.0


In [30]:
print(df['answer_relevancy'].mean())
print(df['faithfulness'].mean())
print(df['context_recall'].mean())
print(df['context_precision'].mean())

0.3883226907319026
0.9549549549549549
0.6251930501930502
0.8378378377540538
