# Use Ragas to evaluate the OpenAI Assistant

**Please note that this test requires a large amount of OpenAI api token consumption. Please read it carefully and Pay attention to the number of times you request access.**

## 1. Prepare environment and data

Before starting, you must set OPENAI_API_KEY in your environment variables.

Install pip dependencies

In [None]:
# ! python -m pip install openai beir pandas ragas==0.0.17

Download [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) data if it not exists in your local space. We convert it into a ragas form that is easier to process, referring from this [script](https://github.com/explodinggradients/ragas/blob/main/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb).

In [1]:
import json
import pandas as pd
import os
from tqdm import tqdm
from datasets import Dataset
from beir import util


def prepare_fiqa_without_answer(knowledge_path):
    dataset_name = "fiqa"

    if not os.path.exists(os.path.join(knowledge_path, f'{dataset_name}.zip')):
        url = (
            "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(
                dataset_name
            )
        )
        util.download_and_unzip(url, knowledge_path)

    data_path = os.path.join(knowledge_path, 'fiqa')
    with open(os.path.join(data_path, "corpus.jsonl")) as f:
        cs = [pd.Series(json.loads(l)) for l in f.readlines()]

    corpus_df = pd.DataFrame(cs)

    corpus_df = corpus_df.rename(columns={"_id": "corpus-id", "text": "ground_truth"})
    corpus_df = corpus_df.drop(columns=["title", "metadata"])
    corpus_df["corpus-id"] = corpus_df["corpus-id"].astype(int)
    corpus_df.head()

    with open(os.path.join(data_path, "queries.jsonl")) as f:
        qs = [pd.Series(json.loads(l)) for l in f.readlines()]

    queries_df = pd.DataFrame(qs)
    queries_df = queries_df.rename(columns={"_id": "query-id", "text": "question"})
    queries_df = queries_df.drop(columns=["metadata"])
    queries_df["query-id"] = queries_df["query-id"].astype(int)
    queries_df.head()

    splits = ["dev", "test", "train"]
    split_df = {}
    for s in splits:
        split_df[s] = pd.read_csv(os.path.join(data_path, f"qrels/{s}.tsv"), sep="\t").drop(
            columns=["score"]
        )

    final_split_df = {}
    for split in split_df:
        df = queries_df.merge(split_df[split], on="query-id")
        df = df.merge(corpus_df, on="corpus-id")
        df = df.drop(columns=["corpus-id"])
        grouped = df.groupby("query-id").apply(
            lambda x: pd.Series(
                {
                    "question": x["question"].sample().values[0],
                    "ground_truths": x["ground_truth"].tolist(),
                }
            )
        )

        grouped = grouped.reset_index()
        grouped = grouped.drop(columns="query-id")
        final_split_df[split] = grouped

    return final_split_df


knowledge_datas_path = './knowledge_datas'
fiqa_path = os.path.join(knowledge_datas_path, 'fiqa_doc.txt')

if not os.path.exists(knowledge_datas_path):
    os.mkdir(knowledge_datas_path)
contexts_list = []
answer_list = []

final_split_df = prepare_fiqa_without_answer(knowledge_datas_path)

docs = []

split = 'test'
for ds in final_split_df[split]["ground_truths"]:
    docs.extend([d for d in ds])
print(len(docs))

docs_str = '\n'.join(docs)
with open(fiqa_path, 'w') as f:
    f.write(docs_str)

split = 'test'
question_list = final_split_df[split]["question"].to_list()
ground_truth_list = final_split_df[split]["ground_truths"].to_list()

1706


Now we have the question list and the ground truth list. And the knowledge documents are prepared in `fiqa_path`.


## 2. Building RAG using OpenAI assistant

To get the context content from the annotations returned by Open AI.

In [None]:
import time
from openai import OpenAI

client = OpenAI()

# Set OPENAI_API_KEY in your environment value
client.api_key = os.getenv('OPENAI_API_KEY')


class OpenAITimeoutException(Exception):
    pass


def get_content_from_retrieved_message(message):
    # Extract the message content
    message_content = message.content[0].text
    annotations = message_content.annotations
    contexts = []
    for annotation in annotations:
        message_content.value = message_content.value.replace(annotation.text, f'')
        if (file_citation := getattr(annotation, 'file_citation', None)):
            contexts.append(file_citation.quote)
    if len(contexts) == 0:
        contexts = ['empty context.']
    return message_content.value, contexts


def try_get_answer_contexts(assistant_id, question, timeout_seconds=120):
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": question,
            }
        ]
    )
    thread_id = thread.id
    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
    )
    start_time = time.time()
    while True:
        elapsed_time = time.time() - start_time
        if elapsed_time > timeout_seconds:
            raise Exception("OpenAI retrieving answer Timeout！")

        run = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run.id
        )
        if run.status == 'completed':
            break
    messages = client.beta.threads.messages.list(
        thread_id=thread_id
    )
    assert len(messages.data) > 1
    res, contexts = get_content_from_retrieved_message(messages.data[0])
    response = client.beta.threads.delete(thread_id)
    assert response.deleted is True
    return contexts, res


def get_answer_contexts_from_assistant(question, assistant_id, timeout_seconds=120, retry_num=6):
    res = 'failed. please retry.'
    contexts = ['failed. please retry.']
    try:
        for _ in range(retry_num):
            try:
                contexts, res = try_get_answer_contexts(assistant_id, question, timeout_seconds)
                break
            except OpenAITimeoutException as e:
                print('OpenAI retrieving answer Timeout, retry...')
                continue
    except Exception as e:
        print(e)
    return res, contexts

Build assistant and upload knowledge files.

In [3]:
file = client.files.create(
    file=open(fiqa_path, "rb"),
    purpose='assistants'
)

# Add the file to the assistant
assistant = client.beta.assistants.create(
    instructions="You are a customer support chatbot. You must use your retrieval tool to retrieve relevant knowledge to best respond to customer queries.",
    model="gpt-4-1106-preview",
    tools=[{"type": "retrieval"}],
    file_ids=[file.id]
)

  0%|          | 0/648 [03:45<?, ?it/s]

KeyboardInterrupt



## 3. Start Ragas Evaluation

Note that a large amount of OpenAI api token is consumed. Every time you ask a question and every evaluation, you will ask the OpenAI service. Please pay attention to your token consumption. If you only want to run a small number of tests, you can modify the code to reduce the test size.

In [None]:
for question in tqdm(question_list):
    answer, contexts = get_answer_contexts_from_assistant(question, assistant.id)
    # print(f'answer = {answer}')
    # print(f'contexts = {contexts}')
    # print('=' * 80)
    answer_list.append(answer)
    contexts_list.append(contexts)

You can choose the indicators you care about to test.


In [None]:
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision, answer_similarity

ds = Dataset.from_dict({"question": question_list,
                        "contexts": contexts_list,
                        "answer": answer_list,
                        "ground_truths": ground_truth_list})

result = evaluate(
    ds,
    metrics=[
        context_precision,
        # context_recall,
        # faithfulness,
        # answer_relevancy,
        # answer_similarity,
        # answer_correctness,
    ],

)
print(result)