<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/low_level/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Building Evaluation from Scratch

We show how you can build evaluation modules from scratch. This includes both evaluation of the final generated response (where the output is plain text), as well as the evaluation of retrievers (where the output is a ranked list of items).

We have in-house modules in our [Evaluation](https://gpt-index.readthedocs.io/en/latest/core_modules/supporting_modules/evaluation/root.html) section.

## Setup

We load some data and define a very simple RAG query engine that we'll evaluate (uses top-k retrieval).

In [None]:
# %pip install llama-index-readers-file pymupdf
# %pip install llama-index-llms-openai

In [None]:
import logging
import sys
import pandas as pd
import numpy as np
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

from pathlib import Path
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter, SimpleNodeParser
from llama_index.llms.openai import OpenAI


In [None]:

import dotenv
dotenv.load_dotenv('../.env')

# load yaml file
import yaml
from easydict import EasyDict
config = EasyDict(yaml.safe_load(open("defaults.yaml")))
config

In [None]:
from datasets import load_dataset
import os

if os.path.exists('ArabicMMLU.csv'):
    arabicmmlu_df = pd.read_csv('ArabicMMLU.csv')
else:
    arabicMMLU = load_dataset('MBZUAI/ArabicMMLU')
    arabicmmlu_df = arabicMMLU['test'].to_pandas()
    arabicmmlu_df.to_csv('ArabicMMLU.csv', index=False)
    

arabicmmlu_df

In [None]:
from llama_index.core.node_parser import SimpleNodeParser

# Set a large chunk size
node_parser = SimpleNodeParser.from_defaults(chunk_size=1e6)



def convert_qa_to_string(row) -> str:
    options = list(map(str.strip, filter(lambda x: x is not None and x is not np.nan, [
       str(row[f"Option {i}"]) for i in range(1, 6)
    ])))
    options_str = "\n".join(
        f"{i+1}. {x}" for i, x in enumerate(options)
    )
    answer_key_map = {
        "A": 0,
        "B": 1,
        "C": 2,
        "D": 3,
        "E": 4,
    }
    correct_answer_str = options[answer_key_map[row["Answer Key"]]]

    return config.MCQ_PROMPT.format(
        question=row['Question'],
        options=options_str,
        correct_answer=correct_answer_str
    )


documents = [
    Document(
        text=convert_qa_to_string(row),
        metadata=row.to_dict()
    ) for _, row in arabicmmlu_df.query('Subject=="Law"').iterrows()
]

arabicmmlu_nodes = node_parser.get_nodes_from_documents(
    documents,
    show_progress=True
)
print(arabicmmlu_nodes[0].text)

In [None]:
import json
with open("../../data/raw/ArabLegalEval/MOJ_Regulations.json", 'r', encoding='utf8') as f:
    moj_regulations = json.load(f)

def listify_dict(d):
    outlist = []
    for k, v in d.items():
        # print('listifying', k, v)
        if k == 'name':
            continue
        v['name'] = k
        outlist.append(v)
    return outlist


for i in range(len(moj_regulations)):
    first_value = next(iter(moj_regulations[i]['Subjects'].values()))
    if 'description' in first_value:  # has no chapters
        # print("No chapters in", moj_regulations[i]['Subjects'].keys())
        moj_regulations[i]['Subjects'] = {
            'الباب الأول أحكام عامة': moj_regulations[i]['Subjects'],
            # 'الباب الأول أحكام عامة': moj_regulations[i]['Subjects'],
        }
    # moj_regulations[i]['Subjects'] = list(moj_regulations[i]['Subjects'].values())  ##FIXME: WARNING: this drops the keys which have some metadata
    moj_regulations[i]['Subjects'] = list(map(listify_dict, listify_dict(moj_regulations[i]['Subjects'])))
    # drop all tables
    for k, v in moj_regulations[i]['Subjects']:
        if 'tables' in v:
            del v['tables']  ##TODO: FIXME: make this append a markdown table to 'description'


## This is the part I'm not sure of ....
import pandas as pd
df_regs = pd.DataFrame(moj_regulations)
df_regs = pd.concat([
    df_regs.drop(columns=['Details']),
    pd.json_normalize(df_regs['Details'])
], axis=1)

df_regs = df_regs.explode('Subjects').explode('Subjects', )
flat_df = pd.concat([
    df_regs.drop(columns=['Subjects']).reset_index(),
    pd.json_normalize(df_regs['Subjects'], max_level=1)
], axis=1)

flat_df

In [None]:
import os
import json
import pandas as pd
import glob



# Example usage:
directory = '../../data/processed/Legal Data/وزارة العدل'  # Change this to your directory path

keys = [
    'preprocess_script_git_hash',
    'untrustworthy_git_hash',
    'schema_version',
    'source_entity',
    'origin_url',
    'serial_number',
    'original_file_path',
    'document_type',
    'circular_topic',
    'circular_number',
    'title',
    'issue_date',
    'effective_date',
    'expiration_date',
    'confidentiality',
    'languages',
    'contents',
]
# keys = ['original_file_path', 'contents', 'circular_topics']  # Change these to the keys you're interested in

def clean_text_content(string):
    return string.replace("""## text:\n---""", "").replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()

json_paths = [file for file in glob.glob(directory + '/**/*.json', recursive=True) if os.path.isfile(file)]
json_objects = [json.load(open(file_path, 'r')) for file_path in json_paths]

df = pd.DataFrame(json_objects)
df['paths'] = json_paths

#modify the dataframe to change the contents series to only contain the text value
df['contents'] = df['contents'].apply(lambda x: x[0]['text'] if x is not None and isinstance(x, list) and  x[0] and len(x[0]['text']) > 20 else None)
#drop rows with null values in the content column only
df = df.dropna(subset=['contents'])
df['contents'] = df['contents'].apply(clean_text_content)


df.shape

In [None]:
df['contents'].iloc[1]

In [None]:

llm = AzureOpenAI(
    # model="gpt-4-32k",
    engine=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    api_key=os.environ['AZURE_OPENAI_API_KEY'],
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
)
# llm.complete('hi')


In [None]:
# https://Cohere-command-r-plus-uptoi-serverless.eastus2.inference.ai.azure.com

llm = AzureOpenAI(
    # model="gpt-4-32k",
    engine="Cohere-command-r-plus-uptoi",
    api_key="N3KUBd1tsfxituEDcC7GuGjdKjhpW77O",
    azure_endpoint="https://Cohere-command-r-plus-uptoi-serverless.eastus2.inference.ai.azure.com",
    # api_version=,
)
llm.complete('hi')

In [None]:
node_parser = SentenceSplitter(chunk_size=1024)

In [None]:


from llama_index.embeddings.openai import OpenAIEmbedding



from llama_index.core import Settings
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import PromptTemplate, ServiceContext, StorageContext, VectorStoreIndex, load_index_from_storage



embed_model = OpenAIEmbedding(model='text-embedding-3-large', api_key=os.environ['OPENAI_API_KEY'])


Settings.embed_model = embed_model

In [None]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

chroma_client = chromadb.PersistentClient(path='./chroma_db')
# Traditional VDB
try:
    chroma_collection = chroma_client.get_collection(f'ArabicMMLU_legal')
except Exception as e:
    print("Creating new collection")
    chroma_collection = chroma_client.create_collection('ArabicMMLU_legal')

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

arabicmmlu_index = VectorStoreIndex(
    arabicmmlu_nodes,
    storage_context=storage_context,
    embed_model=embed_model,
    use_async=False,
    show_progress=True,
)

# # Sentence window retrieval
# query_engine_sentence_window = index_sentence_window.as_query_engine(
#     text_qa_template=text_qa_template, similarity_top_k=3, embed_model=embed_model, llm=llm
# )

In [None]:

# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
node_parser = SentenceSplitter(chunk_size=1e6, chunk_overlap=0)
documents = [
    Document(
        text=row['contents'].strip(),
        metadata=row.drop('contents').to_dict()
    ) for _, row in df.iterrows()
]

nodes = node_parser.get_nodes_from_documents(
    documents,
    show_progress=True
)

In [None]:
mcq_rules = [
    {'Rule': 'Use single best answer', '%*': 71.0},
    # {'Rule': 'Use format vertically', '%*': 36.0},
    # {'Rule': 'Use sound English', '%*': 64.0},
    # {'Rule': 'Use clear simple vocabulary', '%*': 64.0},
    # {'Rule': 'Avoid cueing or hinging', '%*': 71.0},
    {'Rule': 'Use important significant material', '%*': 93.0},
    {'Rule': 'Use single objective per item', '%*': 71.0},
    # {'Rule': 'Use own novel material', '%*': 36.0},
    # {'Rule': 'Use different thinking levels', '%*': 50.0},
    # {'Rule': 'Avoid tricky items', '%*': 71.0},
    # {'Rule': 'Use stem with vignette and question', '%*': 86.0},
    {'Rule': 'Use positive stem and lead-in', '%*': 86.0},
    # {'Rule': 'Use central idea in the stem', '%*': 86.0},
    # {'Rule': 'Use cover-the-options rule', '%*': 50.0},
    # {'Rule': 'Avoid absolute terms', '%*': 64.0},
    {'Rule': 'Use plausible, homogeneous options with parallel length', '%*': 95.0},
    # {'Rule': 'Use options in logical order without overlapping', '%*': 82.0},
    # {'Rule': 'Avoid aOTA, NOTA or “Complex form”', '%*': 81.0},
    {'Rule': 'Avoid vague terms', '%*': 79.0},
    {'Rule': 'Avoid test-wise item flaws', '%*': 79.0},
]

print("- " + "\n- ".join([x['Rule'] for x in mcq_rules]))


In [None]:
rules_str = pd.read_csv('mcq_rules.csv').dropna().to_dict()
apply(
    # for each row, join rule and item format
    lambda row: row['Rule'] + ": " + row["Item format"],
    axis=1
)
print("\n".join(list(rules_str)))

In [None]:
len(nodes)

## Dataset Generation

We first go through an exercise of generating a synthetic evaluation dataset. We do this by synthetically generating a set of questions from existing context. We then run each question with existing context through a powerful LLM (e.g. GPT-4) to generate a "ground-truth" response.

### Define Functions

We define the functions that we will use for dataset generation:

In [None]:

from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re
from tqdm.auto import tqdm
from multiprocessing.pool import ThreadPool


We define `generate_answers_for_questions` to generate answers from questions given context.

In [None]:
question_answer_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.USER, content=config.QA_PROMPT),
    ]
)

def generate_answers_for_questions(
    questions: List[str], context: str, llm: OpenAI
) -> str:
    """Generate answers for questions given context."""
    
    def generate_answer(idx, question):
        fmt_qa_prompt = question_answer_template.format_messages(
            context_str=context,
            query_str=question,
        )
        response_obj = llm.chat(fmt_qa_prompt)
        return response_obj.message.content

    # for idx, node in enumerate(nodes):
    answers = list(
        tqdm(
            ThreadPool().imap(
                lambda x: generate_answer(*x),
                enumerate(questions),
            ),
        "generate_answers_for_questions()",
        total=len(questions),
        )
    )

    return answers

We define `generate_qa_pairs` to generate qa pairs over an entire list of Nodes.

In [None]:
# QUESTION_GEN_USER_TMPL = (
#     "Context information is below.\n"
#     "---------------------\n"
#     "{context_str}\n"
#     "---------------------\n"
#     "Given the context information and not prior knowledge, "
#     "generate the relevant questions. "
# )

# QUESTION_GEN_SYS_TMPL = """\
# You are a Teacher/ Professor. Your task is to setup \
# {num_questions_per_chunk} questions for an upcoming \
# quiz/examination. The questions should be diverse in nature \
# across the document. Restrict the questions to the \
# context information provided.\
# """

question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=config.QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=config.QUESTION_GEN_USER_TMPL),
    ]
)



def generate_qa_pairs(
    nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 2,
    delimiter: str = "\n",
    question_gen_template=question_gen_template,
) -> List[Tuple[str, str]]:
    """Generate questions."""
    #TODO: add support for few shot prompts (using index retriever)
    def process_node(idx, node):
        context_str = node.get_content(metadata_mode="all")
        fmt_messages = question_gen_template.format_messages(
            num_questions_per_chunk=num_questions_per_chunk,
            context_str=context_str,
        )
        chat_response = llm.chat(fmt_messages)
        raw_output = chat_response.message.content

        result_list = str(raw_output).strip().split(delimiter)
        cleaned_questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip()
            for question in result_list
        ]
        answers = generate_answers_for_questions(
            cleaned_questions, context_str, llm
        )
        cur_qa_pairs = list(zip(cleaned_questions, answers))
        return cur_qa_pairs
    
    qa_pairs = list(
        tqdm(
            ThreadPool().imap(
                lambda x: process_node(*x),
                enumerate(nodes),
            ),
        "Generating QA pairs",
        total=len(nodes),
        )
    )
    # flatten
    qa_pairs = [item for sublist in qa_pairs for item in sublist]
        
    return qa_pairs


In [None]:
# arabicmmlu_df = arabicmmlu_df.drop('ID', axis=1)
# arabicmmlu_df['text'] = arabicmmlu_df['contents']

In [None]:
(nodes[0])

In [None]:
len(nodes)

In [None]:
# Get 10 random indicies to select nodes
import random
random_indicies = random.sample(range(0, len(nodes)), 10)
# random_indicies = [1138, 2673, 883, 745, 2670, 2801, 1748, 2999, 632, 72]
random_nodes = [nodes[i] for i in random_indicies]

In [None]:
qa_pairs = generate_qa_pairs(
    random_nodes,
    # nodes,
    llm,
    num_questions_per_chunk=1,
)

In [None]:
for q, a in qa_pairs:
    print(f"Q: {q}\nA: {a}\n")

Converting question answer paris int MSQs

In [None]:
qa_to_mcq_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.USER, content=config.QA_TO_MCQ_PROMPT),
    ]
)
qa_to_mcq_cot_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.USER, content=config.QA_TO_MCQ_COT_PROMPT),
    ]
)

In [None]:



def convert_quesitons_to_mcqs(
    qa_pairs: List[tuple], mcq_prompt_template: str, llm: OpenAI
) -> str:
    """Converting question-answer paris into MCQs."""
    
    def question_to_mcq(idx, qa_pair):
        question, answer = qa_pair
        prompt_template = mcq_prompt_template.format_messages(
            question=question,
            answer=answer,
        )
        response_obj = llm.chat(prompt_template)
        return response_obj.message.content

    mcqs = list(
        tqdm(
            ThreadPool().imap(
                lambda x: question_to_mcq(*x),
                enumerate(qa_pairs),
            ),
        "convert_quesitons_to_mcqs()",
        total=len(qa_pairs),
        )
    )

    return mcqs

In [None]:
mcqs = convert_quesitons_to_mcqs(
    qa_pairs,
    qa_to_mcq_template,
    llm
)

In [None]:
def format_mcqs(mcqs):
    formatted_mcqs = []
    for example in mcqs:
        question = example.split('\n')[0]
        options = example.split('\n')[2:]
        formatted_mcqs.append([question, options])
    return formatted_mcqs

formated_mcqs = format_mcqs(mcqs)

In [None]:
with open("mcq_qa_pairs.json", "w") as f:
    json.dump(formated_mcqs, f, indent=4, ensure_ascii=False)

In [None]:
for mcq in mcqs:
    print(mcq)
    print('--------------------------------\n')

With Chain of Thought

In [None]:
mcqs_cot = convert_quesitons_to_mcqs(
    qa_pairs,
    qa_to_mcq_cot_template,
    llm
)

In [None]:
for mcq in mcqs_cot:
    print(mcq)
    print('--------------------------------\n')

In [None]:
def format_mcqs_cot(mcqs_cot):
    formatted_mcqs_cot = []
    for example in mcqs_cot:
        reasoning_split = example.strip().strip('####').split('####')
        reasoning = reasoning_split[0]
        question = reasoning_split[1].strip().split('\n')[0]
        options = reasoning_split[1].strip().split('\n')[2:]
        formatted_mcqs_cot.append([question,reasoning, options])
    return formatted_mcqs_cot

formated_mcqs_cot = format_mcqs_cot(mcqs_cot)

In [None]:
with open("mcq_qa_cot_pairs.json", "w") as f:
    json.dump(formated_mcqs_cot, f, indent=4, ensure_ascii=False)

### Getting Pairs over Dataset

**NOTE**: This can take a long time. For the sake of speed try inputting a subset of the nodes.

In [None]:
# qa_pairs = generate_qa_pairs(
#     # nodes[:1],
#     nodes,
#     llm,
#     question_gen_template=question_gen_template,
#     num_questions_per_chunk=4,
# )

#### For MCQ GENERATION

In [None]:

mcq_question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=config.MCQ_QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=config.MCQ_QUESTION_GEN_USER_TMPL),
    ]
)


def generate_mcq_pairs(
    nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10,
    top_k: int = 3,
    delimiter: str = "\n",
    mcq_question_gen_template=mcq_question_gen_template,
) -> List[Tuple[str, str]]:
    """Generate questions."""
    def process_node(idx, node):
        context_str = node.get_content(metadata_mode="none")
        if "{few_shot_examples}" in '\n'.join([x.content for x in mcq_question_gen_template.message_templates]) and top_k > 0:
            arabicmmlu_retriever = arabicmmlu_index.as_retriever(
                similarity_top_k=top_k,
                embed_model=embed_model,
            )
            few_shot_examples_str = "\n------------------------------------------------\n".join([
                x.text for x in arabicmmlu_retriever.retrieve(node.text)
            ])
        else:
            few_shot_examples_str = ""

        fmt_messages = mcq_question_gen_template.format_messages(
            num_questions_per_chunk=num_questions_per_chunk,
            context_str=context_str,
            few_shot_examples=few_shot_examples_str,
        )
        chat_response = llm.chat(fmt_messages)
        raw_output = chat_response.message.content
        result_list = str(raw_output).strip().split(delimiter)

        cur_mcq_pairs = [
            #TODO: make this from the config
            question.split("الجواب الصحيح: ")
            for question in result_list if question.strip()
        ]

        return cur_mcq_pairs
    
    mcq_pairs = list(
        tqdm(
            ThreadPool().imap(
                lambda x: process_node(*x),
                enumerate(nodes),
            ),
        "Generating QA pairs",
        total=len(nodes),
        )
    )
    # flatten
    mcq_pairs = [item for sublist in mcq_pairs for item in sublist]
        
    return mcq_pairs

mcq_pairs = generate_mcq_pairs(
    random_nodes,
    # nodes,
    llm,
    mcq_question_gen_template=mcq_question_gen_template,
    num_questions_per_chunk=4,
    delimiter="####"
)

# Run your async function in the existing event loop

In [None]:
for q, a in mcq_pairs:
    print(f"Q: {q}\nA: {a}\n")

In [None]:
for k in [0, 1, 3, 5]:
    mcq_pairs = generate_mcq_pairs(
        random_nodes,
        llm,
        mcq_question_gen_template=mcq_question_gen_template,
        num_questions_per_chunk=1,
        top_k=k,
        delimiter="####"
    )
    with open(f"mcq_pairs_top_{k}.json", "w") as f:
        json.dump(mcq_pairs, f, indent=4, ensure_ascii=False)


In [None]:
import textwrap
def w(*args, **kwargs):
    return print(textwrap.fill(args[0], width=80), *args[1:], **kwargs)

# print(fmt_messages[-1].content)
# print wrapped

# print(textwrap.fill(fmt_messages[-1].content, width=80))
# w(fmt_messages[-1].content)


In [None]:
qa_pairs

#### [Optional] Define save/load

In [None]:
# save
import pickle

pickle.dump(qa_pairs, open("eval_dataset.pkl", "wb"))

In [None]:
# save
import pickle

qa_pairs = pickle.load(open("eval_dataset.pkl", "rb"))

## Evaluating Generation

In this section we walk through a few methods for evaluating the generated results. At a high-level we use an "evaluation LLM" to measure the quality of the generated results. We do this in both the **with labels** setting and **without labels** setting. 

We go through the following evaluation algorithms:
- **Correctness**: Compares the generated answer against the ground-truth answer.
- **Faithfulness**: Evaluates whether a response is faithful to the contexts (label-free).

### Building a Correctness Evaluator

The correctness evaluator compares the generated answer to the reference ground-truth answer, given the query. We output a score between 1 and 5, where 1 is the worst and 5 is the best.

We do this through a system and user prompt with a chat interface.

In [None]:
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Dict

In [None]:
eval_chat_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=config.QA_CORRECTNESS_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=config.QA_CORRECTNESS_USER_TMPL),
    ]
)

Now that we've defined the prompts template, let's define an evaluation function that feeds the prompt to the LLM and parses the output into a dict of results.

In [None]:
from llama_index.llms.openai import OpenAI


def run_correctness_eval(
    query_str: str,
    reference_answer: str,
    generated_answer: str,
    llm: OpenAI,
    threshold: float = 4.0,
) -> Dict:
    """Run correctness eval."""
    fmt_messages = eval_chat_template.format_messages(
        llm=llm,
        query=query_str,
        reference_answer=reference_answer,
        generated_answer=generated_answer,
    )
    chat_response = llm.chat(fmt_messages)
    raw_output = chat_response.message.content

    # Extract from response
    score_str, reasoning_str = raw_output.split("\n", 1)
    score = float(score_str)
    reasoning = reasoning_str.lstrip("\n")

    return {"passing": score >= threshold, "score": score, "reason": reasoning}

Now let's try running this on some sample inputs with a chat model (GPT-4).

In [None]:
# llm = OpenAI(model="gpt-4")

In [None]:

node_parser = SentenceSplitter(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes, show_progress=True)
query_engine = index.as_query_engine(llm=llm)

In [None]:
# query_str = "What is the range of parameters for the large language models (LLMs) developed in this work?"
# reference_answer = "The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion."

query_str = (
    "What is the specific name given to the fine-tuned LLMs optimized for"
    " dialogue use cases?"
)
reference_answer = (
    "The specific name given to the fine-tuned LLMs optimized for dialogue use"
    " cases is Llama 2-Chat."
)

In [None]:
generated_answer = str(query_engine.query(query_str))

In [None]:
print(str(generated_answer))

In [None]:
eval_results = run_correctness_eval(
    query_str, reference_answer, generated_answer, llm=llm, threshold=4.0
)
display(eval_results)

### Building a Faithfulness Evaluator

The faithfulness evaluator evaluates whether the response is faithful to any of the retrieved contexts.

This is a step up in complexity from the correctness evaluator. Since the set of contexts can be quite long, they might overflow the context window. We would need to figure out how to implement a form of **response synthesis** strategy to iterate over contexts in sequence.

We have a corresponding tutorial showing you [how to build response synthesis from scratch](https://gpt-index.readthedocs.io/en/latest/examples/low_level/response_synthesis.html). We also have [out-of-the-box response synthesis modules](https://gpt-index.readthedocs.io/en/latest/core_modules/query_modules/response_synthesizers/root.html). In this guide we'll use the out of the box modules.

In [None]:
# EVAL_TEMPLATE = PromptTemplate(config.EVAL_TEMPLATE)

# EVAL_REFINE_TEMPLATE = PromptTemplate(config.EVAL_REFINE_TEMPLATE)


**NOTE**: In the current response synthesizer setup we don't separate out a system and user message for chat endpoints, so we just use our standard `llm.complete` for text completion.

We now define our function below. Since we defined both a standard eval template for a given piece of context but also a refine template for subsequent contexts, we implement our "create-and-refine" response synthesis strategy to obtain the answer.

In [None]:
from llama_index.core.response_synthesizers import Refine
from typing import List, Dict


def run_faithfulness_eval(
    generated_answer: str,
    contexts: List[str],
    llm: OpenAI,
) -> Dict:
    """Run faithfulness eval."""

    refine = Refine(
        llm=llm,
        text_qa_template=PromptTemplate(config.EVAL_TEMPLATE),
        refine_template=PromptTemplate(config.EVAL_REFINE_TEMPLATE),
    )

    response_obj = refine.get_response(generated_answer, contexts)
    response_txt = str(response_obj)

    if "yes" in response_txt.lower():
        passing = True
    else:
        passing = False

    return {"passing": passing, "reason": str(response_txt)}

Let's try it out on some data

In [None]:
# use the same query_str, and reference_answer as above
# query_str = "What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?"
# reference_answer = "The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat."

response = query_engine.query(query_str)
generated_answer = str(response)

In [None]:
context_list = [n.get_content() for n in response.source_nodes]
eval_results = run_faithfulness_eval(
    generated_answer,
    contexts=context_list,
    llm=llm,
)
display(eval_results)

## Running Evaluation over our Eval Dataset

Now let's tie the two above sections together and run our eval modules over our eval dataset!

**NOTE**: For the sake of speed/cost we extract a very limited sample.

In [None]:
import random

sample_size = 5
qa_pairs_sample = random.sample(qa_pairs, sample_size)

In [None]:
import pandas as pd


def run_evals(qa_pairs: List[Tuple[str, str]], llm: OpenAI, query_engine):
    results_list = []
    for question, reference_answer in qa_pairs:
        response = query_engine.query(question)
        generated_answer = str(response)
        correctness_results = run_correctness_eval(
            query_str,
            reference_answer,
            generated_answer,
            llm=llm,
            threshold=4.0,
        )
        faithfulness_results = run_faithfulness_eval(
            generated_answer,
            contexts=context_list,
            llm=llm,
        )
        cur_result_dict = {
            "correctness": correctness_results["passing"],
            "faithfulness": faithfulness_results["passing"],
        }
        results_list.append(cur_result_dict)
    return pd.DataFrame(results_list)

In [None]:
evals_df = run_evals(qa_pairs_sample, llm, query_engine)

In [None]:
evals_df["correctness"].mean()

In [None]:
evals_df["faithfulness"].mean()