In [10]:
from langchain_openai import ChatOpenAI 
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.evaluation import load_evaluator
from langchain_core.pydantic_v1 import BaseModel, Field

# from langchain.evaluation.criteria import {
#     CriteriaEvalChain,
#     LabeledCriteriaEvalChain
# }
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
# from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')


In [4]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)


In [12]:
class ResultScore(BaseModel):
    score: float = Field(..., description="The score of the result, ranging from 0 to 1 where 1 is the best possible score.")


In [26]:
correctness_prompt = PromptTemplate(
input_variables=["question", "ground_truth", "generated_answer"],
template="""
Question: {question}
Ground Truth: {ground_truth}
Generated Answer: {generated_answer}

Evaluate the correctness of the generated answer compared to the ground truth.
Score from 0 to 1, where 1 is perfectly correct and 0 is completely incorrect.
any score between 0 and 1 is acceptable and depends on how correct the generated answer is.

Score:
"""
)
correctness_chain = correctness_prompt | llm.with_structured_output(ResultScore)


def evaluate_correctness(question, ground_truth, generated_answer):
    """Evaluates the correctness of the generated answer compared to the ground truth.

    Args:
        question: The question.
        ground_truth: The ground truth answer.
        generated_answer: The generated answer.

    Returns:
        A float between 0 and 1, where 1 is the best possible score.
    """
    result = correctness_chain.invoke({"question": question, "ground_truth": ground_truth, "generated_answer": generated_answer})
    return result.score


In [24]:
# test create_correctness_chain
question = "What is the capital of France and Spain?"
ground_truth = "Paris and Barcelona"
generated_answer = "Paris"
score = evaluate_correctness(question, ground_truth, generated_answer)

In [25]:
score

0.5

In [51]:
faithfulness_prompt = PromptTemplate(
input_variables=["question","context", "generated_answer"],
template="""
Question: {question}
Context: {context}
Generated Answer: {generated_answer}

Evaluate if the generate answer to the question can be deduced from the context.
Score from 0 to 1, where 1 is perfectly faithful and 0 is completely unfaithful.
any score between 0 and 1 is acceptable and depends on how faithful the generated answer is.

example:
Question: What are the capitals of France and Spain?
Context: Paris is the capital of France and Madrid is the capital of Spain.
Generated Answer: Paris
in this case the generated answer is faithful to the context so the score should be 1.

example:
Question: What are the capital cities of France and Spain?
Context: London is the capital of France and Barcelona is the capital of Spain.
Generated Answer: London and Barcelona.
in this case the generated answer is faithful to the context so the score should be 1.

example:
Question: What are the capital cities of France and Spain?
Context: Paris is the capital of France and Madrid is the capital of Spain.
Generated Answer: Paris.
in this case the generated answer is faithful to the context so the score should be 1.

exmaple:
Question: What are the capitals of France and Spain?
Context: London is the capital of France and Madrid is the Capital of Spain.
Generated Answer: Paris and Madrid.
in this case the generated answer is based on the pretrained knowledge of the llm and is not faithful to the context so the score should be 0.

example:
Question: What is the capital of France and Spain?
Context: Monkeys like to eat bananas.
Generated Answer: Paris and Madrid.
in this case the generated answer is not based on the context so the score should be 0.
"""
)
faithfulness_chain = faithfulness_prompt | llm.with_structured_output(ResultScore)

In [48]:
def evaluate_faithfulness(question, context, generated_answer):
    """Evaluates if the generate answer to the question can be deduced from the context.

    Args:
        question: The question.
        context: The context.
        generated_answer: The generated answer.

    Returns:
        A float between 0 and 1, where 1 is the best possible score.
    """
    result = faithfulness_chain.invoke({"question": question, "context": context, "generated_answer": generated_answer})
    return result.score

In [49]:
# test create_faithfulness_chain
question= "What are the capital cities of France and Spain?"
context = "France and Spain are two countries in Europe. The capital of France is Paris and the capital of Spain is Madrid."
generated_answer = "Paris"
score = evaluate_faithfulness(question, context, generated_answer)
print(score)

1.0
