
## **LLM Evaluation Framework**
This notebook sets up an evaluation infrastructure for testing LLM performance in a Retrieval-Augmented Generation (RAG) pipeline.
### **Key Components**
- **Vector Database:** ChromaDB for storing and retrieving documents.
- **Embedding Model:** OpenAIEmbeddings for converting text into vector space.
- **Retriever:** Queries the vector database for relevant document snippets.
- **LLM (GPT-4o, GPT-3.5, Gemini):** Generates responses based on retrieved contexts.
- **Evaluation Setup:** Dummy evaluator to test LLM outputs.


In [51]:
#! pip install langchain_community


In [52]:
#! pip install chromadb

In [53]:
#%pip install -qU langchain-google-genai

In [1]:
import os 
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader



doc_path = os.getcwd()
dir = os.path.dirname(os.path.abspath(doc_path))
file_path = os.path.join(dir, "docs", "dummy.txt")

loader = TextLoader(file_path)
documents = loader.load()   

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
splits = text_splitter.split_documents(documents)


# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Index
retriever = vectorstore.as_retriever()


RuntimeError: Error loading /Users/lilian/docs/dummy.txt

In [46]:
import openai
import datetime
from langsmith import traceable
from langsmith.wrappers import wrap_openai
#from langchain_google_genai import ChatGoogleGenerativeAI
#from langchain_google_genai import GoogleGenerativeAI
import google.generativeai as genai


today = datetime.datetime.now().strftime("%Y-%m-%d")

class Ragdummy:
    def __init__(self, retriever, model: str = "gpt-4o"):
    #def __init__(self, retriever, model: str = "gpt-4o-mini"):
    #def __init__(self, retriever, model: str = "gpt-3.5-turbo"):
    #def __init__(self, retriever, model: str = "gemini-2.0-flash"):    
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        #self._client = genai.GenerativeModel(model)
        self._model = model

    #CONTEXT = "answer business questions based on provided documents"
    @traceable
    def get_answer(self, question: str):
        similar = self._retriever.invoke(question)
        
        response = self._client.chat.completions.create(
        #response = self._client.generate_content(
    
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are an accomplished AI working for the insights department at a company.Your job is to answer business questions based on provided documents. "
                    "Today is {today}."
                               " Use the following docs to produce a concise answer to the users question"
                               f"## Docs\n\n{similar}"
                     """Please follow these steps to provide a response:

                    1. Carefully review the source material, paying attention to any information that is relevant to answering the question:
                        - Make extensive use of all information given.
                        - It is CRITICAL that you only use information that is explicitly stated above.
                        - Refrain from recommendations, speculation, or extrapolations.
                        - Compare and contrast findings from different sources. Watch out for any apparent conflicts between sources as well as any corroborating information.
                        - Make sure the context of the information given is applicable to the question, such as any specific country, category, or target group the question asks about.

                    2. Write the answer in a professional, well-structured format with headings and inline citations:
                        - Make sure to write a didactically well-structured answer for insights professionals and business stakeholders.
                        - Aim to provide a professional response that will help inform decision-making.
                        - Structure your answer with headings and use concise full text.
                        - Use tables in your response only when needed.
                        - Use lists and bullet points sparingly. Absolutely avoid nesting lists and bullet points.
                        - Ensure to include direct inline citations for all information referenced in your answer. Use a bracketed citation style with source reference like [XXX]. If there are multiple references, provide them in separate brackets each: [XXX][ZZZ]. MAKE SURE TO USE THE REFERENCES EXACTLY AS GIVEN IN THE <reference> TAGS ABOVE.

                    Write all your responses in English.

                    Make sure to use Markdown in your response and structure it in this format:

                    <answer>
                    ...
                    </answer>
            """},
                {"role": "user", "content": question}

            ]
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            #"answer": response.text,
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in similar],
        }

# Example usage (Ensure `retriever` is properly initialized)
rag_dummy = Ragdummy(retriever) 

In [47]:
response = rag_dummy.get_answer("What percentage of UK consumers are plant-based eaters?")
response["answer"]

"<answer>\n9% of UK consumers are plant-based eaters, which includes both vegans and vegetarians [Document(metadata={'source': '/Users/lilian/Documents/LLM Eval Project/docs/dummy.txt'})].\n</answer>"

In [48]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_dummy.get_answer(example["question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_dummy.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

### Evaluation with GPT 4o

In [40]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        }
    )
]

dataset_name = "Market Logic Dummy"

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="Eval-4o",
    metadata={
    "title":"Report Plant based food in the UK",
    "source": "ProVeg International",
    "publication_date":"2024-09-01T08:20:00Z",
    "summary":"Market analysis of consumer attitudes and behaviors towards plant-based foods in the UK, based on survey conducted as part of the Smart Protein project."
},
)


View the evaluation results for experiment: 'Eval-4o-17704c62' at:
https://smith.langchain.com/o/8a8f5023-8b7e-4c2c-b2bc-eb849b90cbe6/datasets/2148b077-b0a4-439d-acf9-03e55035e9ff/compare?selectedSessions=0d086311-b571-4041-a65d-2c6753c59496




6it [00:41,  6.87s/it]


### Evaluation with GPT 4o-mini

In [37]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        }
    )
]

dataset_name = "Market Logic Dummy"

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="Eval-4o-mini",
    metadata={
    "title":"Report Plant based food in the UK",
    "source": "ProVeg International",
    "publication_date":"2024-09-01T08:20:00Z",
    "summary":"Market analysis of consumer attitudes and behaviors towards plant-based foods in the UK, based on survey conducted as part of the Smart Protein project."
},
)


View the evaluation results for experiment: 'Eval-4o-mini-714673e9' at:
https://smith.langchain.com/o/8a8f5023-8b7e-4c2c-b2bc-eb849b90cbe6/datasets/2148b077-b0a4-439d-acf9-03e55035e9ff/compare?selectedSessions=649c9f07-3cf6-41b3-818a-9cae9a43b8df




6it [00:32,  5.44s/it]


### Evaluation with GPT 3.5

In [43]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        }
    )
]

dataset_name = "Market Logic Dummy"

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="Eval-3.5",
    metadata={
    "title":"Report Plant based food in the UK",
    "source": "ProVeg International",
    "publication_date":"2024-09-01T08:20:00Z",
    "summary":"Market analysis of consumer attitudes and behaviors towards plant-based foods in the UK, based on survey conducted as part of the Smart Protein project."
},
)


View the evaluation results for experiment: 'Eval-3.5-cb93cddb' at:
https://smith.langchain.com/o/8a8f5023-8b7e-4c2c-b2bc-eb849b90cbe6/datasets/2148b077-b0a4-439d-acf9-03e55035e9ff/compare?selectedSessions=bfdd343a-d572-4df7-bf8f-335cdb239489




6it [00:27,  4.55s/it]


### Evaluation with Gemini 1.0

In [None]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        }
    )
]

dataset_name = "Market Logic Dummy"

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="Eval-gemini-1.5-pro",
    metadata={
    "title":"Report Plant based food in the UK",
    "source": "ProVeg International",
    "publication_date":"2024-09-01T08:20:00Z",
    "summary":"Market analysis of consumer attitudes and behaviors towards plant-based foods in the UK, based on survey conducted as part of the Smart Protein project."
},
)


In [50]:
#%pip install langchain-google-vertexai

### Evaluation with context Answer - GPT 4o

In [44]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        }
    )
]

dataset_name = "Market Logic Dummy"

experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="Eval-3.5b",
    metadata={
    "title":"Report Plant based food in the UK",
    "source": "ProVeg International",
    "publication_date":"2024-09-01T08:20:00Z",
    "summary":"Market analysis of consumer attitudes and behaviors towards plant-based foods in the UK, based on survey conducted as part of the Smart Protein project."
},
)


View the evaluation results for experiment: 'Eval-3.5b-ad418013' at:
https://smith.langchain.com/o/8a8f5023-8b7e-4c2c-b2bc-eb849b90cbe6/datasets/2148b077-b0a4-439d-acf9-03e55035e9ff/compare?selectedSessions=dcdb7d4b-5995-47fd-b18e-3a61f90d9cc7




6it [00:25,  4.20s/it]


### Evaluation with context Answer - GPT 3.5

In [49]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        }
    )
]

dataset_name = "Market Logic Dummy"

experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="Eval-4ob",
    metadata={
    "title":"Report Plant based food in the UK",
    "source": "ProVeg International",
    "publication_date":"2024-09-01T08:20:00Z",
    "summary":"Market analysis of consumer attitudes and behaviors towards plant-based foods in the UK, based on survey conducted as part of the Smart Protein project."
},
)


View the evaluation results for experiment: 'Eval-4ob-99c33a6d' at:
https://smith.langchain.com/o/8a8f5023-8b7e-4c2c-b2bc-eb849b90cbe6/datasets/2148b077-b0a4-439d-acf9-03e55035e9ff/compare?selectedSessions=957744f6-ada2-4045-9662-ae7ae3beee42




6it [00:36,  6.03s/it]


In [None]:

# Sample queries to validate evaluation pipeline
test_queries = [
    "What are the key business insights from the document?",
    "Summarize the main points in a structured format.",
    "Are there any risks mentioned in the document?"
]

# Running evaluations
for query in test_queries:
    result = rag_dummy.get_answer(query)
    print(f"Query: {query}\n")
    print(f"Answer: {result['answer']}\n")
    print(f"Contexts: {result['contexts']}\n")
    print("-" * 80)


In [None]:

# Simple evaluation metrics for responses
def evaluate_response(response, contexts):
    relevance_score = sum(1 for ctx in contexts if ctx in response) / len(contexts) if contexts else 0
    length_score = len(response.split())  # Word count as a rough metric
    return {"relevance_score": relevance_score, "length_score": length_score}

# Run evaluation on test queries
for query in test_queries:
    result = rag_dummy.get_answer(query)
    scores = evaluate_response(result['answer'], result['contexts'])
    print(f"Query: {query}\n")
    print(f"Relevance Score: {scores['relevance_score']}")
    print(f"Response Length: {scores['length_score']} words\n")
    print("-" * 80)
