In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import pandas as pd

#QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# dataset
qa_pairs = [{"question":q,"answer":a} for q,a in zip(inputs,outputs)]
df = pd.DataFrame(qa_pairs)

# write 
csv_path = "D:/AI_Projects/RAG/data/goldens.csv"
df.to_csv(csv_path,index=False)


In [3]:
from langsmith import Client

client = Client()
dataset_name = "Multi_Docs_Chats"

#store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIReport",
    
)
# Add examples one by one
for q, a in zip(inputs, outputs):
    client.create_example(
        inputs={"question": q},      # must be ONE dict
        outputs={"answer": a},       # must be ONE dict
        dataset_id=dataset.id
        
    )


In [4]:
import sys
sys.path.append("D:/AI_Projects/RAG")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os
from multi_doc_chat.utils.model_loader import ModelLoader

# simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self,file_path:str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()
    
def answer_ai_report_question(
    inputs: dict,
    data_path :str = "D:/AI_Projects/RAG/data/2025 AI engineering Report.txt",
    chunk_size: int = 1000,
chunk_overlap: int = 200,
k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        #build retriever
        ingestor.built_retriver(
        uploaded_files=[file_adapter],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        k=k
    )
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        loader = ModelLoader()
        
        # create RAG instance and load retreiver
        
        rag = ConversationalRAG(session_id=session_id,model_loader=loader)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
    )
        # get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer":answer}
    
    except Exception as e:
            return {"answer": f"Error: {str(e)}"}
            
            
            
            






In [None]:
#!pip install import-ipynb


Collecting import-ipynb
  Obtaining dependency information for import-ipynb from https://files.pythonhosted.org/packages/ec/62/e0b830773060d2a390aa923dcc8afc680d798bdbdadb6394f760fac62517/import_ipynb-0.2-py3-none-any.whl.metadata
  Using cached import_ipynb-0.2-py3-none-any.whl.metadata (2.3 kB)
Collecting IPython (from import-ipynb)
  Obtaining dependency information for IPython from https://files.pythonhosted.org/packages/f1/df/8ee1c5dd1e3308b5d5b2f2dfea323bb2f3827da8d654abb6642051199049/ipython-9.8.0-py3-none-any.whl.metadata
  Using cached ipython-9.8.0-py3-none-any.whl.metadata (4.5 kB)
Collecting nbformat (from import-ipynb)
  Obtaining dependency information for nbformat from https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl.metadata
  Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting colorama>=0.4.4 (from IPython->import-ipynb)
  Obtaining dependency information f

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Python312\\share'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [5]:
# test the function wuth a sample question
#from notebook.evaluation.answer_ai_report_question
 
#import import_ipynb
#from notebook.evaluation import answer_ai_report_question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-12-08T12:23:56.060766Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T12:23:56.061767Z", "level": "info", "event": "API key loaded successfully"}
{"config_path": "D:\\AI_Projects\\RAG\\multi_doc_chat\\config\\config.yaml", "timestamp": "2025-12-08T12:23:56.064774Z", "level": "info", "event": "ModelLoader initialized"}
{"session_id": "session_20251208_175356_c2118f26", "temp_dir": "data\\session_20251208_175356_c2118f26", "faiss_dir": "faiss_index\\session_20251208_175356_c2118f26", "sessionized": true, "timestamp": "2025-12-08T12:23:56.068056Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "2025 AI engineering Report.txt", "saved_as": "data\\session_20251208_175356_c2118f26\\d85ccdf4.txt", "timestamp": "2025-12-08T12:23:56.086051Z", "level": "info", "event": "File saved for ingestion"}
{"count": 1, "timestamp": "2025-12-08T12:23:56.090050Z", "level": 

  from .autonotebook import tqdm as notebook_tqdm
Use pytorch device_name: cpu
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
{"error": "module 'faiss' has no attribute 'IndexFlatL2'", "timestamp": "2025-12-08T12:24:48.752624Z", "level": "error", "event": "Failed to build retriever"}


Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: Error: Error in [d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py] at line [888] | Message: Failed to build retriever
Traceback:
Traceback (most recent call last):
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 88, in built_retriver
    vs= fm.load_or_create(texts=texts,metadatas=metas)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 178, in load_or_create
    self.vs = FAISS.from_texts(texts=texts, embedding=self.emb, metadatas=metadatas or [])
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py", line 931, in from_texts
    return cls.__from(
           ^^^^^^^^^^^
  File "d:\AI_P

In [None]:
#from langsmith.evaluation import evaluate ,LangChainStringEvaluator 

AttributeError: module 'faiss' has no attribute 'IndexFlatL2'

In [6]:
# Example: Test with all golden questions
print("Testing all the questions from the  dataset")
for i, q in enumerate(inputs,1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}:{q}")
    print(f"A{i}:{result['answer']}\n")
    print("-" * 80 + "\n") 

{"timestamp": "2025-12-08T12:25:14.811574Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T12:25:14.813571Z", "level": "info", "event": "API key loaded successfully"}
{"config_path": "D:\\AI_Projects\\RAG\\multi_doc_chat\\config\\config.yaml", "timestamp": "2025-12-08T12:25:14.814577Z", "level": "info", "event": "ModelLoader initialized"}
{"session_id": "session_20251208_175514_10352ba9", "temp_dir": "data\\session_20251208_175514_10352ba9", "faiss_dir": "faiss_index\\session_20251208_175514_10352ba9", "sessionized": true, "timestamp": "2025-12-08T12:25:14.817571Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "2025 AI engineering Report.txt", "saved_as": "data\\session_20251208_175514_10352ba9\\79b807a7.txt", "timestamp": "2025-12-08T12:25:14.819571Z", "level": "info", "event": "File saved for ingestion"}
{"count": 1, "timestamp": "2025-12-08T12:25:14.823573Z", "level": 

Testing all the questions from the  dataset


{"error": "module 'faiss' has no attribute 'IndexFlatL2'", "timestamp": "2025-12-08T12:25:18.731106Z", "level": "error", "event": "Failed to build retriever"}
{"timestamp": "2025-12-08T12:25:18.736207Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T12:25:18.738102Z", "level": "info", "event": "API key loaded successfully"}
{"config_path": "D:\\AI_Projects\\RAG\\multi_doc_chat\\config\\config.yaml", "timestamp": "2025-12-08T12:25:18.739097Z", "level": "info", "event": "ModelLoader initialized"}
{"session_id": "session_20251208_175518_3152a3c4", "temp_dir": "data\\session_20251208_175518_3152a3c4", "faiss_dir": "faiss_index\\session_20251208_175518_3152a3c4", "sessionized": true, "timestamp": "2025-12-08T12:25:18.751130Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "2025 AI engineering Report.txt", "saved_as": "data\\session_20251208_175518_3152a3c4\\85668c3f.txt", "time

Q1:For customer-facing applications, which company's models dominate the top rankings?
A1:Error: Error in [d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py] at line [888] | Message: Failed to build retriever
Traceback:
Traceback (most recent call last):
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 88, in built_retriver
    vs= fm.load_or_create(texts=texts,metadatas=metas)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 178, in load_or_create
    self.vs = FAISS.from_texts(texts=texts, embedding=self.emb, metadatas=metadatas or [])
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py", line 931, in from_texts
    return cls.__from(
           ^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.

{"error": "module 'faiss' has no attribute 'IndexFlatL2'", "timestamp": "2025-12-08T12:25:22.956650Z", "level": "error", "event": "Failed to build retriever"}
{"timestamp": "2025-12-08T12:25:22.962648Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T12:25:22.963648Z", "level": "info", "event": "API key loaded successfully"}
{"config_path": "D:\\AI_Projects\\RAG\\multi_doc_chat\\config\\config.yaml", "timestamp": "2025-12-08T12:25:22.964648Z", "level": "info", "event": "ModelLoader initialized"}
{"session_id": "session_20251208_175522_1de60844", "temp_dir": "data\\session_20251208_175522_1de60844", "faiss_dir": "faiss_index\\session_20251208_175522_1de60844", "sessionized": true, "timestamp": "2025-12-08T12:25:22.966651Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "2025 AI engineering Report.txt", "saved_as": "data\\session_20251208_175522_1de60844\\e4737d98.txt", "time

Q2:What percentage of respondents are using RAG in some form?
A2:Error: Error in [d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py] at line [888] | Message: Failed to build retriever
Traceback:
Traceback (most recent call last):
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 88, in built_retriver
    vs= fm.load_or_create(texts=texts,metadatas=metas)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 178, in load_or_create
    self.vs = FAISS.from_texts(texts=texts, embedding=self.emb, metadatas=metadatas or [])
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py", line 931, in from_texts
    return cls.__from(
           ^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.venv\Lib\site-packages\la

{"error": "module 'faiss' has no attribute 'IndexFlatL2'", "timestamp": "2025-12-08T12:25:27.102762Z", "level": "error", "event": "Failed to build retriever"}


Q3:How often are most respondents updating their models?
A3:Error: Error in [d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py] at line [888] | Message: Failed to build retriever
Traceback:
Traceback (most recent call last):
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 88, in built_retriver
    vs= fm.load_or_create(texts=texts,metadatas=metas)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\AI_Projects/RAG\multi_doc_chat\src\document_ingestion\data_ingestion.py", line 178, in load_or_create
    self.vs = FAISS.from_texts(texts=texts, embedding=self.emb, metadatas=metadatas or [])
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.venv\Lib\site-packages\langchain_community\vectorstores\faiss.py", line 931, in from_texts
    return cls.__from(
           ^^^^^^^^^^^
  File "d:\AI_Projects\RAG\.venv\Lib\site-packages\langcha

In [19]:
!pip install -U langsmith


Collecting langsmith
  Obtaining dependency information for langsmith from https://files.pythonhosted.org/packages/b8/6f/d5f9c4f1e03c91045d3675dc99df0682bc657952ad158c92c1f423de04f4/langsmith-0.4.56-py3-none-any.whl.metadata
  Downloading langsmith-0.4.56-py3-none-any.whl.metadata (15 kB)
Collecting httpx<1,>=0.23.0 (from langsmith)
  Obtaining dependency information for httpx<1,>=0.23.0 from https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl.metadata
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson>=3.9.14 (from langsmith)
  Obtaining dependency information for orjson>=3.9.14 from https://files.pythonhosted.org/packages/d4/fb/f05646c43d5450492cb387de5549f6de90a71001682c17882d9f66476af5/orjson-3.11.5-cp312-cp312-win_amd64.whl.metadata
  Downloading orjson-3.11.5-cp312-cp312-win_amd64.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.7 kB ? eta -:

ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'C:\\Python312\\Scripts\\httpx.exe' -> 'C:\\Python312\\Scripts\\httpx.exe.deleteme'


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [24]:
import inspect
from langsmith.evaluation import evaluate
print(inspect.signature(evaluate))

(target: 'Union[TARGET_T, Runnable, EXPERIMENT_T, tuple[EXPERIMENT_T, EXPERIMENT_T]]', /, data: 'Optional[DATA_T]' = None, evaluators: 'Optional[Union[Sequence[EVALUATOR_T], Sequence[COMPARATIVE_EVALUATOR_T]]]' = None, summary_evaluators: 'Optional[Sequence[SUMMARY_EVALUATOR_T]]' = None, metadata: 'Optional[dict]' = None, experiment_prefix: 'Optional[str]' = None, description: 'Optional[str]' = None, max_concurrency: 'Optional[int]' = 0, num_repetitions: 'int' = 1, client: 'Optional[langsmith.Client]' = None, blocking: 'bool' = True, experiment: 'Optional[EXPERIMENT_T]' = None, upload_results: 'bool' = True, error_handling: "Literal['log', 'ignore']" = 'log', **kwargs: 'Any') -> 'Union[ExperimentResults, ComparativeExperimentResults]'


In [10]:
uv pip install --upgrade langsmith langchain langchain-community


Note: you may need to restart the kernel to use updated packages.


d:\AI_Projects\RAG\.venv\Scripts\python.exe: No module named uv


In [None]:
from langsmith.evaluation import evaluate
from langsmith.evaluators import CriteriaEvaluator


qa_evaluator = CriteriaEvaluator("correctness")
dataset_name = "AgenticAIReportGoldens"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-agenticAIReport-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

ModuleNotFoundError: No module named 'langsmith.evaluation.criteria'

In [17]:
#from langsmith.evaluation import evaluate
#import inspect
#print(inspect.getsource(evaluate))
import sys
print(sys.executable)

import langsmith
import pkgutil

modules = [m.name for m in pkgutil.iter_modules(langsmith.__path__)]
print(modules)



d:\AI_Projects\RAG\.venv\Scripts\python.exe
['_expect', '_internal', 'anonymizer', 'async_client', 'beta', 'client', 'env', 'evaluation', 'middleware', 'pytest_plugin', 'run_helpers', 'run_trees', 'schemas', 'testing', 'utils', 'uuid', 'wrappers']


In [13]:
import langsmith, inspect, os
print(os.path.dirname(inspect.getfile(langsmith)))
print(langsmith.__version__)

d:\AI_Projects\RAG\.venv\Lib\site-packages\langsmith
0.4.56


In [5]:
#uv pip install --force-reinstall --no-cache-dir "langsmith[all]"




In [None]:
from langsmith.evaluation import evaluate
#from langsmith.beta.evaluation import LLMCriteriaEvaluator

In [1]:
import langchain, langsmith, pydantic, pydantic_core

print(langchain.__version__)        # 0.1.20
print(langsmith.__version__)        # 0.1.58
print(pydantic.__version__)         # 2.12.0
print(pydantic_core.__version__)    # 2.41.x


0.1.20
0.1.58
2.12.0
2.41.1


# Custom Correctness Evaluator

##### creating llm as-a-judge evaluator to assess semantic and factual aligment 

In [9]:
from langsmith.schemas import Run, Example
#from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import os
from langchain_groq import ChatGroq
import os

load_dotenv() 

def correctness_evaluator(run: Run, example: Example) -> dict:
    """

    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system","""You are an evaluator whose job is to judge correctness.
    Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.
    - If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
    - If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.
    Do not penalize for stylistic or formatting differences unless they change meaning."""),
           ("human", """<example>
    <input>
    {input}
    </input>

    <output>
    Expected Output: {expected_output}

    Actual Output: {actual_output}
    </output>
    </example>

    Please grade the following agent run given the input, expected output, and actual output.
    Focus only on correctness (semantic and factual alignment).

    Respond with:
    1. A brief reasoning (1-2 sentences)
    2. A final verdict: either "CORRECT" or "INCORRECT"

    Format your response as:
    Reasoning: [your reasoning]
    Verdict: [CORRECT or INCORRECT]""")])
    
    ## initialize llm using groq llm
    

    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        api_key=os.getenv("GROQ_API_KEY"),
        temperature=0.1
    )

    # Create chain and invoke
    chain = eval_prompt | llm 
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text("\n"):
             if line.startswith("Reasoning:"):
                    reasoning = line.replace("Reasoning:", "").strip()
             elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
                
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }
        
    
        
      

### Run evaluation with custome correctness Evaluator

In [12]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "Multi_Docs_Chats"

 # Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data = dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
     metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)
print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

View the evaluation results for experiment: 'agenticAIReport-correctness-eval-bcf92f39' at:
https://smith.langchain.com/o/da78cfb0-6927-449d-96ba-935b2ed752c9/datasets/681ed8e0-1369-45f0-a017-e645d9dbe879/compare?selectedSessions=fd9285dd-42c0-4fce-b5d6-d6174d888d92




0it [00:00, ?it/s]{"timestamp": "2025-12-08T14:47:11.263759Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"timestamp": "2025-12-08T14:47:11.269297Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T14:47:11.270288Z", "level": "info", "event": "API key loaded successfully"}
{"timestamp": "2025-12-08T14:47:11.272880Z", "level": "info", "event": "Loaded GROQ_API_KEY from environment variable"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T14:47:11.274042Z", "level": "info", "event": "API key loaded successfully"}
{"config_path": "D:\\AI_Projects\\RAG\\multi_doc_chat\\config\\config.yaml", "timestamp": "2025-12-08T14:47:11.278040Z", "level": "info", "event": "ModelLoader initialized"}
{"keys": {"GROQ_API_KEY": "gsk_1g..."}, "timestamp": "2025-12-08T14:47:11.278040Z", "level": "info", "event": "API key loaded successfully"}
{"config_path": "D:\\AI_P


Evaluation completed! Check the LangSmith UI for detailed results.





### combine multiple evaluators


##### you can use multiple  evaluators together to get different perspectives on your RAG system's performance.

In [None]:
# Example: Combine custom correctness evaluator with LangChain's built-in evaluators
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Combine custom and built-in evaluators
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
    LangChainStringEvaluator("cot_qa"),  # Chain-of-thought QA evaluator
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )