In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import pandas as pd

#QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# dataset
qa_pairs = [{"question":q,"answer":a} for q,a in zip(inputs,outputs)]
df = pd.DataFrame(qa_pairs)

# write 
csv_path = "D:/AI_Projects/RAG/data/goldens.csv"
df.to_csv(csv_path,index=False)


In [None]:
from langsmith import Client

client = Client()
dataset_name = "Multi_Docs_Chats"

#store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIReport",
    
)
# Add examples one by one
for q, a in zip(inputs, outputs):
    client.create_example(
        inputs={"question": q},      # must be ONE dict
        outputs={"answer": a},       # must be ONE dict
        dataset_id=dataset.id
        
    )


In [None]:
import sys
sys.path.append("D:/AI_Projects/RAG")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os
from multi_doc_chat.utils.model_loader import ModelLoader

# simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self,file_path:str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()
    
def answer_ai_report_question(
    inputs: dict,
    data_path :str = "D:/AI_Projects/RAG/data/2025 AI engineering Report.txt",
    chunk_size: int = 1000,
chunk_overlap: int = 200,
k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        #build retriever
        ingestor.built_retriver(
        uploaded_files=[file_adapter],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        k=k
    )
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        loader = ModelLoader()
        
        # create RAG instance and load retreiver
        
        rag = ConversationalRAG(session_id=session_id,model_loader=loader)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
    )
        # get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer":answer}
    
    except Exception as e:
            return {"answer": f"Error: {str(e)}"}
            
            
            
            






In [None]:
# test the function wuth a sample question
#from notebook.evaluation.answer_ai_report_question
 
#import import_ipynb
#from notebook.evaluation import answer_ai_report_question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

In [None]:
#from langsmith.evaluation import evaluate ,LangChainStringEvaluator 

In [None]:
# Example: Test with all golden questions
print("Testing all the questions from the  dataset")
for i, q in enumerate(inputs,1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}:{q}")
    print(f"A{i}:{result['answer']}\n")
    print("-" * 80 + "\n") 

In [None]:
from langsmith.evaluation import evaluate
from langsmith.evaluators import CriteriaEvaluator


qa_evaluator = CriteriaEvaluator("correctness")
dataset_name = "AgenticAIReportGoldens"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-agenticAIReport-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

# Custom Correctness Evaluator

##### creating llm as-a-judge evaluator to assess semantic and factual aligment 

In [None]:
from langsmith.schemas import Run, Example
#from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import os
from langchain_groq import ChatGroq
import os

load_dotenv() 

def correctness_evaluator(run: Run, example: Example) -> dict:
    """

    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system","""You are an evaluator whose job is to judge correctness.
    Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.
    - If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
    - If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.
    Do not penalize for stylistic or formatting differences unless they change meaning."""),
           ("human", """<example>
    <input>
    {input}
    </input>

    <output>
    Expected Output: {expected_output}

    Actual Output: {actual_output}
    </output>
    </example>

    Please grade the following agent run given the input, expected output, and actual output.
    Focus only on correctness (semantic and factual alignment).

    Respond with:
    1. A brief reasoning (1-2 sentences)
    2. A final verdict: either "CORRECT" or "INCORRECT"

    Format your response as:
    Reasoning: [your reasoning]
    Verdict: [CORRECT or INCORRECT]""")])
    
    ## initialize llm using groq llm
    

    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        api_key=os.getenv("GROQ_API_KEY"),
        temperature=0.1
    )

    # Create chain and invoke
    chain = eval_prompt | llm 
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text("\n"):
             if line.startswith("Reasoning:"):
                    reasoning = line.replace("Reasoning:", "").strip()
             elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
                
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }
        
    
        
      

### Run evaluation with custome correctness Evaluator

In [None]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "Multi_Docs_Chats"

 # Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data = dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
     metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)
print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

### combine multiple evaluators


##### you can use multiple  evaluators together to get different perspectives on your RAG system's performance.

In [None]:
# Example: Combine custom correctness evaluator with LangChain's built-in evaluators
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Combine custom and built-in evaluators
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
    LangChainStringEvaluator("cot_qa"),  # Chain-of-thought QA evaluator
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )