In [1]:
print("Please see the changes made in the recent edits.")

Please see the changes made in the recent edits.


In [2]:
# !pip install langchain-experimental


In [3]:
# !pip install --upgrade langchain langchain-core langchain-community langchain-experimental langsmith


In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
import pandas as pd

# QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "D:\\PW\\LLMOPS\\data\\goldens.csv"
df.to_csv(csv_path, index=False)

In [6]:
from langsmith import Client

client = Client()
dataset_name = "report4_goldens"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for llmops_agentic_ai_report_goldens",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['42468744-9079-4e6e-b28e-4f0c51b039e1',
  'c22811d4-fbcf-4bc1-a36d-3b82569f9986',
  'a707b616-4f7e-4924-80e2-1fb69b6792f1'],
 'count': 3}

In [7]:
import sys
sys.path.append("D:/PW/LLMOPS")  # Adjust the path as necessary

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "D:/PW/LLMOPS/data/AI Engineering Report 2025.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        return {"answer": answer}
    
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [8]:
# Test the function with a sample question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-12-01T12:06:35.647804Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-12-01T12:06:35.650889Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-12-01T12:06:35.656098Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_5w...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-12-01T12:06:35.658640Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-12-01T12:06:35.670400Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251201_173635_5b54c275", "temp_dir": "data\\session_20251201_173635_5b54c275", "faiss_dir": "faiss_index\\session_20251201_173635_5b54c275", "sessionized": true, "timestamp": "2025-12-01T12:06:35.677411Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI Engineering Report 2025.txt", "saved_as":

Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: For customer-facing applications, **OpenAI** models dominate the top rankings. They account for 3 of the top 5 and half of the top 10 most popular models in this category.


In [9]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Example: Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

{"timestamp": "2025-12-01T12:06:41.255416Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-12-01T12:06:41.258305Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-12-01T12:06:41.261227Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_5w...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-12-01T12:06:41.261993Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-12-01T12:06:41.273320Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251201_173641_d97e3203", "temp_dir": "data\\session_20251201_173641_d97e3203", "faiss_dir": "faiss_index\\session_20251201_173641_d97e3203", "sessionized": true, "timestamp": "2025-12-01T12:06:41.278327Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI Engineering Report 2025.txt", "saved_as":

Testing all questions from the dataset:



{"added": 1, "index": "faiss_index\\session_20251201_173641_d97e3203", "timestamp": "2025-12-01T12:06:42.196886Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-12-01T12:06:42.196886Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-12-01T12:06:42.203598Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-12-01T12:06:42.204587Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-12-01T12:06:42.204587Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_5w...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-12-01T12:06:42.204587Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-12-01T12:06:42.210549Z", "level": "info", "event": "YAML config loaded"}
{"provider": "google", "model": "gemini-2.5-fl

Q1: For customer-facing applications, which company's models dominate the top rankings?
A1: For customer-facing applications, **OpenAI** models dominate the top rankings. They account for 3 of the top 5 and half of the top 10 most popular models in this category.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index\\session_20251201_173646_378f1382", "timestamp": "2025-12-01T12:06:47.315069Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-12-01T12:06:47.315069Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-12-01T12:06:47.322308Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-12-01T12:06:47.322308Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-12-01T12:06:47.322308Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_5w...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-12-01T12:06:47.322308Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-12-01T12:06:47.327398Z", "level": "info", "event": "YAML config loaded"}
{"provider": "google", "model": "gemini-2.5-fl

Q2: What percentage of respondents are using RAG in some form?
A2: 70% of respondents are using RAG in some form.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index\\session_20251201_173651_247ac956", "timestamp": "2025-12-01T12:06:52.816000Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-12-01T12:06:52.818023Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-12-01T12:06:52.821470Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-12-01T12:06:52.821470Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-12-01T12:06:52.821470Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_5w...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-12-01T12:06:52.821470Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-12-01T12:06:52.827213Z", "level": "info", "event": "YAML config loaded"}
{"provider": "google", "model": "gemini-2.5-fl

Q3: How often are most respondents updating their models?
A3: More than 50% of respondents update their models at least monthly, with 17% doing so weekly.

--------------------------------------------------------------------------------



In [10]:
import langchain_experimental
print(langchain_experimental.__version__)

0.4.0


In [11]:
dataset_name

'report4_goldens'

In [12]:
# from langsmith.evaluation import evaluate, LangChainStringEvaluator 

# # This line should now work correctly
# qa_evaluator = [LangChainStringEvaluator("cot_qa")]


# # Run full evaluation experiment on the dataset
# experiment_results = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,               
#     evaluators=qa_evaluator,
#     experiment_prefix="agentic-ai-report-rag-eval",
#     metadata={
#         "variant": "RAG + AI Engineering Report QA",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )

# print("\nðŸ“Š Evaluation Complete!")
# print(f"Run URL: {experiment_results.get('url', 'Check LangSmith dashboard.')}")



In [13]:
# pip install --upgrade setuptools

In [None]:
# import pkg_resources
# print('pkg_resources is available')

ModuleNotFoundError: No module named 'pkg_resources'

In [None]:
# # !pip install --upgrade langchain langchain-core langchain-community langchain-experimental langsmith
# import pkg_resources
# print("langchain version:", pkg_resources.get_distribution("langchain").version)
# print("langsmith version:", pkg_resources.get_distribution("langsmith").version)
# print("langchain-core version:", pkg_resources.get_distribution("langchain-core").version)


In [None]:
# from langsmith.evaluation import evaluate, LangChainStringEvaluator

# # Initialize the evaluator class instance with the name "cot_qa"
# # This creates a callable object that LangSmith can use internally.
# qa_evaluator_instance = LangChainStringEvaluator("cot_qa")

# # Pass the instantiated object into the 'evaluators' list
# experiment_results = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,               
#     evaluators=[qa_evaluator_instance], # Pass the instantiated object here
#     experiment_prefix="agentic-ai-report-rag-eval",
#     metadata={
#         "variant": "RAG + AI Engineering Report QA",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )

# print("\nðŸ“Š Evaluation Complete!")
# print(f"Run URL: {experiment_results.get('url', 'Check LangSmith dashboard.')}")


In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "AgenticAIReportGoldens"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-agenticAIReport-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

# Custom Correctness Evaluator

Creating an LLM-as-a-Judge evaluator to assess semantic and factual alignment

In [17]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }

# Run Evaluation with Custom Correctness Evaluator

In [18]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "AgenticAIReportGoldens"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'agenticAIReport-correctness-eval-54b52917' at:
https://smith.langchain.com/o/a2a4cbb9-84d3-4488-8aef-3127dcbe0611/datasets/8c90360b-bdb2-4676-b445-5e3eceb4e322/compare?selectedSessions=99df121c-3ed9-4d2a-a2b0-ead0e65317e2




0it [00:00, ?it/s]{"timestamp": "2025-12-01T12:17:33.661410Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-12-01T12:17:33.664114Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-12-01T12:17:33.664114Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_5w...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-12-01T12:17:33.666137Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-12-01T12:17:33.674232Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251201_174733_021b8342", "temp_dir": "data\\session_20251201_174733_021b8342", "faiss_dir": "faiss_index\\session_20251201_174733_021b8342", "sessionized": true, "timestamp": "2025-12-01T12:17:33.679276Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI Engineering Report 2025


Evaluation completed! Check the LangSmith UI for detailed results.





# Optional: Combine Multiple Evaluators


You can use multiple evaluators together to get different perspectives on your RAG system's performance.

In [None]:
# Example: Combine custom correctness evaluator with LangChain's built-in evaluators
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Combine custom and built-in evaluators
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
    LangChainStringEvaluator("cot_qa"),  # Chain-of-thought QA evaluator
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )