In [1]:
print("Hello ")

Hello 


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [1]:
import pandas as pd

# QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "/Users/Hp/OneDrive/Desktop/LLM_OPS/data/goldens.csv"
df.to_csv(csv_path, index=False)

In [2]:
from langsmith import Client

client = Client()
dataset_name = "AgenticAIReportGoldens"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIReport",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['4c0b78dd-b68e-44e3-aa17-2196099ca676',
  '70a138c3-1476-460f-a919-9d865431bd68',
  '386753ef-0d4b-432f-8ab2-cefc7d0bf5d0'],
 'count': 3}

In [3]:
import sys
sys.path.append("/Users/Hp/OneDrive/Desktop/LLM_OPS")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "/Users/Hp/OneDrive/Desktop/LLM_OPS/data/The 2025 AI Engineering Report.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [4]:
# Test the function with a sample question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-10-03T14:40:20.893780Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-03T14:40:20.898982Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-03T14:40:20.903462Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_eo...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-03T14:40:20.906511Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-03T14:40:20.922512Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251003_201020_12e63841", "temp_dir": "data\\session_20251003_201020_12e63841", "faiss_dir": "faiss_index\\session_20251003_201020_12e63841", "sessionized": true, "timestamp": "2025-10-03T14:40:20.929194Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Report.txt", "saved_

Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: For customer-facing applications, OpenAI models are dominating the field. Three out of the top five most popular models for these applications are from OpenAI, as well as half of the top ten.


In [5]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

In [6]:
# Example: Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

Testing all questions from the dataset:



{"timestamp": "2025-10-03T14:41:04.609813Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-03T14:41:04.614348Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-03T14:41:04.617924Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_eo...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-03T14:41:04.620642Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-03T14:41:04.629842Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251003_201104_76c8bb44", "temp_dir": "data\\session_20251003_201104_76c8bb44", "faiss_dir": "faiss_index\\session_20251003_201104_76c8bb44", "sessionized": true, "timestamp": "2025-10-03T14:41:04.639088Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Report.txt", "saved_

Q1: For customer-facing applications, which company's models dominate the top rankings?
A1: Error: Error in [c:\Users\Hp\OneDrive\Desktop\LLM_OPS\.venv\Lib\site-packages\google\api_core\grpc_helpers.py] at line [78] | Message: Invocation error in ConversationalRAG
Traceback:
Traceback (most recent call last):
  File "c:\Users/Hp/OneDrive/Desktop/LLM_OPS\multi_doc_chat\src\document_chat\retrieval.py", line 125, in invoke
    answer = self.chain.invoke(payload)
  File "c:\Users\Hp\OneDrive\Desktop\LLM_OPS\.venv\Lib\site-packages\langchain_core\runnables\base.py", line 3046, in invoke
    input_ = context.run(step.invoke, input_, config)
  File "c:\Users\Hp\OneDrive\Desktop\LLM_OPS\.venv\Lib\site-packages\langchain_google_genai\chat_models.py", line 1334, in invoke
    return super().invoke(input, config, stop=stop, **kwargs)
           ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Hp\OneDrive\Desktop\LLM_OPS\.venv\Lib\site-packages\langchain_core\language_models\cha

{"timestamp": "2025-10-03T14:41:13.964439Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-03T14:41:13.967238Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-03T14:41:13.970053Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_eo...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-03T14:41:13.972546Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-03T14:41:13.984225Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251003_201113_a082944e", "temp_dir": "data\\session_20251003_201113_a082944e", "faiss_dir": "faiss_index\\session_20251003_201113_a082944e", "sessionized": true, "timestamp": "2025-10-03T14:41:13.994822Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Report.txt", "saved_

Q2: What percentage of respondents are using RAG in some form?
A2: According to the provided text, 70% of respondents say they are using RAG (retrieval augmented generation) in one way or another.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-03T14:41:42.290840Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-03T14:41:42.294494Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-03T14:41:42.298422Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_eo...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-03T14:41:42.302403Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-03T14:41:42.316944Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251003_201142_8b89d960", "temp_dir": "data\\session_20251003_201142_8b89d960", "faiss_dir": "faiss_index\\session_20251003_201142_8b89d960", "sessionized": true, "timestamp": "2025-10-03T14:41:42.324620Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Report.txt", "saved_

Q3: How often are most respondents updating their models?
A3: More than 50% of respondents are updating their models at least monthly. Of those, 17% are updating their models on a weekly basis.

--------------------------------------------------------------------------------



In [28]:
import inspect

print(inspect.getsource(answer_ai_report_question))

def answer_ai_report_question(example: dict) -> str:
    print("DEBUG INPUT:", example)  # 👈 see what’s being passed in

    question = example.get("question") or example.get("input") or example["prompt"]
    context = example.get("context", "")

    print("DEBUG QUESTION:", question)
    print("DEBUG CONTEXT:", context[:200], "...")  # limit long text

    response = gemini_llm.invoke(
        f"Answer based on context:\n{context}\n\nQ: {question}"
    )

    print("DEBUG RESPONSE:", response)
    return response if isinstance(response, str) else str(response)



In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langchain_google_genai import ChatGoogleGenerativeAI # 1. Import Gemini LLM

# --- Configuration ---
# 2. Instantiate the Gemini LLM for the evaluator
# Note: Ensure you have your GEMINI_API_KEY environment variable set.
eval_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash") 
# 3. Pass the Gemini LLM to the cot_qa evaluator via the 'config' dictionary
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa", 
        config={"llm": eval_llm} 
    )
]

dataset_name = "AgenticAIReportGoldens"
# Assuming 'answer_ai_report_question' is a function defined elsewhere that takes inputs 
# from the dataset and returns a prediction (e.g., your RAG chain).

# Run evaluation using your RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-AgenticAIReportGoldens-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

View the evaluation results for experiment: 'test-AgenticAIReportGoldens-qa-rag-7667671b' at:
https://smith.langchain.com/o/c52f60ad-54d1-4235-9051-676106d86b53/datasets/ef062c10-af21-4b1c-bafb-0b332d90dd4d/compare?selectedSessions=bc26fc1a-c9e7-4583-9d0e-f03fd537c969




3it [00:37, 12.61s/it]


In [17]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }

In [18]:
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "AgenticAIReportGoldens"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-flash",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

View the evaluation results for experiment: 'agenticAIReport-correctness-eval-8d421491' at:
https://smith.langchain.com/o/c52f60ad-54d1-4235-9051-676106d86b53/datasets/ef062c10-af21-4b1c-bafb-0b332d90dd4d/compare?selectedSessions=a8fb2829-2688-4585-a0af-95080845a651




3it [00:25,  8.60s/it]



Evaluation completed! Check the LangSmith UI for detailed results.
