In [1]:
print("Hello World")

Hello World


In [7]:
from dotenv import load_dotenv
load_dotenv()


True

In [8]:
# !pip install pandas

In [18]:
import pandas as pd

# QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "C:\\utube project\\llmops-by-sunny-saviya\\LLMOps_series\\data\\ai_report_qa_pairs.csv"
df.to_csv(csv_path, index=False)

In [19]:
from langsmith import Client

client = Client()
dataset_name = "LLMopsdatset"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIReport",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['2d094ee7-fe5d-49bc-87df-fda85c30d229',
  '3b17d1f5-d8e8-41c9-8541-4a5127f4be7f',
  '9f373186-c710-4d02-925c-017086cf3e6b'],
 'count': 3}

In [20]:
import sys
sys.path.append("C:\\utube project\\llmops-by-sunny-saviya\\LLMOps_series")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "C:\\utube project\\llmops-by-sunny-saviya\\LLMOps_series\\data\\AI_and_ML_Agentic_AI_Notes.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}


In [None]:
# !pip install langchain-google-genai
# !pip install langchain-groq
# !pip install structlog
# !pip install fastapi

In [21]:
# Test the function with a sample question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])


{"timestamp": "2026-01-01T13:42:58.615758Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:42:58.616753Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:42:58.616753Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-01T13:42:58.617752Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2026-01-01T13:42:58.619796Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20260101_191258_a0773a4e", "temp_dir": "data\\session_20260101_191258_a0773a4e", "faiss_dir": "faiss_index\\session_20260101_191258_a0773a4e", "sessionized": true, "timestamp": "2026-01-01T13:42:58.621794Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI_and_ML_Agentic_AI_Notes.txt", "saved_as":

Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.


In [None]:
# faiss-cpu is already installed via requirements.txt
# faiss-gpu is not available for Windows via pip
# !pip install faiss-gpu

ERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)
ERROR: No matching distribution found for faiss-gpu


In [22]:
from langsmith.evaluation import evaluate

In [None]:
# !pip install langsmith
# !pip install langchain
# !pip install "langsmith[vcr]"



Collecting vcrpy>=7.0.0 (from langsmith[vcr])
  Downloading vcrpy-8.1.0-py3-none-any.whl.metadata (4.7 kB)
Collecting wrapt (from vcrpy>=7.0.0->langsmith[vcr])
  Downloading wrapt-2.0.1-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Downloading vcrpy-8.1.0-py3-none-any.whl (42 kB)
Downloading wrapt-2.0.1-cp312-cp312-win_amd64.whl (60 kB)
Installing collected packages: wrapt, vcrpy

   ---------------------------------------- 2/2 [vcrpy]

Successfully installed vcrpy-8.1.0 wrapt-2.0.1


In [13]:
answer_ai_report_question

<function __main__.answer_ai_report_question(inputs: dict, data_path: str = 'C:\\utube project\\llmops-by-sunny-saviya\\LLMOps_series\\data\\AI_and_ML_Agentic_AI_Notes.txt', chunk_size: int = 1000, chunk_overlap: int = 200, k: int = 5) -> dict>

In [23]:
# Example: Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")


{"timestamp": "2026-01-01T13:43:13.787161Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:43:13.788157Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:43:13.789156Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-01T13:43:13.790153Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2026-01-01T13:43:13.792153Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20260101_191313_c187c31a", "temp_dir": "data\\session_20260101_191313_c187c31a", "faiss_dir": "faiss_index\\session_20260101_191313_c187c31a", "sessionized": true, "timestamp": "2026-01-01T13:43:13.793634Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI_and_ML_Agentic_AI_Notes.txt", "saved_as":

Testing all questions from the dataset:



HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
{"added": 1, "index": "faiss_index\\session_20260101_191313_c187c31a", "timestamp": "2026-01-01T13:43:14.530181Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2026-01-01T13:43:14.531187Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2026-01-01T13:43:14.532685Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:43:14.532685Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:43:14.533694Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-

Q1: For customer-facing applications, which company's models dominate the top rankings?
A1: OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.

--------------------------------------------------------------------------------



HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
{"added": 1, "index": "faiss_index\\session_20260101_191316_eb0b65cf", "timestamp": "2026-01-01T13:43:16.731219Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2026-01-01T13:43:16.732633Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2026-01-01T13:43:16.732633Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:43:16.732633Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:43:16.732633Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-

Q2: What percentage of respondents are using RAG in some form?
A2: 70% of respondents are using RAG in some form.

--------------------------------------------------------------------------------



HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
{"added": 1, "index": "faiss_index\\session_20260101_191318_9ab27df2", "timestamp": "2026-01-01T13:43:18.837775Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2026-01-01T13:43:18.837775Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2026-01-01T13:43:18.840703Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:43:18.841718Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:43:18.841718Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-

Q3: How often are most respondents updating their models?
A3: More than 50% update their models at least monthly, with 17% doing so weekly.

--------------------------------------------------------------------------------



In [24]:
from langsmith.evaluation import evaluate

# Use evaluator string directly instead of LangChainStringEvaluator
dataset_name = "LLMopsdatset"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    # evaluators=["cot_qa"],  # Pass evaluator string directly
    experiment_prefix="test-agenticAIReport-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

View the evaluation results for experiment: 'test-agenticAIReport-qa-rag-bd173ea7' at:
https://smith.langchain.com/o/30627a25-f995-4e3b-8724-5e02aab0f561/datasets/28607798-db0a-4bcd-acf3-dafb101d7667/compare?selectedSessions=c6e7fc18-15c1-4c7c-90d4-d1b144c9a883




0it [00:00, ?it/s]{"timestamp": "2026-01-01T13:43:30.095708Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:43:30.096709Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:43:30.097709Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-01T13:43:30.097709Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2026-01-01T13:43:30.100773Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20260101_191330_aa470523", "temp_dir": "data\\session_20260101_191330_aa470523", "faiss_dir": "faiss_index\\session_20260101_191330_aa470523", "sessionized": true, "timestamp": "2026-01-01T13:43:30.101773Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI_and_ML_Agentic_AI_Notes

## Custom Correctness Evaluator

Creating an LLM-as-a-Judge evaluator to assess semantic and factual alignment


In [25]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }


### Run Evaluation with Custom Correctness Evaluator


In [27]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "LLMopsdatset"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")


View the evaluation results for experiment: 'agenticAIReport-correctness-eval-9e71e17d' at:
https://smith.langchain.com/o/30627a25-f995-4e3b-8724-5e02aab0f561/datasets/28607798-db0a-4bcd-acf3-dafb101d7667/compare?selectedSessions=f12b561c-8839-4ca3-947b-7056b393efb8




0it [00:00, ?it/s]{"timestamp": "2026-01-01T13:46:03.897233Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2026-01-01T13:46:03.899240Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2026-01-01T13:46:03.900529Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_jV...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2026-01-01T13:46:03.900529Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2026-01-01T13:46:03.903907Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20260101_191603_721554c1", "temp_dir": "data\\session_20260101_191603_721554c1", "faiss_dir": "faiss_index\\session_20260101_191603_721554c1", "sessionized": true, "timestamp": "2026-01-01T13:46:03.905973Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AI_and_ML_Agentic_AI_Notes


Evaluation completed! Check the LangSmith UI for detailed results.





### Optional: Combine Multiple Evaluators

You can use multiple evaluators together to get different perspectives on your RAG system's performance.


In [28]:
# Example: Combine custom correctness evaluator with langsmith built-in evaluators
from langsmith.evaluation import evaluate

# Combine custom and built-in evaluators
# Pass evaluator strings directly instead of using LangChainStringEvaluator
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
    "cot_qa",  # Chain-of-thought QA evaluator (as string)
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )