Ragas

In [12]:
from ragas import SingleTurnSample,EvaluationDataset,evaluate
from ragas.metrics import (
    Faithfulness,ResponseRelevancy,LLMContextPrecisionWithReference,LLMContextRecall,NoiseSensitivity,ContextEntityRecall
)

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from ragas.dataset import Dataset
from langchain_core.documents import Document
import warnings
from pydantic import ConfigDict
warnings.filterwarnings("ignore")

In [13]:
import asyncio
import nest_asyncio

def run_async_function(cornjob):
    """Helper Function To Run Async Functions in Jupyter Notebooks"""
    try:
        loop=asyncio.get_event_loop()
        if loop.is_running():
            nest_asyncio.apply()
            return loop.run_until_complete(cornjob)
        else:
            return asyncio.run(cornjob)
    except RuntimeError:
        return asyncio.run(cornjob)

Generator

Faithfullness

In [14]:
test_context=[
    "LLMs are trained on large text datasets and do not have consciousness or personal beliefs."
]

test_response="No, LLMs do not have personal beliefs and only generate text based on learned patterns."



Generate Claims

In [42]:
prompt_template=ChatPromptTemplate.from_template(
    """Given the following response,extract all factual claims as a numbered list.
    Each claim should be single, verifiable statement
    
    Response:{test_response}
    Extract all Factual Claims
    """
)

llm=ChatOpenAI(model='gpt-3.5-turbo')

chain=prompt_template|llm|StrOutputParser()

result=chain.invoke({"test_response":test_response})
print(result)

1. LLMs do not have personal beliefs.
2. LLMs only generate text based on learned patterns.


Match Claims

In [43]:
claims=[
    "LLMs do not have personal beliefs.",
    "LLMs only generate text based on learned patterns.",
    "Today is a Sunny day"
]
prompt_template=ChatPromptTemplate.from_template("""
Given the Claim and Context,verify if the claim is Supported by the context

Claim:{Claim}
Context:{Context}

Answer with:
 - "SUPPORTED" if the context supports the claim
 - "NOT SUPPORTED" if the context doesn't support the claim

 Also give a Brief Explanation
 Verdict: 
"""
)
llm=ChatOpenAI(model='gpt-3.5-turbo')
chain=prompt_template|llm|StrOutputParser()

verification_results=[]

for claim in claims:
    result=chain.invoke(
        {"Claim":claim,"Context":test_context[0]}
    )
    is_supported="SUPPORTED" in result.upper() and "NOT SUPPORTED" not in result.upper()
    verification_results.append(
        {
            "claim":claim,
            "is_supported":is_supported,
            "Explanation":result
        }
    )

print(verification_results)




[{'claim': 'LLMs do not have personal beliefs.', 'is_supported': True, 'Explanation': 'SUPPORTED \n\nExplanation: The context explicitly states that LLMs do not have personal beliefs as they are trained on large text datasets and do not have consciousness. This aligns with the claim that LLMs do not have personal beliefs.'}, {'claim': 'LLMs only generate text based on learned patterns.', 'is_supported': True, 'Explanation': 'SUPPORTED\n\nExplanation: The context clearly states that LLMs do not have consciousness or personal beliefs, indicating that they generate text based on learned patterns and data inputs rather than personal intentions or thoughts. This supports the claim that LLMs only generate text based on learned patterns.'}, {'claim': 'Today is a Sunny day', 'is_supported': False, 'Explanation': 'NOT SUPPORTED\n\nExplanation: The context provided about LLMs being trained on text datasets and lacking consciousness has no relevance to the claim about today being a sunny day. The

Calculate Faithfullness

In [44]:
supported_claims=sum(1 for r in verification_results if r["is_supported"])
total_claims=len(verification_results)

faithfilness_score=supported_claims/total_claims
print(f"{faithfilness_score:.2f}")

0.67


Ragas Implementation

In [40]:
failthfulness_sample=SingleTurnSample(
    user_input="Do Large Language Models have personal beliefs?",
    response=test_response,
    retrieved_contexts=test_context
)

faithfulness_metric=Faithfulness(llm=llm)
faithfulness_score=run_async_function(faithfulness_metric.single_turn_ascore(failthfulness_sample))
print(faithfilness_score)



0.6666666666666666


In [49]:
faithfulness_examples = [
    {
        "name": "Perfect Faithfulness (No hallucinations)",
        "response": "The first Super Bowl was played on January 15, 1967 at the Los Angeles Memorial Coliseum.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum."]
    },
    {
        "name": "Partial Faithfulness (Some hallucinations)",
        "response": "The first Super Bowl was on January 15, 1967. The Green Bay Packers won 35-10 with Bart Starr as MVP.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967."]
    },
    {
        "name": "Zero Faithfulness (Complete hallucination)",
        "response": "The first Super Bowl was held in Miami in 1970 and attracted over 100,000 spectators.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum."]
    }
]

faithfulness_metric=Faithfulness(llm=llm)
for items in faithfulness_examples:
    faithfulness_sample=SingleTurnSample(
        user_input="Tell me about the first Super Bowl",
        response=items["response"],
        retrieved_contexts=items["context"]
    )

    faithfulness_score=run_async_function(faithfulness_metric.single_turn_ascore(faithfulness_sample))

    print(items["name"])
    print(f"{faithfulness_score:.2f}")
    print('*'*50)

Perfect Faithfulness (No hallucinations)
0.50
**************************************************
Partial Faithfulness (Some hallucinations)
0.00
**************************************************
Zero Faithfulness (Complete hallucination)
0.00
**************************************************


Answer Relevance

Hypothetical Question Generation

In [69]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

question="Do Large Language Models have personal beliefs?"
llm_response="No, LLMs do not have personal beliefs and only generate text based on learned patterns."

prompt_template=ChatPromptTemplate.from_template(
    """
From the Answer given generate Hypothecial questions which are Questions to the answer.Strictly ensure that to all the questions generated
they should all have the same answer i.e the answer given

Answer:{Answer}

Construct 3 questions in a numbered manner like
1.
2.
3.
"""
)


llm=ChatOpenAI(model='gpt-3.5-turbo')
chain=prompt_template|llm|StrOutputParser()

result=chain.invoke({"Answer":llm_response})



Cosine SImilarity Formula

In [None]:
import numpy as np
from numpy.linalg import norm
def calculate_cosine_similarity(vec1,vec2):
    vec1=np.array(vec1)
    vec2=np.array(vec2)
    num=np.dot(vec1,vec2)
    dem=norm(vec1)*norm(vec2)
    return num/dem




0.0


Calculate Similarities Of the Original Question to the Questions generated

In [None]:
from langchain_openai import OpenAIEmbeddings
generated_questions=[
    "Do LLMs possess personal beliefs when generating text?",
    "Are LLMs only capable of generating text based on learned patterns?",
    "Can LLMs form their own personal opinions when generating text?"
]
embeddings=OpenAIEmbeddings(model='text-embedding-3-small')
original_embedding=embeddings.embed_query(question)

similarity_index=[]
for items in generated_questions:
    sim=embeddings.embed_query(items)
    cosine=calculate_cosine_similarity(original_embedding,sim)
    similarity_index.append(cosine)





[np.float64(0.5985796899687075), np.float64(0.4147325779462339), np.float64(0.46830935807152213)]


Mean of the Similarities

In [68]:
import numpy as np

a=np.mean(similarity_index)
print(a)

0.49387387532882115


Using the RAGS

In [95]:
response_relevancy_sample=SingleTurnSample(
    user_input=question,
    response=llm_response,
    retrieved_contexts=test_context
)

response_relevancy_metric=ResponseRelevancy(
    llm=llm,embeddings=embeddings
)

response_relevancy_score=run_async_function(response_relevancy_metric.single_turn_ascore(response_relevancy_sample))

print(response_relevancy_score)

0.5850632477874373


Retriever

Context Precision

In [87]:
query="What is an LLM?"
response="An LLM is an AI model trained on large text datasets to generate and understand human language."

chunks=[
    "The Eiffel Tower is located in Paris and was completed in 1889.",
    "Python lists are mutable data structures used to store collections of items.",
    "Large Language Models are trained on massive text data and can generate human-like responses.",
    "LLMs are commonly used for tasks like question answering and text generation."
]



In [83]:
prompt_template=ChatPromptTemplate.from_template(
"""
For the Given Question and Answer verify if the retrieved chunk is relevant or not
Question:{Question},
Answer:{Answer}
Chunk:{Chunk}
If the chunk is relevant give RELEVANT else give IRRELEVANT
"""
)

llm=ChatOpenAI(model='gpt-3.5-turbo')
chain=prompt_template|llm|StrOutputParser()
relevancy_matrix=[]
for chunk in chunks:
    result=chain.invoke(
        {
            "Question":query,
            "Answer":response,
            "Chunk":chunk
        }
    )
    is_relevant="RELEVANT" in result.upper() and "IRRELEVANT" not in result.upper()
    relevancy_matrix.append(is_relevant)

print(relevancy_matrix)

[False, False, True, True]


Precision count

In [84]:

precision_good=[]
relevant_count=0
for k,relevancy in enumerate(relevancy_matrix,1):
    if relevancy:
        relevant_count+=1
        precision_at_k=relevant_count/k
        precision_good.append(precision_at_k)



total_relevancy_matrix=sum(relevancy_matrix)

total_context_precision=sum(precision_good)/sum(relevancy_matrix) if total_relevancy_matrix>0 else 0
print(total_context_precision)

0.41666666666666663


Ragas

In [89]:
bad_sample=[
    "The Eiffel Tower is located in Paris and was completed in 1889.",
    "Python lists are mutable data structures used to store collections of items.",
    "Large Language Models are trained on massive text data and can generate human-like responses.",
    "LLMs are commonly used for tasks like question answering and text generation."
]

good_sample=[
    "Large Language Models are trained on massive text data and can generate human-like responses.",
    "LLMs are commonly used for tasks like question answering and text generation.",
    "The Eiffel Tower is located in Paris and was completed in 1889.",
    "Python lists are mutable data structures used to store collections of items."
]

context_precision_sample_good=SingleTurnSample(
    user_input=query,
    reference=response,
    retrieved_contexts=good_sample
)
context_precision_sample_bad=SingleTurnSample(
    user_input=query,
    reference=response,
    retrieved_contexts=bad_sample
)

context_precision_metric=LLMContextPrecisionWithReference(llm=llm)
good_result=run_async_function(context_precision_metric.single_turn_ascore(context_precision_sample_good))
bad_result=run_async_function(context_precision_metric.single_turn_ascore(context_precision_sample_bad))
print(good_result)
print(bad_result)

0.99999999995
0.4166666666458333


Conext Recall

In [91]:
query="Tell me about Eiffel Tower"
recall_reference="The Eifel Tower is Located in Paris.It was buit in 1889.It is 330 meters tall"

recall_context=[
    "The Eifel Tower is a Landmark Located in Paris,France",
    "The Tower was completed in 1889or the World's Fair"
]
recall_claims=[
    "The Eifel Tower is Located in Paris",
    "It was buit in 1889",
    "It is 330 meters tall"
]
prompt_template=ChatPromptTemplate.from_template(
    """
    Check if the the Following Claims can be attributed to the Context
    Claim:{Claim}
    Context:{Context}

    if the Claim is supported by the Context gives "YES" if not then gine "NO"
    """
)

chain=prompt_template|llm|StrOutputParser()
combined_context="\n".join(recall_context)
claims=[]
for claim in recall_claims:
    result=chain.invoke({
        "Claim":claim,
        "Context":combined_context
    })
    is_supported="YES" in result.upper() and "NO" not in result.upper()
    claims.append(is_supported)
    print(f"Claim is {claim} and is {is_supported}")


Claim is The Eifel Tower is Located in Paris and is True
Claim is It was buit in 1889 and is True
Claim is It is 330 meters tall and is False


Ragas

In [93]:
context_recall_sample=SingleTurnSample(
    user_input=query,
    reference=recall_reference,
    retrieved_contexts=recall_context
)

context_recall_metric=LLMContextRecall(llm=llm)
context_recall_score=run_async_function(context_recall_metric.single_turn_ascore(context_recall_sample))

print(context_recall_score)

0.6666666666666666


Context Entity Recall

In [117]:
entity_reference = "William Shakespeare wrote many famous plays such as Hamlet and Romeo and Juliet in England during the late 16th and early 17th century."
entity_context = [
    "William Shakespeare was an English playwright known for his influential works in literature."
]

prompt_template=ChatPromptTemplate.from_template(
"""
Extract all named entities from the following Text.
Include: PERSON,NAME,DATE,LOCATION and other proper nouns
Text:{Text}
List each entity on a new line with its type:
"""
)

chain = prompt_template|llm|StrOutputParser()
reference_result=chain.invoke({"Text":entity_reference})
context_result=chain.invoke({"Text":entity_context[0]})
print(reference_result)
print('@'*50)
print(context_result)

PERSON: William Shakespeare
LOCATION: England
LOCATION: 16th
LOCATION: 17th century
PLAY: Hamlet
PLAY: Romeo and Juliet
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
PERSON: William Shakespeare
LOCATION: English
DATE: N/A
WORK_OF_ART: literature


In [115]:
reference_entities = {
    "William Shakespeare": "PERSON",
    "Hamlet": "PERSON",
    "Romeo": "PERSON",
    "Juliet": "PERSON",
    "England": "LOCATION",
    "late 16th and early 17th century": "DATE"
}

context_entities = {
    "William Shakespeare": "PERSON",
    "English": "LOCATION",
    "playwright": "PERSON"
}


common_entities_reference_context=reference_entities.keys() and context_entities.keys()
common_entities_reference_context_size=len(common_entities_reference_context)
reference_entities_size=len(reference_entities.keys())
context_entity_recall=common_entities_reference_context_size/reference_entities_size

print(context_entity_recall)




0.5


Ragas

In [119]:
context_entity_recall_sample=SingleTurnSample(
    reference=entity_reference,
    retrieved_contexts=entity_context
)

context_entity_recall_metric=ContextEntityRecall(llm=llm)
context_entity_recall=run_async_function(context_entity_recall_metric.single_turn_ascore(context_entity_recall_sample))
print(context_entity_recall)

0.1999999996


Noise Reduction

In [121]:
noise_question = "What is LIC known for?"
noise_response = "LIC is the largest insurance company in India, known for its vast portfolio. LIC contributes to financial stability."
noise_reference = "LIC is the largest insurance company in India, established in 1956. It is known for managing a large portfolio of investments."

noise_contexts = [
    "LIC was established in 1956 following nationalization.",      
    "LIC is the largest insurance company with huge investments.",     
    "LIC manages substantial funds for financial stability.",           
    "The Indian economy is one of the fastest-growing economies..."     
]

In [122]:
from langchain_core.prompts import ChatMessagePromptTemplate

prompt_template=ChatPromptTemplate.from_template(
"""
For the reference given Below verify if the a given context directly support the reference or not

reference:{reference}
context:{context}

return TRUE if the context directly supports the reference or FALSE if it doesnot directly supports the reference
"""
)
chain=prompt_template|llm|StrOutputParser()
for context in noise_contexts:
    result=chain.invoke(
        {
            "reference":noise_reference,
            "context":context
        }
    )
    print(f"Context{context} {result}")

ContextLIC was established in 1956 following nationalization. True
ContextLIC is the largest insurance company with huge investments. TRUE
ContextLIC manages substantial funds for financial stability. TRUE
ContextThe Indian economy is one of the fastest-growing economies... FALSE


In [130]:
response_claims = [
    ("LIC is the largest insurance company in India", True, "Matches reference"),
    ("LIC is known for its vast portfolio", True, "Matches reference (portfolio)"),
    ("LIC contributes to financial stability", False, "NOT in reference - possible hallucination from noise!")
]

incorrect_count = 0
for claim in response_claims:
    if not claim[1]:
        incorrect_count+=1

noise=incorrect_count/len(response_claims)
print(noise)

0.3333333333333333


RAGAS

In [133]:
noise_sample=SingleTurnSample(
    user_input="What is LIC known for?",
    response="LIC is the largest insurance company in India, known for its vast portfolio. LIC contributes to financial stability.",
    reference="LIC is the largest insurance company in India, established in 1956. It is known for managing a large portfolio of investments.",
    retrieved_contexts=noise_contexts
)

noise_metric=NoiseSensitivity(llm=llm)
noise=run_async_function(noise_metric.single_turn_ascore(noise_sample))

print(noise)

0.3333333333333333


Document

In [15]:
cloudflow_docs = [
    # ============================================================================
    # ARCHITECTURE DOCUMENTS (3)
    # ============================================================================
    Document(
        page_content="""CloudFlow Architecture Overview

CloudFlow is a distributed cloud platform built on microservices architecture. The platform consists of three main layers that work together to provide a robust, scalable infrastructure.

The API Gateway layer handles all incoming requests using OAuth 2.0 authentication and routes them through our service mesh powered by Istio. This layer provides load balancing, SSL termination, and request routing capabilities.

The Service Mesh layer orchestrates communication between microservices, providing service discovery, health checking, and automatic failover. It uses Kubernetes for container orchestration across multiple availability zones.

The Data Storage layer implements a distributed database system with automatic replication across three availability zones. This ensures data durability and supports horizontal scaling based on demand.

CloudFlow guarantees 99.99% uptime SLA with triple redundancy across availability zones. The platform supports horizontal scaling with automatic load balancing, allowing each service to scale independently based on CPU and memory metrics.""",
        metadata={"source": "architecture_overview", "topic": "architecture", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow Scaling Mechanisms

CloudFlow implements sophisticated auto-scaling mechanisms to handle varying workloads efficiently. The platform monitors real-time metrics to make intelligent scaling decisions.

Horizontal Pod Autoscaling (HPA) adjusts the number of pod replicas based on CPU utilization (target: 70%) and memory usage (target: 80%). When these thresholds are exceeded for more than 3 consecutive minutes, the system automatically provisions additional pods.

Vertical scaling adjusts resource allocation for individual services. CloudFlow can increase or decrease CPU and memory limits without downtime, using Kubernetes resource management capabilities.

The platform supports bursting to handle sudden traffic spikes. During burst periods, CloudFlow can temporarily scale up to 500% of baseline capacity for up to 15 minutes before triggering permanent scaling.

Load balancing distributes traffic across all available pods using a weighted round-robin algorithm. Health checks run every 10 seconds, and unhealthy pods are automatically removed from the rotation within 30 seconds.""",
        metadata={"source": "scaling_guide", "topic": "architecture", "difficulty": "advanced"}
    ),
    
    Document(
        page_content="""CloudFlow System Components

CloudFlow's architecture comprises several key components that work in harmony to deliver reliable cloud services.

The Control Plane manages the overall system state, including service registration, configuration management, and orchestration. It runs on a dedicated cluster with five replicas for high availability.

The Data Plane handles actual request processing and data flow. It consists of worker nodes that execute application workloads and process user requests. Each data plane node has 16 CPU cores and 64GB RAM.

The Observability Stack includes Prometheus for metrics collection, Grafana for visualization, and ELK (Elasticsearch, Logstash, Kibana) for log aggregation. Metrics are collected every 15 seconds and retained for 90 days.

The Service Registry maintains a real-time directory of all available services and their endpoints. It uses etcd for distributed consensus and supports automatic service discovery with DNS-based lookups.

The Message Queue system, based on Apache Kafka, handles asynchronous communication between services with guaranteed message delivery and ordering.""",
        metadata={"source": "system_components", "topic": "architecture", "difficulty": "intermediate"}
    ),
    
    # ============================================================================
    # API DOCUMENTATION (4)
    # ============================================================================
    Document(
        page_content="""CloudFlow API Authentication

CloudFlow APIs support two authentication methods: OAuth 2.0 and API Keys. Both methods provide secure access to platform resources.

OAuth 2.0 is recommended for user-facing applications. It supports the Authorization Code flow and provides access tokens valid for 1 hour and refresh tokens valid for 30 days. To implement OAuth 2.0, direct users to the authorization endpoint at https://auth.cloudflow.io/oauth/authorize with your client_id and redirect_uri parameters.

API Keys are ideal for server-to-server communication and background jobs. Each API key has the format "cf_live_" followed by 32 alphanumeric characters. API keys never expire unless explicitly revoked.

To authenticate requests, include your API key in the Authorization header: "Authorization: Bearer YOUR_API_KEY". All API requests must be made over HTTPS; HTTP requests will be rejected with a 403 error.

API keys can be scoped to specific permissions (read, write, admin) and restricted to specific IP addresses for enhanced security. You can manage your API keys through the CloudFlow dashboard or the /api/v1/keys endpoint.""",
        metadata={"source": "api_authentication", "topic": "api", "difficulty": "beginner"}
    ),
    
    Document(
        page_content="""CloudFlow REST API Endpoints

CloudFlow provides a comprehensive REST API with endpoints organized by resource type. All endpoints follow RESTful conventions and return JSON responses.

Base URL: https://api.cloudflow.io/v1

Resources endpoint: GET /api/v1/resources - List all resources with pagination (max 100 per page). Supports filtering by type, status, and creation date.

Resource creation: POST /api/v1/resources - Create a new resource. Required fields: name (string), type (string), config (object). Returns 201 Created on success.

Resource details: GET /api/v1/resources/{id} - Retrieve detailed information about a specific resource by ID.

Resource update: PUT /api/v1/resources/{id} - Update an existing resource. Supports partial updates with PATCH /api/v1/resources/{id}.

Resource deletion: DELETE /api/v1/resources/{id} - Delete a resource. Returns 204 No Content on success. Deleted resources are soft-deleted and can be recovered within 30 days.

All list endpoints support query parameters: limit (default: 25, max: 100), offset (default: 0), sort (default: created_at), order (asc|desc).""",
        metadata={"source": "api_endpoints", "topic": "api", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow API Rate Limiting

CloudFlow implements rate limiting to ensure fair usage and platform stability. Rate limits vary by pricing tier and authentication method.

Standard Tier: 1,000 requests per hour per API key. Burst capacity allows up to 100 requests per minute. Exceeding limits returns HTTP 429 (Too Many Requests).

Premium Tier: 10,000 requests per hour per API key with burst capacity of 500 requests per minute. Premium tier also includes priority request processing.

Enterprise Tier: Custom rate limits negotiated based on usage patterns. Typically starts at 100,000 requests per hour with dedicated infrastructure.

Rate limit headers are included in every response:
- X-RateLimit-Limit: Maximum requests per hour
- X-RateLimit-Remaining: Remaining requests in current window
- X-RateLimit-Reset: Unix timestamp when the limit resets

When rate limited, the Retry-After header indicates how many seconds to wait before retrying. Implement exponential backoff: wait 1s, then 2s, then 4s, etc.

OAuth 2.0 authenticated requests have separate, higher limits: 5,000 requests per hour for Standard tier.""",
        metadata={"source": "api_rate_limits", "topic": "api", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow API Error Codes

CloudFlow APIs use standard HTTP status codes and provide detailed error messages in JSON format to help diagnose issues.

Authentication Errors:
- 401 Unauthorized: Missing or invalid API key. Check the Authorization header.
- 403 Forbidden: Valid API key but insufficient permissions for the requested operation.

Client Errors:
- 400 Bad Request: Invalid request format or missing required fields. The response includes a "details" field explaining what's wrong.
- 404 Not Found: Requested resource doesn't exist. Verify the resource ID.
- 409 Conflict: Request conflicts with current resource state (e.g., duplicate name).
- 422 Unprocessable Entity: Request format is valid but contains semantic errors.
- 429 Too Many Requests: Rate limit exceeded. Check X-RateLimit-Reset header.

Server Errors:
- 500 Internal Server Error: Unexpected server error. CloudFlow team is automatically notified.
- 502 Bad Gateway: Temporary issue with upstream services. Retry after a few seconds.
- 503 Service Unavailable: Scheduled maintenance or system overload. Check status.cloudflow.io.

Error Response Format: {"error": {"code": "error_code", "message": "Human-readable message", "details": {...}}}""",
        metadata={"source": "api_error_codes", "topic": "api", "difficulty": "beginner"}
    ),
    
    # ============================================================================
    # SECURITY DOCUMENTATION (2)
    # ============================================================================
    Document(
        page_content="""CloudFlow Security Features

Security is a top priority at CloudFlow. We implement industry-leading security practices to protect your data and applications.

Encryption: All data is encrypted at rest using AES-256 encryption. Data in transit uses TLS 1.3 with perfect forward secrecy. Encryption keys are rotated every 90 days using AWS KMS.

Network Security: CloudFlow runs in a Virtual Private Cloud (VPC) with strict network segmentation. Public endpoints are protected by Web Application Firewall (WAF) rules that block common attack patterns. DDoS protection is provided by Cloudflare with mitigation capacity up to 50 Gbps.

Access Control: All resources support Role-Based Access Control (RBAC) with customizable roles and permissions. We support integration with external identity providers via SAML 2.0 and OpenID Connect.

Audit Logging: Every API call is logged with timestamp, user identity, IP address, and action taken. Audit logs are immutable and retained for 2 years. You can access logs via the /api/v1/audit-logs endpoint.

Vulnerability Management: CloudFlow undergoes quarterly penetration testing by independent security firms. We maintain a bug bounty program and respond to security reports within 24 hours.""",
        metadata={"source": "security_features", "topic": "security", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow Compliance Standards

CloudFlow maintains compliance with major industry standards and regulations to ensure your data is handled responsibly.

SOC 2 Type II: CloudFlow is SOC 2 Type II certified, demonstrating our commitment to security, availability, and confidentiality. Audit reports are available to enterprise customers under NDA.

GDPR Compliance: CloudFlow is fully compliant with the European Union's General Data Protection Regulation. We support data residency requirements, right to erasure, data portability, and provide Data Processing Agreements (DPA) to all customers.

HIPAA: For healthcare customers, CloudFlow offers HIPAA-compliant infrastructure with Business Associate Agreements (BAA). HIPAA features include enhanced audit logging, encrypted backups, and strict access controls.

ISO 27001: CloudFlow's information security management system is certified to ISO 27001:2013 standards. We maintain comprehensive security policies and undergo annual recertification audits.

PCI DSS: For customers processing payment card data, CloudFlow provides PCI DSS Level 1 certified infrastructure. However, we recommend using dedicated payment processors rather than storing card data.

Data Residency: CloudFlow supports data residency in US, EU, UK, and APAC regions to meet local regulatory requirements.""",
        metadata={"source": "compliance_standards", "topic": "security", "difficulty": "advanced"}
    ),
    
    # ============================================================================
    # PRICING DOCUMENTATION (2)
    # ============================================================================
    Document(
        page_content="""CloudFlow Pricing Tiers

CloudFlow offers three pricing tiers designed to meet the needs of individuals, teams, and enterprises.

Standard Tier ($99/month):
- 1,000 API requests per hour
- 100 GB storage included
- 10 GB bandwidth per month
- Community support via forums
- 99.9% uptime SLA
- Up to 5 team members

Premium Tier ($499/month):
- 10,000 API requests per hour
- 1 TB storage included
- 100 GB bandwidth per month
- Email support with 24-hour response time
- 99.95% uptime SLA
- Up to 25 team members
- Advanced monitoring and alerting
- Custom domain support

Enterprise Tier (Custom pricing):
- Custom API rate limits (100,000+ requests/hour)
- Unlimited storage and bandwidth
- 24/7 phone and email support with 1-hour response time
- 99.99% uptime SLA with service credits
- Unlimited team members
- Dedicated account manager
- Custom integrations and professional services
- Private cloud deployment options

All tiers include: SSL certificates, daily backups, API access, and dashboard analytics. Annual billing provides 15% discount.""",
        metadata={"source": "pricing_tiers", "topic": "pricing", "difficulty": "beginner"}
    ),
    
    Document(
        page_content="""CloudFlow Billing Information

Understanding CloudFlow's billing model helps you manage costs effectively and avoid unexpected charges.

Billing Cycle: Subscriptions are billed monthly on the date you signed up. Annual subscriptions are billed upfront with a 15% discount. Billing date can be changed once per year.

Usage-Based Charges: Beyond included quotas, additional usage is billed at:
- API requests: $0.01 per 1,000 requests
- Storage: $0.10 per GB per month
- Bandwidth: $0.08 per GB
- Backup retention (beyond 30 days): $0.05 per GB per month

Payment Methods: CloudFlow accepts credit cards (Visa, Mastercard, Amex), ACH transfers (US only), and wire transfers for invoices over $1,000. Cryptocurrency payments available for annual plans.

Invoicing: Invoices are emailed on the billing date and available in the dashboard. Enterprise customers receive consolidated monthly invoices with 30-day payment terms.

Upgrades and Downgrades: Upgrade anytime to immediately access higher tier features. Downgrades take effect at the next billing cycle. Prorated credits are applied to your account balance.

Free Trial: New customers get 14-day free trial on Premium tier with no credit card required. Trial includes 1,000 API requests and 10 GB storage.""",
        metadata={"source": "billing_info", "topic": "pricing", "difficulty": "beginner"}
    ),
    
    # ============================================================================
    # BEST PRACTICES DOCUMENTATION (3)
    # ============================================================================
    Document(
        page_content="""CloudFlow Performance Optimization

Following these best practices will help you achieve optimal performance from your CloudFlow applications.

Caching Strategy: Implement caching at multiple levels. Use CloudFlow's built-in Redis cache for frequently accessed data with TTL between 5-60 minutes. Cache API responses on the client side and respect Cache-Control headers.

Request Optimization: Batch multiple operations into single API calls when possible. Use pagination for large result sets (recommended page size: 50-100 items). Implement request compression using gzip to reduce bandwidth.

Connection Management: Reuse HTTP connections with keep-alive headers. Maintain a connection pool with 5-10 concurrent connections per API key. Set appropriate timeouts: connection timeout 10s, read timeout 30s.

Query Efficiency: Use field filtering to request only required data: /resources?fields=id,name,status. Leverage server-side filtering instead of retrieving all data and filtering locally.

Asynchronous Processing: For long-running operations, use CloudFlow's async API endpoints. Poll for results using the returned job_id rather than blocking on the initial request.

CDN Usage: Serve static assets through CloudFlow's global CDN with 150+ edge locations. Configure appropriate cache headers for optimal performance: max-age=3600 for semi-static content.""",
        metadata={"source": "performance_optimization", "topic": "best_practices", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow Monitoring and Observability

Effective monitoring ensures your CloudFlow applications remain healthy and performant.

Metrics Collection: CloudFlow automatically collects key metrics including request rate, error rate, latency (p50, p95, p99), and resource utilization. Access metrics via the dashboard or Metrics API at /api/v1/metrics.

Custom Metrics: Send custom application metrics using the StatsD protocol. CloudFlow aggregates custom metrics every 60 seconds and retains them for 90 days.

Alerting: Configure alerts for critical conditions like error rate >5%, latency >500ms, or approaching rate limits. CloudFlow supports alerting via email, SMS, Slack, PagerDuty, and webhooks.

Distributed Tracing: Enable distributed tracing to track requests across services. CloudFlow supports OpenTelemetry and provides trace visualization in the dashboard. Sample rate: 10% of requests (configurable up to 100%).

Log Management: CloudFlow retains logs for 7 days by default (30 days for Premium, 90 days for Enterprise). Use structured logging with JSON format for better searchability. Maximum log line length: 32KB.

Dashboard Widgets: Create custom dashboards with real-time metrics, SLA compliance, and cost tracking. Share dashboards with team members or embed in external tools using iframe integration.""",
        metadata={"source": "monitoring_observability", "topic": "best_practices", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow Disaster Recovery

CloudFlow implements comprehensive disaster recovery capabilities to protect your data and ensure business continuity.

Backup Strategy: CloudFlow performs automatic daily backups of all data at 2 AM UTC. Backups are encrypted and stored in geographically diverse locations. Retention: 30 days for Standard tier, 90 days for Premium, 1 year for Enterprise.

Point-in-Time Recovery: Enterprise customers can restore data to any point within the retention period with 5-minute granularity. Recovery operations typically complete within 15-30 minutes.

Multi-Region Replication: Enable multi-region replication for critical data. Data is asynchronously replicated to a secondary region within 60 seconds. Failover to secondary region is automatic and takes approximately 5 minutes.

Backup Verification: CloudFlow performs monthly backup restoration tests to ensure data recoverability. Test results are available in your compliance dashboard.

Export Capabilities: Export your data anytime in JSON, CSV, or Parquet format. Full exports are available via the /api/v1/export endpoint. Large exports (>10 GB) are delivered to your S3 bucket.

RTO and RPO: CloudFlow guarantees Recovery Time Objective (RTO) of 4 hours and Recovery Point Objective (RPO) of 1 hour for Enterprise tier. Contact support to initiate disaster recovery procedures.""",
        metadata={"source": "disaster_recovery", "topic": "best_practices", "difficulty": "advanced"}
    ),
    
    # ============================================================================
    # TROUBLESHOOTING DOCUMENTATION (3)
    # ============================================================================
    Document(
        page_content="""Common CloudFlow Errors and Solutions

This guide covers the most common errors encountered when using CloudFlow and their solutions.

Error: "Invalid API Key" (401)
Solution: Verify your API key format starts with "cf_live_" and is exactly 40 characters. Check for extra spaces or newlines. Generate a new API key if the issue persists. API keys are case-sensitive.

Error: "Rate Limit Exceeded" (429)
Solution: Implement exponential backoff in your retry logic. Check X-RateLimit-Reset header to know when limits reset. Consider upgrading to a higher tier if you consistently hit limits. Use batch endpoints to reduce request count.

Error: "Resource Not Found" (404)
Solution: Verify the resource ID is correct and the resource hasn't been deleted. Use the /api/v1/resources endpoint to list available resources. Check if you're using the correct API version (/v1).

Error: "Timeout" (504)
Solution: Increase client timeout to at least 30 seconds. For long-running operations, use async endpoints and poll for results. Check CloudFlow status page for any service degradation.

Error: "Validation Error" (422)
Solution: Review the error details field for specific validation failures. Common issues: missing required fields, invalid data types, values outside allowed ranges. Consult API documentation for correct request format.""",
        metadata={"source": "common_errors", "topic": "troubleshooting", "difficulty": "beginner"}
    ),
    
    Document(
        page_content="""CloudFlow Debugging Guide

When troubleshooting issues with CloudFlow, follow this systematic debugging approach.

Step 1 - Check Service Status: Visit status.cloudflow.io to verify all systems are operational. Subscribe to status updates to receive notifications about incidents and maintenance.

Step 2 - Review API Logs: Access detailed API logs in the CloudFlow dashboard under Analytics > API Logs. Filter by time range, status code, and endpoint. Look for patterns in failed requests.

Step 3 - Enable Debug Mode: Add X-CloudFlow-Debug: true header to requests to receive detailed debug information in responses. Debug mode provides request ID, processing time breakdown, and backend service information.

Step 4 - Test with curl: Isolate issues by testing with curl commands. Example: curl -H "Authorization: Bearer YOUR_API_KEY" -H "X-CloudFlow-Debug: true" https://api.cloudflow.io/v1/resources

Step 5 - Check Network Connectivity: Ensure your network allows outbound HTTPS traffic to *.cloudflow.io on port 443. Verify DNS resolution is working correctly.

Step 6 - Verify SDK Version: If using CloudFlow SDK, ensure you're running the latest version. Outdated SDKs may not support new API features or may have known bugs.

Step 7 - Contact Support: If issues persist, contact CloudFlow support with the request ID from failed requests. Support responds within 24 hours for Standard tier, 4 hours for Premium, 1 hour for Enterprise.""",
        metadata={"source": "debugging_guide", "topic": "troubleshooting", "difficulty": "intermediate"}
    ),
    
    Document(
        page_content="""CloudFlow Support Escalation Process

Understanding CloudFlow's support escalation process ensures your issues are resolved efficiently.

Support Channels:
- Community Forums (All tiers): community.cloudflow.io - Best for general questions, feature requests, and sharing knowledge
- Email Support (Premium & Enterprise): support@cloudflow.io - Include account ID and request ID in subject line
- Phone Support (Enterprise only): +1-888-CLOUDFLOW - Available 24/7 for critical issues
- Slack Channel (Enterprise only): Direct access to engineering team

Issue Severity Levels:
- P0 (Critical): Complete service outage affecting production. Response time: 1 hour for Enterprise, 4 hours for Premium
- P1 (High): Major functionality impaired but workarounds available. Response time: 4 hours for Enterprise, 8 hours for Premium
- P2 (Medium): Minor functionality issues with workarounds. Response time: 24 hours
- P3 (Low): Questions, feature requests, documentation issues. Response time: 48 hours

Escalation Path: If your issue isn't resolved within SLA, it automatically escalates to the next support tier. Enterprise customers can request immediate escalation to engineering team.

Required Information: Include account ID, request ID, error messages, timestamps, steps to reproduce, and expected vs actual behavior. Screenshots and API logs are helpful.""",
        metadata={"source": "support_escalation", "topic": "troubleshooting", "difficulty": "beginner"}
    ),
]

print(f"✓ Created {len(cloudflow_docs)} CloudFlow documentation documents")
print("\nDocument breakdown by category:")
for topic in ["architecture", "api", "security", "pricing", "best_practices", "troubleshooting"]:
    count = len([doc for doc in cloudflow_docs if doc.metadata["topic"] == topic])
    print(f"  - {topic.title()}: {count} documents")

✓ Created 17 CloudFlow documentation documents

Document breakdown by category:
  - Architecture: 3 documents
  - Api: 4 documents
  - Security: 2 documents
  - Pricing: 2 documents
  - Best_Practices: 3 documents
  - Troubleshooting: 3 documents


Splitting

In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter=RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=128,
    length_function=len,
    separators=[
         "\n\n",
        "\n",
        " ",
        ".",
        ",",
    ]
)

split=splitter.split_documents(cloudflow_docs)

Embedding and Vector Store

In [17]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings=OpenAIEmbeddings(model='text-embedding-3-small')

vectorstore=FAISS.from_documents(split,embeddings)

vectorstore_path="FAISS"

vectorstore.save_local(vectorstore_path)



Retriever

In [18]:
retriever=vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k":5
    }
)

query="What is CloudFlow's uptime SLA"

result=retriever.invoke(query)
print(result[0].page_content)

CloudFlow guarantees 99.99% uptime SLA with triple redundancy across availability zones. The platform supports horizontal scaling with automatic load balancing, allowing each service to scale independently based on CPU and memory metrics.


Chat

In [19]:
llm=ChatOpenAI(
    model='gpt-3.5-turbo',
    temperature=0,
    max_completion_tokens=500                   # Reasonable for QA system              
)

test_response=llm.invoke("How are you openai guy")
print(test_response)

content="I am an AI assistant created by OpenAI, so I don't have feelings or emotions like a human. But I am here to help and assist you with any questions or tasks you may have. How can I assist you today?" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 47, 'prompt_tokens': 13, 'total_tokens': 60, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-D0O30LswN0o5nSpXs8Z2cB0PFU56L', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019bdfb6-1049-73e1-8ed2-183aa4c7e78a-0' tool_calls=[] invalid_tool_calls=[] usage_metadata={'input_tokens': 13, 'output_tokens': 47, 'total_tokens': 60, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_detail

Prompt Template and helper function

In [20]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda


prompt_template =ChatPromptTemplate.from_template("""You are a helpful assistant for CloudFlow Platform documentation.
Answer the question based on the following context. If you cannot answer based on
the context, say "I don't have enough information to answer that question."

Be concise and accurate. Include specific details like numbers, limits, and technical
specifications when available in the context.

Context:
{context}

Question: {question}

Answer:"""
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

format_docs=RunnableLambda(format_docs)


LCEL

In [21]:
from langchain_core.runnables import RunnablePassthrough

chain=(
    {'context':retriever|format_docs,'question':RunnablePassthrough()}|prompt_template|llm|StrOutputParser()
)

test_questions = [
    "What is CloudFlow's uptime SLA?",
    "How do I authenticate with CloudFlow APIs?",
    "What are the pricing tiers?"
]



for i, question in enumerate(test_questions, 1):
    print(f"\nQuestion {i}: {question}")
    answer = chain.invoke(question)
    print(f"Answer: {answer}")
    print("-" * 80)

print("\n✓ RAG pipeline is working correctly!")


Question 1: What is CloudFlow's uptime SLA?
Answer: CloudFlow's uptime SLA is guaranteed at 99.99% across availability zones.
--------------------------------------------------------------------------------

Question 2: How do I authenticate with CloudFlow APIs?
Answer: To authenticate with CloudFlow APIs, you can use either OAuth 2.0 or API Keys. For OAuth 2.0, direct users to the authorization endpoint with your client_id and redirect_uri parameters. Access tokens are valid for 1 hour, and refresh tokens are valid for 30 days. For API Keys, include your key in the Authorization header with the format "Authorization: Bearer YOUR_API_KEY". Ensure all requests are made over HTTPS to avoid a 403 error.
--------------------------------------------------------------------------------

Question 3: What are the pricing tiers?
Answer: The pricing tiers for CloudFlow are Standard Tier ($99/month), Premium Tier ($499/month), and Enterprise Tier (Custom pricing).
-------------------------------

Ground Truth

In [22]:
test_cases = [
    # ========== SIMPLE FACTUAL (5) ==========
    {
        "question": "What is CloudFlow's uptime SLA?",
        "ground_truth": "CloudFlow guarantees 99.99% uptime SLA with triple redundancy across availability zones."
    },
    {
        "question": "What authentication protocol does CloudFlow use?",
        "ground_truth": "CloudFlow uses OAuth 2.0 for user-facing applications and API keys for server-to-server communication."
    },
    {
        "question": "What is the service mesh technology used by CloudFlow?",
        "ground_truth": "CloudFlow uses Istio as the service mesh technology to orchestrate communication between microservices."
    },
    {
        "question": "What compliance standards does CloudFlow support?",
        "ground_truth": "CloudFlow supports SOC 2 Type II, GDPR, HIPAA, ISO 27001, and PCI DSS Level 1 compliance standards."
    },
    {
        "question": "How long are CloudFlow audit logs retained?",
        "ground_truth": "CloudFlow audit logs are immutable and retained for 2 years."
    },
    
    # ========== MULTI-FACT (4) ==========
    {
        "question": "What are the three main layers of CloudFlow architecture?",
        "ground_truth": "The three main layers are: API Gateway layer (handles authentication and routing), Service Mesh layer (orchestrates microservices), and Data Storage layer (distributed database with replication)."
    },
    {
        "question": "What are CloudFlow's pricing tiers and their API rate limits?",
        "ground_truth": "Standard tier costs $99/month with 1,000 requests/hour, Premium tier costs $499/month with 10,000 requests/hour, and Enterprise tier has custom pricing with 100,000+ requests/hour."
    },
    {
        "question": "What HTTP status codes indicate authentication failures in CloudFlow API?",
        "ground_truth": "401 Unauthorized indicates missing or invalid API key, and 403 Forbidden indicates valid API key but insufficient permissions."
    },
    {
        "question": "What auto-scaling metrics does CloudFlow monitor?",
        "ground_truth": "CloudFlow monitors CPU utilization (target 70%), memory usage (target 80%), and triggers scaling when thresholds are exceeded for more than 3 consecutive minutes."
    },
    
    # ========== PROCEDURAL (3) ==========
    {
        "question": "How do I authenticate with CloudFlow APIs using an API key?",
        "ground_truth": "Include your API key in the Authorization header as 'Authorization: Bearer YOUR_API_KEY'. All requests must be made over HTTPS, and API keys have the format 'cf_live_' followed by 32 alphanumeric characters."
    },
    {
        "question": "How do I handle rate limit errors in CloudFlow?",
        "ground_truth": "When you receive a 429 error, implement exponential backoff in retry logic, check the X-RateLimit-Reset header to know when limits reset, and use the Retry-After header to determine wait time (1s, then 2s, then 4s, etc.)."
    },
    {
        "question": "What steps should I follow to optimize CloudFlow API performance?",
        "ground_truth": "Implement caching with Redis (TTL 5-60 minutes), batch multiple operations into single API calls, use pagination for large result sets (50-100 items), enable request compression with gzip, and maintain a connection pool with 5-10 concurrent connections."
    },
    
    # ========== COMPARISON (2) ==========
    {
        "question": "What's the difference between Standard and Premium tier rate limits?",
        "ground_truth": "Standard tier allows 1,000 requests per hour with 100 requests per minute burst, while Premium tier allows 10,000 requests per hour with 500 requests per minute burst. Premium also includes priority request processing."
    },
    {
        "question": "How does OAuth 2.0 authentication differ from API key authentication in CloudFlow?",
        "ground_truth": "OAuth 2.0 is recommended for user-facing applications with access tokens valid for 1 hour and provides the Authorization Code flow, while API keys are ideal for server-to-server communication, never expire unless revoked, and have a simpler implementation."
    },
    
    # ========== TROUBLESHOOTING (2) ==========
    {
        "question": "What should I do if I receive a 504 timeout error?",
        "ground_truth": "Increase client timeout to at least 30 seconds, use async endpoints for long-running operations and poll for results, and check the CloudFlow status page for any service degradation."
    },
    {
        "question": "How do I debug slow API response times in CloudFlow?",
        "ground_truth": "Add X-CloudFlow-Debug: true header to requests for detailed debug information, review API logs in the dashboard under Analytics > API Logs, test with curl commands, and verify network connectivity to *.cloudflow.io on port 443."
    },
    
    # ========== EDGE CASES (2) ==========
    {
        "question": "What happens if I use an expired OAuth token?",
        "ground_truth": "If you use an expired access token, you'll receive a 401 Unauthorized error. You should use your refresh token to obtain a new access token. Access tokens are valid for 1 hour and refresh tokens are valid for 30 days."
    },
    {
        "question": "Does CloudFlow support blockchain integration?",
        "ground_truth": "I don't have enough information to answer that question."  # Tests 'I don't know' handling
    },
]

print(f"✓ Created {len(test_cases)} test questions with ground truth answers\n")
print("Question breakdown by category:")
print("  - Simple Factual: 5 questions")
print("  - Multi-Fact: 4 questions")
print("  - Procedural: 3 questions")
print("  - Comparison: 2 questions")
print("  - Troubleshooting: 2 questions")
print("  - Edge Cases: 2 questions")

✓ Created 18 test questions with ground truth answers

Question breakdown by category:
  - Simple Factual: 5 questions
  - Multi-Fact: 4 questions
  - Procedural: 3 questions
  - Comparison: 2 questions
  - Troubleshooting: 2 questions
  - Edge Cases: 2 questions


RAGAS

In [None]:
evaluation_dataset={
    "user_input":[],
    "response":[],
    "reference":[],
    "retrieved_contexts":[]
}

# for a csv File
# Save to CSV
#eval_dataset.to_csv("my_eval_dataset.csv")
# # After creating eval_dataset
# with open("my_eval_dataset.pkl", "wb") as f:
#     pickle.dump(eval_dataset, f)

# print("✅ Dataset saved as pickle")
# Load from CSV
#loaded_dataset = EvaluationDataset.from_csv("my_eval_dataset.csv")

for references in test_cases:
    user_input=references['question']
    references=references['ground_truth']

    response=chain.invoke(user_input)
    retriever_docs=retriever.invoke(user_input)
    retriever_docs=[doc.page_content for doc in retriever_docs]


    evaluation_dataset['user_input'].append(user_input)
    evaluation_dataset['response'].append(response)
    evaluation_dataset['reference'].append(references)
    evaluation_dataset['retrieved_contexts'].append(retriever_docs)




print(evaluation_dataset)




{'user_input': ["What is CloudFlow's uptime SLA?", 'What authentication protocol does CloudFlow use?', 'What is the service mesh technology used by CloudFlow?', 'What compliance standards does CloudFlow support?', 'How long are CloudFlow audit logs retained?', 'What are the three main layers of CloudFlow architecture?', "What are CloudFlow's pricing tiers and their API rate limits?", 'What HTTP status codes indicate authentication failures in CloudFlow API?', 'What auto-scaling metrics does CloudFlow monitor?', 'How do I authenticate with CloudFlow APIs using an API key?', 'How do I handle rate limit errors in CloudFlow?', 'What steps should I follow to optimize CloudFlow API performance?', "What's the difference between Standard and Premium tier rate limits?", 'How does OAuth 2.0 authentication differ from API key authentication in CloudFlow?', 'What should I do if I receive a 504 timeout error?', 'How do I debug slow API response times in CloudFlow?', 'What happens if I use an expire

In [170]:
evaluation_docs=[
    SingleTurnSample(
        user_input=evaluation_dataset['user_input'][i],
        response=evaluation_dataset['response'][i],
        reference=evaluation_dataset['reference'][i],
        retrieved_contexts=evaluation_dataset['retrieved_contexts'][i]
    )
    for i in range(0,len(evaluation_dataset['user_input']))
]

eval_set=EvaluationDataset(samples=evaluation_docs)

evaluator_llm=ChatOpenAI(
    model='gpt-3.5-turbo',
    temperature=0,
    max_completion_tokens=3000,
    timeout=480,
    n=3
)

evaluator_embedding=OpenAIEmbeddings(
    model='text-embedding-3-small'
)

ragas_llm=LangchainLLMWrapper(evaluator_llm)
ragas_embedding=LangchainEmbeddingsWrapper(evaluator_embedding)

  ragas_llm=LangchainLLMWrapper(evaluator_llm)
  ragas_embedding=LangchainEmbeddingsWrapper(evaluator_embedding)


In [171]:
from ragas.metrics import(
    ContextRecall,
    ContextEntityRecall,
    ContextPrecision,
    Faithfulness,
    NoiseSensitivity,
    AnswerRelevancy
)

metrics=[
    ContextPrecision(llm=ragas_llm),
    ContextEntityRecall(llm=ragas_llm),
    ContextPrecision(llm=ragas_llm),
    Faithfulness(llm=ragas_llm),
    NoiseSensitivity(llm=ragas_llm),
    AnswerRelevancy(llm=ragas_llm,embeddings=evaluator_embedding)
]

result=evaluate(
    dataset=eval_set,
    metrics=metrics
)

print(result)


Evaluating:   0%|          | 0/108 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   1%|          | 1/108 [00:12<22:50, 12.80s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   6%|▌         | 6/108 [00:53<09:52,  5.81s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  15%|█▍        | 16/108 [02:50<13:54,  9.07s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  16%|█▌        | 17/108 [03:06<16:41, 11.01s/it]Exception raised in Job[4]: TimeoutError()
Exception raised in Job[0]: TimeoutError()
Evaluating:  18%|█▊        | 19/108 [03:08<09:34,  6.46s/it]Exception raised in Job[14]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Evaluating:  22%|██▏       | 24/108 [03:19<05:21,  3.83s/it]Exception raised in Job[16]: TimeoutError()
E

{'context_precision': 0.7858, 'context_entity_recall': 0.3833, 'faithfulness': 0.8716, 'noise_sensitivity(mode=relevant)': 0.0000, 'answer_relevancy': 0.7831}


LLM as a Judge

In [24]:
from enum import Enum
class Score(str,Enum):
    """0-3 for creating the Evaluation metrics"""
    no_relevance="0"
    low_relevance="1"
    medium_relevance="2"
    high_relevance="3"

SCORE_DESCRIPTION = (
    "Score as a string between '0' and '3'. "
    "0: No relevance/Not grounded/Poor quality - Completely fails the criterion. "
    "1: Low relevance/Low groundedness/Below average - Minimal adherence to criterion. "
    "2: Medium relevance/Medium groundedness/Good - Mostly meets the criterion. "
    "3: High relevance/High groundedness/Excellent - Fully meets the criterion."
)

Groundedness

In [25]:
from pydantic import BaseModel,Field
from typing import List


class Groundedness(BaseModel):
     """Evaluates if the answer is faithful to retrieved context (no hallucinations)"""
     reasoning:str=Field(...,description="Check if the generated response is derived from the retrieved context ensuring the LLM doesnot Hallucinate.")
     score:Score=Field(...,description=SCORE_DESCRIPTION)

Answer Relevancy

In [26]:
class AnswerRelevancy(BaseModel):
    """Checks if the answer given as response is relevant to the question asked as query"""
    reasoning:str=Field(...,description="Check if the answer given as response is relevant to the question asked as query")
    score:Score=Field(...,description=SCORE_DESCRIPTION)

    

Retrieval Quality

In [27]:
class RetrievalQuality(BaseModel):
    """Evaluates if the retrieved contexts are helpful is responsing the query"""
    reasoning:str=Field(...,description="Evaluate if the contexts retrieved from the sources are relevant and helpful in aswering the query asked by the user")
    score:Score=Field(...,description=SCORE_DESCRIPTION)

RAG EVALUATION

In [28]:
# Complete RAG Evaluation Model
class RAGEvaluation(BaseModel):
    """Complete RAG evaluation with all three metrics"""
    groundedness: Groundedness = Field(
        ...,
        description="Evaluation of answer faithfulness to retrieved context"
    )
    answer_relevance: AnswerRelevancy = Field(
        ...,
        description="Evaluation of answer relevance to the question"
    )
    retrieval_quality: RetrievalQuality = Field(
        ...,
        description="Evaluation of retrieved context relevance to the question"
    )


Judge Prompt

In [None]:


prompt_template="""You are an expert evaluator for Retrieval-Augmented Generation (RAG) systems.
Your role is to assess the quality of RAG system outputs across three key dimensions:

1. **Groundedness**: How faithful is the answer to the retrieved context? Does it contain hallucinations?
2. **Answer Relevance**: How well does the answer address the user's question?
3. **Retrieval Quality**: How relevant are the retrieved contexts to answering the question?

For each dimension:
- Provide detailed step-by-step reasoning
- Assign a score from 0-3 where:
  - 0: Completely fails the criterion
  - 1: Minimal adherence (significant issues)
  - 2: Mostly meets criterion (minor issues)
  - 3: Fully meets criterion (excellent quality)

Be objective, thorough, and consistent in your evaluations."""

print(prompt_template)

You are an expert evaluator for Retrieval-Augmented Generation (RAG) systems.
Your role is to assess the quality of RAG system outputs across three key dimensions:

1. **Groundedness**: How faithful is the answer to the retrieved context? Does it contain hallucinations?
2. **Answer Relevance**: How well does the answer address the user's question?
3. **Retrieval Quality**: How relevant are the retrieved contexts to answering the question?

For each dimension:
- Provide detailed step-by-step reasoning
- Assign a score from 0-3 where:
  - 0: Completely fails the criterion
  - 1: Minimal adherence (significant issues)
  - 2: Mostly meets criterion (minor issues)
  - 3: Fully meets criterion (excellent quality)

Be objective, thorough, and consistent in your evaluations.


Create the Format suitable for LLM judge

In [30]:
def create_judge_prompt(query:str,response:str,context:List[str])->str:
    """
        Create the user prompt for the judge LLM.
    
    Args:
        question: User's question
        retrieved_contexts: List of retrieved document chunks
        generated_answer: RAG system's answer
        
    Returns:
        Formatted prompt string
    """

    context_format="\n\n--\n\n".join(f"Context {i+1}:\n{context}" for i,context in enumerate(context))

    return """Evaluate the following component of RAG
    Question:{query}
    Response:{response}
    context:{context_format}

    Please evaluate the Following component of RAG based on groundedness,answer_relevance,retrieval_quality
    """





Evaluate RAG with Judge

In [36]:
def evaluate_rag_with_judge(query:str,context:List[str],response:str)->RAGEvaluation:
        """
    Evaluate a single RAG interaction using LLM-as-Judge.
    
    Args:
        question: User's question
        retrieved_contexts: List of retrieved document chunks
        generated_answer: RAG system's answer
        
    Returns:
        RAGEvaluation object with scores and reasoning for all metrics
    """
        messages=[
            {"role":"system","content":prompt_template},
            {"role":"user","content":create_judge_prompt(query,context,response)}
        ]

        response=llm.with_structured_output(RAGEvaluation).invoke(messages)

        return response


Test

In [39]:
test_question = "What is CloudFlow's uptime SLA?"
test_answer =chain.invoke(test_question)
test_contexts = [doc.page_content for doc in retriever.invoke(test_question)]


response=evaluate_rag_with_judge(test_question,test_contexts,test_answer)
print(response)

groundedness=Groundedness(reasoning='The groundedness will be evaluated based on how faithfully the answer is derived from the retrieved context without any hallucinations.', score=<Score.medium_relevance: '2'>) answer_relevance=AnswerRelevancy(reasoning="The answer relevance will be assessed based on how well the response addresses the user's question.", score=<Score.high_relevance: '3'>) retrieval_quality=RetrievalQuality(reasoning="The retrieval quality will be evaluated based on the relevance and helpfulness of the retrieved contexts in answering the user's query.", score=<Score.medium_relevance: '2'>)


Ground Truth

In [40]:
# Test cases: question + ground truth answer pairs
test_cases = [
    # ========== SIMPLE FACTUAL (5) ==========
    {
        "question": "What is CloudFlow's uptime SLA?",
        "ground_truth": "CloudFlow guarantees 99.99% uptime SLA with triple redundancy across availability zones."
    },
    {
        "question": "What authentication protocol does CloudFlow use?",
        "ground_truth": "CloudFlow uses OAuth 2.0 for user-facing applications and API keys for server-to-server communication."
    },
    {
        "question": "What is the service mesh technology used by CloudFlow?",
        "ground_truth": "CloudFlow uses Istio as the service mesh technology to orchestrate communication between microservices."
    },
    {
        "question": "What compliance standards does CloudFlow support?",
        "ground_truth": "CloudFlow supports SOC 2 Type II, GDPR, HIPAA, ISO 27001, and PCI DSS Level 1 compliance standards."
    },
    {
        "question": "How long are CloudFlow audit logs retained?",
        "ground_truth": "CloudFlow audit logs are immutable and retained for 2 years."
    },
    
    # ========== MULTI-FACT (4) ==========
    {
        "question": "What are the three main layers of CloudFlow architecture?",
        "ground_truth": "The three main layers are: API Gateway layer (handles authentication and routing), Service Mesh layer (orchestrates microservices), and Data Storage layer (distributed database with replication)."
    },
    {
        "question": "What are CloudFlow's pricing tiers and their API rate limits?",
        "ground_truth": "Standard tier costs $99/month with 1,000 requests/hour, Premium tier costs $499/month with 10,000 requests/hour, and Enterprise tier has custom pricing with 100,000+ requests/hour."
    },
    {
        "question": "What HTTP status codes indicate authentication failures in CloudFlow API?",
        "ground_truth": "401 Unauthorized indicates missing or invalid API key, and 403 Forbidden indicates valid API key but insufficient permissions."
    },
    {
        "question": "What auto-scaling metrics does CloudFlow monitor?",
        "ground_truth": "CloudFlow monitors CPU utilization (target 70%), memory usage (target 80%), and triggers scaling when thresholds are exceeded for more than 3 consecutive minutes."
    },
    
    # ========== PROCEDURAL (3) ==========
    {
        "question": "How do I authenticate with CloudFlow APIs using an API key?",
        "ground_truth": "Include your API key in the Authorization header as 'Authorization: Bearer YOUR_API_KEY'. All requests must be made over HTTPS, and API keys have the format 'cf_live_' followed by 32 alphanumeric characters."
    },
    {
        "question": "How do I handle rate limit errors in CloudFlow?",
        "ground_truth": "When you receive a 429 error, implement exponential backoff in retry logic, check the X-RateLimit-Reset header to know when limits reset, and use the Retry-After header to determine wait time (1s, then 2s, then 4s, etc.)."
    },
    {
        "question": "What steps should I follow to optimize CloudFlow API performance?",
        "ground_truth": "Implement caching with Redis (TTL 5-60 minutes), batch multiple operations into single API calls, use pagination for large result sets (50-100 items), enable request compression with gzip, and maintain a connection pool with 5-10 concurrent connections."
    },
    
    # ========== COMPARISON (2) ==========
    {
        "question": "What's the difference between Standard and Premium tier rate limits?",
        "ground_truth": "Standard tier allows 1,000 requests per hour with 100 requests per minute burst, while Premium tier allows 10,000 requests per hour with 500 requests per minute burst. Premium also includes priority request processing."
    },
    {
        "question": "How does OAuth 2.0 authentication differ from API key authentication in CloudFlow?",
        "ground_truth": "OAuth 2.0 is recommended for user-facing applications with access tokens valid for 1 hour and provides the Authorization Code flow, while API keys are ideal for server-to-server communication, never expire unless revoked, and have a simpler implementation."
    },
    
    # ========== TROUBLESHOOTING (2) ==========
    {
        "question": "What should I do if I receive a 504 timeout error?",
        "ground_truth": "Increase client timeout to at least 30 seconds, use async endpoints for long-running operations and poll for results, and check the CloudFlow status page for any service degradation."
    },
    {
        "question": "How do I debug slow API response times in CloudFlow?",
        "ground_truth": "Add X-CloudFlow-Debug: true header to requests for detailed debug information, review API logs in the dashboard under Analytics > API Logs, test with curl commands, and verify network connectivity to *.cloudflow.io on port 443."
    },
    
    # ========== EDGE CASES (2) ==========
    {
        "question": "What happens if I use an expired OAuth token?",
        "ground_truth": "If you use an expired access token, you'll receive a 401 Unauthorized error. You should use your refresh token to obtain a new access token. Access tokens are valid for 1 hour and refresh tokens are valid for 30 days."
    },
    {
        "question": "Does CloudFlow support blockchain integration?",
        "ground_truth": "I don't have enough information to answer that question."  # Tests 'I don't know' handling
    },
]

print(f"✓ Created {len(test_cases)} test questions with ground truth answers\n")
print("Question breakdown by category:")
print("  - Simple Factual: 5 questions")
print("  - Multi-Fact: 4 questions")
print("  - Procedural: 3 questions")
print("  - Comparison: 2 questions")
print("  - Troubleshooting: 2 questions")
print("  - Edge Cases: 2 questions")

✓ Created 18 test questions with ground truth answers

Question breakdown by category:
  - Simple Factual: 5 questions
  - Multi-Fact: 4 questions
  - Procedural: 3 questions
  - Comparison: 2 questions
  - Troubleshooting: 2 questions
  - Edge Cases: 2 questions


Creating the Dataset

In [50]:


evaluator_dataset=[]
{
    "user_input":[],
    "response":[],
    "reference":[],
    "retriever_context":[]
}

for i,test in enumerate(test_cases):
    user_input=test.get("question")
    response=chain.invoke(user_input)
    reference=test.get("ground_truth")
    retriever_context=retriever.invoke(user_input)
    retriever_docs=[doc.page_content for doc in retriever_context]

    evaluator_dataset.append({
    "user_input":user_input,
    "response":response,
    "reference":reference,
    "retriever_context":retriever_docs
})

print(evaluator_dataset)




[{'user_input': "What is CloudFlow's uptime SLA?", 'response': "CloudFlow's uptime SLA is guaranteed at 99.99% across availability zones.", 'reference': 'CloudFlow guarantees 99.99% uptime SLA with triple redundancy across availability zones.', 'retriever_context': ['CloudFlow guarantees 99.99% uptime SLA with triple redundancy across availability zones. The platform supports horizontal scaling with automatic load balancing, allowing each service to scale independently based on CPU and memory metrics.', 'CloudFlow Pricing Tiers\n\nCloudFlow offers three pricing tiers designed to meet the needs of individuals, teams, and enterprises.\n\nStandard Tier ($99/month):\n- 1,000 API requests per hour\n- 100 GB storage included\n- 10 GB bandwidth per month\n- Community support via forums\n- 99.9% uptime SLA\n- Up to 5 team members\n\nPremium Tier ($499/month):\n- 10,000 API requests per hour\n- 1 TB storage included\n- 100 GB bandwidth per month\n- Email support with 24-hour response time\n- 99

Judge results

In [51]:
judge_results=[]

for items in evaluator_dataset:
    evaluation=evaluate_rag_with_judge(
        query=items["user_input"],
        context=items["retriever_context"],
        response=items["response"]
    )

    judge_results.append(evaluation)

print(judge_results)

[RAGEvaluation(groundedness=Groundedness(reasoning='The groundedness will be evaluated based on how faithfully the answer is derived from the retrieved context without any hallucinations.', score=<Score.medium_relevance: '2'>), answer_relevance=AnswerRelevancy(reasoning="The answer relevance will be assessed based on how well the response addresses the user's question.", score=<Score.high_relevance: '3'>), retrieval_quality=RetrievalQuality(reasoning="The retrieval quality will be evaluated based on the relevance and helpfulness of the retrieved context in answering the user's query.", score=<Score.medium_relevance: '2'>)), RAGEvaluation(groundedness=Groundedness(reasoning='The groundedness will be evaluated based on how faithfully the answer is derived from the retrieved context without any hallucinations.', score=<Score.medium_relevance: '2'>), answer_relevance=AnswerRelevancy(reasoning="The answer relevance will be assessed based on how well the response addresses the user's questio