# RAGAs Integration with LLM Log JSON

This notebook integrates RAGAs metrics into LLM log evaluation using Google's Gemini 2.5 Flash model.

## Metrics Computed:
- **Faithfulness**: Factual consistency with context
- **Answer Relevancy**: Relevance to user query
- **Context Precision**: Proportion of relevant context chunks

## 1. Install Dependencies

In [2]:
!pip install ragas langchain-google-genai langchain-core numpy




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
!pip install Pillow




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 2. Import Libraries

In [4]:
import json
import asyncio
import os
from typing import List, Dict, Any
import numpy as np

# RAGAs imports
from ragas import SingleTurnSample
from ragas.metrics import Faithfulness, ResponseRelevancy, LLMContextPrecisionWithoutReference
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# LangChain Google GenAI imports
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


## 3. Set Google API Key

In [5]:
# Set your Google API key here
# Get your API key from: https://ai.google.dev/
os.environ["GOOGLE_API_KEY"] = "AIzaSyAkogaBOk8_x-45UPqLR1f5kykEVtH9nIE"

# Verify the API key is set
if not os.getenv("GOOGLE_API_KEY") or os.getenv("GOOGLE_API_KEY") == "your-google-api-key-here":
    print("⚠️  Please set your actual Google API key above")
else:
    print("✅ Google API key is set")

✅ Google API key is set


## 4. RAGAs Evaluator Class

In [6]:
class RAGAsEvaluator:
    """
    RAGAs evaluator for LLM logs using Gemini models.
    """
    
    def __init__(self, google_api_key: str = None):
        """Initialize the RAGAs evaluator with Gemini models."""
        if google_api_key:
            os.environ["GOOGLE_API_KEY"] = google_api_key
        elif not os.getenv("GOOGLE_API_KEY"):
            raise ValueError("Google API key must be provided")
        
        # Initialize Gemini 1.5 Flash model (more stable for free tier)
        self.llm = LangchainLLMWrapper(
            ChatGoogleGenerativeAI(
                model="gemini-2.0-flash",
                temperature=0.1,
                max_tokens=1024,
                timeout=100  # Changed from request_timeout to timeout
            )
        )
        
        # Initialize Google embeddings
        self.embeddings = LangchainEmbeddingsWrapper(
            GoogleGenerativeAIEmbeddings(
                model="models/embedding-001",
                task_type="retrieval_document"
            )
        )
        
        # Initialize RAGAs metrics
        self.faithfulness_metric = Faithfulness(llm=self.llm)
        self.answer_relevancy_metric = ResponseRelevancy(
            llm=self.llm, 
            embeddings=self.embeddings
        )
        self.context_precision_metric = LLMContextPrecisionWithoutReference(
            llm=self.llm
        )
    
    def load_logs(self, log_file_path: str):
        """Load log data from JSON file."""
        with open(log_file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    
    def extract_evaluation_data(self, log_data) -> List[Dict[str, Any]]:
        """Extract evaluation data from log structure."""
        evaluation_samples = []
        
        # Handle list structure (your actual JSON format)
        if isinstance(log_data, list):
            logs = log_data
        else:
            logs = log_data.get("logs", [])
        
        for log_entry in logs:
            items = log_entry.get("items", [])
            
            for item in items:
                item_id = item.get("id", f"item-{len(evaluation_samples) + 1}")
                input_data_list = item.get("input", [])
                expected_output_list = item.get("expectedOutput", [])
                
                # Extract system and user content from input list
                system_prompt = ""
                user_query = ""
                
                for input_item in input_data_list:
                    if isinstance(input_item, dict):
                        role = input_item.get("role", "")
                        if role == "system":
                            system_prompt = input_item.get("context", "")
                        elif role == "user":
                            user_query = input_item.get("context", "")
                
                # Extract expected output (response) from expectedOutput list
                expected_output = ""
                for output_item in expected_output_list:
                    if isinstance(output_item, dict) and output_item.get("role") == "assistant":
                        expected_output = output_item.get("content", "")
                        break
                
                # Only add if we have both query and expected output
                if user_query and expected_output:
                    evaluation_samples.append({
                        "id": item_id,
                        "query": user_query,
                        "context": system_prompt,
                        "response": expected_output
                    })
        
        return evaluation_samples
    
    async def compute_ragas_metrics(self, sample_data: Dict[str, Any]) -> Dict[str, float]:
        """Compute RAGAs metrics for a single sample."""
        try:
            sample = SingleTurnSample(
                user_input=sample_data["query"],
                response=sample_data["response"],
                retrieved_contexts=[sample_data["context"]] if sample_data["context"] else []
            )
            
            # Add delay to avoid rate limiting
            await asyncio.sleep(25)  # Increased delay to 15 seconds
            
            # Compute metrics with error handling
            try:
                faithfulness_score = await self.faithfulness_metric.single_turn_ascore(sample)
            except Exception as e:
                print(f"Faithfulness error: {e}")
                faithfulness_score = 0.0
            
            await asyncio.sleep(25)  # Another delay
            
            try:
                answer_relevancy_score = await self.answer_relevancy_metric.single_turn_ascore(sample)
            except Exception as e:
                print(f"Answer relevancy error: {e}")
                answer_relevancy_score = 0.0
            
            await asyncio.sleep(25)  # Another delay
            
            try:
                if sample_data["context"]:
                    context_precision_score = await self.context_precision_metric.single_turn_ascore(sample)
                else:
                    context_precision_score = 0.0
            except Exception as e:
                print(f"Context precision error: {e}")
                context_precision_score = 0.0
            
            return {
                "faithfulness": float(faithfulness_score) if not np.isnan(faithfulness_score) else 0.0,
                "answer_relevancy": float(answer_relevancy_score) if not np.isnan(answer_relevancy_score) else 0.0,
                "context_precision": float(context_precision_score) if not np.isnan(context_precision_score) else 0.0
            }
            
        except Exception as e:
            print(f"Error computing metrics: {e}")
            return {"faithfulness": 0.0, "answer_relevancy": 0.0, "context_precision": 0.0}
    
    async def evaluate_logs(self, log_file_path: str) -> List[Dict[str, Any]]:
        """Evaluate all samples in the log file."""
        log_data = self.load_logs(log_file_path)
        evaluation_samples = self.extract_evaluation_data(log_data)
        
        if not evaluation_samples:
            print("⚠️ No evaluation samples found. Check your JSON structure.")
            return []
        
        results = []
        
        for i, sample in enumerate(evaluation_samples):
            print(f"Evaluating {i+1}/{len(evaluation_samples)}: {sample['id']}")
            
            metrics = await self.compute_ragas_metrics(sample)
            
            result = {
                "id": sample["id"],
                "faithfulness": round(metrics["faithfulness"], 4),
                "answer_relevancy": round(metrics["answer_relevancy"], 4),
                "context_precision": round(metrics["context_precision"], 4)
            }
            
            results.append(result)
            print(f"  Faithfulness: {result['faithfulness']}, Relevancy: {result['answer_relevancy']}, Precision: {result['context_precision']}")
        
        return results
    
    def save_results(self, results: List[Dict[str, Any]], output_file: str = "ragas_evaluation_results.json"):
        """Save evaluation results to JSON file."""
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"Results saved to {output_file}")

## 5. Load and Preview Log Data

In [7]:
# Load and preview the log data
with open('logs.json', 'r') as f:
    log_data = json.load(f)

# Check the type and structure of the loaded data
print(f"Type of log_data: {type(log_data)}")

if isinstance(log_data, list):
    print(f"Number of log entries: {len(log_data)}")
    if log_data:
        print("\nFirst log entry structure:")
        print(json.dumps(log_data[0], indent=2)[:500] + "...")
elif isinstance(log_data, dict):
    print(f"Keys in log_data: {list(log_data.keys())}")
    if 'logs' in log_data:
        print(f"Number of log entries: {len(log_data['logs'])}")
        print("\nFirst log entry structure:")
        print(json.dumps(log_data['logs'][0], indent=2)[:500] + "...")
    else:
        # Show the structure if 'logs' key doesn't exist
        print("\nLog data structure:")
        print(json.dumps(log_data, indent=2)[:500] + "...")
else:
    print(f"Unexpected data type: {type(log_data)}")
    print(str(log_data)[:500] + "...")

Type of log_data: <class 'list'>
Number of log entries: 10

First log entry structure:
{
  "metaData": {
    "name": "Cypher Negative Dataset",
    "description": null,
    "created_at": "2025-07-11T13:46:24.846Z",
    "updated_at": "2025-07-11T13:46:24.846Z",
    "item_count": 1,
    "expected_items": "unknown",
    "exported_at": "2025-07-11T13:46:24.846Z"
  },
  "items": [
    {
      "id": "41ae71ea-59db-46b7-af56-d8f5026ead56",
      "status": "ACTIVE",
      "input": [
        {
          "role": "system",
          "context": "\n<objective>\nYou are Cypher Query BotAI for t...


In [8]:
# Initialize the evaluator
evaluator = RAGAsEvaluator()
print("✅ RAGAs evaluator initialized")

✅ RAGAs evaluator initialized


## 6. Initialize Evaluator

In [None]:
# Run the evaluation with sample data (not the empty logs.json)
import asyncio

async def run_evaluation():
    results = await evaluator.evaluate_logs("sample_logs.json")  # Use sample_logs.json instead
    return results

# Run the evaluation
results = await run_evaluation()

print(f"\n=== Evaluation Complete ===")
print(f"Total samples evaluated: {len(results)}")

Evaluating 1/10: 41ae71ea-59db-46b7-af56-d8f5026ead56
  Faithfulness: 0.0, Relevancy: 0.848, Precision: 1.0
Evaluating 2/10: f2530571-175f-479e-a21b-2e096d186cd8
  Faithfulness: 0.0, Relevancy: 0.839, Precision: 1.0
Evaluating 3/10: 77e16750-3981-4751-a862-ea21bbbd2a1d
  Faithfulness: 0.8, Relevancy: 0.0, Precision: 1.0
Evaluating 4/10: 80e8a1e6-6ba2-4b38-9397-56b89564ca00
  Faithfulness: 1.0, Relevancy: 0.8416, Precision: 1.0
Evaluating 5/10: 3a1d050a-502c-4af7-8ed3-9dd55f13b06a
  Faithfulness: 0.2308, Relevancy: 0.8608, Precision: 1.0
Evaluating 6/10: da3e35f3-da5f-4453-9d68-5c2f951b2d85
Faithfulness error: Failed to parse NLIStatementOutput from completion {"statements": [{"statement": "The first code block matches a brand node to a car node.", "reason": "The provided context describes a Cypher Query BotAI designed to transform car-related questions into Cypher queries. It outlines the schema, mandates, strategies, and rules for generating these queries. However, it does not contain

In [10]:
# Analyze the evaluation results
print("=== Detailed Analysis ===")
print(f"Total samples evaluated: {len(results)}")

if results:
    # Calculate statistics
    faithfulness_scores = [r["faithfulness"] for r in results]
    relevancy_scores = [r["answer_relevancy"] for r in results]
    precision_scores = [r["context_precision"] for r in results]
    
    print(f"\n📊 Score Statistics:")
    print(f"Faithfulness - Mean: {np.mean(faithfulness_scores):.3f}, Std: {np.std(faithfulness_scores):.3f}")
    print(f"Answer Relevancy - Mean: {np.mean(relevancy_scores):.3f}, Std: {np.std(relevancy_scores):.3f}")
    print(f"Context Precision - Mean: {np.mean(precision_scores):.3f}, Std: {np.std(precision_scores):.3f}")
    
    # Identify problematic samples
    print(f"\n⚠️ Samples with issues:")
    for result in results:
        issues = []
        if result["faithfulness"] == 0.0:
            issues.append("Low/Zero Faithfulness")
        if result["answer_relevancy"] < 0.7:
            issues.append("Low Relevancy")
        if result["context_precision"] < 0.8:
            issues.append("Low Precision")
        
        if issues:
            print(f"ID {result['id']}: {', '.join(issues)}")
    
    # Show top performing samples
    print(f"\n✅ Top performing samples:")
    sorted_results = sorted(results, key=lambda x: x["faithfulness"] + x["answer_relevancy"] + x["context_precision"], reverse=True)
    for result in sorted_results[:3]:
        total_score = result["faithfulness"] + result["answer_relevancy"] + result["context_precision"]
        print(f"ID {result['id']}: Total={total_score:.3f} (F:{result['faithfulness']}, R:{result['answer_relevancy']}, P:{result['context_precision']})")
else:
    print("No results to analyze.")

=== Detailed Analysis ===
Total samples evaluated: 10

📊 Score Statistics:
Faithfulness - Mean: 0.303, Std: 0.421
Answer Relevancy - Mean: 0.743, Std: 0.250
Context Precision - Mean: 1.000, Std: 0.000

⚠️ Samples with issues:
ID 41ae71ea-59db-46b7-af56-d8f5026ead56: Low/Zero Faithfulness
ID f2530571-175f-479e-a21b-2e096d186cd8: Low/Zero Faithfulness
ID 77e16750-3981-4751-a862-ea21bbbd2a1d: Low Relevancy
ID da3e35f3-da5f-4453-9d68-5c2f951b2d85: Low/Zero Faithfulness
ID ba7cb5ba-b87a-491f-a4a7-6ce72b26afd2: Low/Zero Faithfulness
ID f7097108-3b58-415b-b3ce-31db680e1701: Low/Zero Faithfulness
ID 9bd1b731-bd4d-4da0-85b5-33266fbc5245: Low/Zero Faithfulness

✅ Top performing samples:
ID 80e8a1e6-6ba2-4b38-9397-56b89564ca00: Total=2.842 (F:1.0, R:0.8416, P:1.0)
ID 710acf9d-867a-43f4-9873-70ca08989d79: Total=2.751 (F:1.0, R:0.7512, P:1.0)
ID 3a1d050a-502c-4af7-8ed3-9dd55f13b06a: Total=2.092 (F:0.2308, R:0.8608, P:1.0)


In [11]:
# Save results and create summary report
if results:
    # Save the detailed results
    evaluator.save_results(results, "ragas_evaluation_results.json")
    
    # Create a summary report
    summary_report = {
        "evaluation_summary": {
            "total_samples": len(results),
            "evaluation_date": "2025-07-26",
            "model_used": "gemini-2.0-flash",
            "metrics_computed": ["faithfulness", "answer_relevancy", "context_precision"]
        },
        "average_scores": {
            "faithfulness": round(np.mean([r["faithfulness"] for r in results]), 4),
            "answer_relevancy": round(np.mean([r["answer_relevancy"] for r in results]), 4),
            "context_precision": round(np.mean([r["context_precision"] for r in results]), 4)
        },
        "score_distribution": {
            "faithfulness": {
                "min": round(np.min([r["faithfulness"] for r in results]), 4),
                "max": round(np.max([r["faithfulness"] for r in results]), 4),
                "std": round(np.std([r["faithfulness"] for r in results]), 4)
            },
            "answer_relevancy": {
                "min": round(np.min([r["answer_relevancy"] for r in results]), 4),
                "max": round(np.max([r["answer_relevancy"] for r in results]), 4),
                "std": round(np.std([r["answer_relevancy"] for r in results]), 4)
            },
            "context_precision": {
                "min": round(np.min([r["context_precision"] for r in results]), 4),
                "max": round(np.max([r["context_precision"] for r in results]), 4),
                "std": round(np.std([r["context_precision"] for r in results]), 4)
            }
        },
        "detailed_results": results
    }
    
    # Save summary report
    with open("ragas_summary_report.json", "w", encoding="utf-8") as f:
        json.dump(summary_report, f, indent=2, ensure_ascii=False)
    
    print("✅ Results saved:")
    print("  - ragas_evaluation_results.json (detailed results)")
    print("  - ragas_summary_report.json (summary report)")
    
    # Display final summary
    print(f"\n🎯 Final Summary:")
    print(f"Average Faithfulness: {summary_report['average_scores']['faithfulness']}")
    print(f"Average Answer Relevancy: {summary_report['average_scores']['answer_relevancy']}")
    print(f"Average Context Precision: {summary_report['average_scores']['context_precision']}")
    
    overall_score = (summary_report['average_scores']['faithfulness'] + 
                    summary_report['average_scores']['answer_relevancy'] + 
                    summary_report['average_scores']['context_precision']) / 3
    print(f"Overall Average Score: {overall_score:.4f}")
    
else:
    print("❌ No results to save")

Results saved to ragas_evaluation_results.json
✅ Results saved:
  - ragas_evaluation_results.json (detailed results)
  - ragas_summary_report.json (summary report)

🎯 Final Summary:
Average Faithfulness: 0.3031
Average Answer Relevancy: 0.7431
Average Context Precision: 1.0
Overall Average Score: 0.6821
