<a href="https://colab.research.google.com/github/Sidhtang/evaluation-metrics-in-hallucination/blob/main/hallucination_metrics_in_llms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-generativeai



In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import json
import os
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import google.generativeai as genai
from typing import Dict, List, Tuple, Union, Optional

# Download necessary NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class RGBHallucinationEvaluator:
    """Evaluates LLM responses for hallucinations using the RGB framework with Gemini 2.0."""

    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize the evaluator with Gemini 2.0.

        Args:
            api_key: Gemini API key
        """
        # Set up Gemini client with API key from parameter or environment
        if api_key:
            genai.configure(api_key=api_key)
        elif os.environ.get("GEMINI_API_KEY"):
            genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
        else:

            api_key = ""
            genai.configure(api_key=api_key)
            print("Warning: Using hardcoded API key. This is not recommended for production.")

        # Initialize Gemini model - using the 2.0 Flash model for evaluation
        self.model = genai.GenerativeModel('gemini-2.0-flash')

        # Load bias word lists
        self.bias_indicators = {
            "demographic": ["all", "every", "always", "never", "obviously", "clearly", "certainly", "undoubtedly"],
            "confidence": ["definitely", "absolutely", "without doubt", "undeniably", "unquestionably"],
            "absolutist": ["always", "never", "everyone", "nobody", "impossible", "guaranteed"]
        }

    def evaluate(self, response_text: str, reference_text: Optional[str] = None, query_text: Optional[str] = None) -> Dict:
        """
        Evaluate the LLM response for hallucinations.

        Args:
            response_text: The LLM response to evaluate
            reference_text: Optional reference text to compare against
            query_text: Optional original query that generated the response

        Returns:
            Dictionary containing RGB hallucination scores and details
        """
        # Split texts into sentences
        response_sentences = sent_tokenize(response_text)

        # Evaluate each dimension
        retrieval_results = self._evaluate_retrieval_hallucination(response_text, reference_text)
        generation_results = self._evaluate_generation_hallucination(response_text, query_text)
        bias_results = self._evaluate_bias_hallucination(response_text)

        # Calculate overall score (weighted average)
        weights = {"retrieval": 0.4, "generation": 0.4, "bias": 0.2}
        overall_score = (
            weights["retrieval"] * retrieval_results["score"] +
            weights["generation"] * generation_results["score"] +
            weights["bias"] * bias_results["score"]
        )

        # Return comprehensive results
        return {
            "overall_hallucination_score": overall_score,
            "retrieval_hallucination": retrieval_results,
            "generation_hallucination": generation_results,
            "bias_hallucination": bias_results,
            "interpretation": self._interpret_score(overall_score)
        }

    def _parse_gemini_response(self, response_text: str) -> Dict:
        """
        Parse the Gemini model response text into a JSON structure.

        Args:
            response_text: Text response from Gemini

        Returns:
            Dictionary parsed from the JSON in the response
        """
        try:
            # Try to find JSON structure in the response
            json_match = re.search(r'({[\s\S]*})', response_text)
            if json_match:
                json_str = json_match.group(1)
                return json.loads(json_str)
            else:
                # If no JSON structure found, create a basic response
                return {"score": 0.5, "reasoning": "Could not parse structured output from model response"}
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            print(f"Response text: {response_text}")
            return {"score": 0.5, "reasoning": "Failed to parse model response"}

    def _evaluate_retrieval_hallucination(self, response_text: str, reference_text: Optional[str]) -> Dict:
        """
        Evaluate factual errors or made-up information.

        Args:
            response_text: The LLM response to evaluate
            reference_text: Optional reference text to compare against

        Returns:
            Dictionary with retrieval hallucination score and details
        """
        # If reference text is provided, use it for evaluation
        if reference_text:
            prompt = f"""
            You are an expert evaluator of AI-generated content. Analyze the following response for factual errors
            or made-up information (retrieval hallucination) by comparing it to the reference text.

            Reference text:
            {reference_text}

            AI-generated response to evaluate:
            {response_text}

            Identify specific statements in the response that:
            1. Contradict the reference text
            2. Present information not found in the reference text and not common knowledge
            3. Make up specific details, numbers, dates, names, or events

            Then provide:
            - A score from 0 to 1, where 0 means no retrieval hallucination and 1 means severe retrieval hallucination
            - A list of hallucinated statements with explanations

            Return your analysis in JSON format with the following structure:
            {{
                "score": 0.0,
                "hallucinated_statements": [
                    {{
                        "statement": "example statement",
                        "explanation": "explanation of why this is a hallucination"
                    }}
                ],
                "reasoning": "your overall reasoning for the score"
            }}
            """
        else:
            # Without reference, ask the model to evaluate based on common knowledge
            prompt = f"""
            You are an expert evaluator of AI-generated content. Analyze the following response for factual errors
            or made-up information (retrieval hallucination) based on common knowledge.

            AI-generated response to evaluate:
            {response_text}

            Identify specific statements that:
            1. Contradict well-established facts
            2. Present information that is suspiciously specific without citation
            3. Make up specific details, numbers, dates, names, or events
            4. Claim something as fact when it's actually disputed or uncertain

            Then provide:
            - A score from 0 to 1, where 0 means no retrieval hallucination and 1 means severe retrieval hallucination
            - A list of potentially hallucinated statements with explanations

            Return your analysis in JSON format with the following structure:
            {{
                "score": 0.0,
                "hallucinated_statements": [
                    {{
                        "statement": "example statement",
                        "explanation": "explanation of why this may be a hallucination"
                    }}
                ],
                "reasoning": "your overall reasoning for the score"
            }}
            """

        # Get evaluation from Gemini
        try:
            response = self.model.generate_content(prompt)
            result = self._parse_gemini_response(response.text)

            # Ensure we have the expected structure
            if "hallucinated_statements" not in result:
                result["hallucinated_statements"] = []

            return result
        except Exception as e:
            print(f"Error in retrieval hallucination evaluation: {str(e)}")
            return {"score": 0.5, "hallucinated_statements": [], "reasoning": f"Evaluation failed: {str(e)}"}

    def _evaluate_generation_hallucination(self, response_text: str, query_text: Optional[str]) -> Dict:
        """
        Evaluate inconsistencies or logical errors in generated content.

        Args:
            response_text: The LLM response to evaluate
            query_text: Optional original query that generated the response

        Returns:
            Dictionary with generation hallucination score and details
        """
        context = f"In response to the query: {query_text}\n\n" if query_text else ""

        prompt = f"""
        You are an expert evaluator of AI-generated content. Analyze the following response for internal inconsistencies,
        logical errors, and self-contradictions (generation hallucination).

        {context}AI-generated response to evaluate:
        {response_text}

        Identify:
        1. Internal contradictions (places where the text contradicts itself)
        2. Logical inconsistencies or errors in reasoning
        3. Non-sequiturs or disconnected chains of reasoning
        4. Claims that don't logically follow from premises

        Then provide:
        - A score from 0 to 1, where 0 means perfect logical consistency and 1 means severe logical inconsistencies
        - A list of inconsistencies with explanations

        Return your analysis in JSON format with the following structure:
        {{
            "score": 0.0,
            "inconsistencies": [
                {{
                    "issue": "description of inconsistency",
                    "explanation": "explanation of the logical problem"
                }}
            ],
            "reasoning": "your overall reasoning for the score"
        }}
        """

        # Get evaluation from Gemini
        try:
            response = self.model.generate_content(prompt)
            result = self._parse_gemini_response(response.text)

            # Ensure we have the expected structure
            if "inconsistencies" not in result:
                result["inconsistencies"] = []

            return result
        except Exception as e:
            print(f"Error in generation hallucination evaluation: {str(e)}")
            return {"score": 0.5, "inconsistencies": [], "reasoning": f"Evaluation failed: {str(e)}"}

    def _evaluate_bias_hallucination(self, response_text: str) -> Dict:
        """
        Evaluate unfounded assumptions or biased framing.

        Args:
            response_text: The LLM response to evaluate

        Returns:
            Dictionary with bias hallucination score and details
        """
        # Simple heuristic check for bias indicator words
        bias_word_count = 0
        total_bias_words = []

        for category, words in self.bias_indicators.items():
            for word in words:
                pattern = r'\b' + re.escape(word) + r'\b'
                matches = re.findall(pattern, response_text.lower())
                if matches:
                    bias_word_count += len(matches)
                    total_bias_words.extend([word] * len(matches))

        # Full analysis with Gemini
        prompt = f"""
        You are an expert evaluator of AI-generated content. Analyze the following response for unfounded assumptions,
        biased framing, and unjustified assertions (bias hallucination).

        AI-generated response to evaluate:
        {response_text}

        Identify:
        1. Unfounded generalizations or assumptions
        2. Biased or one-sided framing of issues
        3. Assertions presented as fact without adequate justification
        4. Subjective judgments presented as objective facts
        5. Use of loaded language or emotional appeals instead of evidence

        Then provide:
        - A score from 0 to 1, where 0 means no bias hallucination and 1 means severe bias hallucination
        - A list of biased statements or unfounded assumptions with explanations

        Return your analysis in JSON format with the following structure:
        {{
            "score": 0.0,
            "biased_elements": [
                {{
                    "statement": "example statement",
                    "issue": "nature of the bias or unfounded assumption",
                    "explanation": "explanation of the issue"
                }}
            ],
            "reasoning": "your overall reasoning for the score"
        }}
        """

        # Get evaluation from Gemini
        try:
            response = self.model.generate_content(prompt)
            result = self._parse_gemini_response(response.text)

            # Ensure we have the expected structure
            if "biased_elements" not in result:
                result["biased_elements"] = []

            # Add the simple heuristic analysis
            result["bias_indicators_found"] = total_bias_words
            result["bias_indicator_count"] = bias_word_count

            return result
        except Exception as e:
            print(f"Error in bias hallucination evaluation: {str(e)}")
            return {
                "score": 0.5,
                "biased_elements": [],
                "reasoning": f"Evaluation failed: {str(e)}",
                "bias_indicators_found": total_bias_words,
                "bias_indicator_count": bias_word_count
            }

    def _interpret_score(self, score: float) -> str:
        """
        Interpret the overall hallucination score.

        Args:
            score: Overall hallucination score (0-1)

        Returns:
            String interpretation of the score
        """
        if score < 0.1:
            return "Excellent: Response contains virtually no hallucinations"
        elif score < 0.3:
            return "Good: Response contains minor hallucinations that don't significantly impact quality"
        elif score < 0.5:
            return "Fair: Response contains moderate hallucinations that somewhat impact quality"
        elif score < 0.7:
            return "Poor: Response contains substantial hallucinations that significantly impact quality"
        else:
            return "Very Poor: Response is dominated by hallucinations, making it unreliable"

def main():
    """Run evaluation with predefined values."""
    # Define your response, reference, and query texts here
    response_text = """
    The Great Pyramid of Giza was built in 2560 BCE and stands 481 feet tall. It was constructed by Emperor Napoleon
    during his Egyptian campaign to commemorate his victories. The pyramid contains exactly 2,300,000 limestone blocks,
    each weighing precisely 2.5 tons. Scientists recently discovered a hidden chamber containing ancient alien technology
    that powered the entire Egyptian civilization.
    """
    reference_text = """
    The Great Pyramid of Giza was built in approximately 2560 BCE during the Fourth Dynasty of Egypt's Old Kingdom.
    It stands 138 meters (approximately 455 feet) tall and was the tallest man-made structure in the world for more than
    3,800 years. It was built as a tomb for the Pharaoh Khufu (Cheops). The pyramid is estimated to contain between
    2.3 million and 2.6 million stone blocks, with an average weight ranging from 2.5 to 15 tons.
    """
    query_text = "Tell me about the Great Pyramid of Giza."

    # Create evaluator - you would normally provide your API key here
    evaluator = RGBHallucinationEvaluator()

    # Run evaluation
    results = evaluator.evaluate(
        response_text=response_text,
        reference_text=reference_text,
        query_text=query_text
    )

    # Output results
    print(json.dumps(results, indent=2))

    # Print summary
    print("\n=== RGB Hallucination Evaluation Summary ===")
    print(f"Overall hallucination score: {results['overall_hallucination_score']:.2f}/1.00")
    print(f"Retrieval hallucination score: {results['retrieval_hallucination']['score']:.2f}/1.00")
    print(f"Generation hallucination score: {results['generation_hallucination']['score']:.2f}/1.00")
    print(f"Bias hallucination score: {results['bias_hallucination']['score']:.2f}/1.00")
    print(f"Interpretation: {results['interpretation']}")

if __name__ == "__main__":
    main()


{
  "overall_hallucination_score": 0.96,
  "retrieval_hallucination": {
    "score": 1.0,
    "hallucinated_statements": [
      {
        "statement": "It stands 481 feet tall.",
        "explanation": "The reference states the height is approximately 455 feet, not 481 feet. This is a factual error."
      },
      {
        "statement": "It was constructed by Emperor Napoleon during his Egyptian campaign to commemorate his victories.",
        "explanation": "The reference states it was built as a tomb for the Pharaoh Khufu (Cheops) around 2560 BCE. Attributing its construction to Napoleon is a fabrication."
      },
      {
        "statement": "The pyramid contains exactly 2,300,000 limestone blocks, each weighing precisely 2.5 tons.",
        "explanation": "The reference provides a range of 2.3 million to 2.6 million blocks and a weight range of 2.5 to 15 tons. Specifying 'exactly 2,300,000' and 'precisely 2.5 tons' is a made-up detail and thus a hallucination."
      },
      {
