<a href="https://colab.research.google.com/github/Reemaalt/Detection-of-Hallucination-in-Arabic/blob/main/our%20framework%20code/mysemantic_entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Estimate the probability of each cluster.
- Use Monte Carlo integration to compute semantic entropy.

In [None]:
import json
import os
import numpy as np
import torch
from google.colab import files

In [None]:
def compute_cluster_log_probabilities(clusters):
    """
    Compute p(c|x) for each cluster according to equation (2)

    Compute normalized probabilities for each cluster using pre-summed negative log-likelihoods.
    """
    # Convert total negative log-likelihoods to probabilities
    cluster_probs = [np.exp(-cluster['total_neg_log_likelihoods_for_cluster']) for cluster in clusters]
    # Count how many total responses
    total_responses = sum(len(cluster) for cluster in clusters)

    # Normalize to ensure sum of probabilities = 1
    total_prob = sum(cluster_probs)
    if total_prob > 0:
        return [p/total_prob for p in cluster_probs]
    else:
        print("Warning: Total probability is zero. Check input values.")
        return [0.0] * len(cluster_probs)

- The reference code uses predictive_entropy_rao which computes -np.sum(np.exp(log_probs) * log_probs)
our code computes -sum(p * np.log(p) for p in normalized_probs)
These are mathematically equivalent approaches

- implementing the semantic entropy calculation as described in the reference code's predictive_entropy_rao() function.



In [None]:
def compute_semantic_entropy(probabilities):
    """
        Compute entropy from normalized probabilities.
        Uses the formula SE(x) = -∑ p(c|x)log p(c|x)

    """
    valid_probs = [p for p in probabilities if p > 0]

    if not valid_probs:
        return 0.0


    # Calculate entropy using the proper formula 3
    # This is the mc calculation as in the reference code
    entropy = -sum(p * np.log(p) for p in valid_probs)

    return entropy


In [None]:
# Step 5: Calculate semantic entropy
def process_clustered_data(clustered_data):
    entropy_results = {}
    for question_id, data in clustered_data.items():
        clusters = data["clusters"]

        # Step 1: Compute cluster probabilities
        probabilities = compute_cluster_log_probabilities(clusters)

        # Step 2: Compute semantic entropy
        entropy = compute_semantic_entropy(probabilities)

        # Store results
        entropy_results[question_id] = {
            "question": data["question"],
            "semantic_entropy": entropy,
            "num_clusters": len(clusters),
            "cluster_probabilities": probabilities
        }
    return entropy_results


In [None]:
# Load input data
with open("/content/entailment_clusters_Llama3.1-8b-xor_tydiqa.json", "r", encoding="utf-8") as f:
    clustered_data = json.load(f)
print(f"Loaded {len(clustered_data)} questions with clustered answers.")

# Calculate results
entropy_results = process_clustered_data(clustered_data)

# Save output
output_file = "semantic_entropy_Llama3.1-8b_xquadAll_results.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(entropy_results, f, ensure_ascii=False, indent=4)

print("Semantic entropy calculation completed successfully!")
files.download(output_file)
print(f"Semantic entropy results saved to {output_file}")

Loaded 708 questions with clustered answers.
Semantic entropy calculation completed successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Semantic entropy results saved to semantic_entropy_Llama3.1-8b_xquadAll_results.json


In [None]:
def compute_cluster_log_probabilitiesnew(clusters):
    # Get negative log-likelihoods for each cluster
    neg_llh = [cluster['total_neg_log_likelihoods_for_cluster'] for cluster in clusters]
    log_probs = [-nll for nll in neg_llh]  # Convert to log probabilities

    # Log-normalize using log-sum-exp trick for numerical stability
    max_log_prob = max(log_probs)
    log_sum = max_log_prob + np.log(sum(np.exp(lp - max_log_prob) for lp in log_probs))
    norm_log_probs = [lp - log_sum for lp in log_probs]

    return norm_log_probs

def compute_semantic_entropynew(log_probabilities):
    # Calculate entropy using log probabilities: -sum(exp(log_p) * log_p)
    return -sum(np.exp(lp) * lp for lp in log_probabilities if not np.isneginf(lp))