<a href="https://colab.research.google.com/github/Reemaalt/Detection-of-Hallucination-in-Arabic/blob/main/mysemantic_entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Estimate the probability of each cluster.
- Use Monte Carlo integration to compute semantic entropy.

In [None]:
import json
import os
import numpy as np
import torch
from google.colab import files

In [None]:
# Load clustered responses
file_path = "entailment_clusters_Llama3.1-mlqa.json"
with open(file_path, "r", encoding="utf-8") as f:
    clustered_data = json.load(f)

print(f"Loaded {len(clustered_data)} questions with clustered answers.")


Loaded 100 questions with clustered answers.


In [None]:
#since Since we dont have log likelihoods, estimate cluster probabilities by assigning uniform probability
def compute_cluster_uniform_probabilities(clusters):
    total_responses = sum(len(cluster) for cluster in clusters)
    probabilities = [len(cluster) / total_responses for cluster in clusters]
    return probabilities


Instead of assuming equal probability, the function:

1.   Converts log probabilities to normal
  probabilities using np.exp(log_prob).
2.   Sums the probabilities within each cluster.
3. Normalizes the probabilities so they sum to 1.


In [None]:
def compute_cluster_log_probabilities(clusters, response_log_probs):
    """
    Computes cluster probabilities by summing the exponentiated log probabilities of responses in each cluster.

    Parameters:
    - clusters: List of clusters, where each cluster is a list of response indices.
    - response_log_probs: List containing log probabilities log(p(s|x)) for each response.

    """
    cluster_probs = []

    for cluster in clusters:
        # Compute probability of each cluster by summing p(s|x) = exp(log_prob)
        cluster_prob = sum(np.exp(response_log_probs[idx]) for idx in cluster)
        cluster_probs.append(cluster_prob)

    # Normalize to ensure probabilities sum to 1
    total_prob = sum(cluster_probs)
    return [prob / total_prob for prob in cluster_probs] if total_prob > 0 else cluster_probs


In [None]:
#use Monte Carlo approximation based on Equation (3)
def compute_semantic_entropy(probabilities):
    probabilities = np.array(probabilities)
    entropy = -np.sum(probabilities * np.log(probabilities))
    return entropy


In [None]:
#Compute SE
entropy_results = {}

for question_id, data in clustered_data.items():
    clusters = data["clusters"]

    # Step 1: Compute cluster probabilities
    probabilities = compute_cluster_uniform_probabilities(clusters)

    # Step 2: Compute semantic entropy
    entropy = compute_semantic_entropy(probabilities)

    # Store results
    entropy_results[question_id] = {
        "question": data["question"],
        "semantic_entropy": entropy,
        "num_clusters": len(clusters),
        "probabilities": probabilities
    }

# Save the results
output_file = "semantic_entropy_Llama3.1-mlqa_results.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(entropy_results, f, ensure_ascii=False, indent=4)

files.download(output_file)
print(f"Semantic entropy results saved to {output_file}")



Semantic entropy results saved to semantic_entropy_Llama3.1-mlqa_results.json
