<a href="https://colab.research.google.com/github/Reemaalt/Detection-of-Hallucination-in-Arabic/blob/main/our%20framework%20code/semantic_entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Estimate the probability of each cluster.
- Use Monte Carlo integration to compute semantic entropy.

In [4]:
import json
import os
import numpy as np
import torch
from google.colab import files
import re

In [17]:
def compute_cluster_log_probabilities(clusters):
    # Get negative log-likelihoods for each cluster
    neg_llh = [cluster['total_neg_log_likelihoods_for_cluster'] for cluster in clusters]
    log_probs = [-nll for nll in neg_llh]  # Convert to log probabilities

    # Log-normalize using log-sum-exp trick for numerical stability
    max_log_prob = max(log_probs)
    log_sum = max_log_prob + np.log(sum(np.exp(lp - max_log_prob) for lp in log_probs))
    norm_log_probs = [lp - log_sum for lp in log_probs]

    return norm_log_probs

def compute_semantic_entropy(log_probabilities):
   # form their code  entropy = - torch.sum(aggregated_likelihoods, dim=0) / torch.tensor(aggregated_likelihoods.shape[0])
    return -np.sum(log_probabilities) / len(log_probabilities)


In [19]:
def process_clustered_data(clustered_data):
    entropy_results = {}
    for question_id, data in clustered_data.items():
        clusters = data["clusters"]

        # Step 1: Compute cluster probabilities
        probabilities = compute_cluster_log_probabilities(clusters)

        # Step 2: Compute semantic entropy
        entropy = compute_semantic_entropy(probabilities)

        # Store results
        entropy_results[question_id] = {
            "question": data["question"],
            "semantic_entropy": entropy,
            "num_clusters": len(clusters),
            "answers" : clusters,
            "cluster_probabilities": probabilities
        }
    return entropy_results


In [22]:
# Load input data
input_file = "/content/entailment_clusters_Qwen2-7B-xor_tydiqa.json"


with open(input_file, "r", encoding="utf-8") as f:
    clustered_data = json.load(f)
print(f"Loaded {len(clustered_data)} questions with clustered answers.")

# Extract model name and dataset name
input_filename = os.path.basename(input_file)
match = re.search(r'entailment_clusters_(.+?)_(.+?)\.json', input_filename)
model_name = match.group(1)
dataset_name = match.group(2)

# Calculate results
entropy_results = process_clustered_data(clustered_data)

# Save output
output_file = f"semantic_entropy_{model_name}_{dataset_name}.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(entropy_results, f, ensure_ascii=False, indent=4)

print("Semantic entropy calculation completed successfully!")
files.download(output_file)
print(f"Semantic entropy results saved to {output_file}")

Loaded 708 questions with clustered answers.
Semantic entropy calculation completed successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Semantic entropy results saved to semantic_entropy_Qwen2-7B-xor_tydiqa.json
