In [14]:
# Imports
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import json


##Load the uncertainty results and hallucination labels

In [15]:
with open("/content/semantic_entropy_Llama3.1-8b_xor_tydiqa_results.json", 'r') as f:
    data = json.load(f)

with open("/content/labeled_data_XORfull_rougel_isri.json", 'r') as f:
    hallucination_labels = json.load(f)


## Confidence measures functions

In [16]:
# Small epsilon to avoid log(0)
epsilon = 1e-10

# Mutual Information = variance of cluster probabilities
def compute_mutual_information(cluster_probabilities):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32)
    variance = torch.var(cluster_probs, dim=0)
    return torch.mean(variance)

# Predictive Entropy
def compute_predictive_entropy(cluster_probabilities):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32) + epsilon
    entropy = -torch.sum(cluster_probs * torch.log(cluster_probs))
    return entropy

# Entropy over concepts (grouped entropy)
def compute_entropy_over_concepts(cluster_probabilities, semantic_set_ids):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32) + epsilon
    if not semantic_set_ids:
        return compute_predictive_entropy(cluster_probabilities)

    unique_concepts = torch.unique(torch.tensor(semantic_set_ids, dtype=torch.int64))
    entropies = []
    for concept in unique_concepts:
        concept_probs = cluster_probs[torch.tensor(semantic_set_ids, dtype=torch.int64) == concept]
        entropy = -torch.sum(concept_probs * torch.log(concept_probs))
        entropies.append(entropy)

    return torch.mean(torch.stack(entropies)) if entropies else compute_predictive_entropy(cluster_probabilities)

# Margin Probability = top1 - top2 cluster confidence
def compute_margin_probability(cluster_probabilities):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32)
    sorted_probs, _ = torch.sort(cluster_probs, descending=True)
    return sorted_probs[0] - sorted_probs[1] if len(sorted_probs) > 1 else torch.tensor(1.0)


## AUROC Calculation Function

In [17]:
# Evaluate AUROC for each uncertainty metric
def evaluate_uncertainty_metrics(results_df):
    metrics = {}
    try:
        metrics['mutual_information_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['mutual_information'])
        metrics['predictive_entropy_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['predictive_entropy'])
        metrics['entropy_over_concepts_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['entropy_over_concepts'])
        metrics['margin_probability_auroc'] = roc_auc_score(1 - results_df['correct'], -results_df['margin_probability'])
        metrics['semantic_entropy_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['semantic_entropy'])
    except ValueError as e:
        print(f"Error computing AUROC: {e}")
    return metrics


## Plotting Functions (ROC + Bar Chart)

In [18]:
# Plot ROC Curves for all metrics
def plot_roc_curves(results_df):
    plt.figure(figsize=(10, 8))
    metrics = {
        'mutual_information': results_df['mutual_information'],
        'predictive_entropy': results_df['predictive_entropy'],
        'entropy_over_concepts': results_df['entropy_over_concepts'],
        'margin_probability': -results_df['margin_probability'],  # flip for ROC
        'semantic_entropy': results_df['semantic_entropy']
    }
    for name, values in metrics.items():
        fpr, tpr, _ = roc_curve(1 - results_df['correct'], values)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Hallucination Detection')
    plt.legend(loc="lower right")
    plt.savefig('roc_curves.png')
    plt.close()
    print("✅ ROC curve saved as 'roc_curves.png'.")

# Plot AUROC bar chart with matching ROC curve colors
def plot_auroc_bar_chart(auroc_dict, output_path='auroc_bar_chart.png'):
    color_map = {
        'mutual_information_auroc': '#1f77b4',
        'predictive_entropy_auroc': '#ff7f0e',
        'entropy_over_concepts_auroc': '#2ca02c',
        'margin_probability_auroc': '#d62728',
        'semantic_entropy_auroc': '#9467bd'
    }

    labels = list(auroc_dict.keys())
    scores = [auroc_dict[label] for label in labels]
    colors = [color_map.get(label, '#333333') for label in labels]

    plt.figure(figsize=(10, 6))
    bars = plt.bar(labels, scores, color=colors)
    plt.ylabel("AUROC Score")
    plt.title("Uncertainty Metrics - AUROC Comparison")
    plt.ylim(0, 1)

    for bar, score in zip(bars, scores):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
                 f"{score:.3f}", ha='center', va='bottom')

    plt.xticks(rotation=30, ha='right')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    print(f"✅ AUROC bar chart saved as '{output_path}'.")


## Run Everything (Main Script)

In [19]:
# Process data and compute all metrics
results = []
matched_count = 0

for item_id, item in data.items():
    cluster_probabilities = item.get('cluster_probabilities', [])
    if not cluster_probabilities or not all(isinstance(x, (int, float)) for x in cluster_probabilities):
        continue

    num_clusters = item.get('num_clusters', len(cluster_probabilities))
    semantic_set_ids = list(range(num_clusters))

    try:
        mutual_info = compute_mutual_information(cluster_probabilities)
        predictive_entropy = compute_predictive_entropy(cluster_probabilities)
        entropy_over_concepts = compute_entropy_over_concepts(cluster_probabilities, semantic_set_ids)
        margin_probability = compute_margin_probability(cluster_probabilities)
        semantic_entropy = item.get('semantic_entropy', np.nan)

        if item_id in hallucination_labels:
            label = hallucination_labels[item_id].get('computed_question_label', "Unknown")
            is_correct = 0 if label == "Hallucinated" else 1
            matched_count += 1

            results.append({
                'id': item_id,
                'question': item['question'],
                'mutual_information': mutual_info.item(),
                'predictive_entropy': predictive_entropy.item(),
                'entropy_over_concepts': entropy_over_concepts.item(),
                'margin_probability': margin_probability.item(),
                'semantic_entropy': semantic_entropy,
                'correct': is_correct,
                'hallucination_status': label
            })
    except Exception as e:
        print(f"❌ Error processing {item_id}: {e}")


## Save + Visualize Results

In [20]:
# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df.fillna(0, inplace=True)

# Compute AUROC
evaluation_metrics = evaluate_uncertainty_metrics(results_df)
print("🎯 Evaluation Metrics:")
for k, v in evaluation_metrics.items():
    print(f"{k}: {v:.4f}")

# Save results to files
results_df.to_csv('uncertainty_hallucination_metrics.csv', index=False)
with open('uncertainty_evaluation_metrics.json', 'w') as f:
    json.dump(evaluation_metrics, f, indent=4)

print(f"\n✅ Matched {matched_count} out of {len(data)} items.")

# Plot
plot_roc_curves(results_df)
plot_auroc_bar_chart(evaluation_metrics)


🎯 Evaluation Metrics:
mutual_information_auroc: 0.1557
predictive_entropy_auroc: 0.6868
entropy_over_concepts_auroc: 0.6688
margin_probability_auroc: 0.6529
semantic_entropy_auroc: 0.6774

✅ Matched 708 out of 708 items.
✅ ROC curve saved as 'roc_curves.png'.
✅ AUROC bar chart saved as 'auroc_bar_chart.png'.


## Print Summary Stats

In [21]:
# Print mean values for each metric by hallucination label
metrics = ['mutual_information', 'predictive_entropy', 'entropy_over_concepts', 'margin_probability', 'semantic_entropy']
print("\n📊 Summary Stats (Correct vs Hallucinated):")
for metric in metrics:
    non_hall = results_df[results_df['correct'] == 1][metric].mean()
    hall = results_df[results_df['correct'] == 0][metric].mean()
    diff = (hall - non_hall) / non_hall * 100
    print(f"{metric}:")
    print(f"  ✅ Non-hallucinated: {non_hall:.4f}")
    print(f"  ❌ Hallucinated:     {hall:.4f}")
    print(f"  📉 Diff: {diff:+.1f}%\n")



📊 Summary Stats (Correct vs Hallucinated):
mutual_information:
  ✅ Non-hallucinated: 0.2262
  ❌ Hallucinated:     0.1309
  📉 Diff: -42.1%

predictive_entropy:
  ✅ Non-hallucinated: 0.0172
  ❌ Hallucinated:     0.1431
  📉 Diff: +729.9%

entropy_over_concepts:
  ✅ Non-hallucinated: 0.0037
  ❌ Hallucinated:     0.0188
  📉 Diff: +405.0%

margin_probability:
  ✅ Non-hallucinated: 0.9920
  ❌ Hallucinated:     0.8857
  📉 Diff: -10.7%

semantic_entropy:
  ✅ Non-hallucinated: 0.0172
  ❌ Hallucinated:     0.1431
  📉 Diff: +729.9%

