In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import json

# Load JSON data
with open("/content/semantic_entropy_Llama3.1-8b_xor_tydiqa_results.json", 'r') as f:
    data = json.load(f)

with open("/content/labeled_data_XORfull_rougel_isri.json", 'r') as f:
    hallucination_labels = json.load(f)

# Small epsilon value to avoid log(0)
epsilon = 1e-10

def compute_mutual_information(cluster_probabilities):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32)
    variance = torch.var(cluster_probs, dim=0)
    return torch.mean(variance)

def compute_predictive_entropy(cluster_probabilities):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32) + epsilon
    entropy = -torch.sum(cluster_probs * torch.log(cluster_probs))
    return entropy

def compute_entropy_over_concepts(cluster_probabilities, semantic_set_ids):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32) + epsilon
    if not semantic_set_ids:
        return compute_predictive_entropy(cluster_probabilities)

    unique_concepts = torch.unique(torch.tensor(semantic_set_ids, dtype=torch.int64))
    entropies = []
    for concept in unique_concepts:
        concept_probs = cluster_probs[torch.tensor(semantic_set_ids, dtype=torch.int64) == concept]
        entropy = -torch.sum(concept_probs * torch.log(concept_probs))
        entropies.append(entropy)

    return torch.mean(torch.stack(entropies)) if entropies else compute_predictive_entropy(cluster_probabilities)

def compute_margin_probability(cluster_probabilities):
    cluster_probs = torch.tensor([float(x) for x in cluster_probabilities], dtype=torch.float32)
    sorted_probs, _ = torch.sort(cluster_probs, descending=True)
    return sorted_probs[0] - sorted_probs[1] if len(sorted_probs) > 1 else torch.tensor(1.0)

# Evaluate AUROC metrics
def evaluate_uncertainty_metrics(results_df):
    metrics = {}
    try:
        metrics['mutual_information_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['mutual_information'])
        metrics['predictive_entropy_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['predictive_entropy'])
        metrics['entropy_over_concepts_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['entropy_over_concepts'])
        metrics['margin_probability_auroc'] = roc_auc_score(1 - results_df['correct'], -results_df['margin_probability'])
        metrics['semantic_entropy_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['semantic_entropy'])
    except ValueError as e:
        print(f"Error computing AUROC: {e}")
    return metrics

# Function to plot and save AUROC curves
def plot_roc_curves(results_df):
    plt.figure(figsize=(10, 8))

    metrics = {
        'mutual_information': results_df['mutual_information'],
        'predictive_entropy': results_df['predictive_entropy'],
        'entropy_over_concepts': results_df['entropy_over_concepts'],
        'margin_probability': -results_df['margin_probability'],  # Inverted
        'semantic_entropy': results_df['semantic_entropy']
    }

    for name, values in metrics.items():
        fpr, tpr, _ = roc_curve(1 - results_df['correct'], values)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Hallucination Detection')
    plt.legend(loc="lower right")
    plt.savefig('roc_curves.png')  # Save AUROC plot
    plt.close()
    print("\nROC curve saved as 'roc_curves.png'.")

# Prepare results
results = []
matched_count = 0

# Match hallucination labels with uncertainty data
for item_id, item in data.items():
    question = item['question']
    cluster_probabilities = item.get('cluster_probabilities', [])

    # Ensure cluster_probabilities is valid
    if not cluster_probabilities or not all(isinstance(x, (int, float)) for x in cluster_probabilities):
        print(f"Skipping item {item_id} due to invalid cluster_probabilities.")
        continue

    # Generate semantic_set_ids if missing
    num_clusters = item.get('num_clusters', len(cluster_probabilities))
    semantic_set_ids = list(range(num_clusters))

    # Compute uncertainty measures
    try:
        mutual_info = compute_mutual_information(cluster_probabilities)
        predictive_entropy = compute_predictive_entropy(cluster_probabilities)
        entropy_over_concepts = compute_entropy_over_concepts(cluster_probabilities, semantic_set_ids)
        margin_probability = compute_margin_probability(cluster_probabilities)
        semantic_entropy = item.get('semantic_entropy', np.nan)

        # Check if the item exists in hallucination_labels
        if item_id in hallucination_labels:
            question_label = hallucination_labels[item_id].get('computed_question_label', "Unknown")
            is_correct = 0 if question_label == "Hallucinated" else 1
            matched_count += 1

            results.append({
                'id': item_id,
                'question': question,
                'mutual_information': mutual_info.item(),
                'predictive_entropy': predictive_entropy.item(),
                'entropy_over_concepts': entropy_over_concepts.item(),
                'margin_probability': margin_probability.item(),
                'semantic_entropy': semantic_entropy,
                'correct': is_correct,
                'hallucination_status': question_label
            })
    except Exception as e:
        print(f"Error processing item {item_id}: {e}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df.fillna(0, inplace=True)  # Handle NaN values

# Compute AUROC metrics
evaluation_metrics = evaluate_uncertainty_metrics(results_df)
print("\nEvaluation Metrics:")
for metric, value in evaluation_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save results
results_df.to_csv('uncertainty_hallucination_metrics.csv', index=False)
with open('uncertainty_evaluation_metrics.json', 'w') as f:
    json.dump(evaluation_metrics, f, indent=4)

print(f"\nMatched {matched_count} out of {len(data)} questions with hallucination labels.")

# Save AUROC plot
plot_roc_curves(results_df)

# Calculate and print mean values for correct vs incorrect responses
print("\nSummary Statistics for Correct vs Hallucinated Responses:")
metrics = ['mutual_information', 'predictive_entropy', 'entropy_over_concepts', 'margin_probability', 'semantic_entropy']
for metric in metrics:
    non_hallucinated_mean = results_df[results_df['correct'] == 1][metric].mean()
    hallucinated_mean = results_df[results_df['correct'] == 0][metric].mean()
    diff_percent = (hallucinated_mean - non_hallucinated_mean) / non_hallucinated_mean * 100
    print(f"{metric}:")
    print(f"  Non-hallucinated mean: {non_hallucinated_mean:.4f}")
    print(f"  Hallucinated mean: {hallucinated_mean:.4f}")
    print(f"  Difference: {diff_percent:+.1f}%")

print("\nAnalysis complete! Results saved to CSV and ROC curves saved as 'roc_curves.png'.")



Evaluation Metrics:
mutual_information_auroc: 0.1557
predictive_entropy_auroc: 0.6868
entropy_over_concepts_auroc: 0.6688
margin_probability_auroc: 0.6529
semantic_entropy_auroc: 0.6774

Matched 708 out of 708 questions with hallucination labels.

ROC curve saved as 'roc_curves.png'.

Summary Statistics for Correct vs Hallucinated Responses:
mutual_information:
  Non-hallucinated mean: 0.2262
  Hallucinated mean: 0.1309
  Difference: -42.1%
predictive_entropy:
  Non-hallucinated mean: 0.0172
  Hallucinated mean: 0.1431
  Difference: +729.9%
entropy_over_concepts:
  Non-hallucinated mean: 0.0037
  Hallucinated mean: 0.0188
  Difference: +405.0%
margin_probability:
  Non-hallucinated mean: 0.9920
  Hallucinated mean: 0.8857
  Difference: -10.7%
semantic_entropy:
  Non-hallucinated mean: 0.0172
  Hallucinated mean: 0.1431
  Difference: +729.9%

Analysis complete! Results saved to CSV and ROC curves saved as 'roc_curves.png'.


In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import math
import json


In [None]:
# Load the data from the JSON files
with open("/content/semantic_entropy_Llama3.1-8b_xquadAll_results (1).json", 'r') as f:
    data = json.load(f)

with open("/content/labeled_data_rougel.json", 'r') as f:
    hallucination_labels = json.load(f)


# compute the uncertainty measures

In [None]:
# Small Epsilon to Avoid Log(0
epsilon = 1e-10



def compute_mutual_information(cluster_probabilities):
    cluster_probs = torch.tensor(cluster_probabilities)
    variance = torch.var(cluster_probs, dim=0)
    mutual_information = torch.mean(variance)
    return mutual_information


def compute_predictive_entropy(cluster_probabilities):
    cluster_probs = torch.tensor(cluster_probabilities) + epsilon
    entropy = -torch.sum(cluster_probs * torch.log(cluster_probs))
    return entropy



def compute_entropy_over_concepts(cluster_probabilities, semantic_set_ids):
    """Compute semantic entropy by grouping by concepts."""
    cluster_probs = torch.tensor(cluster_probabilities) + epsilon
    unique_concepts = torch.unique(torch.tensor(semantic_set_ids))
    entropies = []

    for concept in unique_concepts:
        concept_probs = cluster_probs[torch.tensor(semantic_set_ids) == concept]
        entropy = -torch.sum(concept_probs * torch.log(concept_probs))
        entropies.append(entropy)

    if len(entropies) == 0:
        return compute_predictive_entropy(cluster_probabilities)

    return torch.mean(torch.stack(entropies))


def compute_margin_probability(cluster_probabilities):
    cluster_probs = torch.tensor(cluster_probabilities)
    sorted_probs, _ = torch.sort(cluster_probs, descending=True)
    if len(sorted_probs) == 1:
        return torch.tensor(1.0)
    margin = sorted_probs[0] - sorted_probs[1]
    return margin



# Evaluate Uncertainty Metrics + Plot ROC Curves

In [None]:
def evaluate_uncertainty_metrics(results_df):
    metrics = {}
    metrics['mutual_information_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['mutual_information'])
    metrics['predictive_entropy_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['predictive_entropy'])
    metrics['entropy_over_concepts_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['entropy_over_concepts'])
    metrics['margin_probability_auroc'] = roc_auc_score(1 - results_df['correct'], -results_df['margin_probability'])
    metrics['semantic_entropy_auroc'] = roc_auc_score(1 - results_df['correct'], results_df['semantic_entropy'])

    return metrics


def plot_roc_curves(results_df):
    """Plots ROC curves for different uncertainty metrics and saves as PNG."""
    plt.figure(figsize=(10, 8))

    metrics = {
        'mutual_information': results_df['mutual_information'],
        'predictive_entropy': results_df['predictive_entropy'],
        'entropy_over_concepts': results_df['entropy_over_concepts'],
        'margin_probability': -results_df['margin_probability'],
        'semantic_entropy': results_df['semantic_entropy']
    }

    for name, values in metrics.items():
        fpr, tpr, _ = roc_curve(1 - results_df['correct'], values)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Hallucination Detection')
    plt.legend(loc="lower right")
    plt.savefig('roc_curves.png')  # Save the AUROC plot
    plt.close()
    print("\nROC curve saved as 'roc_curves.png'.")


# Match Hallucination Labels & Computed Uncertainty Metrics

In [None]:
# Prepare to store results
results = []

# Match hallucination labels with uncertainty data
matched_count = 0
total_questions = len(data)

for item_id, item in data.items():
    question = item['question']
    cluster_probabilities = item['cluster_probabilities']

    # Generate semantic_set_ids since it's missing in the file
    semantic_set_ids = list(range(item.get('num_clusters', len(cluster_probabilities))))

    # Compute uncertainty measures
    mutual_info = compute_mutual_information(cluster_probabilities)
    predictive_entropy = compute_predictive_entropy(cluster_probabilities)
    entropy_over_concepts = compute_entropy_over_concepts(cluster_probabilities, semantic_set_ids)
    margin_probability = compute_margin_probability(cluster_probabilities)
    semantic_entropy = item.get('semantic_entropy', 0)

    # Match the hallucination label for each question from hallucination_labels
    if item_id in hallucination_labels:
        question_label = hallucination_labels[item_id]['question_label']
        is_correct = 0 if question_label == "Hallucinated" else 1
        matched_count += 1

        # Store the results for this question
        results.append({
            'id': item_id,
            'question': question,
            'mutual_information': mutual_info.item(),
            'predictive_entropy': predictive_entropy.item(),
            'entropy_over_concepts': entropy_over_concepts.item(),
            'margin_probability': margin_probability.item(),
            'semantic_entropy': semantic_entropy,
            'correct': is_correct,
            'hallucination_status': question_label
        })


        # Handling NaN values in metrics computation
for i, item in enumerate(results):
    for metric in ['mutual_information', 'predictive_entropy', 'entropy_over_concepts', 'margin_probability', 'semantic_entropy']:
        if metric not in item or math.isnan(item[metric]):
            item[metric] = 0

# Convert to DataFrame after cleaning NaNs
results_df = pd.DataFrame(results)



  variance = torch.var(cluster_probs, dim=0)


# Results

In [None]:
# Compute and save AUROC values
evaluation_metrics = evaluate_uncertainty_metrics(results_df)
print("\nEvaluation of Uncertainty Metrics:")
for metric, value in evaluation_metrics.items():
    print(f"{metric}: {value:.4f}")

# Print matching statistics
print(f"Successfully matched {matched_count} out of {total_questions} questions with hallucination labels.")
print(f"Matching rate: {matched_count/total_questions:.2%}")

# Display summary statistics
print("\nSummary Statistics for Non-hallucinated vs Hallucinated:")
metrics = ['mutual_information', 'predictive_entropy', 'entropy_over_concepts', 'margin_probability', 'semantic_entropy']
for metric in metrics:
    non_hallucinated_mean = results_df[results_df['correct'] == 1][metric].mean()
    hallucinated_mean = results_df[results_df['correct'] == 0][metric].mean()
    diff_percent = (hallucinated_mean - non_hallucinated_mean) / non_hallucinated_mean * 100
    print(f"{metric}:")
    print(f"  Non-hallucinated mean: {non_hallucinated_mean:.4f}")
    print(f"  Hallucinated mean: {hallucinated_mean:.4f}")
    print(f"  Difference: {diff_percent:+.1f}%")

    # Save AUROC plot
plot_roc_curves(results_df)

# Save results & metrics
results_df.to_csv('uncertainty_hallucination_metrics.csv', index=False)
with open('uncertainty_evaluation_metrics.json', 'w') as f:
    json.dump(evaluation_metrics, f, indent=4)

print("\nAnalysis complete! Results saved.")




Evaluation of Uncertainty Metrics:
mutual_information_auroc: 0.2021
predictive_entropy_auroc: 0.7979
entropy_over_concepts_auroc: 0.5643
margin_probability_auroc: 0.6151
semantic_entropy_auroc: 0.7979
Successfully matched 1190 out of 1190 questions with hallucination labels.
Matching rate: 100.00%

Summary Statistics for Non-hallucinated vs Hallucinated:
mutual_information:
  Non-hallucinated mean: 0.1473
  Hallucinated mean: 0.0377
  Difference: -74.4%
predictive_entropy:
  Non-hallucinated mean: 0.6991
  Hallucinated mean: 1.2999
  Difference: +85.9%
entropy_over_concepts:
  Non-hallucinated mean: 0.1541
  Hallucinated mean: 0.1791
  Difference: +16.2%
margin_probability:
  Non-hallucinated mean: 0.4348
  Hallucinated mean: 0.2005
  Difference: -53.9%
semantic_entropy:
  Non-hallucinated mean: 0.6991
  Hallucinated mean: 1.2999
  Difference: +85.9%

ROC curve saved as 'roc_curves.png'.

Analysis complete! Results saved.
