In [1]:
import numpy as np
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Device check for GPU usage
device = 0 if torch.cuda.is_available() else -1

# Load entailment model and tokenizer
entailment_model_name = "microsoft/deberta-large-mnli"
model = AutoModelForSequenceClassification.from_pretrained(entailment_model_name)
tokenizer = AutoTokenizer.from_pretrained(entailment_model_name)

entailment_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    framework="pt"
)

# Entailment check function between two sentences
def check_entailment(sent1, sent2):
    result = entailment_pipe([{"text": sent1, "text_pair": sent2}])[0]
    return result['label'] == "ENTAILMENT" and result['score'] > 0.9

# Cluster output responses by bidirectional entailment
def cluster_by_bidirectional_entailment(outputs):
    clusters = []
    for out in outputs:
        placed = False
        for cluster in clusters:
            if check_entailment(out, cluster[0]) and check_entailment(cluster[0], out):
                cluster.append(out)
                placed = True
                break
        if not placed:
            clusters.append([out])
    return clusters

# Compute semantic entropy in bits
def semantic_entropy(clusters, base=2):
    total = sum(len(cluster) for cluster in clusters)
    proportions = np.array([len(cluster) / total for cluster in clusters if len(cluster) > 0])
    entropy = -np.sum(proportions * np.log(proportions) / np.log(base))
    return entropy

# Check hallucination given response set
def detect_hallucination_from_responses(responses, entropy_threshold=0.8):
    non_empty_resps = [r for r in responses if r.strip()]
    clusters = cluster_by_bidirectional_entailment(non_empty_resps)
    ent = semantic_entropy(clusters, base=2)
    hallucinated = ent > entropy_threshold
    return hallucinated, ent, clusters



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [6]:
import pandas as pd
from sklearn.metrics import classification_report

# Load CSV with pandas
df = pd.read_csv(r'/content/dataset.csv')

y_true = []
y_pred = []

for idx, row in df.iterrows():
    prompt = row['prompt']
    # Collect responses based on new format: 3 responses instead of 10
    responses = [row[f'response_{i}'] for i in range(1, 4)]

    hallucinated, entropy_value, clusters = detect_hallucination_from_responses(responses)

    print(f"Prompt: {prompt}")
    print(f"Hallucination Detected: {hallucinated}")
    print(f"Semantic Entropy (bits): {entropy_value:.3f}")
    print(f"Number of Clusters: {len(clusters)}")
    print("="*50)

    y_true.append(1 if row['target'].strip().lower() == 'yes' else 0)
    y_pred.append(1 if hallucinated else 0)

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['No Hallucination', 'Hallucination']))


Prompt: Who wrote 'Pride and Prejudice'?
Hallucination Detected: True
Semantic Entropy (bits): 0.918
Number of Clusters: 2
Prompt: What is the capital of France?
Hallucination Detected: False
Semantic Entropy (bits): -0.000
Number of Clusters: 1
Prompt: When did humans land on Mars?
Hallucination Detected: False
Semantic Entropy (bits): -0.000
Number of Clusters: 1
Prompt: What year did the Titanic sink?
Hallucination Detected: False
Semantic Entropy (bits): -0.000
Number of Clusters: 1
Prompt: Who invented the telephone?
Hallucination Detected: True
Semantic Entropy (bits): 0.918
Number of Clusters: 2
Prompt: Is Mount Everest the tallest mountain in the world?
Hallucination Detected: False
Semantic Entropy (bits): -0.000
Number of Clusters: 1
Prompt: Did World War I start in 1914?
Hallucination Detected: False
Semantic Entropy (bits): -0.000
Number of Clusters: 1
Prompt: What is the boiling point of water in Celsius?
Hallucination Detected: True
Semantic Entropy (bits): 1.585
Number o

In [12]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true, y_pred,labels=[0,1]))

[[5 5]
 [1 9]]
