In [5]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Load public NLI-based factual consistency model
model_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# Label mapping: 0 = contradiction, 1 = neutral, 2 = entailment
labels = ["Contradiction", "Neutral", "Entailment"]

# === Load and Save Functions ===
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

def save_json(data, path):
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)

# === Evaluation Function ===
def evaluate_factual_consistency(data):
    results = []
    for item in tqdm(data, desc="Evaluating answers"):
        hypothesis = item.get("answer", "").strip()
        premise = item.get("ground_truth", "").strip()

        if not hypothesis or not premise:
            item["factcc_label"] = "Skipped"
            item["factcc_confidence"] = 0.0
            results.append(item)
            continue

        inputs = tokenizer(hypothesis, premise, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)

        pred_label_idx = torch.argmax(probs).item()
        predicted_label = labels[pred_label_idx]
        confidence = round(probs[0, pred_label_idx].item(), 4)

        item["factcc_label"] = "Entailed" if predicted_label == "Entailment" else "Not Entailed"
        item["factcc_confidence"] = confidence
        results.append(item)

    return results


#  input/output file paths
input_path = "rag_responses_vanilla_evaluation_data.json"
output_path = "rag_responses_vanilla_evaluation_data_FACTCC.json"

data = load_json(input_path)
evaluated_data = evaluate_factual_consistency(data)
save_json(evaluated_data, output_path)

print(f"✅ Factual consistency results saved to: {output_path}")


Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Evaluating answers: 100%|██████████| 271/271 [06:36<00:00,  1.46s/it]

✅ Factual consistency results saved to: rag_responses_vanilla_evaluation_data_FACTCC.json





In [6]:
import json
from collections import Counter
import numpy as np

with open('rag_responses_vanilla_evaluation_data_FACTCC.json') as f:
    data = json.load(f)

labels = [item['factcc_label'] for item in data]
confidences = [item['factcc_confidence'] for item in data]

# Get counts
counter = Counter(labels)
total = sum(counter[label] for label in ["Entailed", "Not Entailed"])
fc_rate = counter["Entailed"] / total if total > 0 else 0

# Average confidences
def avg_conf(label):
    return np.mean([item['factcc_confidence'] for item in data if item['factcc_label'] == label]) if counter[label] else 0

print("Total samples:", len(data))
print("Entailed:", counter["Entailed"])
print("Not Entailed:", counter["Not Entailed"])
print("Skipped:", counter["Skipped"])
print("Factual Consistency Rate: {:.2%}".format(fc_rate))
print("Avg Conf Entailed: {:.3f}".format(avg_conf("Entailed")))
print("Avg Conf Not Entailed: {:.3f}".format(avg_conf("Not Entailed")))


Total samples: 271
Entailed: 54
Not Entailed: 217
Skipped: 0
Factual Consistency Rate: 19.93%
Avg Conf Entailed: 0.922
Avg Conf Not Entailed: 0.953
