## Setup and Imports

In [None]:
!pip install -q datasets transformers accelerate

import random
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## Data Loading and Preprocessing (Logic 1: Mismatched Summaries)

In [None]:
# Load CNN/DailyMail (the '3.0.0' version is common for summarization)
cnn_dm = load_dataset("cnn_dailymail", "3.0.0")

# Take a manageable subset — tune these numbers as you like
N_TRAIN = 5000   # e.g., 5k for quick prototype
N_VAL   = 1000

train_raw = cnn_dm["train"].shuffle(seed=42).select(range(N_TRAIN))
val_raw   = cnn_dm["validation"].shuffle(seed=42).select(range(N_VAL))

def build_pairs_from_split(split):
    """
    Build positive and negative (article, summary, label) lists
    from a given split of the CNN/DM dataset.
    """
    articles = split["article"]
    summaries = split["highlights"]

    n = len(articles)
    indices = list(range(n))

    pos_sources, pos_summaries, pos_labels = [], [], []
    neg_sources, neg_summaries, neg_labels = [], [], []

    for i in range(n):
        src = articles[i]
        summ = summaries[i]

        # positive pair
        pos_sources.append(src)
        pos_summaries.append(summ)
        pos_labels.append(1)

        # negative: mismatch summary from another article
        j = i
        while j == i:
            j = random.randint(0, n - 1)
        bad_summ = summaries[j]

        neg_sources.append(src)
        neg_summaries.append(bad_summ)
        neg_labels.append(0)

    # concat pos + neg
    sources = pos_sources + neg_sources
    summs   = pos_summaries + neg_summaries
    labels  = pos_labels + neg_labels

    return sources, summs, labels

train_sources, train_summaries, train_labels = build_pairs_from_split(train_raw)
val_sources,   val_summaries,   val_labels   = build_pairs_from_split(val_raw)

print("Train pairs:", len(train_sources))
print("Val pairs:", len(val_sources))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Train pairs: 10000
Val pairs: 2000


Epoch 1/3 | train_loss=0.1277 | val_acc=0.9995 <br>
Epoch 2/3 | train_loss=0.0091 | val_acc=1.0000 <br>
Epoch 3/3 | train_loss=0.0019 | val_acc=1.0000

To aid this problem, I introduced Corrupted Summaries along with Mismatched Summaries:

**Mismatched Summaries (Easy Negatives)**

Definition: Pair the article with a summary taken from a different article. <br>
Goal: Teach the model to detect when the summary is about the wrong topic entirely.

**Corrupted Summaries (Hard Negatives)**

Definition: Take the correct summary and make a small factual change (e.g., flip one number or slightly alter an entity). <br>
Goal: Teach the model to detect subtle hallucinations and fact inconsistencies, not just topic mismatch.

## Data Loading and Preprocessing (Logic 2: Mismatched & Less Aggressive Corrupted Summaries)


In [None]:
import re
import random

# Load CNN/DailyMail
cnn_dm = load_dataset("cnn_dailymail", "3.0.0")

# Expand training set as needed
N_TRAIN = 10000
N_VAL   = 2000

train_raw = cnn_dm["train"].shuffle(seed=42).select(range(N_TRAIN))
val_raw   = cnn_dm["validation"].shuffle(seed=42).select(range(N_VAL))


# -------------------------------------------------------------
# 1. Gentle corruption (flip ONLY ONE number, sometimes)
# -------------------------------------------------------------
def corrupt_summary(summary: str) -> str:
    """
    Mild corruption:
    - Flip ONE randomly chosen number (if any)
    - Only sometimes (30% chance)
    - Produces more realistic hallucinations
    """
    # find all numbers
    nums = re.findall(r"\d+", summary)
    if not nums:
        return summary

    # 30% chance to corrupt at all
    if random.random() > 0.3:
        return summary

    # choose ONE number to corrupt
    target = random.choice(nums)
    try:
        val = int(target)
        delta = random.randint(1, 5)   # much smaller change
        new_val = str(val + random.choice([-delta, delta]))
        return summary.replace(target, new_val, 1)
    except:
        return summary


# -------------------------------------------------------------
# 2. Updated dataset builder with mild corruption
# -------------------------------------------------------------
def build_pairs_from_split(
    split,
    neg_mismatch_ratio=1.0,     # always include mismatched negatives
    neg_corrupt_ratio=0.2       # only 20% corrupted negatives
):
    """
    Build (source, summary, label) for faithfulness classifier.

    Positives (label=1):
        - Article_i paired with its gold summary_i

    Negatives (label=0):
        - Mismatched (Article_i + Summary_j)
        - Mild corrupted (Article_i + corrupted summary_i)
    """

    articles = split["article"]
    summaries = split["highlights"]
    n = len(articles)

    pos_src, pos_sum, pos_lbl = [], [], []
    neg_src, neg_sum, neg_lbl = [], [], []

    for i in range(n):
        src = articles[i]
        summ = summaries[i]

        # ----------------
        # Positive example
        # ----------------
        pos_src.append(src)
        pos_sum.append(summ)
        pos_lbl.append(1)

        # ----------------------------------
        # Negative 1: mismatched summary
        # ----------------------------------
        if neg_mismatch_ratio > 0:
            j = i
            while j == i:
                j = random.randint(0, n - 1)

            neg_src.append(src)
            neg_sum.append(summaries[j])
            neg_lbl.append(0)

        # ----------------------------------
        # Negative 2: mildly corrupted summary
        # (only sometimes, controlled by neg_corrupt_ratio)
        # ----------------------------------
        if random.random() < neg_corrupt_ratio:
            corrupted = corrupt_summary(summ)
            if corrupted != summ:    # only if corruption changed something
                neg_src.append(src)
                neg_sum.append(corrupted)
                neg_lbl.append(0)

    # merge
    sources = pos_src + neg_src
    summs   = pos_sum + neg_sum
    labels  = pos_lbl + neg_lbl

    return sources, summs, labels


# -------------------------------------------------------------
# Build datasets
# -------------------------------------------------------------
train_sources, train_summaries, train_labels = build_pairs_from_split(train_raw)
val_sources,   val_summaries,   val_labels   = build_pairs_from_split(val_raw)

print("Train pairs:", len(train_sources))
print("Val pairs:", len(val_sources))


Train pairs: 20348
Val pairs: 4095


## Define Custom Dataset Class

In [None]:
class FaithfulnessDataset(Dataset):
    def __init__(self, sources, summaries, labels, tokenizer, max_length=512):
        self.sources = sources
        self.summaries = summaries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sources)

    def __getitem__(self, idx):
        src = self.sources[idx]
        summ = self.summaries[idx]
        label = self.labels[idx]

        enc = self.tokenizer(
            src,
            summ,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )

        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

## Initialize Tokenizer and Model

In [None]:
FAITH_MODEL_NAME = "roberta-base"   # base backbone; we'll fine-tune it

faith_tokenizer = AutoTokenizer.from_pretrained(FAITH_MODEL_NAME)
faith_model = AutoModelForSequenceClassification.from_pretrained(
    FAITH_MODEL_NAME,
    num_labels=2  # 0 = unfaithful, 1 = faithful
).to(device)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Training Function


In [None]:
def train_faithfulness_model(
    model,
    tokenizer,
    train_sources,
    train_summaries,
    train_labels,
    val_sources,
    val_summaries,
    val_labels,
    epochs=3,
    batch_size=8,
    lr=2e-5,
    max_length=512,
):
    train_ds = FaithfulnessDataset(train_sources, train_summaries, train_labels, tokenizer, max_length)
    val_ds   = FaithfulnessDataset(val_sources,   val_summaries,   val_labels,   tokenizer, max_length)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl   = DataLoader(val_ds,   batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_dl) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps,
    )

    loss_fn = nn.CrossEntropyLoss()

    model.to(device)

    for epoch in range(epochs):
        # ---- Training ----
        model.train()
        total_loss = 0.0

        for batch in train_dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop("labels")

            outputs = model(**batch)
            logits = outputs.logits  # [B, 2]
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dl)

        # ---- Validation ----
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_dl:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop("labels")

                logits = model(**batch).logits
                preds = logits.argmax(dim=-1)

                correct += (preds == labels).sum().item()
                total += labels.size(0)

        val_acc = correct / total if total > 0 else 0.0

        print(
            f"Epoch {epoch+1}/{epochs} | "
            f"train_loss={avg_train_loss:.4f} | val_acc={val_acc:.4f}"
        )

    return model

## Train Faithfulness Model

• No need to train once model is saved on GDrive.

In [None]:
faith_model = train_faithfulness_model(
    faith_model,
    faith_tokenizer,
    train_sources,
    train_summaries,
    train_labels,
    val_sources,
    val_summaries,
    val_labels,
    epochs=3,       # tune
    batch_size=8,   # tune
    lr=2e-5,
    max_length=512,
)

Epoch 1/3 | train_loss=0.1426 | val_acc=0.9699
Epoch 2/3 | train_loss=0.0830 | val_acc=0.9763
Epoch 3/3 | train_loss=0.0614 | val_acc=0.9797


Epoch 1/3 | train_loss=0.1426 | val_acc=0.9699 <br>
Epoch 2/3 | train_loss=0.0830 | val_acc=0.9763 <br>
Epoch 3/3 | train_loss=0.0614 | val_acc=0.9797

## Save Trained Model and Tokenizer

In [None]:
SAVE_DIR = "/content/gdrive/MyDrive/faithfulness_model_cnn_dm"

faith_model.save_pretrained(SAVE_DIR)
faith_tokenizer.save_pretrained(SAVE_DIR)

print("Saved faithfulness model to:", SAVE_DIR)

Saved faithfulness model to: /content/gdrive/MyDrive/faithfulness_model_cnn_dm


## Load Saved Model for Inference

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

LOAD_DIR = "/content/gdrive/MyDrive/faithfulness_model_cnn_dm"

factcc_tokenizer = AutoTokenizer.from_pretrained(LOAD_DIR)
factcc_model = AutoModelForSequenceClassification.from_pretrained(LOAD_DIR).to(device)

## Define Faithfulness Check Function


In [None]:
def check_faithfulness(
    source_text: str,
    summary_text: str,
    max_length: int = 512,
) -> float:
    """
    Returns P(faithful | source_text, summary_text) in [0, 1].

    label 1 = faithful
    label 0 = unfaithful
    """
    enc = factcc_tokenizer(
        source_text,
        summary_text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt",
    ).to(device)

    factcc_model.eval()
    with torch.no_grad():
        logits = factcc_model(**enc).logits  # [1, 2]

    probs = F.softmax(logits, dim=-1).squeeze(0)  # [2]
    faithful_prob = probs[1].item()
    return faithful_prob

## Demonstrate Faithfulness Check

In [None]:
src = train_raw[0]["article"]
true_summ = train_raw[0]["highlights"]
wrong_summ = train_raw[1]["highlights"]

print("True pair score: ", check_faithfulness(src, true_summ))
print("Wrong pair score:", check_faithfulness(src, wrong_summ))

True pair score:  0.99652498960495
Wrong pair score: 0.00022908105165697634


## Checking Unseen Source & Summary Faithfulness

In [None]:
source = "Lucas Matthysse won a majority decision against Ruslan Provodnikov in a 12-round super lightweight bout on Saturday night. Matthysse landed the majority of the punches in the first round and opening a cut near Provodnikov's left eye early in the second. Provodnikov (24-3) put Matthysse on the ropes late in the third round and landed two hard right hook-left hook combos in the fourth before Matthysse (37-3) regained control in the fifth. He continued to use his three-inch reach advantage to keep Provodnikov at bay, giving him room to dodge the Russian's powerful left hook. Lucas Matthysse celebrates after his win against Ruslan Provodnikov at the Turning Stone Resort Casino . Provodnikov (right) lands an uppercut to the head of Matthysse despite having a cut opened up early on . Provodnikov (24-3) put Matthysse on the ropes late in the third round and landed two hard right hook-left hook combos in the fourth before Matthysse (37-3) regained control in the fifth. He continued to use his three-inch reach advantage to keep Provodnikov at bay, giving him room to dodge the Russian's powerful left hook. Provodnikov landed several punches in the later rounds but couldn't knock down his Argentine opponent down, although he did stagger Matthysse in the 11th. 'He did, he hurt me,' Matthysse said through a translator. 'But I was able to withstand the onslaught. He's a very tough fighter. He's very strong. He just keeps coming forward.' Don Ackerman scored the fight as a draw 114-114, but Glenn Feldman and John McKaie both scored it 115-113 in favor of Matthysse. 'To me, he was better today,' Provodnikov said through a translator. 'He was the better man in the ring and, you know, it was a close fight but he won and I hope everybody enjoyed it.' The night's undercard was quickly decided when Patrick Teixeira (25-0, 21 KOs) won with a second-round knockout of Patrick Allotey (30-2) in their middleweight bout. Provodnikov admitted that Matthysse was better than him on the day in the majority decision win . Matthysse said he just wanted to rest but talked up a fight with Floyd Mayweather or Manny Pacquiao next . The sell-out crowd then had to wait more than an hour for the main event, watching the TV feed of Terence Crawford beating Thomas Dulorme for the vacant WBO junior welterweight title in Austin, Texas. WBO president Paco Valcarcel tweeted he would like to see Crawford defend his title against Matthysse, but the Argentinian had his sights set higher. 'For right now I just want to rest,' Matthysse said. 'I got my daughter, my family waiting for me back home. I want to rest. I want to go back there and see them so much and let's see what happens with (Manny) Pacquiao and (Floyd) Mayweather.' Matthysse and Provodnikov prepare to hug at the end of their fight on Saturday night in Verona, New York ."

In [None]:
summary_wrong = "Tayfun Korkut has been sacked by Hannover after 13 games without a win . Michael Frontzek has signed a contract valid for the remaining five matches . Korkut is returning to the Bundesliga as coach for the first time since 2011 when he was fired by Borussia Moenchengladbach."


In [None]:
# Taken from Venkat's Doc File - Normal Summary for Item 3002

summary_right = "Lucas Matthysse won a majority decision against Ruslan Provodnikov in a 12-round super lightweight bout on Saturday night . Matthysse landed the majority of the punches in the first round and opening a cut near Provodnikov's left eye early in the second . Provodnikov (24-3) put Matthysse on the ropes late in the third round and landed two hard right hook-left hook combo"

In [None]:
summary_right_simple = "Lucas Matthysse beat Ruslan Provodnikov 1-0 at the Turning Stone Resort Casino. Matthysse opened a cut near Provodnikov's left eye early in the second round. Patrick Teixeira (25-0, 21 KOs) won with a second-round knockout of Patrick Allotey (30-2)"

### Sensitive to syntax - GPT

In [None]:
summary_right_reference = "Lucas Matthysse won a majority decision against Ruslan Provodnikov . Matthysse managed to open a cut early on, landing majority of punches . But he was on the ropes before regaining control in the fifth round . He outlined his plans to fight Floyd Mayweather or Manny Pacquiao ."

In [None]:
score = check_faithfulness(source, summary_wrong)
print("Faithfulness score:", score)


Faithfulness score: 0.00023873784812167287


In [None]:
score = check_faithfulness(source, summary_right)
print("Faithfulness score:", score)


Faithfulness score: 0.9959813356399536


In [None]:
score = check_faithfulness(source, summary_right_simple)
print("Faithfulness score:", score)

Faithfulness score: 0.5919630527496338


In [None]:
score = check_faithfulness(source, summary_right_reference)
print("Faithfulness score:", score)

Faithfulness score: 0.9957876801490784


In [None]:
su = "Patrick Teixeira (25-0, 21 KOs) won with a second-round knockout of Patrick Allotey (30-2)"

In [None]:
score = check_faithfulness(source, su)
print("Faithfulness score:", score)

Faithfulness score: 0.9957042336463928
