# Paper 1 – RFFR Deepfake Detection Benchmark

This notebook implements and evaluates the **Residual Feature Fusion for Robust Deepfake Detection (RFFR)**-style model on multiple datasets:

- **FF++ (CViT frames)** for in-dataset training and testing

- **DFDC** and **Celeb-DF** for cross-dataset generalization

- **JPEG compression robustness** experiments to test stability under varying quality levels

> Run the notebook top-to-bottom to train (or load) the model, evaluate it on different benchmarks, and finally see summary visualizations of the main metrics.

Paper link : https://arxiv.org/pdf/2303.08439 (2303.08439v1.pdf)


In [8]:
import os
import random
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import torchvision.transforms as T
import timm

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

IMG_SIZE = 224
BATCH_SIZE = 8        # safe for 3050
EPOCHS = 0
LR = 2e-5

FFPP_REAL_PATH = r""
FFPP_FAKE_PATH = r""

## 2. Dataset & Preprocessing


In this section we define the **FFPPDataset** wrapper around image folders of real and fake faces.


Key points:


- Images are read from disk using `PIL.Image`.
- Each image is resized to `IMG_SIZE × IMG_SIZE`.
- Images are normalized to the `[-1, 1]` range using mean/std of 0.5.


> The resulting dataset is used for FF++, DFDC, and Celeb-DF evaluations by simply changing the input folders.


In [5]:
class FFPPDataset(Dataset):
    def __init__(self, real_path, fake_path):
        self.samples = []

        for f in os.listdir(real_path):
            self.samples.append((os.path.join(real_path, f), 0))

        for f in os.listdir(fake_path):
            self.samples.append((os.path.join(fake_path, f), 1))

        self.transform = T.Compose([
            T.Resize((IMG_SIZE, IMG_SIZE)),
            T.ToTensor(),
            T.Normalize([0.5]*3, [0.5]*3)
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert("RGB")
        img = self.transform(img)
        return img, label


In [6]:
# %%
class FFPPDataset(Dataset):
    def __init__(self, real_path, fake_path):
        self.samples = []

        for f in os.listdir(real_path):
            self.samples.append((os.path.join(real_path, f), 0))

        for f in os.listdir(fake_path):
            self.samples.append((os.path.join(fake_path, f), 1))

        self.transform = T.Compose([
            T.Resize((IMG_SIZE, IMG_SIZE)),
            T.ToTensor(),
            T.Normalize([0.5]*3, [0.5]*3)
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert("RGB")
        img = self.transform(img)
        return img, label


In [7]:
class BlockMasker:
    def __init__(self, k=4):
        self.k = k

    def mask(self, x):
        B, C, H, W = x.shape
        bh, bw = H // self.k, W // self.k
        out = x.clone()

        for b in range(B):
            i = random.randint(0, self.k - 1)
            j = random.randint(0, self.k - 1)

            out[b, :, i*bh:(i+1)*bh, j*bw:(j+1)*bw] = 0

        return out


In [8]:
# %%
class ResidualGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.inpainter = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 3, 3, padding=1)
        )

    def forward(self, original, masked):
        recon = self.inpainter(masked)
        residual = 4.0 * (recon - original)
        return residual


## 3. Model Architecture – RFFR


The model follows a **dual-branch RFFR-style architecture**:


1. **BlockMasker** randomly masks out a spatial block of the input face.
2. **ResidualGenerator** inpaints the masked image and computes a residual map \(4 \times (\hat{x} - x)\).
3. **DualBranchClassifier** extracts features from both the original image and the residual using a shared ViT backbone and fuses them for classification.


> This design encourages the model to focus on subtle manipulation artifacts rather than only global appearance.


In [9]:
# %%
class DualBranchClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.backbone = timm.create_model(
            "vit_base_patch16_224",
            pretrained=True,
            num_classes=0
        )

        self.fc = nn.Linear(768 * 2, 2)

    def forward(self, img, residual):
        f_img = self.backbone(img)
        f_res = self.backbone(residual)

        fused = torch.cat([f_img, f_res], dim=1)
        return self.fc(fused)


In [10]:
# %%
class RFFRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.masker = BlockMasker(k=4)
        self.residual_gen = ResidualGenerator()
        self.classifier = DualBranchClassifier()

    def forward(self, x):
        masked = self.masker.mask(x)
        residual = self.residual_gen(x, masked)
        return self.classifier(x, residual)


In [11]:
# %%
class RFFRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.masker = BlockMasker(k=4)
        self.residual_gen = ResidualGenerator()
        self.classifier = DualBranchClassifier()

    def forward(self, x):
        masked = self.masker.mask(x)
        residual = self.residual_gen(x, masked)
        return self.classifier(x, residual)


In [12]:
# %%
dataset = FFPPDataset(FFPP_REAL_PATH, FFPP_FAKE_PATH)

val_ratio = 0.2
val_size = int(len(dataset) * val_ratio)
train_size = len(dataset) - val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

print("Train:", len(train_dataset), "Val:", len(val_dataset))


Train: 30602 Val: 7650


## 4. Training Setup


Here we:


- Instantiate the `FFPPDataset` on FF++ training frames.
- Split the data into **train** and **validation** subsets.
- Wrap them in PyTorch **DataLoader** objects with shuffling for training.


> Adjust `BATCH_SIZE`, `EPOCHS`, and paths above to match your GPU memory and dataset locations.


In [13]:
# %%
model = RFFRModel().to(DEVICE)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR
)

In [14]:
# %%
def run_epoch(loader, model, optimizer=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()

    total_loss, correct, total = 0.0, 0, 0

    for imgs, labels in tqdm(loader, leave=False):
        imgs = imgs.to(DEVICE)
        labels = labels.to(DEVICE)

        if is_train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(is_train):
            logits = model(imgs)
            loss = criterion(logits, labels)

            if is_train:
                loss.backward()
                optimizer.step()

        preds = logits.argmax(1)
        total_loss += loss.item()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(loader), correct / total


In [15]:
# %%
best_val = 0.0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")

    train_loss, train_acc = run_epoch(train_loader, model, optimizer)
    val_loss, val_acc = run_epoch(val_loader, model)

    print(f"Train | loss: {train_loss:.4f} acc: {train_acc:.4f}")
    print(f"Val   | loss: {val_loss:.4f} acc: {val_acc:.4f}")

    if val_acc > best_val:
        best_val = val_acc
        torch.save(model.state_dict(), "best_rffr.pth")
        print("✔ Saved best model")


In [19]:
# %% =========================
# LOAD BEST MODEL FOR TESTING
# =========================

BEST_MODEL_PATH = "best_rffr.pth"

print("\nLoading best trained model from:", BEST_MODEL_PATH)

# Create fresh model instance
model = RFFRModel().to(DEVICE)

# Load weights
state_dict = torch.load(BEST_MODEL_PATH, map_location=DEVICE)
model.load_state_dict(state_dict)

model.eval()

print("✔ Best model loaded successfully")



Loading best trained model from: best_rffr.pth


  state_dict = torch.load(BEST_MODEL_PATH, map_location=DEVICE)


✔ Best model loaded successfully


In [20]:
# %%
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for imgs, labels in val_loader:
        imgs = imgs.to(DEVICE)
        labels = labels.to(DEVICE)

        preds = model(imgs).argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print("Final Val Accuracy:", correct / total)

Final Val Accuracy: 0.9887581699346405


In [5]:
# %% =========================
# Evaluation Utilities (Paper1)
# =============================

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
)
from tqdm import tqdm
import torch.nn.functional as F


@torch.no_grad()
def evaluate(loader, model):
    model.eval()

    all_probs = []
    all_preds = []
    all_labels = []

    for imgs, labels in tqdm(loader, desc="Evaluating", leave=False):
        imgs = imgs.to(DEVICE)

        logits = model(imgs)                  # (B,2)
        probs = F.softmax(logits, dim=1)[:,1]   # fake prob

        preds = (probs >= 0.5).long().cpu()

        all_probs.append(probs.cpu())
        all_preds.append(preds)
        all_labels.append(labels)

    probs = torch.cat(all_probs).numpy()
    preds = torch.cat(all_preds).numpy()
    labels = torch.cat(all_labels).numpy()

    return {
        "acc": accuracy_score(labels, preds),
        "auc": roc_auc_score(labels, probs),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
    }


NameError: name 'torch' is not defined

## 5. Evaluation & Metrics


We define a reusable `evaluate` helper that computes the main detection metrics:


- **Accuracy (ACC)**
- **Area under ROC curve (AUC)**
- **Precision / Recall**
- **F1 score**


> These metrics are used for FF++ test, JPEG compression robustness, and cross-dataset (DFDC, Celeb-DF) experiments.


In [None]:
# %% =========================
# FF++ TEST SET | 3-RUN AVG
# ============================

print("\n===== FF++ TEST (Paper1) | 3-RUN AVG =====")

FFPP_REAL_PATH = r""
FFPP_FAKE_PATH = r""

NUM_RUNS = 1
all_metrics = []

ffpp_test_dataset = FFPPDataset(FFPP_REAL_PATH, FFPP_FAKE_PATH)
ffpp_test_loader = DataLoader(
    ffpp_test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

for run in range(NUM_RUNS):
    print(f"\nRun {run+1}/{NUM_RUNS}")

    metrics = evaluate(ffpp_test_loader, model)
    all_metrics.append(metrics)
    print(metrics)

# Average
avg = {k: np.mean([m[k] for m in all_metrics]) for k in all_metrics[0]}
print("\nAVG:", avg)



===== FF++ TEST (Paper1) | 3-RUN AVG =====

Run 1/1


                                                               

{'acc': 0.8673898605268984, 'auc': 0.930978795863652, 'precision': 0.9507564806784234, 'recall': 0.8846843615494978, 'f1': 0.9165311904872497}

AVG: {'acc': np.float64(0.8673898605268984), 'auc': np.float64(0.930978795863652), 'precision': np.float64(0.9507564806784234), 'recall': np.float64(0.8846843615494978), 'f1': np.float64(0.9165311904872497)}




In [32]:
# %% =========================
# JPEG COMPRESSION TEST
# =========================

from PIL import Image
import io

class JPEGCompression:
    def __init__(self, quality):
        self.quality = quality

    def __call__(self, img_tensor):

        # UNNORMALIZE
        img = img_tensor.clone()
        img = img * 0.5 + 0.5     # [-1,1] -> [0,1]
        img = img.clamp(0,1)

        img = img.permute(1,2,0).cpu().numpy()
        img = (img * 255).astype(np.uint8)

        pil_img = Image.fromarray(img)
        buffer = io.BytesIO()
        pil_img.save(buffer, format="JPEG", quality=self.quality)
        buffer.seek(0)

        comp = Image.open(buffer).convert("RGB")
        comp = np.array(comp) / 255.0
        comp = torch.tensor(comp).permute(2,0,1).float()

        # RENORMALIZE
        comp = (comp - 0.5) / 0.5

        return comp



print("\n===== JPEG COMPRESSION TEST (Paper1) | 3-RUN AVG =====")

jpeg_qualities = [100, 90, 75, 50, 30]

for q in jpeg_qualities:
    print(f"\n--- JPEG Quality {q} ---")

    class JPEGWrapper(torch.utils.data.Dataset):
        def __init__(self, base_dataset, quality):
            self.base = base_dataset
            self.comp = JPEGCompression(quality)

        def __len__(self):
            return len(self.base)

        def __getitem__(self, idx):
            img, label = self.base[idx]
            img = self.comp(img)
            return img, label

    metrics_runs = []

    for run in range(NUM_RUNS):

        jpeg_dataset = JPEGWrapper(ffpp_test_dataset, q)
        jpeg_loader = DataLoader(
            jpeg_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=0,
        )

        metrics = evaluate(jpeg_loader, model)
        metrics_runs.append(metrics)

    avg = {k: np.mean([m[k] for m in metrics_runs]) for k in metrics_runs[0]}
    print("AVG:", avg)



===== JPEG COMPRESSION TEST (Paper1) | 3-RUN AVG =====

--- JPEG Quality 100 ---


Evaluating:   0%|          | 0/1694 [00:00<?, ?it/s]

                                                               

AVG: {'acc': np.float64(0.866135340565272), 'auc': np.float64(0.9300870109377233), 'precision': np.float64(0.9533009708737864), 'recall': np.float64(0.8804698708751794), 'f1': np.float64(0.9154391198955808)}

--- JPEG Quality 90 ---


                                                               

AVG: {'acc': np.float64(0.8589034019629548), 'auc': np.float64(0.9298905517483075), 'precision': np.float64(0.9602510460251046), 'recall': np.float64(0.864329268292683), 'f1': np.float64(0.9097687588485135)}

--- JPEG Quality 75 ---


                                                               

AVG: {'acc': np.float64(0.8479816987676186), 'auc': np.float64(0.9181844098419774), 'precision': np.float64(0.9525184152896675), 'recall': np.float64(0.8580523672883787), 'f1': np.float64(0.9028210208510237)}

--- JPEG Quality 50 ---


                                                               

AVG: {'acc': np.float64(0.8047376577374363), 'auc': np.float64(0.9022546196316853), 'precision': np.float64(0.9588908070781182), 'recall': np.float64(0.7968974175035868), 'f1': np.float64(0.8704211557296768)}

--- JPEG Quality 30 ---


                                                               

AVG: {'acc': np.float64(0.7569921039037709), 'auc': np.float64(0.8818722029982602), 'precision': np.float64(0.9643152546378353), 'recall': np.float64(0.731796987087518), 'f1': np.float64(0.8321182768289574)}




In [None]:
# %% =========================
# DFDC CROSS DATASET TEST
# ============================

DFDC_REAL_PATH = r""
DFDC_FAKE_PATH = r""
print("\n===== DFDC CROSS-DATASET (Paper1) | 3-RUN AVG =====")

dfdc_dataset = FFPPDataset(DFDC_REAL_PATH, DFDC_FAKE_PATH)
dfdc_loader = DataLoader(
    dfdc_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

metrics_runs = []

for run in range(NUM_RUNS):
    print(f"Run {run+1}/{NUM_RUNS}")
    metrics = evaluate(dfdc_loader, model)
    metrics_runs.append(metrics)

avg = {k: np.mean([m[k] for m in metrics_runs]) for k in metrics_runs[0]}
print("\nAVG:", avg)



===== DFDC CROSS-DATASET (Paper1) | 3-RUN AVG =====
Run 1/1


                                                                    


AVG: {'acc': np.float64(0.7222571468146995), 'auc': np.float64(0.6375245349462827), 'precision': np.float64(0.8043250649535307), 'recall': np.float64(0.8506028378489214), 'f1': np.float64(0.826816904402825)}


In [None]:
# %% =========================
# CELEB-DF CROSS DATASET TEST
# =========================

CELEB_REAL_PATH = r""
CELEB_FAKE_PATH = r""

print("\n===== CELEB-DF CROSS-DATASET (Paper1) | 3-RUN AVG =====")

celeb_dataset = FFPPDataset(CELEB_REAL_PATH, CELEB_FAKE_PATH)
celeb_loader = DataLoader(
    celeb_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

metrics_runs = []

for run in range(NUM_RUNS):
    print(f"Run {run+1}/{NUM_RUNS}")
    metrics = evaluate(celeb_loader, model)
    metrics_runs.append(metrics)

avg = {k: np.mean([m[k] for m in metrics_runs]) for k in metrics_runs[0]}
print("\nAVG:", avg)



===== CELEB-DF CROSS-DATASET (Paper1) | 3-RUN AVG =====
Run 1/1


                                                                


AVG: {'acc': np.float64(0.8357267369469578), 'auc': np.float64(0.7835831147047972), 'precision': np.float64(0.9361126150514347), 'recall': np.float64(0.8773006134969326), 'f1': np.float64(0.9057529288503667)}


