# Scientific Image Forgery Detection — Training ➜ Inference ➜ Submission (No EDA)

This notebook trains a U-Net model to segment copy–move forgeries in scientific images and generates a submission file.
- **No EDA / visualization** to save runtime.
- Assumes: `train_images/authentic/`, `train_images/forged/`, `train_masks/` (flat, one per forged), `test_images/`.


In [None]:
!uv pip install --system --no-index --find-links='/kaggle/input/sif-packages/whls/' \
  "numpy==1.26.4" "scipy==1.11.4" \
  "torch==2.4.1" "torchvision==0.19.1" \
  "timm==0.9.2" \
  "efficientnet-pytorch==0.7.1" "pretrainedmodels==0.7.4" "tqdm" \
  "opencv-python-headless>=4.10.0.84,<5.0.0" \
  "albumentations==1.4.6" \
  "segmentation-models-pytorch==0.3.3"

In [None]:
# ===== Imports & Paths =====
import os, warnings, random, math, gc
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import cv2

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import segmentation_models_pytorch as smp

import torch.optim as optim
from segmentation_models_pytorch import UnetPlusPlus
from segmentation_models_pytorch.losses import FocalLoss
from torch.cuda.amp import autocast, GradScaler
from albumentations import (
    GaussNoise, RandomResizedCrop, Affine, ShiftScaleRotate, CLAHE
)
import torch.nn.functional as F
from skimage.morphology import remove_small_objects



DATA_DIR = Path("/kaggle/input/recodai-luc-scientific-image-forgery-detection")
TRAIN_IMG_DIR = DATA_DIR / "train_images"
MASK_DIR      = DATA_DIR / "train_masks"
TEST_IMG_DIR  = DATA_DIR / "test_images"

assert (TRAIN_IMG_DIR / "authentic").exists(), "Missing train_images/authentic"
assert (TRAIN_IMG_DIR / "forged").exists(), "Missing train_images/forged"
assert MASK_DIR.exists(), "Missing train_masks"

def read_image(path: str):
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    if img is None:
        raise FileNotFoundError(path)
    return img

print("Ready. Folders checked.")


In [None]:
# ===== Indexing (attach masks only to forged) =====
def list_image_paths():
    auth_paths = sorted((TRAIN_IMG_DIR / "authentic").glob("*.png"))
    forg_paths = sorted((TRAIN_IMG_DIR / "forged").glob("*.png"))
    mask_paths = sorted(MASK_DIR.glob("*.npy"))
    return auth_paths, forg_paths, mask_paths

auth_paths, forg_paths, mask_paths = list_image_paths()

df_auth = pd.DataFrame({
    "case_id": [p.stem for p in auth_paths],
    "label": ["authentic"]*len(auth_paths),
    "img_path": [str(p) for p in auth_paths],
})
df_forg = pd.DataFrame({
    "case_id": [p.stem for p in forg_paths],
    "label": ["forged"]*len(forg_paths),
    "img_path": [str(p) for p in forg_paths],
})
df_mask = pd.DataFrame({
    "case_id": [m.stem for m in mask_paths],
    "mask_path": [str(m) for m in mask_paths],
})

df_forg = df_forg.merge(df_mask, on="case_id", how="left")
df_forg["has_mask"] = df_forg["mask_path"].notna()
df_auth["mask_path"] = None
df_auth["has_mask"]  = False

df = pd.concat([df_auth, df_forg], ignore_index=True)

# Basic assertions for integrity
num_masks = len(df_mask)
num_forged = (df["label"]=="forged").sum()
auth_with_mask = df_auth["has_mask"].sum()
forged_without_mask = ((df_forg["label"]=="forged") & ~df_forg["has_mask"]).sum()

print("Counts — authentic:", (df.label=="authentic").sum(), "| forged:", num_forged, "| masks:", num_masks)
assert num_masks == num_forged, "Number of masks must equal number of forged images."
assert auth_with_mask == 0, "Authentic images must have no masks."
assert forged_without_mask == 0, "All forged images must have exactly one mask."


In [None]:
# ===== Config =====
CFG = {
    "seed": 42,
    "img_size": 512,
    "epochs": 20,  # Increased
    "train_bs": 8,  # Increased (effective 16 with accumulation)
    "valid_bs": 8,
    "lr": 5e-4,  # Slightly higher for faster start
    "weight_decay": 1e-5,  # Reduced to allow more learning
    "encoder": "efficientnet-b5",  # Upgraded
    "pretrained": "imagenet",
    "loss_dice_weight": 0.7,  # More emphasis on Dice
    "focal_alpha": 0.25,  # For Focal Loss
    "num_workers": 4,  # Increased if GPU allows
    "accum_steps": 2,  # Gradient accumulation for larger effective batch
    "patience": 5,  # Early stopping
    "save_dir": "/kaggle/working",
}
random.seed(CFG["seed"]); np.random.seed(CFG["seed"]); torch.manual_seed(CFG["seed"])
print(CFG)


In [None]:
# ===== Dataset & Augmentations =====
def rle_encode(mask: np.ndarray):
    pixels = mask.flatten(order="F")
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return " ".join(str(x) for x in runs)

class ForgeryDataset(Dataset):
    def __init__(self, frame, img_size=512, aug=True):
        self.frame = frame.reset_index(drop=True)
        self.img_size = img_size
        self.aug = aug
        self.tfm_train = A.Compose([
            A.LongestMaxSize(max_size=img_size),
            A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.RandomRotate90(p=0.5),
            A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=30, p=0.5),
            A.GaussNoise(var_limit=(10, 50), p=0.3),  # New: Noise for robustness
            A.CLAHE(clip_limit=4.0, p=0.3),  # New: Contrast enhancement
            A.ColorJitter(p=0.5, brightness=0.2, contrast=0.2, saturation=0.15, hue=0.1),  # Enhanced
            A.Normalize(),
            ToTensorV2(),
        ])
        self.tfm_valid = A.Compose([
            A.LongestMaxSize(max_size=img_size),
            A.PadIfNeeded(min_height=img_size, min_width=img_size, border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0),
            A.Normalize(),
            ToTensorV2(),
        ])

    # Rest remains the same

    def __len__(self): return len(self.frame)

    def __getitem__(self, idx):
        r = self.frame.iloc[idx]
        img = cv2.imread(r["img_path"], cv2.IMREAD_COLOR)
        if img is None: raise FileNotFoundError(r["img_path"])
        H, W = img.shape[:2]

        if r["label"] == "forged":
            m = np.load(r["mask_path"])
            m = (m > 0).astype(np.uint8)
            if m.ndim == 3: m = m[...,0]
            if m.shape != (H, W):
                if m.shape[::-1] == (H, W):
                    m = m.T
                else:
                    m = cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST)
        else:
            m = np.zeros((H, W), dtype=np.uint8)

        tfm = self.tfm_train if self.aug else self.tfm_valid
        out = tfm(image=img, mask=m)
        img_t = out["image"]
        mask_t = out["mask"][None]
        return img_t, mask_t.float(), r["case_id"]


In [None]:
# ===== Train/Val Split & Loaders =====
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=CFG["seed"])
df["strat"] = df["label"].values
for fold, (tr_idx, va_idx) in enumerate(skf.split(df, df["strat"])):
    train_df = df.iloc[tr_idx].reset_index(drop=True)
    valid_df = df.iloc[va_idx].reset_index(drop=True)
    print(f"Using fold {fold} — train={len(train_df)} | valid={len(valid_df)}")
    break

train_ds = ForgeryDataset(train_df, img_size=CFG["img_size"], aug=True)
valid_ds = ForgeryDataset(valid_df, img_size=CFG["img_size"], aug=False)

train_loader = DataLoader(train_ds, batch_size=CFG["train_bs"], shuffle=True, num_workers=CFG["num_workers"], pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=CFG["valid_bs"], shuffle=False, num_workers=CFG["num_workers"], pin_memory=True)

len(train_loader), len(valid_loader)


In [None]:
# ===== Model, Loss, Optimizer, Scheduler =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = UnetPlusPlus(  # Upgraded to U-Net++
    encoder_name=CFG["encoder"],
    encoder_weights=CFG["pretrained"],
    in_channels=3,
    classes=1
).to(device)

bce = nn.BCEWithLogitsLoss()
dice = smp.losses.DiceLoss(mode="binary")
focal = FocalLoss(mode="binary", alpha=CFG["focal_alpha"])

def mix_loss(logits, targets):
    return CFG["loss_dice_weight"] * dice(logits, targets) + \
           (1 - CFG["loss_dice_weight"]) * focal(logits, targets)  # Switched to Focal + Dice

optimizer = optim.AdamW(model.parameters(), lr=CFG["lr"], weight_decay=CFG["weight_decay"])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)
scaler = GradScaler(enabled=torch.cuda.is_available())


# ===== Model (inference) =====
import os
import torch
import torch.nn as nn
import segmentation_models_pytorch as smp

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to your best weights in Kaggle Input (update if your filename/folder differs)
BEST_PTH = "/kaggle/input/unet-v1/best_unet.pth"  # <- change if needed

# Build the model WITHOUT downloading encoder weights
model = smp.Unet(
    encoder_name="efficientnet-b3",
    encoder_weights=None,       # <— prevents GitHub download
    in_channels=3,
    classes=1
).to(device)

# Load checkpoint robustly (supports various checkpoint formats)
ckpt = torch.load(BEST_PTH, map_location=device)

if isinstance(ckpt, dict):
    if "state_dict" in ckpt:
        state = ckpt["state_dict"]
    elif "model" in ckpt:
        state = ckpt["model"]
    else:
        state = ckpt
else:
    state = ckpt

# If keys are prefixed (e.g., 'module.'), strip them
from collections import OrderedDict
new_state = OrderedDict()
for k, v in state.items():
    nk = k
    if k.startswith("module."):
        nk = k[len("module."):]
    # Some training scripts save head as 'model.' prefix
    if nk.startswith("model."):
        nk = nk[len("model."):]
    new_state[nk] = v

missing, unexpected = model.load_state_dict(new_state, strict=False)
if missing or unexpected:
    print("Loaded with mismatched keys:\n  missing:", missing, "\n  unexpected:", unexpected)

model.eval()
torch.set_grad_enabled(False)

# ===== (Optional) loss utils if you still need metrics during inference =====
# bce = nn.BCEWithLogitsLoss()
# dice = smp.losses.DiceLoss(mode="binary")
# def mix_loss(logits, targets):
#     return CFG["loss_dice_weight"]*dice(logits, targets) + (1-CFG["loss_dice_weight"])*bce(logits, targets)

# NOTE: Optimizer/scheduler/scaler are unnecessary for inference and intentionally removed.

In [None]:
# ===== Training Loop =====
import numpy as np
def dice_coef_np(y_true, y_pred, eps=1e-7):
    y_true = y_true.astype(np.float32).flatten()
    y_pred = y_pred.astype(np.float32).flatten()
    inter = (y_true * y_pred).sum()
    return (2 * inter + eps) / (y_true.sum() + y_pred.sum() + eps)

def validate_epoch():
    model.eval()
    losses, dices = [], []
    with torch.no_grad():
        for imgs, masks, _ in valid_loader:
            imgs, masks = imgs.to(device), masks.to(device)
            logits = model(imgs)
            loss = mix_loss(logits, masks)
            losses.append(loss.item())
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs > 0.5).astype(np.uint8)
            targs = masks.cpu().numpy().astype(np.uint8)
            for p, t in zip(preds, targs):
                dices.append(dice_coef_np(t, p))
    return np.mean(losses), np.mean(dices)

best_dice = -1.0
patience_counter = 0
for epoch in range(1, CFG["epochs"] + 1):
    model.train()
    train_losses = []
    accum_loss = 0
    step = 0
    for imgs, masks, _ in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        optimizer.zero_grad(set_to_none=True)
        with autocast(enabled=torch.cuda.is_available()):
            logits = model(imgs)
            loss = mix_loss(logits, masks) / CFG["accum_steps"]
        scaler.scale(loss).backward()
        accum_loss += loss.item() * CFG["accum_steps"]
        step += 1
        if step % CFG["accum_steps"] == 0 or step == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            train_losses.append(accum_loss)
            accum_loss = 0

    val_loss, val_dice = validate_epoch()
    scheduler.step(val_dice)  # Step based on Dice
    print(f"Epoch {epoch:02d}/{CFG['epochs']} | train_loss={np.mean(train_losses):.4f} | val_loss={val_loss:.4f} | val_dice={val_dice:.4f}")

    if val_dice > best_dice:
        best_dice = val_dice
        torch.save(model.state_dict(), f"{CFG['save_dir']}/best_unet.pth")
        print(f"  ↳ Saved new best (val_dice={best_dice:.4f})")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= CFG["patience"]:
            print(f"Early stopping at epoch {epoch}")
            break

In [None]:
import json
import numba

@numba.jit(nopython=True)
def _rle_encode_jit(x: np.ndarray, fg_val: int = 1) -> list:
    """
    Official Numba-jitted RLE encoder from competition metric.
    Encodes in TRANSPOSED order (column-major).
    """
    dots = np.where(x.T.flatten() == fg_val)[0]
    run_lengths = []
    prev = -2
    for b in dots:
        if b > prev + 1:
            run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return run_lengths

def rle_encode(masks: list, fg_val: int = 1) -> str:
    """
    Official RLE encoding function from competition metric.
    Adapted from contrails RLE https://www.kaggle.com/code/inversion/contrails-rle-submission
    
    Args:
        masks: list of numpy array of shape (height, width), 1 - mask, 0 - background
        fg_val: foreground value (default 1)
    
    Returns: 
        Run length encodings as a string, with each RLE JSON-encoded and separated by a semicolon.
        Format: "[start1, length1, start2, length2, ...]"
    """
    return ';'.join([json.dumps(_rle_encode_jit(x, fg_val)) for x in masks])


In [None]:
# ===== Inference + Submission =====
from skimage.morphology import remove_small_objects
import numpy as np
import cv2
import torch
from albumentations import A
from torch.utils.data import ToTensorV2
from pathlib import Path

TEST_IMG_DIR = Path("/kaggle/input/recodai-luc-scientific-image-forgery-detection/test_images")
test_imgs = sorted(TEST_IMG_DIR.glob("*.png"))
test_df = pd.DataFrame({"case_id": [p.stem for p in test_imgs], "img_path": [str(p) for p in test_imgs]})
print("Test images:", len(test_df))

tfm_valid = A.Compose([
    A.LongestMaxSize(max_size=CFG["img_size"]),
    A.PadIfNeeded(min_height=CFG["img_size"], min_width=CFG["img_size"], border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0),
    A.Normalize(),
    ToTensorV2(),
])

ckpt = Path(f"{CFG['save_dir']}/best_unet.pth")
#ckpt = Path("/kaggle/input/unet-v1/best_unet.pth")
assert ckpt.exists(), "No model checkpoint found. Train first."
model.load_state_dict(torch.load(ckpt, map_location=device))
model.eval()

def predict_mask(img_bgr, tta=True):
    H0, W0 = img_bgr.shape[:2]
    probs = []
    tfms = [tfm_valid]  # Base
    if tta:
        tfms += [  # Add flips/rotates
            A.Compose([A.HorizontalFlip(always_apply=True)] + tfm_valid.transforms),
            A.Compose([A.VerticalFlip(always_apply=True)] + tfm_valid.transforms),
            A.Compose([A.RandomRotate90(always_apply=True)] + tfm_valid.transforms),
        ]
    for t in tfms:
        out = t(image=img_bgr)
        img_t = out["image"].unsqueeze(0).to(device)
        with torch.no_grad():
            logit = model(img_t)
            prob = torch.sigmoid(logit).squeeze().cpu().numpy()
        # Undo augmentations
        if any(isinstance(tr, A.HorizontalFlip) for tr in t.transforms): prob = np.fliplr(prob)
        if any(isinstance(tr, A.VerticalFlip) for tr in t.transforms): prob = np.flipud(prob)
        if any(isinstance(tr, A.RandomRotate90) for tr in t.transforms): prob = np.rot90(prob, k=-1)  # Rotate back
        prob = cv2.resize(prob, (W0, H0), interpolation=cv2.INTER_LINEAR)
        probs.append(prob)
    return np.mean(probs, axis=0)  # Average TTA

SUB = []
thr = 0.45  # Lowered for sensitivity
min_area = 50  # Increased slightly
for _, r in test_df.iterrows():
    img = cv2.imread(r["img_path"], cv2.IMREAD_COLOR)
    prob = predict_mask(img)
    mask = (prob > thr).astype(np.uint8)
    mask = remove_small_objects(mask.astype(bool), min_size=min_area, connectivity=1).astype(np.uint8)
    if mask.sum() < min_area:
        SUB.append((r["case_id"], "authentic"))
    else:
        SUB.append((r["case_id"], rle_encode([mask])))

sub_df = pd.DataFrame(SUB, columns=["case_id","annotation"])
sub_path = f"{CFG['save_dir']}/submission.csv"
sub_df.to_csv(sub_path, index=False)
print("Saved submission:", sub_path)
print(sub_df.head(5).to_string(index=False))