In [1]:
import os
import random
import shutil
from pathlib import Path
from PIL import Image, ImageDraw, ImageEnhance, ImageFont
from tqdm import tqdm
from Forgery import make_forgery

In [2]:
RVLCDIP_ROOT = "E:/Thesis/rvl-cdip/images/" 
OUTPUT_ROOT = "./dataset"
SAMPLE_SIZE = 2000 
SEED = 42

random.seed(SEED)

In [3]:
all_images = list(Path(RVLCDIP_ROOT).rglob("*.tif"))
print(f"Found {len(all_images):,} images in dataset")

Found 400,000 images in dataset


In [4]:
sampled_images = random.sample(all_images, min(SAMPLE_SIZE, len(all_images)))

for subset in ["train", "val", "test"]:
    for label in ["authentic", "forged"]:
        os.makedirs(os.path.join(OUTPUT_ROOT, subset, label), exist_ok=True)

In [5]:
train_split = int(0.7 * len(sampled_images))
val_split = int(0.15 * len(sampled_images))

splits = {
    "train": sampled_images[:train_split],
    "val": sampled_images[train_split:train_split + val_split],
    "test": sampled_images[train_split + val_split:]
}


In [6]:
def generate_forgery(img: Image.Image) -> Image.Image:
    """
    Very simple synthetic forgery:
    - adds text overlay
    - random brightness/contrast tweak
    """
    img = img.convert("RGB")
    draw = ImageDraw.Draw(img)
    w, h = img.size

    # Try loading a font (optional)
    try:
        font = ImageFont.truetype("arial.ttf", size=int(h * 0.03))
    except:
        font = ImageFont.load_default()

    # Overlay “Edited” somewhere
    text = "Edited"
    x = random.randint(0, max(0, w - 100))
    y = random.randint(0, max(0, h - 30))
    draw.text((x, y), text, fill=(255, 0, 0), font=font)

    # Add some global brightness/contrast noise
    enhancer = ImageEnhance.Brightness(img)
    img = enhancer.enhance(random.uniform(0.8, 1.2))
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(random.uniform(0.8, 1.3))

    return img

In [7]:
for subset, images in splits.items():
    for path in tqdm(images, desc=f"Processing {subset}"):
        try:
            img = Image.open(path).convert("RGB")
        except Exception as e:
            print(f"Skipping {path}: {e}")
            continue

        # Save original (real)
        base_name = path.stem + ".png"
        real_path = os.path.join(OUTPUT_ROOT, subset, "authentic", base_name)
        img.save(real_path)

        # Generate and save forged version
        forged_img = make_forgery(img)
        forged_path = os.path.join(OUTPUT_ROOT, subset, "forged", path.stem + "_forged.png")
        forged_img.save(forged_path)

print("✅ Dataset successfully created!")
print(f"Saved under: {OUTPUT_ROOT}")

Processing train:   0%|          | 0/1400 [00:00<?, ?it/s]

Processing train: 100%|██████████| 1400/1400 [08:21<00:00,  2.79it/s]
Processing val: 100%|██████████| 300/300 [01:38<00:00,  3.04it/s]
Processing test: 100%|██████████| 300/300 [01:33<00:00,  3.21it/s]

✅ Dataset successfully created!
Saved under: ./dataset



