In [5]:
from pathlib import Path
import numpy as np
from tqdm import tqdm
import cv2
import random       

DATA_ROOT   = Path("FaceForensics++_C23")   # <- folder you showed
FRAMES_ROOT = Path("frames")          # where extracted frames will go
NUM_FRAMES  = 32
MAX_PER_CLASS = 1000   # max real / max fake

FRAMES_ROOT.mkdir(parents=True, exist_ok=True)

def extract_frames(video_path, out_dir, num_frames=32, img_size=224):
    out_dir.mkdir(parents=True, exist_ok=True)

    cap = cv2.VideoCapture(str(video_path))
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total <= 0:
        cap.release()
        return False

    # segment-based sampling with small randomness
    segment_len = max(1, total // num_frames)
    indices = []
    for i in range(num_frames):
        start = i * segment_len
        end   = min(total - 1, (i + 1) * segment_len - 1)
        idx = np.random.randint(start, end + 1) if start < end else start
        indices.append(idx)

    for i, idx in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()

        if not ret or frame is None:
            frame = np.zeros((img_size, img_size, 3), dtype=np.uint8)
        else:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (img_size, img_size), interpolation=cv2.INTER_AREA)

        cv2.imwrite(str(out_dir / f"{i:03d}.jpg"), frame)

    cap.release()
    return True


In [6]:
# ---- REAL VIDEOS ONLY ----
real_dir = DATA_ROOT / "original"
real_paths = sorted(real_dir.glob("*.mp4"))

print("Found REAL videos:", len(real_paths), "in", real_dir)

# limit per class
if len(real_paths) > MAX_PER_CLASS:
    idx = np.random.choice(len(real_paths), MAX_PER_CLASS, replace=False)
    real_paths = [real_paths[i] for i in idx]
    print("Using REAL videos:", len(real_paths))

# Extract frames for REAL videos
for vp in tqdm(real_paths, desc="Extracting REAL frames"):
    out_dir = FRAMES_ROOT / "real" / vp.stem
    extract_frames(vp, out_dir, NUM_FRAMES)

print("✅ Finished extracting REAL frames")


Found REAL videos: 1000 in FaceForensics++_C23/original


Extracting REAL frames: 100%|██████████| 1000/1000 [38:58<00:00,  2.34s/it]

✅ Finished extracting REAL frames





In [7]:
FAKE_FOLDERS = [
    "DeepFakeDetection",
    "Deepfakes",
    "Face2Face",
    "FaceShifter",
    "FaceSwap",
    "NeuralTextures",
]

# collect ALL fake paths from all 6 folders
fake_paths_all = []

for folder_name in FAKE_FOLDERS:
    folder = DATA_ROOT / folder_name
    paths  = sorted(folder.glob("*.mp4"))
    print(f"{folder_name}: found {len(paths)} videos in {folder}")
    fake_paths_all.extend(paths)

print("\nTOTAL fake videos found:", len(fake_paths_all))

# limit to MAX_PER_CLASS across all fake types
if len(fake_paths_all) > MAX_PER_CLASS:
    idx = np.random.choice(len(fake_paths_all), MAX_PER_CLASS, replace=False)
    fake_paths = [fake_paths_all[i] for i in idx]
    print("Using FAKE videos:", len(fake_paths))
else:
    fake_paths = fake_paths_all
    print("Using FAKE videos (no limit applied):", len(fake_paths))

# Extract frames for FAKE videos
for vp in tqdm(fake_paths, desc="Extracting FAKE frames"):
    out_dir = FRAMES_ROOT / "fake" / vp.stem
    extract_frames(vp, out_dir, NUM_FRAMES)

print("✅ Finished extracting FAKE frames")


DeepFakeDetection: found 1000 videos in FaceForensics++_C23/DeepFakeDetection
Deepfakes: found 1000 videos in FaceForensics++_C23/Deepfakes
Face2Face: found 1000 videos in FaceForensics++_C23/Face2Face
FaceShifter: found 1000 videos in FaceForensics++_C23/FaceShifter
FaceSwap: found 1000 videos in FaceForensics++_C23/FaceSwap
NeuralTextures: found 1000 videos in FaceForensics++_C23/NeuralTextures

TOTAL fake videos found: 6000
Using FAKE videos: 1000


Extracting FAKE frames: 100%|██████████| 1000/1000 [44:56<00:00,  2.70s/it] 

✅ Finished extracting FAKE frames





In [8]:
from pathlib import Path
import shutil

SRC_ROOT = Path("frames")       # your current extracted frames
DST_ROOT = Path("frames_dataset")    # new clean dataset
DST_ROOT.mkdir(parents=True, exist_ok=True)


In [9]:
REAL_SRC = SRC_ROOT / "real"

for video_folder in REAL_SRC.iterdir():
    if not video_folder.is_dir():
        continue
    
    new_name = f"{video_folder.name}_real"
    dst = DST_ROOT / new_name
    dst.mkdir(exist_ok=True)

    # copy all frames
    for fp in video_folder.glob("*.jpg"):
        shutil.copy(fp, dst / fp.name)

print("Done moving REAL frames.")


Done moving REAL frames.


In [10]:
FAKE_SRC = SRC_ROOT / "fake"

for video_folder in FAKE_SRC.iterdir():
    if not video_folder.is_dir():
        continue
    
    new_name = f"{video_folder.name}_fake"
    dst = DST_ROOT / new_name
    dst.mkdir(exist_ok=True)

    for fp in video_folder.glob("*.jpg"):
        shutil.copy(fp, dst / fp.name)

print("Done moving FAKE frames.")


Done moving FAKE frames.


In [None]:
# Build list of (frames_dir, label)
all_videos = []

for vp in real_paths:
    frames_dir = FRAMES_ROOT / "real" / vp.stem
    all_videos.append((frames_dir, 0))   # 0 = real

for vp in fake_paths:
    frames_dir = FRAMES_ROOT / "fake" / vp.stem
    all_videos.append((frames_dir, 1))   # 1 = fake




In [None]:
import random
random.shuffle(all_videos)


In [None]:
mtcnn = MTCNN(image_size=224, margin=20, post_process=True, device=DEVICE)

In [None]:
def crop_faces_from_folder(in_folder, out_folder):
    out_folder.mkdir(parents=True, exist_ok=True)

    frame_paths = sorted(in_folder.glob("*.jpg"))
    if len(frame_paths) == 0:
        return

    for fp in frame_paths:
        img = cv2.imread(str(fp))
        if img is None:
            continue

        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # detect face
        face = mtcnn(img_rgb)
        if face is None:
            # fallback: resize original image
            face = cv2.resize(img_rgb, (224,224))
            face = torch.tensor(face).permute(2,0,1) / 255.0

        face_np = (face.permute(1,2,0).numpy() * 255).astype(np.uint8)
        cv2.imwrite(str(out_folder / fp.name), cv2.cvtColor(face_np, cv2.COLOR_RGB2BGR))


In [None]:
folders = sorted(FRAMES_ROOT.iterdir())

for f in tqdm(folders, desc="Cropping faces"):
    out_f = CROPPED_ROOT / f.name
    crop_faces_from_folder(f, out_f)
