In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_BASE = '/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted'
import os
os.makedirs(DRIVE_BASE, exist_ok=True)
print("Saving everything to:", DRIVE_BASE)


Mounted at /content/drive
Saving everything to: /content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted


In [None]:
REAL_VIDEOS = '/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2/Celeb-real'
FAKE_VIDEOS = '/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2/Celeb-synthesis'

FRAMES_PER_VIDEO = 50

LIMIT_REAL_VIDEOS = None
LIMIT_FAKE_VIDEOS = None


In [None]:
import cv2, glob
from tqdm import tqdm
from pathlib import Path

OUT_FRAMES_REAL = os.path.join(DRIVE_BASE, 'frames', 'real')
OUT_FRAMES_FAKE = os.path.join(DRIVE_BASE, 'frames', 'fake')
os.makedirs(OUT_FRAMES_REAL, exist_ok=True)
os.makedirs(OUT_FRAMES_FAKE, exist_ok=True)

def extract_from_video_to_drive(video_path, out_dir, n_frames=50):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total <= 0:
        cap.release()
        return 0

    if total <= n_frames:
        indices = list(range(total))
    else:
        step = total / n_frames
        indices = [int(i*step) for i in range(n_frames)]

    idx_set = set(indices)
    idx = 0; saved = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx in idx_set:
            fname = f"{Path(video_path).stem}_f{idx:06d}.jpg"
            cv2.imwrite(os.path.join(out_dir, fname), frame)
            saved += 1
        idx += 1

    cap.release()
    return saved


real_files = sorted(glob.glob(os.path.join(REAL_VIDEOS, '*.mp4')))
if LIMIT_REAL_VIDEOS: real_files = real_files[:LIMIT_REAL_VIDEOS]

print("Real videos:", len(real_files))
for v in tqdm(real_files, desc='Real'):
    extract_from_video_to_drive(v, OUT_FRAMES_REAL, FRAMES_PER_VIDEO)


fake_files = sorted(glob.glob(os.path.join(FAKE_VIDEOS, '*.mp4')))
if LIMIT_FAKE_VIDEOS: fake_files = fake_files[:LIMIT_FAKE_VIDEOS]

print("Fake videos:", len(fake_files))
for v in tqdm(fake_files, desc='Fake'):
    extract_from_video_to_drive(v, OUT_FRAMES_FAKE, FRAMES_PER_VIDEO)

print("DONE. Saved:",
      len(os.listdir(OUT_FRAMES_REAL)), "real frames |",
      len(os.listdir(OUT_FRAMES_FAKE)), "fake frames")


Real videos: 590


Real: 100%|██████████| 590/590 [14:26<00:00,  1.47s/it]


Fake videos: 5640


Fake: 100%|██████████| 5640/5640 [2:59:51<00:00,  1.91s/it]


DONE. Saved: 29451 real frames | 282000 fake frames


In [None]:
import cv2, glob, os
from tqdm import tqdm
from pathlib import Path


DRIVE_BASE = '/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted'

FRAMES_REAL = os.path.join(DRIVE_BASE, 'frames', 'real')
FRAMES_FAKE = os.path.join(DRIVE_BASE, 'frames', 'fake')

FACES_REAL = os.path.join(DRIVE_BASE, 'faces', 'real')
FACES_FAKE = os.path.join(DRIVE_BASE, 'faces', 'fake')
os.makedirs(FACES_REAL, exist_ok=True)
os.makedirs(FACES_FAKE, exist_ok=True)

face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

TARGET_FACE_SIZE = (224, 224)


def extract_face_from_frame(frame_path, out_dir):
    img = cv2.imread(frame_path)
    if img is None:
        return False

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)


    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)

    if len(faces) == 0:
        return False


    x, y, w, h = max(faces, key=lambda b: b[2] * b[3])

    pad = int(0.1 * max(w, h))
    x1 = max(0, x - pad)
    y1 = max(0, y - pad)
    x2 = min(img.shape[1], x + w + pad)
    y2 = min(img.shape[0], y + h + pad)

    face = img[y1:y2, x1:x2]
    face = cv2.resize(face, TARGET_FACE_SIZE)

    out_path = os.path.join(out_dir, Path(frame_path).name)
    cv2.imwrite(out_path, face)
    return True

real_frames = sorted(glob.glob(os.path.join(FRAMES_REAL, '*.jpg')))
print("Real frames:", len(real_frames))

for p in tqdm(real_frames, desc='faces-real'):
    outp = os.path.join(FACES_REAL, Path(p).name)
    if os.path.exists(outp):
        continue  
    extract_face_from_frame(p, FACES_REAL)


fake_frames = sorted(glob.glob(os.path.join(FRAMES_FAKE, '*.jpg')))
print("Fake frames:", len(fake_frames))

for p in tqdm(fake_frames, desc='faces-fake'):
    outp = os.path.join(FACES_FAKE, Path(p).name)
    if os.path.exists(outp):
        continue
    extract_face_from_frame(p, FACES_FAKE)

print("Done. Faces saved in:")
print(FACES_REAL)
print(FACES_FAKE)

Real frames: 29451


faces-real: 100%|██████████| 29451/29451 [3:29:04<00:00,  2.35it/s]


Fake frames: 0


faces-fake: 0it [00:00, ?it/s]

Done. Faces saved in:
/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted/faces/real
/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted/faces/fake





In [None]:
import cv2, glob, os
from tqdm import tqdm
from pathlib import Path


DRIVE_BASE = '/content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted'

FRAMES_FAKE = os.path.join(DRIVE_BASE, 'frames', 'fake')
FACES_FAKE  = os.path.join(DRIVE_BASE, 'faces', 'fake')

os.makedirs(FACES_FAKE, exist_ok=True)

print("Looking for fake frames in:", FRAMES_FAKE)

face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

TARGET_FACE_SIZE = (224, 224)

def extract_face_from_frame(frame_path, out_dir):
    img = cv2.imread(frame_path)
    if img is None:
        return False

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)

    if len(faces) == 0:
        return False


    x, y, w, h = max(faces, key=lambda b: b[2] * b[3])
    pad = int(0.1 * max(w, h))

    x1 = max(0, x - pad)
    y1 = max(0, y - pad)
    x2 = min(img.shape[1], x + w + pad)
    y2 = min(img.shape[0], y + h + pad)

    face = img[y1:y2, x1:x2]
    face = cv2.resize(face, TARGET_FACE_SIZE)

    out_path = os.path.join(out_dir, Path(frame_path).name)
    cv2.imwrite(out_path, face)
    return True



fake_frames = sorted(glob.glob(os.path.join(FRAMES_FAKE, '*.jpg')))
print("✅ Fake frames found:", len(fake_frames))

for p in tqdm(fake_frames, desc='Extracting fake faces'):
    outp = os.path.join(FACES_FAKE, Path(p).name)
    if os.path.exists(outp):
        continue
    extract_face_from_frame(p, FACES_FAKE)

print("✅ DONE")
print("Faces saved in:", FACES_FAKE)


Looking for fake frames in: /content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted/frames/fake
✅ Fake frames found: 0


Extracting fake faces: 0it [00:00, ?it/s]

✅ DONE
Faces saved in: /content/drive/MyDrive/Deep_Fake_Detection/CelebDFv2_Extracted/faces/fake



