In [1]:
import os
import cv2
from tqdm import tqdm

In [2]:
# Load the pre-trained Haar Cascade classifier for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

In [3]:
# Function to create the directory to store cropped frames
def create_output_dir():
    output_dir = os.path.join(os.getcwd(), 'cropped_images')
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    return output_dir

In [4]:
# Function to crop faces from frames
def detect_and_crop_face(frame, scale_factor=1.4):  # Bounding box expansion by 40%
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE)
    
    if len(faces) == 0:
        return None
    
    x, y, w, h = max(faces, key=lambda rect: rect[2] * rect[3])  # Largest face by area
    
    # Expand bounding box
    x = max(0, x - int(w * (scale_factor - 1) / 2))
    y = max(0, y - int(h * (scale_factor - 1) / 2))
    w = min(frame.shape[1] - x, int(w * scale_factor))
    h = min(frame.shape[0] - y, int(h * scale_factor))
    
    cropped_face = frame[y:y+h, x:x+w]
    
    return cropped_face

In [17]:
# Function to capture and process frames from each video
def process_video_frames(video_path, output_dir, frame_skip=30):
    cap = cv2.VideoCapture(video_path)
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    video_output_dir = os.path.join(output_dir, video_name)
    
    if not os.path.exists(video_output_dir):
        os.makedirs(video_output_dir)
    
    frame_count = 0
    extracted_frame_count = 0

    if not cap.isOpened():
        return
    
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    with tqdm(total=total_frames, desc=f"Processing {video_name}") as pbar:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            if frame_count % frame_skip == 0:
                cropped_face = detect_and_crop_face(frame)
                
                if cropped_face is not None:
                    frame_filename = os.path.join(video_output_dir, f"frame_{extracted_frame_count}.png")
                    cv2.imwrite(frame_filename, cropped_face)
                    extracted_frame_count += 1
            
            frame_count += 1
            pbar.update(1)
    
    cap.release()


In [18]:
# Main function to process all videos in the dataset folder
def process_all_videos_in_dataset(dataset_dir, frame_skip=30):
    output_dir = create_output_dir()
    
    video_files = [f for f in os.listdir(dataset_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    
    for video_file in video_files:
        video_path = os.path.join(dataset_dir, video_file)
        process_video_frames(video_path, output_dir, frame_skip)

In [19]:
# Specify the dataset folder and frame skip interval
dataset_dir = './dataset'
frame_skip = 3 # Change this according to your frame skipping requirements

In [25]:
# Process all videos in the dataset folder
process_all_videos_in_dataset(dataset_dir, frame_skip)

Processing id0_0000: 100%|██████████| 469/469 [00:05<00:00, 79.36it/s]
Processing id0_0001: 100%|██████████| 303/303 [00:03<00:00, 91.01it/s]
Processing id0_0002: 100%|██████████| 350/350 [00:05<00:00, 61.49it/s]
Processing id0_0003: 100%|██████████| 529/529 [00:07<00:00, 72.48it/s]
Processing id0_0004: 100%|██████████| 326/326 [00:05<00:00, 59.26it/s]
Processing id0_0005: 100%|██████████| 459/459 [00:06<00:00, 74.23it/s]
Processing id0_0006: 100%|██████████| 534/534 [00:11<00:00, 46.70it/s]
Processing id0_0007: 100%|██████████| 479/479 [00:08<00:00, 57.85it/s]
Processing id0_0008: 100%|██████████| 464/464 [00:07<00:00, 63.57it/s]
Processing id0_0009: 100%|██████████| 520/520 [00:07<00:00, 69.50it/s]
Processing id1_0000: 100%|██████████| 371/371 [00:02<00:00, 123.72it/s]
Processing id1_0001: 100%|██████████| 276/276 [00:01<00:00, 141.44it/s]


In [26]:
# Function to check if an image contains a face
def contains_face(image_path):
    image = cv2.imread(image_path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray_image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE)
    
    return len(faces) > 0

In [33]:
# Function to delete images without faces from the 'cropped_images' folder
def delete_non_face_images(cropped_images_dir):
    # Collect all image files from the directory
    image_files = []
    for root, dirs, files in os.walk(cropped_images_dir):
        for file in files:
            if file.endswith(('.png', '.jpg', '.jpeg')):
                image_files.append(os.path.join(root, file))

In [34]:
# Specify the cropped_images folder
cropped_images_dir = './cropped_images'

In [36]:
# Delete non-face images with progress tracking
delete_non_face_images(cropped_images_dir)