In [11]:
import os
import cv2
from facenet_pytorch import MTCNN
import torch
from PIL import Image, ImageFile
import psutil
import time

In [12]:
# Allow processing of large images
Image.MAX_IMAGE_PIXELS = None  # Disable the decompression bomb limit
ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow loading truncated images

In [13]:
# use with new folders/files
#Pre-filter Files Before Processing
def clean_dataset(folder_path, valid_extensions=None):
    if valid_extensions is None:
        valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
    removed_files = 0
    for root, _, files in os.walk(folder_path):
        for file in files:
            if os.path.splitext(file)[1].lower() not in valid_extensions:
                file_path = os.path.join(root, file)
                os.remove(file_path)
                removed_files += 1
    print(f"Cleaned dataset. Removed {removed_files} unsupported files.")

# Example usage
clean_dataset("home/natalyagrokh/img_datasets/pexels_images_2")

Cleaned dataset. Removed 0 unsupported files.


In [14]:
# Validate Image Files
def is_valid_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except Exception as e:
        print(f"Invalid image {file_path}: {e}")
        return False

In [15]:
#batch processing function
#batch processing function
def process_in_batches(image_paths, batch_size, mtcnn, output_folder):
    """
    Process images in batches, detecting and cropping faces.

    Args:
        image_paths (list): List of image file paths.
        batch_size (int): Number of images to process per batch.
        mtcnn (MTCNN): MTCNN face detection model.
        output_folder (str): Path to the folder where cropped faces will be saved.

    Returns:
        None
    """
    processed_images = 0
    errors = 0

    for i in range(0, len(image_paths), batch_size):
        batch = image_paths[i:i + batch_size]
        for file_path in batch:
            try:
                # Open and validate image
                with Image.open(file_path) as img:
                    if img.size[0] * img.size[1] > 89478485:  # Check for oversized images
                        print(f"Skipping large image: {file_path}, size: {img.size[0]}x{img.size[1]}")
                        continue

                if not is_valid_image(file_path):
                    print(f"Skipping invalid file: {file_path}")
                    continue

                image = Image.open(file_path).convert("RGB")
                boxes, _ = mtcnn.detect(image)
                if boxes is not None:
                    for idx, box in enumerate(boxes):
                        left, top, right, bottom = map(int, box)
                        face = image.crop((left, top, right, bottom))
                        output_path = os.path.join(output_folder, f"{os.path.basename(file_path)}_face{idx+1}.jpg")
                        face.save(output_path)
                processed_images += 1

            except Exception as e:
                errors += 1
                with open("error_log.txt", "a") as log_file:
                    log_file.write(f"Error processing {file_path}: {e}\n")
                print(f"Error processing file {file_path}: {e}")
                continue

        print(f"Processed {len(batch)} images in batch. Total processed so far: {processed_images}. Errors: {errors}")

        # Monitor memory usage
        memory_info = psutil.virtual_memory()
        print(f"Memory usage: {memory_info.percent}%")
        if memory_info.percent > 90:
            print("High memory usage detected. Pausing for 30 seconds.")
            time.sleep(30)

In [16]:
# Face Cropping Function
def crop_and_save_faces(input_folder, output_folder, batch_size=25):
    """
    Detect and crop faces from images in a folder.

    Args:
        input_folder (str): Path to the folder containing input images.
        output_folder (str): Path to the folder where cropped faces will be saved.
        batch_size (int): Number of images to process per batch.

    Returns:
        None
    """
    mtcnn = MTCNN(keep_all=True, device="cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(output_folder, exist_ok=True)

    # Gather all image paths
    image_paths = [
        os.path.join(root, file)
        for root, _, files in os.walk(input_folder)
        for file in files if is_valid_image(os.path.join(root, file))
    ]
    print(f"Total images found: {len(image_paths)}")

    process_in_batches(image_paths, batch_size, mtcnn, output_folder)
    print("Face cropping complete.")

In [17]:
# Resource Monitoring
def monitor_resources():
    print(f"CPU usage: {psutil.cpu_percent()}%")
    print(f"Memory usage: {psutil.virtual_memory().percent}%")

In [None]:
# folder paths
if __name__ == "__main__":
    INPUT_FOLDER = "/home/natalyagrokh/img_datasets/temp_scraped_images/flickr_images_1"
    OUTPUT_FOLDER = "/home/natalyagrokh/img_datasets/curated_images/flickr_dataset_curatedr"
    BATCH_SIZE = 25

    crop_and_save_faces(INPUT_FOLDER, OUTPUT_FOLDER, BATCH_SIZE)
    
# input_folder = "/home/natalyagrokh/img_datasets/temp_scraped_images/flickr_images_1"
# output_folder = "/home/natalyagrokh/img_datasets/curated_images/flickr_dataset_curated"
# crop_and_save_faces(input_folder, output_folder, batch_size=50)

Total images found: 4051
Processed 25 images in batch. Total processed so far: 25. Errors: 0
Memory usage: 31.1%
Processed 25 images in batch. Total processed so far: 50. Errors: 0
Memory usage: 29.3%
Processed 25 images in batch. Total processed so far: 75. Errors: 0
Memory usage: 29.7%
Processed 25 images in batch. Total processed so far: 100. Errors: 0
Memory usage: 30.1%
Skipping large image: /home/natalyagrokh/img_datasets/temp_scraped_images/flickr_images_1/image_12843.jpg, size: 15370x10639
Processed 25 images in batch. Total processed so far: 124. Errors: 0
Memory usage: 29.2%
Processed 25 images in batch. Total processed so far: 149. Errors: 0
Memory usage: 29.4%
Processed 25 images in batch. Total processed so far: 174. Errors: 0
Memory usage: 29.2%
Processed 25 images in batch. Total processed so far: 199. Errors: 0
Memory usage: 29.4%
Skipping large image: /home/natalyagrokh/img_datasets/temp_scraped_images/flickr_images_1/image_15651.jpg, size: 15370x10639
Processed 25 ima

In [None]:
# #works well on uncorrupted, true images
# #runs into problems with corrupted or non-image files
# #function filters high res imgs and crops out faces, saves faces only
# def crop_and_save_faces(input_folder, output_folder):
#     """
#     Detects faces in images, crops them, and saves them as individual files incrementally.
#     Processes images one at a time to reduce memory usage.
    
#     Parameters:
#     - input_folder: Path to the folder containing input images.
#     - output_folder: Path to save cropped face images.
#     """
#     # Initialize MTCNN
#     mtcnn = MTCNN(keep_all=True, device="cpu")  # Use GPU if available

#     # Ensure output folder exists
#     os.makedirs(output_folder, exist_ok=True)

#     # Gather all image paths
#     image_paths = []
#     for root, _, files in os.walk(input_folder):
#         for file in files:
#             image_paths.append(os.path.join(root, file))

#     total_images = len(image_paths)
#     processed_images = 0
#     errors = 0

#     print(f"Total images found: {total_images}")

#     for file_path in image_paths:
#         try:
#             # Open image
#             image = Image.open(file_path).convert("RGB")
#             # Detect faces
#             boxes, _ = mtcnn.detect(image)

#             if boxes is not None:  # If faces are detected
#                 for idx, box in enumerate(boxes):
#                     # Crop and save each face
#                     left, top, right, bottom = map(int, box)
#                     face = image.crop((left, top, right, bottom))
#                     output_path = os.path.join(output_folder, f"{os.path.basename(file_path)}_face{idx+1}.jpg")
#                     face.save(output_path)
#             processed_images += 1

#             # Monitor memory usage
#             memory_info = psutil.virtual_memory()
#             if memory_info.percent > 90:  # If memory usage exceeds 90%, pause
#                 print("High memory usage detected. Pausing for 30 seconds.")
#                 time.sleep(30)

#         except Exception as e:
#             errors += 1
#             with open("error_log.txt", "a") as log_file:
#                 log_file.write(f"Error processing {file_path}: {e}\n")
#             print(f"Error processing file {file_path}: {e}")
#             continue

#         # Log progress every 100 images
#         if processed_images % 100 == 0:
#             print(f"Processed {processed_images}/{total_images} images. Errors so far: {errors}")

#     print(f"Processing complete. Total processed: {processed_images}. Total errors: {errors}.")