In [5]:
import os
from PIL import Image

# --- Set your folder paths here ---
data_dirs = ["images/environment", "images/studio"]
# ---

print("--- Starting Image Verification and Conversion ---")
files_converted = 0
files_scanned = 0

for dir_path in data_dirs:
    if not os.path.exists(dir_path):
        print(f"Warning: Directory not found, skipping: {dir_path}")
        continue
        
    for root, _, files in os.walk(dir_path):
        for file_name in files:
            if file_name.lower().endswith(('.jpg', '.jpeg')):
                files_scanned += 1
                file_path = os.path.join(root, file_name)
                
                try:
                    with Image.open(file_path) as img:
                        if img.format != 'JPEG':
                            print(f"-> Mismatch Found: '{file_name}' is a {img.format}. Converting...")
                            
                            # This block correctly handles transparency
                            if img.mode == 'RGBA':
                                # Create a new image with a white background
                                background = Image.new('RGB', img.size, (255, 255, 255))
                                # Paste the original image onto the background, using its alpha channel as a mask
                                background.paste(img, mask=img.getchannel('A'))
                                # Save the flattened image as a JPEG
                                background.save(file_path, 'JPEG', quality=95)
                            else:
                                # If no transparency, just convert and save
                                img.convert('RGB').save(file_path, 'JPEG', quality=95)
                            
                            files_converted += 1

                except Exception as e:
                    print(f"-> Error: Could not process '{file_name}'. It may be corrupted. Error: {e}")

print("\n--- Process Complete ---")
print(f"Scanned {files_scanned} JPG/JPEG files.")
print(f"Successfully converted {files_converted} mismatched files to the proper JPEG format.")

--- Starting Image Verification and Conversion ---

--- Process Complete ---
Scanned 5618 JPG/JPEG files.
Successfully converted 0 mismatched files to the proper JPEG format.


In [4]:
import os
from PIL import Image

# --- IMPORTANT: Update with the paths to ALL your data folders ---
# This includes train, validation, and any test directories you use.
data_dirs = ["images/environment", "images/studio"] 
# ---

problem_files = []
print("--- Starting Final, Deep Verification Scan ---")

for dir_path in data_dirs:
    if not os.path.exists(dir_path):
        print(f"Warning: Directory not found, skipping: {dir_path}")
        continue
    for root, _, files in os.walk(dir_path):
        for file_name in files:
            # Check all common image extensions
            if file_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                file_path = os.path.join(root, file_name)
                try:
                    # Open the image file
                    with Image.open(file_path) as img:
                        # This command forces the library to load all image data from the file.
                        # It will fail on a truncated file.
                        img.load() 
                        
                        # Additionally, check for format mismatches
                        if file_name.lower().endswith(('.jpg', '.jpeg')) and img.format != 'JPEG':
                           print(f"-> Mismatch Found: {file_path} is a {img.format}")
                           problem_files.append(file_path)

                except Exception as e:
                    # This block will catch truncated files and other errors
                    print(f"-> Corruption Found: {file_path} (Error: {e})")
                    problem_files.append(file_path)

print("\n--- Verification Complete ---")

if problem_files:
    print(f"\nError: Found {len(problem_files)} problematic files.")
    print("You should DELETE these files from your dataset:")
    for f in problem_files:
        print(f"-> {f}")
else:
    print("Success! All image files appear to be valid and fully readable.")

--- Starting Final, Deep Verification Scan ---

--- Verification Complete ---
Success! All image files appear to be valid and fully readable.


In [3]:
import os
from PIL import Image

# --- Set your folder paths here ---
# This list should contain the paths to all your class folders.
data_dirs = ["images/environment", "images/studio"]
# ---

# Pillow's default warning limit (in total pixels)
# You can lower this number to be more strict if you want.
PIXEL_LIMIT = 89478485 

print("--- Scanning for large images (potential decompression bombs) ---")
large_files = []

for dir_path in data_dirs:
    if not os.path.exists(dir_path):
        print(f"Warning: Directory not found, skipping: {dir_path}")
        continue
        
    # Walk through all files in the directory
    for root, _, files in os.walk(dir_path):
        for file_name in files:
            # Check for common image extensions
            if file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp')):
                file_path = os.path.join(root, file_name)
                
                try:
                    with Image.open(file_path) as img:
                        # Calculate the total number of pixels
                        width, height = img.size
                        total_pixels = width * height
                        
                        # Check if the image exceeds the limit
                        if total_pixels > PIXEL_LIMIT:
                            print(f"-> Found large file: '{file_name}' ({total_pixels:,} pixels)")
                            large_files.append(file_path)
                except Exception as e:
                    # Ignores corrupted files we already know about
                    continue

print("\n--- Scan Complete ---")

if large_files:
    print(f"Found {len(large_files)} image(s) exceeding the pixel limit.")
    print("You may want to resize these files to speed up training:")
    for f in large_files:
        print(f"-> {f}")
else:
    print("No images exceeding the pixel limit were found.")

--- Scanning for large images (potential decompression bombs) ---

--- Scan Complete ---
No images exceeding the pixel limit were found.
