# Image Corruption Identification

In [1]:
from PIL import Image
import os
import pandas as pd

# Identify Corrupted Images
def identify_corrupted_images(split):
    # Directory of images
    directory = f"{split}_images"

    # Keep total indices of subsampled data
    total_indices = pd.read_csv(f"cleaned_data/{split}_5k.csv").iloc[:, 0].tolist()

    # Store total count, number of corrupted images, corrupted file names, and corrupted index number from CSV
    count = 0
    corrupted = 0
    corrupted_file_names = []
    corrupted_numbers = []

    # Iterate through all files, store list of corrupted file names
    for filename in os.listdir(directory):
        # Get Image File Path
        path = os.path.join(directory, filename)

        # Increment total count
        count += 1

        # Try to open image
        try:
            img = Image.open(path).convert("RGB") # Try to Fetch RGB Image
            img.verify()   # Verifies integrity without fully decoding
        except Exception:
            # If we encounter an error, we know we have a corrupted image
            corrupted += 1 # Increment corruption count

            # extract number before ".jpg"
            try:
                num = int(filename.split(".")[0])

                # Only store for subsampled images
                if num in total_indices:
                    corrupted_numbers.append(num)
                    corrupted_file_names.append(filename) # Store corrupted file name
            except:
                pass
    
    # Save corrupted numbers
    num_txt = f"{split}_5k_corrupted_indices.txt"
    with open(num_txt, "w") as f:
        for n in corrupted_numbers:
            f.write(f"{n}\n")

    # Save corrupted filenames
    file_txt = f"{split}_5k_corrupted_filenames.txt"
    with open(file_txt, "w") as f:
        for name in corrupted_file_names:
            f.write(f"{name}\n")
    
    print(f"Split: {split}, Corrupted: {corrupted}, Total: {count}, %: {100 * corrupted / count}%")

In [2]:
identify_corrupted_images("validation")



Split: validation, Corrupted: 20, Total: 33316, %: 0.06003121623244087%


In [3]:
identify_corrupted_images("test")



Split: test, Corrupted: 20, Total: 33519, %: 0.05966765118291118%
