# Image Dataset Pre-Processing

The following code takes our various color images from different online dataset sources (one folder with nested folders of different images inside), combines them, renames them uniformly, and resizes them all to 128x128 resolution. I'm using offline directories (on my local machine) outside of this repo because Github causes issues if I make more than 10000 changes at once.

Further rescaling for compressed size datasets at the end.

In [6]:
import glob
import shutil
import os
from PIL import Image
import random

# Source directory with subfolders
raw_images_source = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/original"
# temp directory to store all color images before uniform scaling
color_raw_dir = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/color_raw"
# final dataset color directory
color_dir = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/largeset_multisize_v2/color"
# final dataset grayscale black & white directory
bw_dir = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/largeset_multisize_v2/gray"

bw_half_dir = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/largeset_multisize_v2/gray64"

bw_quarter_dir = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/largeset_multisize_v2/gray32"

bw_eighth_dir = "C:/Users/ziven/OneDrive/School/UBC/Fourth Year/CPSC 440/Final Project/Offline/largeset_multisize_v2/gray16"

The following code block extracts images from a sub-directory structure (as they are in the aerial images dataset) and places them uniformly in one directory with unique numbers 1 thru whatever max is necessary. 

In [7]:
# Get a list of all image files in nested folders
files = glob.glob(raw_images_source + '/**/*.jpg', recursive=True)

# Randomly shuffle the images, so we can partition them into training and testing based on number alone
random.shuffle(files)

counter = 1

# Copy each image to the destination folder
for file in files:
    filename = os.path.basename(file)
    new_filename = f"{counter}.jpg"
    destination_path = os.path.join(color_raw_dir, new_filename)

    # Ensure unique filenames
    while os.path.exists(destination_path):
        counter += 1
        new_filename = f"{counter}.jpg"
        destination_path = os.path.join(color_raw_dir, new_filename)

    # Open the image to check its aspect ratio
    img = Image.open(file)
    width, height = img.size

    # Check if the aspect ratio is square
    if width == height and width >= 128:
        # Copy the image to the destination folder
        shutil.copy(file, destination_path)
        # Increment the counter
        counter += 1
    else:
        print(f"Discarded {filename} because it is not square or has a dimension that is too small.")

print("All files copied with unique names.")

Discarded 4014.jpg because it is not square or has a dimension that is too small.
Discarded 4462.jpg because it is not square or has a dimension that is too small.
Discarded 3255.jpg because it is not square or has a dimension that is too small.
Discarded 1093.jpg because it is not square or has a dimension that is too small.
Discarded 4467.jpg because it is not square or has a dimension that is too small.
Discarded 3105.jpg because it is not square or has a dimension that is too small.
Discarded 6241.jpg because it is not square or has a dimension that is too small.
Discarded 3664.jpg because it is not square or has a dimension that is too small.
Discarded 619.jpg because it is not square or has a dimension that is too small.
Discarded 6307.jpg because it is not square or has a dimension that is too small.
Discarded 1804.jpg because it is not square or has a dimension that is too small.
Discarded 3416.jpg because it is not square or has a dimension that is too small.
Discarded 3128.jp

This code rescales all the images in the color directory to 128x128 (technically this is losing a bit more resolution than we need to in order to get uniform sizes, hoping this shouldn't be too much of a quality hit, as it would be good to get everything to a round power of 2 for our compressed images later).

In [8]:
target_size = (128, 128)

# Iterate through all files in the source directory
for filename in os.listdir(color_raw_dir):
    if filename.lower().endswith(".jpg"):
        try:
            # Open the image
            img_path = os.path.join(color_raw_dir, filename)
            img = Image.open(img_path)

            # Resize the image while preserving aspect ratio
            img.thumbnail(target_size, Image.ANTIALIAS)

            # Save the resized image to the destination folder
            new_filename = os.path.splitext(filename)[0] + ".jpg"
            img.save(os.path.join(color_dir, new_filename), "JPEG")

            # print(f"Resized {filename} to {target_size[0]}x{target_size[1]}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All images resized")

  img.thumbnail(target_size, Image.ANTIALIAS)


All images resized


Now, this code creates identically-named grayscale versions of the images in the "gray" directory of the dataset.

In [9]:
# Iterate through all files in the source directory
for filename in os.listdir(color_dir):
    if filename.lower().endswith(".jpg"):
        try:
            # Open the color image
            img_path = os.path.join(color_dir, filename)
            img = Image.open(img_path)

            # Convert to grayscale
            gray_img = img.convert("L")

            # Save the grayscale image to the destination folder
            new_filename = os.path.splitext(filename)[0] + ".jpg"
            gray_img.save(os.path.join(bw_dir, new_filename), "JPEG")

            # print(f"Converted {filename} to grayscale")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All color images have been converted to grayscale.")

All color images have been converted to grayscale.


The following code blocks create the successively smaller versions of the grayscale images.

In [10]:
target_size = (64, 64)

# Iterate through all files in the source directory
for filename in os.listdir(bw_dir):
    if filename.lower().endswith(".jpg"):
        try:
            # Open the image
            img_path = os.path.join(bw_dir, filename)
            img = Image.open(img_path)

            # Resize the image while preserving aspect ratio
            img.thumbnail(target_size, Image.ANTIALIAS)

            # Save the resized image to the destination folder
            new_filename = os.path.splitext(filename)[0] + ".jpg"
            img.save(os.path.join(bw_half_dir, new_filename), "JPEG")

            # print(f"Resized {filename} to {target_size[0]}x{target_size[1]}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All images resized")

  img.thumbnail(target_size, Image.ANTIALIAS)


All images resized


In [11]:
target_size = (32, 32)

# Iterate through all files in the source directory
for filename in os.listdir(bw_dir):
    if filename.lower().endswith(".jpg"):
        try:
            # Open the image
            img_path = os.path.join(bw_dir, filename)
            img = Image.open(img_path)

            # Resize the image while preserving aspect ratio
            img.thumbnail(target_size, Image.ANTIALIAS)

            # Save the resized image to the destination folder
            new_filename = os.path.splitext(filename)[0] + ".jpg"
            img.save(os.path.join(bw_quarter_dir, new_filename), "JPEG")

            # print(f"Resized {filename} to {target_size[0]}x{target_size[1]}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All images resized")

  img.thumbnail(target_size, Image.ANTIALIAS)


All images resized


In [12]:
target_size = (16, 16)

# Iterate through all files in the source directory
for filename in os.listdir(bw_dir):
    if filename.lower().endswith(".jpg"):
        try:
            # Open the image
            img_path = os.path.join(bw_dir, filename)
            img = Image.open(img_path)

            # Resize the image while preserving aspect ratio
            img.thumbnail(target_size, Image.ANTIALIAS)

            # Save the resized image to the destination folder
            new_filename = os.path.splitext(filename)[0] + ".jpg"
            img.save(os.path.join(bw_eighth_dir, new_filename), "JPEG")

            # print(f"Resized {filename} to {target_size[0]}x{target_size[1]}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All images resized")

  img.thumbnail(target_size, Image.ANTIALIAS)


All images resized
