<a href="https://colab.research.google.com/github/RudyMartin/dsai-2024/blob/main/data_prep_v3_for_zipped_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Prep for Zipped Files

In [1]:
### Name Project folder
import os
from google.colab import drive
import datetime

# Record the start time for performance evaluation
start_time = datetime.datetime.now()


drive.flush_and_unmount()
!rm -rf /tmp/*

drive.mount("/content/gdrive", force_remount=True)
root_dir = "/content/gdrive/My Drive/dsai-2024/MVPS"
proj_dir = os.path.join(root_dir, 'Camp-Rock-Paper-Scissors')
os.chdir('/content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors')

rps_dir = os.path.join(proj_dir, 'rps_660')
train_dir = os.path.join(rps_dir, 'train')
test_dir = os.path.join(rps_dir, 'test')
model_dir = os.path.join(root_dir, 'model')

# Define and create directories for rps
for dir in ['train', 'test', 'model']:
    os.makedirs(os.path.join(rps_dir, dir), exist_ok=True)

# Define and create subdirectories for train and test
for sub_dir in ['rock', 'paper', 'scissors']:
    os.makedirs(os.path.join(train_dir, sub_dir), exist_ok=True)
    os.makedirs(os.path.join(test_dir, sub_dir), exist_ok=True)

## Look at the current directory
%ls

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/gdrive
[0m[01;34mcolab_setup[0m/  [01;34mmodel_init[0m/  README.MD    [01;34mrps_60[0m/     [01;34mrps_660[0m/
[01;34mdata_prep[0m/    [01;34mmodel_tune[0m/  rps_114.zip  rps_60.zip  rps_660.zip


In [2]:
# Ensure the data directory exists
if not os.path.exists(rps_dir):
    raise FileNotFoundError(f"Directory {rps_dir} does not exist.")
print(f"'rps' directory contents: {os.listdir(rps_dir)}")

'rps' directory contents: ['.DS_Store', 'train', 'test', 'model']


To uphold principle of fairness and simplify machine learning process, all images need to be converted into same file format (.jpg) and same resolution. Please run the cell below to convert the files to common format.

Participants are also encouraged to manually edit the pictures, using tips above, to improve accuracy of their learning algorithm.

In [3]:
# Load training data
import glob

print(f"Train directory: {train_dir}")
print(f"Test directory: {test_dir}")

print(f"Number of train scissors images: {len(glob.glob(f'{train_dir}/paper/*.jpg'))}")
print(f"Number of train rock images: {len(glob.glob(f'{train_dir}/rock/*.jpg'))}")
print(f"Number of train paper images: {len(glob.glob(f'{train_dir}/paper/*.jpg'))}")

print(f"Number of test scissors images: {len(glob.glob(f'{test_dir}/paper/*.jpg'))}")
print(f"Number of test rock images: {len(glob.glob(f'{test_dir}/rock/*.jpg'))}")
print(f"Number of test paper images: {len(glob.glob(f'{test_dir}/paper/*.jpg'))}")

## ORIGINAL PHOTOS DO NOT HAVE EXTENSION NAMES SO WILL NOT BE COUNTED (OK ZER) - BUT CHECK VISUALLY

Train directory: /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train
Test directory: /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/test
Number of train scissors images: 200
Number of train rock images: 200
Number of train paper images: 200
Number of test scissors images: 20
Number of test rock images: 20
Number of test paper images: 20


Converts code and renames bad files with starting 'z' name.

In [4]:
from os import listdir, rename
from os.path import isfile, join
from PIL import Image
from os.path import splitext

def process_images(proj_path):
    for label in ['rock', 'paper', 'scissors']:
        for i in [f for f in listdir(join(proj_path, label)) if isfile(join(proj_path, label, f))]:
            file_path = join(proj_path, label, i)
            file_name, file_ext = splitext(i)

            try:
                im = Image.open(file_path)
                im = im.resize((640, 480))

                if file_ext.lower() in ['.png', '.jpeg']:
                    # Convert RGBA to RGB if the image has an alpha channel
                    if im.mode == 'RGBA':
                        im = im.convert('RGB')
                    new_file_path = join(proj_path, label, f"{file_name}.jpg")
                    im.save(new_file_path, 'jpeg')
                    print(f"Successfully Converted {i} to {file_name}.jpg")
                    # Optionally, remove the original file
                    # os.remove(file_path)
                else:
                    new_file_path = join(proj_path, label, f"{file_name}.jpg")
                    im.save(new_file_path, 'jpeg')
                    print(f"Successfully Resized {i} and saved as {file_name}.jpg")

            except Exception as e:
                print(f"Error processing file {i} at {file_path}: {e}")
                # Rename the problematic file with 'z_' prefix
                new_file_path = join(proj_path, label, f"z_{i}")
                rename(file_path, new_file_path)
                print(f"Renamed problematic file {i} to z_{i}")

# Example usage:

process_images(test_dir)

process_images(train_dir)

Successfully Resized rock_1.jpg and saved as rock_1.jpg
Successfully Resized rock_2.jpg and saved as rock_2.jpg
Successfully Resized rock_10.jpg and saved as rock_10.jpg
Successfully Resized rock_5.jpg and saved as rock_5.jpg
Successfully Resized rock_4.jpg and saved as rock_4.jpg
Successfully Resized rock_12.jpg and saved as rock_12.jpg
Successfully Resized rock_3.jpg and saved as rock_3.jpg
Successfully Resized rock_11.jpg and saved as rock_11.jpg
Successfully Resized rock_18.jpg and saved as rock_18.jpg
Successfully Resized rock_6.jpg and saved as rock_6.jpg
Successfully Resized rock_8.jpg and saved as rock_8.jpg
Successfully Resized rock_13.jpg and saved as rock_13.jpg
Successfully Resized rock_7.jpg and saved as rock_7.jpg
Successfully Resized rock_14.jpg and saved as rock_14.jpg
Successfully Resized rock_15.jpg and saved as rock_15.jpg
Successfully Resized rock_9.jpg and saved as rock_9.jpg
Successfully Resized rock_16.jpg and saved as rock_16.jpg
Successfully Resized rock_20.jpg

ONLY AFTER REVIEWING AND DELETING FILES - Then RENAME, REORDER remaining files to jpg - Wait 1-2 minutes between running this and prior section to be sure the Z files are no longer in folder.

In [5]:
import os

def renumber_images(base_path):
    categories = ['rock', 'paper', 'scissors']
    for category in categories:
        # Renumber images in the train folder
        train_path = os.path.join(base_path, 'train', category)
        renumber_folder(train_path, category)

        # Renumber images in the test folder
        test_path = os.path.join(base_path, 'test', category)
        renumber_folder(test_path, category)

def renumber_folder(folder_path, class_name):
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        return

    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    files.sort()  # Sort to maintain a consistent order

    # Rename files to a temporary name to avoid conflicts
    temp_files = []
    for idx, filename in enumerate(files, start=1):
        file_path = os.path.join(folder_path, filename)
        temp_filename = f"temp_{class_name}_{idx:05d}.jpg"
        temp_file_path = os.path.join(folder_path, temp_filename)
        os.rename(file_path, temp_file_path)
        temp_files.append(temp_filename)

    # Then, rename temporary files to the final name
    for idx, temp_filename in enumerate(temp_files, start=1):
        temp_file_path = os.path.join(folder_path, temp_filename)
        new_filename = f"{class_name}_{idx}.jpg"
        new_file_path = os.path.join(folder_path, new_filename)
        os.rename(temp_file_path, new_file_path)
        print(f"Renamed {temp_file_path} to {new_file_path}")

# Path to your dataset
renumber_images(rps_dir)

Renamed /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/temp_rock_00001.jpg to /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/rock_1.jpg
Renamed /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/temp_rock_00002.jpg to /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/rock_2.jpg
Renamed /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/temp_rock_00003.jpg to /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/rock_3.jpg
Renamed /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/temp_rock_00004.jpg to /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/rock_4.jpg
Renamed /content/gdrive/My Drive/dsai-2024/MVPS/Camp-Rock-Paper-Scissors/rps_660/train/rock/temp_rock_00005.jpg to /content/gdrive/My Drive/dsai-2024/MVPS/Camp-

**Test manipulating images** - Preview of what should happen later in experiments

In [6]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Define a custom preprocessing function to ensure uniform image sizes and shapes
def make_square_and_resize(image):
    """Pad an image to make it square and resize it to (224, 224)."""
    target_size = (224, 224)
    height, width = image.shape[:2]
    delta_w = max(height - width, 0)
    delta_h = max(width - height, 0)
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)
    color = [255, 255, 255]  # white background for padding
    new_img = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    new_img = cv2.resize(new_img, target_size)
    return new_img

# Prepare image data generators with real-time augmentation and custom preprocessing
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    preprocessing_function=make_square_and_resize
)
test_datagen = ImageDataGenerator(rescale=1./255, preprocessing_function=make_square_and_resize)

# Load images from directories and prepare them for training and validation
train_generator = train_datagen.flow_from_directory(train_dir, target_size=(224, 224), batch_size=32, class_mode='categorical')
test_generator = test_datagen.flow_from_directory(test_dir, target_size=(224, 224), batch_size=32, class_mode='categorical')



Found 600 images belonging to 3 classes.
Found 60 images belonging to 3 classes.


In [7]:
# Get the number of training images
num_train_images = train_generator.samples
print(f"Number of training images: {num_train_images}")

# Get the number of testing images
num_test_images = test_generator.samples
print(f"Number of testing images: {num_test_images}")

Number of training images: 600
Number of testing images: 60
