This notebook is the dataset preprocessing part of the project

In [7]:
import os

In [16]:
def rename_files_in_folder(folder_path):
    """
    Renames files in a folder with the folder name plus their position.

    Args:
        folder_path (str): The path to the folder.
    """
    files = os.listdir(folder_path)
    for i, file in enumerate(files):
        if os.path.isfile(os.path.join(folder_path, file)):
            new_name = f"{os.path.basename(folder_path)}_{i+1}{os.path.splitext(file)[1]}"
            try:
                os.rename(os.path.join(folder_path, file), os.path.join(folder_path, new_name))
            except FileExistsError:
                print(f"Error: File '{new_name}' already exists in the folder. Skipping.")

# Example usage:
folder_to_rename = 'Dataset_getty/1989'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Acoustic'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Fearless'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Folkmore'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Lover'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Midnights'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Reputation'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Speak_Now'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/Red'
rename_files_in_folder(folder_to_rename)
folder_to_rename = 'Dataset_getty/TTPD'
rename_files_in_folder(folder_to_rename)


Error: File '1989_2.jpeg' already exists in the folder. Skipping.
Error: File '1989_3.jpg' already exists in the folder. Skipping.
Error: File '1989_4.jpg' already exists in the folder. Skipping.
Error: File '1989_10.jpeg' already exists in the folder. Skipping.
Error: File '1989_14.jpg' already exists in the folder. Skipping.
Error: File '1989_15.jpeg' already exists in the folder. Skipping.
Error: File '1989_21.jpg' already exists in the folder. Skipping.
Error: File '1989_25.jpeg' already exists in the folder. Skipping.
Error: File '1989_27.jpeg' already exists in the folder. Skipping.
Error: File '1989_28.jpg' already exists in the folder. Skipping.
Error: File '1989_40.jpeg' already exists in the folder. Skipping.
Error: File '1989_41.jpg' already exists in the folder. Skipping.
Error: File '1989_46.jpg' already exists in the folder. Skipping.
Error: File '1989_47.jpeg' already exists in the folder. Skipping.
Error: File '1989_48.jpg' already exists in the folder. Skipping.
Error:

In [24]:
import os
import shutil
import random

def split_dataset(folder_path, output_path, train_split=0.8, test_split=0.1, 
                   val_split=0.1, shuffle=True):
    """
    Splits the images in the given folder into train, test, and validation sets,
    and moves them to the specified output path, ensuring no duplicate moves.

    Args:
        folder_path (str): Path to the folder containing the images.
        output_path (str): Path to the directory where the split datasets 
                          will be saved.
        train_split (float, optional): Ratio of images for training (0.0 to 1.0). 
                                     Defaults to 0.8.
        test_split (float, optional): Ratio of images for testing (0.0 to 1.0). 
                                    Defaults to 0.1.
        val_split (float, optional): Ratio of images for validation (0.0 to 1.0). 
                                    Defaults to 0.1.
        shuffle (bool, optional): Whether to shuffle the images. Defaults to True.
        shuffle_size (int, optional): Size of the shuffle buffer. Defaults to 1000.
    """

    if train_split + test_split + val_split != 1.0:
        raise ValueError("Train, test, and validation splits must add up to 1.0")

    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Create subfolders for train, test, and validation in the output directory
    for subfolder in ["train", "test", "val"]:
        os.makedirs(os.path.join(output_path, subfolder), exist_ok=True)

    # Get list of image files in the folder
    image_files = [
        f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))
    ]

    # Shuffle the image files
    if shuffle:
        random.shuffle(image_files)

    # Calculate the number of images for each set
    num_images = len(image_files)
    num_train = int(num_images * train_split)
    num_test = int(num_images * test_split)

    # Split the image files into sets
    train_files = image_files[:num_train]
    test_files = image_files[num_train : num_train + num_test]
    val_files = image_files[num_train + num_test :]

    # Move images to corresponding subfolders, preventing duplicates
    for file_list, subfolder in zip(
        [train_files, test_files, val_files], ["train", "test", "val"]
    ):
        for image_file in file_list:
            src_path = os.path.join(folder_path, image_file)
            dest_path = os.path.join(output_path, subfolder, image_file)
            if not os.path.exists(dest_path):  # Check if file already exists
                shutil.move(src_path, dest_path) 
 

In [25]:
def organize_images(directory):
  """Organizes images in a directory into subfolders based on the first word of their filenames.

  Args:
    directory: The path to the directory containing the images.
  """

  for filename in os.listdir(directory):
    if filename.endswith(('.jpg', '.jpeg', '.png', '.gif')):  # Adjust file extensions if needed
      file_path = os.path.join(directory, filename)
      folder_name = filename.split('_')[0]  # Extract the first word before "_"
      folder_path = os.path.join(directory, folder_name)

      # Create the folder if it doesn't exist
      if not os.path.exists(folder_path):
        os.makedirs(folder_path)

      # Move the image to the corresponding folder
      shutil.move(file_path, os.path.join(folder_path, filename))

In [26]:
output_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\"

# 1989
_1989_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\1989"
split_dataset(_1989_path, output_path) 

# Acoustic
acoustic_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Acoustic"
split_dataset(acoustic_path, output_path) 

# Fearless
fearless_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Fearless"
split_dataset(fearless_path, output_path)

# Folkmore
folkmore_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Folkmore"
split_dataset(folkmore_path, output_path)

# Lover
lover_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Lover"
split_dataset(lover_path, output_path)

# Midnights
midnights_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Midnights"
split_dataset(midnights_path, output_path)

# Red
red_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Red"
split_dataset(red_path, output_path)

# Reputation
reputation_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Reputation"
split_dataset(reputation_path, output_path)

# Speak Now
speak_now_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\Speak_Now"
split_dataset(speak_now_path, output_path)

# TTPD
ttpd_path = "C:\\Users\\Ronan\\Documents\\ML\\Taylor_Swift_Projects\\CNN\\Dataset_getty\\TTPD"
split_dataset(ttpd_path, output_path)

In [27]:
# Train
train_directory = "C:/Users/Ronan/Documents/ML/Taylor_Swift_Projects/CNN/train"
organize_images(train_directory)

# Val
val_directory = "C:/Users/Ronan/Documents/ML/Taylor_Swift_Projects/CNN/val"
organize_images(val_directory)

# Test
test_directory = "C:/Users/Ronan/Documents/ML/Taylor_Swift_Projects/CNN/test"
organize_images(test_directory)