In [9]:
import numpy as np
import pandas as pd

from pathlib import Path

import os
import shutil

import matplotlib.pyplot as plt
import torch
import torchvision

from torch import nn
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

In [10]:
image_path = Path("data/Dataset_filtered")

trainval_image_path = image_path / "TrainVal/color"
test_image_path = image_path / "Test/color"

trainval_jpg_count = sum(1 for file in os.listdir(trainval_image_path) if file.lower().endswith(".jpg"))
print(f"Number of training and validation images: {trainval_jpg_count}") #3673

test_jpg_count = sum(1 for file in os.listdir(test_image_path) if file.lower().endswith(".jpg"))
print(f"Number of test images: {test_jpg_count}") #3694

Number of training and validation images: 3673
Number of test images: 3694


### Split Training and Validation Set 

In [11]:
import os
import shutil
from pathlib import Path
import numpy as np

def train_val_split(split_ratio=0.2):
    """
    Splits the dataset into training and validation sets based on split_ratio.
    Also updates 'train.txt' and 'val.txt' in the 'annotations' directory.

    Parameters:
        split_ratio (float): The fraction of images to be used for validation (default: 0.2)
    """
    # Define paths
    base_dir = Path("Dataset_filtered/TrainVal")
    color_dir = base_dir / "color"
    label_dir = base_dir / "label"

    train_dir = Path("Dataset_filtered/Train")
    val_dir = Path("Dataset_filtered/Val")

    annotations_dir = Path("annotations")
    trainval_txt = annotations_dir / "trainval.txt"
    train_txt = annotations_dir / "train.txt"
    val_txt = annotations_dir / "val.txt"

    # Create subdirectories if they don’t exist
    for subfolder in ["color", "label"]:
        (train_dir / subfolder).mkdir(parents=True, exist_ok=True)
        (val_dir / subfolder).mkdir(parents=True, exist_ok=True)

    # Read the trainval.txt file
    if not trainval_txt.exists():
        raise FileNotFoundError(f"trainval.txt not found in {annotations_dir}")

    with open(trainval_txt, "r") as f:
        lines = f.readlines()

    # Extract image names (first column in trainval.txt)
    image_names = [line.split()[0] for line in lines]  # Assuming format: "image_name label1 label2 label3..."
    total_images = len(image_names)
    val_count = int(total_images * split_ratio)

    # Randomly select validation images
    val_images = set(np.random.choice(image_names, val_count, replace=False))

    # Write new train.txt and val.txt
    with open(train_txt, "w") as f_train, open(val_txt, "w") as f_val:
        for line in lines:
            image_name = line.split()[0]  # Extract image name
            if image_name in val_images:
                f_val.write(line)
            else:
                f_train.write(line)

    # Move images and labels to train/val folders
    for image_name in image_names:
        img_path = color_dir / f"{image_name}.jpg"
        label_path = label_dir / f"{image_name}.png"  # Assuming labels are .png format

        if image_name in val_images:
            shutil.move(str(img_path), str(val_dir / "color" / img_path.name))
            if label_path.exists():
                shutil.move(str(label_path), str(val_dir / "label" / label_path.name))
        else:
            shutil.move(str(img_path), str(train_dir / "color" / img_path.name))
            if label_path.exists():
                shutil.move(str(label_path), str(train_dir / "label" / label_path.name))

    print(f"Dataset split completed: {total_images - val_count} training images, {val_count} validation images.")

### Resizing Images to Make Them COnsistent

In [12]:
NUM_WORKERS = os.cpu_count()
print(NUM_WORKERS)

def create_dataloaders(
    train_dir: str,
    val_dir: str, 
    test_dir: str, 
    transform: transforms.Compose, 
    batch_size: int, 
    num_workers: int=NUM_WORKERS
):
  """Creates training and testing DataLoaders.

  Takes in a training directory and testing directory path and turns
  them into PyTorch Datasets and then into PyTorch DataLoaders.

  Args:
    train_dir: Path to training directory.
    test_dir: Path to testing directory.
    transform: torchvision transforms to perform on training and testing data.
    batch_size: Number of samples per batch in each of the DataLoaders.
    num_workers: An integer for number of workers per DataLoader.

  Returns:
    A tuple of (train_dataloader, test_dataloader, class_names).
    Where class_names is a list of the target classes.
    Example usage:
      train_dataloader, val_dataloader, test_dataloader, class_names = \
        = create_dataloaders(train_dir=path/to/train_dir,
                             test_dir=path/to/test_dir,
                             transform=some_transform,
                             batch_size=32,
                             num_workers=4)
  """
  # Use ImageFolder to create dataset(s)
  train_data = datasets.ImageFolder(train_dir, transform=transform)
  val_data = datasets.ImageFolder(val_dir, transform=transform)
  test_data = datasets.ImageFolder(test_dir, transform=transform)

  # Get class names
  class_names = train_data.classes

  # Turn images into data loaders
  train_dataloader = DataLoader(
      train_data,
      batch_size=batch_size,
      shuffle=True,
      num_workers=num_workers,
      pin_memory=True,
  )
  val_dataloader = DataLoader(
      val_data,
      batch_size=batch_size,
      shuffle=False,  # don't need to shuffle validation data
      num_workers=num_workers,
      pin_memory=True,
  )
  test_dataloader = DataLoader(
      test_data,
      batch_size=batch_size,
      shuffle=False, # don't need to shuffle test data
      num_workers=num_workers,
      pin_memory=True,
  )

  return train_dataloader, val_dataloader, test_dataloader, class_names

8


In [13]:
resize_transform = transforms.Compose([
        transforms.Resize((224, 224)), # 1. Reshape all images to 224x224 (though some models may require different sizes)
        transforms.ToTensor(), # 2. Turn image values to between 0 & 1 
        transforms.Normalize(mean=[0.485, 0.456, 0.406], # 3. A mean of [0.485, 0.456, 0.406] (across each colour channel)
                            std=[0.229, 0.224, 0.225]) # 4. A standard deviation of [0.229, 0.224, 0.225] (across each colour channel),
    ])

In [14]:
# trainval_output_path = Path("data/Dataset_filtered/TrainVal/resized")
# test_output_path = Path("data/Dataset_filtered/Test/resized")


# def resize_image(original_path, output_path, resize_height=256, resize_width=256):

#     # Create output directory if not exists
#     output_path.mkdir(parents=True, exist_ok=True)

#     # Define the transformation
#     resize_transform = transforms.Compose([
#         transforms.Resize((resize_height, resize_width)), # 1. Reshape all images to 224x224 (though some models may require different sizes)
#         transforms.ToTensor(), # 2. Turn image values to between 0 & 1 
#         transforms.Normalize(mean=[0.485, 0.456, 0.406], # 3. A mean of [0.485, 0.456, 0.406] (across each colour channel)
#                             std=[0.229, 0.224, 0.225]) # 4. A standard deviation of [0.229, 0.224, 0.225] (across each colour channel),
#     ])

#     # Loop through all .jpg images and resize them
#     for img_path in original_path.glob("*.jpg"):  # Iterate over .jpg images
#         with Image.open(img_path) as img:
#             resized_img = resize_transform(img)  # Apply resizing
            
#             # Convert back to PIL image for saving
#             resized_pil = transforms.ToPILImage()(resized_img)

#             # Save resized image
#             resized_pil.save(output_path / img_path.name)


In [15]:
def read_data(filename):

    column_names = ["Image", "CLASS-ID", "SPECIES", "BREED ID"]

    df = pd.read_csv(filename, sep=" ", names=column_names, header=None)

    return df

trainval_data = read_data("data/annotations/trainval.txt")
test_data = read_data("data/annotations/test.txt")

trainval_data['Image'].values


array(['Abyssinian_100', 'Abyssinian_101', 'Abyssinian_102', ...,
       'yorkshire_terrier_189', 'yorkshire_terrier_18',
       'yorkshire_terrier_190'], dtype=object)

In [16]:
cat_dir = Path('data/images/cat')
dog_dir = Path('data/images/dog')

cat_dir.mkdir(parents=True, exist_ok=True)
dog_dir.mkdir(parents=True, exist_ok=True)

valid_images = [
    img.stem for img in Path('data/images').iterdir()
    if img.suffix.lower() =='.jpg' 
    and (img.stem in trainval_data['Image'].values or img.stem in test_data['Image'].values)
]
sorted(valid_images)

# cat_dir_trainval = cat_dir / 'trainval'
# cat_dir_trainval = cat_dir / 'trainval'
# dog_dir_trainval = dog_dir / 'trainval'
# dog_dir_trainval = dog_dir / 'trainval'
# cat_dir_test = cat_dir / 'test'
# cat_dir_test = cat_dir / 'test'
# dog_dir_test = dog_dir / 'test'
# dog_dir_test = dog_dir / 'test'



[]