In [3]:
import numpy as np
import pandas as pd

from pathlib import Path

import os
import shutil

import matplotlib.pyplot as plt
import torch
import torchvision

from torch import nn
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

In [4]:
image_path = Path("data/Dataset_filtered")

trainval_image_path = image_path / "TrainVal/color"
test_image_path = image_path / "Test/color"

trainval_jpg_count = sum(1 for file in os.listdir(trainval_image_path) if file.lower().endswith(".jpg"))
print(f"Number of training and validation images: {trainval_jpg_count}") #3673

test_jpg_count = sum(1 for file in os.listdir(test_image_path) if file.lower().endswith(".jpg"))
print(f"Number of test images: {test_jpg_count}") #3694

Number of training and validation images: 3673
Number of test images: 3694


### Split Training and Validation Set 

In [5]:
from pathlib import Path

def train_val_split(split_ratio=0.2):
    """
    Splits the dataset into training and validation sets with a species-specific split.
    For each species (class), split_ratio of the samples will be assigned to validation.
    Updates 'train.txt' and 'val.txt' in the 'annotations' directory.

    Assumes each line in trainval.txt is formatted as:
        image_name species [other_info...]
    
    Parameters:
        split_ratio (float): The fraction of images per species to be used for validation (default: 0.2)
    """
    annotations_dir = Path("Data/annotations")
    trainval_txt = annotations_dir / "trainval.txt"
    train_txt = annotations_dir / "train.txt"
    val_txt = annotations_dir / "val.txt"

    # Read the trainval.txt file
    if not trainval_txt.exists():
        raise FileNotFoundError(f"trainval.txt not found in {annotations_dir}")

    with open(trainval_txt, "r") as f:
        lines = f.readlines()

    # Group lines by species (assuming species is the second token in each line)
    species_dict = {}
    for line in lines:
        tokens = line.strip().split()
        if len(tokens) < 2:
            continue  # Skip lines that don't have enough tokens
        species = tokens[1]
        species_dict.setdefault(species, []).append(line)

    train_lines = []
    val_lines = []
    # For each species, select a fraction of samples for validation
    for species, species_lines in species_dict.items():
        n_samples = len(species_lines)
        n_val = int(n_samples * split_ratio)
        # If rounding leads to 0 but there is at least one sample, ensure at least one goes to val
        if n_val == 0 and n_samples > 0:
            n_val = 1
        indices = np.arange(n_samples)
        val_indices = np.random.choice(indices, size=n_val, replace=False)
        for idx, line in enumerate(species_lines):
            if idx in val_indices:
                val_lines.append(line)
            else:
                train_lines.append(line)

    # Write new train.txt and val.txt files
    with open(train_txt, "w") as f_train:
        f_train.writelines(train_lines)
    with open(val_txt, "w") as f_val:
        f_val.writelines(val_lines)





np.random.seed(42)
train_val_split()

In [None]:
def copy_images_to_train_val():
    """
    Copies images and labels for training and validation sets based on the annotation files.
    
    For each line in train.txt and val.txt, the first token (image name) is used to locate:
        - the image file: Dataset_filtered/TrainVal/color/<image_name>.jpg
        - the mask file: Dataset_filtered/TrainVal/label/<image_name>.png

    The files are then copied to:
        - Training set: Dataset_filtered/Train/color and Dataset_filtered/Train/label
        - Validation set: Dataset_filtered/Val/color and Dataset_filtered/Val/label
    """
    # Define source directories
    base_dir = Path("Data/Dataset_filtered")
    source_dir = base_dir / "TrainVal"
    source_color = source_dir / "color"
    source_label = source_dir / "label"
    
    # Define destination directories for training and validation
    train_dir = base_dir / "Train"
    val_dir = base_dir / "Val"
    train_color = train_dir / "color"
    train_label = train_dir / "label"
    val_color = val_dir / "color"
    val_label = val_dir / "label"
    
    # Create destination directories if they do not exist
    for folder in [train_color, train_label, val_color, val_label]:
        folder.mkdir(parents=True, exist_ok=True)
    
    # Define the annotations directory and files
    annotations_dir = Path("Data/annotations")
    train_txt = annotations_dir / "train.txt"
    val_txt = annotations_dir / "val.txt"
    
    def copy_files(annotation_file, dest_color, dest_label):
        if not annotation_file.exists():
            raise FileNotFoundError(f"{annotation_file} not found.")
        with open(annotation_file, "r") as f:
            lines = f.readlines()
        for line in lines:
            tokens = line.strip().split()
            if not tokens:
                continue  # Skip empty lines
            image_name = tokens[0]
            # Define source file paths
            src_image = source_color / f"{image_name}.jpg"
            src_label = source_label / f"{image_name}.png"
            # Copy image file if it exists
            if src_image.exists():
                shutil.copy(str(src_image), str(dest_color / src_image.name))
            else:
                print(f"Warning: {src_image} not found.")
            # Copy label file if it exists
            if src_label.exists():
                shutil.copy(str(src_label), str(dest_label / src_label.name))
            else:
                print(f"Warning: {src_label} not found.")
    
    # Copy files for the training set
    copy_files(train_txt, train_color, train_label)
    # Copy files for the validation set
    copy_files(val_txt, val_color, val_label)
    
    # print("Image and label copying completed.")

copy_images_to_train_val()

### Resizing Images to Make Them Consistent

In [12]:
NUM_WORKERS = os.cpu_count()
print(NUM_WORKERS)

def create_dataloaders(
    train_dir: str,
    val_dir: str, 
    test_dir: str, 
    transform: transforms.Compose, 
    batch_size: int, 
    num_workers: int=NUM_WORKERS
):
  """Creates training and testing DataLoaders.

  Takes in a training directory and testing directory path and turns
  them into PyTorch Datasets and then into PyTorch DataLoaders.

  Args:
    train_dir: Path to training directory.
    test_dir: Path to testing directory.
    transform: torchvision transforms to perform on training and testing data.
    batch_size: Number of samples per batch in each of the DataLoaders.
    num_workers: An integer for number of workers per DataLoader.

  Returns:
    A tuple of (train_dataloader, test_dataloader, class_names).
    Where class_names is a list of the target classes.
    Example usage:
      train_dataloader, val_dataloader, test_dataloader, class_names = \
        = create_dataloaders(train_dir=path/to/train_dir,
                             test_dir=path/to/test_dir,
                             transform=some_transform,
                             batch_size=32,
                             num_workers=4)
  """
  # Use ImageFolder to create dataset(s)
  train_data = datasets.ImageFolder(train_dir, transform=transform)
  val_data = datasets.ImageFolder(val_dir, transform=transform)
  test_data = datasets.ImageFolder(test_dir, transform=transform)

  # Get class names
  class_names = train_data.classes

  # Turn images into data loaders
  train_dataloader = DataLoader(
      train_data,
      batch_size=batch_size,
      shuffle=True,
      num_workers=num_workers,
      pin_memory=True,
  )
  val_dataloader = DataLoader(
      val_data,
      batch_size=batch_size,
      shuffle=False,  # don't need to shuffle validation data
      num_workers=num_workers,
      pin_memory=True,
  )
  test_dataloader = DataLoader(
      test_data,
      batch_size=batch_size,
      shuffle=False, # don't need to shuffle test data
      num_workers=num_workers,
      pin_memory=True,
  )

  return train_dataloader, val_dataloader, test_dataloader, class_names

8


In [13]:
resize_transform = transforms.Compose([
        transforms.Resize((224, 224)), # 1. Reshape all images to 224x224 (though some models may require different sizes)
        transforms.ToTensor(), # 2. Turn image values to between 0 & 1 
        transforms.Normalize(mean=[0.485, 0.456, 0.406], # 3. A mean of [0.485, 0.456, 0.406] (across each colour channel)
                            std=[0.229, 0.224, 0.225]) # 4. A standard deviation of [0.229, 0.224, 0.225] (across each colour channel),
    ])

In [14]:
# trainval_output_path = Path("data/Dataset_filtered/TrainVal/resized")
# test_output_path = Path("data/Dataset_filtered/Test/resized")


# def resize_image(original_path, output_path, resize_height=256, resize_width=256):

#     # Create output directory if not exists
#     output_path.mkdir(parents=True, exist_ok=True)

#     # Define the transformation
#     resize_transform = transforms.Compose([
#         transforms.Resize((resize_height, resize_width)), # 1. Reshape all images to 224x224 (though some models may require different sizes)
#         transforms.ToTensor(), # 2. Turn image values to between 0 & 1 
#         transforms.Normalize(mean=[0.485, 0.456, 0.406], # 3. A mean of [0.485, 0.456, 0.406] (across each colour channel)
#                             std=[0.229, 0.224, 0.225]) # 4. A standard deviation of [0.229, 0.224, 0.225] (across each colour channel),
#     ])

#     # Loop through all .jpg images and resize them
#     for img_path in original_path.glob("*.jpg"):  # Iterate over .jpg images
#         with Image.open(img_path) as img:
#             resized_img = resize_transform(img)  # Apply resizing
            
#             # Convert back to PIL image for saving
#             resized_pil = transforms.ToPILImage()(resized_img)

#             # Save resized image
#             resized_pil.save(output_path / img_path.name)
