<a href="https://colab.research.google.com/github/TalhaAhmed2000/DeepLearning/blob/main/custom_data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Importing Relevant Libraries
import requests
from pathlib import Path
import os
import re
import random
import shutil
import zipfile

# Mounting Drive to access root directory
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Downloading & Processing of Data for Task 1 - Image Classification and Task 2 - Person Segmentation




In [3]:
# Clone Github Repo to access data
!git clone https://github.com/TalhaAhmed2000/DeepLearning.git

Cloning into 'DeepLearning'...
remote: Enumerating objects: 15619, done.[K
remote: Total 15619 (delta 0), reused 0 (delta 0), pack-reused 15619[K
Receiving objects: 100% (15619/15619), 2.26 GiB | 19.96 MiB/s, done.
Resolving deltas: 100% (266/266), done.
Updating files: 100% (19431/19431), done.


In [5]:
# Changing `Pet Breeds` Dataset to have the following structure:

  # data/ <- overall dataset folder
  #   train/ <- training images
  #       siamese cat/ <- class name as folder name
  #           image01.jpg
  #           image02.jpg
  #           ...
  #       ragdoll cat/
  #           image24.jpg
  #           image25.jpg
  #           ...
  #       sphinx/
  #           image37.jpg
  #           ...
  #   test/ <- testing images
  #       siamese cat/ <- class name as folder name
  #           image101.jpg
  #           image102.jpg
  #           ...
  #       ragdoll cat/
  #           image124.jpg
  #           image125.jpg
  #           ...
  #       sphinx/
  #           image137.jpg
  #           ...

In [14]:
# Path for the pet_breeds data
root_path = Path('/content/DeepLearning/Task 1')
image_path = root_path / 'Pet_Breeds'

breeds = []

# Observing how the data is structured and side by side store their folder name (aka breed name) in a list for later
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(image_path)):
  print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
  if i != 0:
    breeds.append(os.path.basename(dirpath))

There are 23 directories and 0 images in '/content/DeepLearning/Task 1/Pet_Breeds'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/rottwiler'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/beagle'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/yorkshire terrier'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/husky'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/chihuahua'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/maine coon'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/corgi'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/abyssinian'.
There are 0 directories and 170 images in '/content/DeepLearning/Task 1/Pet_Breeds/german shepherd'.
There are 0 directories and 170 images in '/content/Deep

In [17]:
# Lets see the list
breeds, len(breeds)

(['rottwiler',
  'beagle',
  'yorkshire terrier',
  'husky',
  'chihuahua',
  'maine coon',
  'corgi',
  'abyssinian',
  'german shepherd',
  'shiba inu',
  'pomeranian',
  'ragdoll cat',
  'mumbai cat',
  'bulldog',
  'sphynx',
  'pug',
  'persian cat',
  'labrador',
  'golden retriever',
  'dachshund',
  'american shorthair',
  'siamese cat',
  'boxer'],
 23)

In [24]:
def shuffle_and_split_images(source_folder: Path,
                             destination_folder_1: Path,
                             destination_folder_2: Path,
                             split_ratio: float = 0.8):

  """
  Transforms a file structure like that of pet breeds into the one below
    data/ <- overall dataset folder
    train/ <- training images
        siamese cat/ <- class name as folder name
            image01.jpg
            image02.jpg
            ...
        ragdoll cat/
            image24.jpg
            image25.jpg
            ...
        sphinx/
            image37.jpg
            ...
    test/ <- testing images
        siamese cat/ <- class name as folder name
            image101.jpg
            image102.jpg
            ...
        ragdoll cat/
            image124.jpg
            image125.jpg
            ...
        sphinx/
            image137.jpg
            ...
  Args:
  source_folder (Path): The directory where the data is stored intially
  destination_folder_1 (Path): The train directory
  destination_folder_2 (Path): The test directory
  split_ratio (float): The ratio by which to split each breed folder randomly and select that "split_ratio" * total_images for train and (1- split_ratio) * total_images for test. Defaults to 0.8
  """
  # Get the list of image files in the source folder
  image_files = os.listdir(source_folder)

  # Randomly shuffle the image files
  random.shuffle(image_files)

  # Calculate the number of images for each destination folder based on the split ratio
  num_images_1 = int(len(image_files) * split_ratio)
  num_images_2 = len(image_files) - num_images_1

  # Create the destination folders if they don't exist
  os.makedirs(destination_folder_1, exist_ok = True)
  os.makedirs(destination_folder_2, exist_ok = True)

  # Copy the first num_images_1 images to destination_folder_1
  for i in range(num_images_1):
      src_path = os.path.join(source_folder, image_files[i])
      dst_path = os.path.join(destination_folder_1, image_files[i])
      shutil.copy2(src_path, dst_path)

  # Copy the remaining images to destination_folder_2
  for i in range(num_images_1, len(image_files)):
      src_path = os.path.join(source_folder, image_files[i])
      dst_path = os.path.join(destination_folder_2, image_files[i])
      shutil.copy2(src_path, dst_path)

In [19]:
data_path = Path('data/')
data_path.mkdir(parents = True, exist_ok = True)

# Create Path for data to go in

breed_train_dir = Path(f'data/pet_breeds/train')
breed_test_dir = Path(f'data/pet_breeds/test')

# Make the directories
breed_train_dir.mkdir(parents = True, exist_ok = True)
breed_test_dir.mkdir(parents = True, exist_ok = True)

In [26]:
# Using the above defined function on each breed folder to get the required structure

for breed in breeds:
  source_folder = image_path / breed
  dest_1 = breed_train_dir / breed
  dest_2 = breed_test_dir / breed
  shuffle_and_split_images(source_folder, dest_1, dest_2, split_ratio = 0.8)

In [27]:
# Verifying if split was done correctly
for dirpath, dirnames, filenames in os.walk('/content/data/'):
  print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

There are 1 directories and 0 images in '/content/data/'.
There are 2 directories and 0 images in '/content/data/pet_breeds'.
There are 23 directories and 0 images in '/content/data/pet_breeds/train'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/rottwiler'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/beagle'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/yorkshire terrier'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/husky'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/chihuahua'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/maine coon'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/corgi'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/abyssinian'.
There are 0 directories and 136 images in '/content/data/pet_breeds/train/german shepherd'.
There are 0 director

Approximately 135 images for train and 35 for test per breed

In [28]:
# Standard Procedure to clone back to github. Avoided to show as it contains sensitive info

In [31]:
# Moving to do the same for the mask segmentation dataset. Very similar to how we did the pet breeds dataset except now we have additional two sub-folders in each train and test
# specifying the ground truth masks for train and test images respecitively. Hence the function will be slightly altered but the main idea remains the same

# We want to convert this into a structure like the following:

### data --> images --> train
### data --> images --> test
### data --> masks --> train
### data --> masks --> test

# Note: data represents the root directory and it contains two subfolders, images and mask with each of them having two further subfolders train and test.

# Example. data/images/train/img.png has its corresponding ground truth mask in data/mas/train/img.png

In [29]:
# Path for the mask data
root_path = Path('/content/DeepLearning/Task 2/segmentation_mask_image')
image_path = root_path / 'images'
mask_path = root_path / 'masks'

# Observing how the data is structured
for dirpath, dirnames, filenames in os.walk(root_path):
  print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

There are 3 directories and 0 images in '/content/DeepLearning/Task 2/segmentation_mask_image'.
There are 0 directories and 1192 images in '/content/DeepLearning/Task 2/segmentation_mask_image/masks'.
There are 0 directories and 1192 images in '/content/DeepLearning/Task 2/segmentation_mask_image/collages'.
There are 0 directories and 1192 images in '/content/DeepLearning/Task 2/segmentation_mask_image/images'.


In [32]:
data_path = Path('data/')
data_path.mkdir(parents = True, exist_ok = True)

# Create Path for data to go in

img_train_dir = f'data/images/train'
img_test_dir = f'data/images/test'

mask_train_dir = f'data/mask/train'
mask_test_dir = f'data/mask/test'

img_target_train = Path(img_train_dir)
mask_target_train = Path(mask_train_dir)

img_target_test = Path(img_test_dir)
mask_target_test = Path(mask_test_dir)

# Make the directories
img_target_train.mkdir(parents = True, exist_ok = True)
mask_target_train.mkdir(parents = True, exist_ok = True)
mask_target_test.mkdir(parents = True, exist_ok = True)
img_target_test.mkdir(parents = True, exist_ok = True)


In [33]:
random.seed(42)
def shuffle_and_split_images(image_path, mask_path, destination_folder_1, destination_folder_2, destination_folder_3, destination_folder_4, split_ratio, random_list):
    # Get the list of image files in the source folder
    image_files = random_list

    # Calculate the number of images for each destination folder based on the split ratio
    num_images_1 = int(len(image_files) * split_ratio)
    num_masks_1 = int(len(image_files) * split_ratio)

    num_images_2 = len(image_files) - num_images_1
    num_masks_2 = len(image_files) - num_images_1

    # Create the destination folders if they don't exist
    os.makedirs(destination_folder_1, exist_ok = True)
    os.makedirs(destination_folder_2, exist_ok = True)
    os.makedirs(destination_folder_3, exist_ok = True)
    os.makedirs(destination_folder_4, exist_ok = True)

    # Copy the first num_images_1 images to destination_folder_1 --> img_train
    for i in range(num_images_1):
        src_path = os.path.join(image_path, image_files[i])
        dst_path = os.path.join(destination_folder_1, image_files[i])
        shutil.copy2(src_path, dst_path)

    # Copy the remaining images to destination_folder_2 --> img_test
    for i in range(num_images_1, len(image_files)):
        src_path = os.path.join(image_path, image_files[i])
        dst_path = os.path.join(destination_folder_2, image_files[i])
        shutil.copy2(src_path, dst_path)

    # Copy the first num_masks_1 images to destination_folder_3 --> masks_train
    for i in range(num_masks_1):
        src_path = os.path.join(mask_path, image_files[i])
        dst_path = os.path.join(destination_folder_3, image_files[i])
        shutil.copy2(src_path, dst_path)

    # Copy the remaining masks to destination_folder_4 --> mask_test
    for i in range(num_masks_1, len(image_files)):
        src_path = os.path.join(mask_path, image_files[i])
        dst_path = os.path.join(destination_folder_4, image_files[i])
        shutil.copy2(src_path, dst_path)

In [34]:
# Proceed with the processing 

split_ratio = 0.8

list_image_mask = list(os.listdir(image_path))
list_random = random.sample(list_image_mask, len(list_image_mask))

source_folder = root_path
destination_folder_1 = img_target_train
destination_folder_2 = img_target_test
destination_folder_3 = mask_target_train
destination_folder_4 = mask_target_test

# Shuffle and split the images
shuffle_and_split_images(image_path, mask_path, destination_folder_1, destination_folder_2, destination_folder_3, destination_folder_4, split_ratio, list_random)

In [38]:
# Verifying if split was done correctly
for dirpath, dirnames, filenames in os.walk('/content/data/'):
  if '/content/data/pet_breeds' not in dirpath:
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

There are 3 directories and 0 images in '/content/data/'.
There are 2 directories and 0 images in '/content/data/images'.
There are 0 directories and 953 images in '/content/data/images/train'.
There are 0 directories and 239 images in '/content/data/images/test'.
There are 2 directories and 0 images in '/content/data/mask'.
There are 0 directories and 953 images in '/content/data/mask/train'.
There are 0 directories and 239 images in '/content/data/mask/test'.


953 images and masks for train and 239 for test

In [39]:
# Standard Procedure to clone back to github. Avoided to show as it contains sensitive info