# **Data Collection**
---

## Objective

- Clean and prepare the provided image dataset

## Inputs

- A folder containing a specific amount of correctly labeled subfolders corresponding to the amount of pets to distinguish
- Several hundred preprocessed png images (size 450x450) in each subfolder, showing a close-up of the pet's face

## Outputs

- A cleaned dataset in the folder `inputs/datasets/pets` split into `train`/`validation`/`test` subfolders

---

### Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from matplotlib.image import imread
from tensorflow.keras.preprocessing import image
from PIL import Image
import shutil
import random

sns.set_style("white")


### Set the correct working directory

In [None]:
# Check the current working directory; os.chdir to change working directory if needed
cwd = os.getcwd()
cwd

In [None]:
os.chdir(os.path.dirname(cwd))
cwd = os.getcwd()
cwd

### Set the path to the full dataset

In [None]:
full_dataset_path = os.path.normpath(os.path.join(cwd, 'inputs/datasets/pets'))
full_dataset_path

### Create style for highlighting errors and warnings

In [46]:
from colorama import Fore, Style

# Error notification style
def print_err(message):
    print(f'{Fore.RED}ERROR:{Style.RESET_ALL}\n{message}')

# Warning notification style
def print_warn(message):
    print(f'{Fore.YELLOW}WARNING:{Style.RESET_ALL}\n{message}')

### Check the directories and clean the images

In [53]:
# !! Only run with a new image dataset !!
from PIL import UnidentifiedImageError
from PIL import ImageOps
import sys


def check_and_clean_images(path, expected_labels_num):
    
    # Check if the directory with the dataset exists
    if not os.path.isdir(path):
        print_err(f"The path {path} that should contain the labelled subfolders doesn't exist.")
        return
    if len(os.listdir(path)) == 0:
        print_err(f"The path {path} that should contain the labelled subfolders is empty.")
        return
    
    base_root = ''
    empty_label = False
    incomplete_dataset = False
    for root, dirs, files in os.walk(path):
        print(f'Checking the provided directory {root} ...')
        if not base_root:
            base_root = root
            # Check if all labels are present
            if len(dirs) != expected_labels_num:
                print_warn(f"The labelled subfolders {dirs} seem to be incomplete: there are {len(dirs)} instead of {expected_labels_num} labels.\n"
                        f"Continue cleaning and preparing the existing subfolders anyway? (y/n)")
                sys.stdout.flush()
                user_prompt = input()
                if user_prompt.lower() == 'y':
                    incomplete_dataset = True
                    continue
                else:
                    print('Cancelling the operation.')
                    return
        # Check if all labels contain files
        else:
            print(f'Amount of files with the label "{os.path.split(root)[1]}": \n{len(files)}')
            if len(files) == 0:
                empty_label = True
    
    if empty_label:
        print_warn(f"There are labelled subfolders not containing any files.\n"
                f"Continue cleaning and preparing the other subfolders anyway? (y/n)")
        sys.stdout.flush()
        user_prompt = input()
        if user_prompt.lower() == 'y':
            incomplete_dataset = True
        else:
            print('Cancelling the operation.')
            return
    
    # Valid image extensions
    img_ext = ('.png', '.jpg', '.jpeg')
    img_stats = {}

        
    for label in os.listdir(path):
        c = 0
        label_path = os.path.join(path, label)
        label_files = os.listdir(label_path)
        
        if len(label_files) == 0:
            print_warn(f"The path {label_path} that should contain the images of the pet '{label}' is empty. Skipping to the next label.")
            continue
        
        for file in label_files:
            file_path = os.path.join(label_path, file)
            if not file.lower().endswith(img_ext):
                print_err(f'{file} has an invalid file extension. Valid extensions are png, jpg, jpeg. Delete file (y) or cancel operation (n)?')
                sys.stdout.flush()
                user_prompt = input()
                if user_prompt.lower() == 'y':
                    print(f'Deleting invalid file {file} ...')
                    os.remove(file_path)
                else:
                    print('Cancelling the image cleaning process. Please make sure all input files have valid extensions and restart the process.')
                    return
            else:
                try:
                    with Image.open(file_path) as img:
                        if not img.width == img.height:
                            print_warn(f'{file_path}:\nThe image aspect ratio is not 1:1. Pad and overwrite file? (y/n)')
                            sys.stdout.flush()
                            user_prompt = input()
                            if user_prompt.lower() == 'y':
                                img = ImageOps.pad(img, (450, 450), method=Image.LANCZOS, color='white')
                                img.show()
                                #img = img.resize((450, 450), resample=Image.LANCZOS)
                                img.save(file_path)
                                print(f'Padded and overwrote {file_path}')
                            else:
                                print('Skipping ...')
                                continue
                    c += 1
                    
                except (IOError, OSError, UnidentifiedImageError):
                    print_err(f"Couldn't open {file_path}.\nDelete invalid file {file_path}? (y/n)")
                    sys.stdout.flush()
                    user_prompt = input()
                    if user_prompt.lower() == 'y':
                        os.remove(file_path)
                        print(f'Deleted {file_path}')
                        
                    continue
        
        img_stats[label] = c
    
    return img_stats
    #print(f'{label} contains {c} files')

img_stats = check_and_clean_images(full_dataset_path, expected_labels_num=3)
img_stats

Checking the provided directory d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets ...
The labelled subfolders ['fin', 'iris', 'nn', 'smilla'] seem to be incomplete: there are 4 instead of 3 labels.
Continue cleaning and preparing the existing subfolders anyway? (y/n)


Checking the provided directory d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\fin ...
Amount of files with the label "fin": 
259
Checking the provided directory d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\iris ...
Amount of files with the label "iris": 
138
Checking the provided directory d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\nn ...
Amount of files with the label "nn": 
0
Checking the provided directory d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\smilla ...
Amount of files with the label "smilla": 
100
There are labelled subfolders not containing any files.
Continue cleaning and preparing the other subfolders anyway? (y/n)
The path d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\nn that should contain the images of the pet 'nn' is empty. Skipping to the next label.


{'fin': 259, 'iris': 138, 'smilla': 100}