# **Data Collection**
---

## Objective

- Clean and prepare the provided image dataset

## Inputs

- A folder containing a specific amount of correctly labeled subfolders corresponding to the amount of pets to distinguish
- Several hundred preprocessed png images (size 450x450) in each subfolder, showing a close-up of the pet's face

## Outputs

- A cleaned dataset in the folder `inputs/datasets/pets` split into `train`/`validation`/`test` subfolders

---

### Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from matplotlib.image import imread
from tensorflow.keras.preprocessing import image
from PIL import Image
import shutil
import random

sns.set_style("white")


### Set the correct working directory

In [2]:
# Check the current working directory; os.chdir to change working directory if needed
cwd = os.getcwd()
cwd

'd:\\Projekte\\Coding\\CodeInstitute\\PP5\\jupyter_notebooks'

In [5]:
os.chdir(os.path.dirname(cwd))
cwd = os.getcwd()
cwd

'd:\\Projekte\\Coding\\CodeInstitute\\PP5'

### Set the path to the full dataset

In [17]:
full_dataset_path = os.path.normpath(os.path.join(cwd, 'inputs/datasets/pets'))
full_dataset_path

'd:\\Projekte\\Coding\\CodeInstitute\\PP5\\inputs\\datasets\\pets'

### Create style for highlighting errors

In [15]:
from colorama import Fore, Style

# Error notification style
def print_err(message):
    print(f'{Fore.RED}ERROR:{Style.RESET_ALL}\n{message}')

### Check the directories and clean the images

In [24]:
# !! Only run with a new image dataset !!
from PIL import UnidentifiedImageError


def check_and_clean_images(path):
    
    if not os.path.isdir(path):
        print_err(f"The path {path} that should contain the labelled subfolders doesn't exist.")
        return
    if len(os.listdir(path)) == 0:
        print_err(f"The path {path} that should contain the labelled subfolders is empty.")
        return
    
    # Valid image extensions
    img_ext = ('.png', '.jpg', '.jpeg')
    img_stats = {}
    for label in os.listdir(path):
        c = 0
        label_path = os.path.join(path, label)
        label_files = os.listdir(label_path)
        if len(label_files) == 0:
            print_err(f"The path {label_path} that should contain the images of the pet '{label}' is empty. Cancelling the operation.")
            return
        for file in label_files:
            file_path = os.path.join(label_path, file)
            if not file.lower().endswith(img_ext):
                print_err(f'{file} has an invalid file extension. Valid extensions are png, jpg, jpeg. Delete file (y) or cancel operation (n)?')
                user_prompt = input()
                if user_prompt.lower() == 'y':
                    print(f'Deleting invalid file {file} ...')
                    os.remove(file_path)
                else:
                    print('Canceling the image cleaning process. Please make sure all input files have valid extensions and restart the process.')
                    return
            else:
                try:
                    with Image.open(file_path) as img:
                        if not img.width == img.height:
                            user_prompt = input(f"Resize and overwrite {file_path}? (y/n)")
                            if user_prompt.lower() == 'y':
                                img = img.resize((450, 450), resample=Image.LANCZOS)
                                img.save(file_path)
                                print(f'Resized and overwrote {file_path}')
                            else:
                                print('skipping ...')
                                continue
                    c += 1
                    
                except (IOError, OSError, UnidentifiedImageError):
                    print_err(f"Couldn't open {file_path}.\nDelete invalid file {file_path}? (y/n)")
                    user_prompt = input()
                    if user_prompt.lower() == 'y':
                        os.remove(file_path)
                        print(f'Deleted {file_path}')
                        
                    continue
        
        img_stats[label] = c
    
    return img_stats
    #print(f'{label} contains {c} files')

img_stats = check_and_clean_images(full_dataset_path)
#img_stats

[31mERROR:[0m
Couldn't open d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\iris\Neue Bitmap.png.
Delete invalid file d:\Projekte\Coding\CodeInstitute\PP5\inputs\datasets\pets\iris\Neue Bitmap.png? (y/n)
