In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
working_dir = os.getcwd()
print(f"You are now working in {working_dir}")
print("If you need to change to the parent directory, run the cell below")

In [None]:
os.chdir(os.path.dirname(working_dir))

In [None]:
working_dir = os.getcwd()
print(f"You have now changed your working directory to {working_dir}")

In [None]:
version = 'v1'
file_path = f'outputs/{version}'

if 'outputs' in os.listdir(working_dir) and version in os.listdir(working_dir + '/outputs'):
    print("This version already exists, create a new version if you are working on a new version")
    pass
else:
    os.makedirs(name=file_path)

<hr>

### Search for non-image files


In [None]:
"""
This function will search through the raw dataset
for files that doesn´t have the extensions
that we typed in at 'image_extension'.

When going through all the files the arrays for
image_files and non_image_files are filled with
result that in the end will be printed out.    
"""


def search_non_image_files(raw_dir):
    
    image_extension = ('.png', '.jpg', '.jpeg') # file extensions to search for
    
    non_image_files = [] # array for files without the extension searched for
    image_files = [] # array for image files with the extension searched for
    
    folders = os.listdir(raw_dir)
    for folder in folders:
        files = os.listdir(os.path.join(raw_dir, folder))
        
        for file in files: 
            file_location = os.path.join(raw_dir, folder, file)
            if not file.lower().endswith(image_extension):
                non_image_files.append(file_location)
            else:
                image_files.append(file_location)
    

    print("Total amount of folders searched:", len(folders))
    print("Total image files found:", len(image_files))
    print("Total non image files found:", len(non_image_files))
    
    return image_files, non_image_files

In [None]:
search_non_image_files(raw_dir='inputs/dataset/raw/flower_photos')

<hr>

### Image distribution

In [None]:
def images_distribution(image_dirs, save_path=None):

    # Creates a list that collects images per flower category
    images_per_flower = {}
    for flowers in os.listdir(image_dirs):
        flowers_path = os.path.join(image_dirs, flowers)
        if os.path.isdir(flowers_path):
            images_per_flower[flowers] = len([img for img in os.listdir(flowers_path) if img.endswith(('.png', 'jpg', 'jpeg'))])
            

    # Convert this list to a dataframe
    df = pd.DataFrame(list(images_per_flower.items()), columns=['Flowers', 'Count'])

    # Calculate the percentage of images per flower
    df['Percentage'] = (df['Count'] / df['Count'].sum()) * 100

    # Calculate the highest, the lowest and mean value of the distribution
    highest_perc = df.loc[df['Percentage'].idxmax()]
    lowest_perc = df.loc[df['Percentage'].idxmin()]
    mean_percentage = df['Percentage'].mean()
    mean_count = df['Count'].mean()
    


    # Create a barplot to visualize this distibution of images per flower
    plt.figure(figsize=(15, 8))
    bars = plt.bar(df['Flowers'], df['Percentage'], color='skyblue')
    plt.axhline(y=mean_percentage, color='r', linestyle='--')
    plt.xlabel('Flowers')
    plt.ylabel('Percentage (%)')
    plt.title('Distribution of images per flower')
    plt.xticks(rotation=90)

    # Adds extra text to the top of the barplot
    plt.gcf().text(0.45, 0.85, f"Mean percentage: {mean_percentage:.2f}% ({mean_count:.0f} images)", fontsize=10)
    plt.gcf().text(0.45, 0.82, f"Highest percentage: {highest_perc['Flowers']} - {highest_perc['Percentage']:.2f}%", fontsize=10)
    plt.gcf().text(0.45, 0.79, f"Lowest percentage: {lowest_perc['Flowers']} - {lowest_perc['Percentage']:.2f}%", fontsize=10)

    # Adds extra text over each bar to show amount of images
    for bar, count in zip(bars, df['Count']):
        plt.text(
            bar.get_x() + bar.get_width() / 2,  
            bar.get_height() + 0.3,  
            f'{count}',  
            ha='center', va='bottom', fontsize=9, color='black'
            )
    
    # If save_path is given it will save an image of the barplot at the given path
    if save_path:
        plt.savefig(save_path, format='png', bbox_inches='tight')
    
    # Shows the barplot
    plt.tight_layout()
    plt.show()
    
    return df

In [None]:
all_flowers = images_distribution(image_dirs = 'inputs/dataset/raw/flower_photos', save_path=f'outputs/{version}/raw_image_distribution.png')

### Split into train/validation/test sets

In [None]:
import shutil
import random


def split_train_validation_test_images(image_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    
    # Check if the ratios are set to the correct total amount
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("The total of train_set_ratio, validation_set_ratio and test_set_ratio should be 1.0")
        return

    flowers = os.listdir(image_dir)
    
    # Creates train, validation and test folders if they don´t exist
    for folder in ['train', 'validation', 'test']:
        folder_path = os.path.join(image_dir, folder)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        for flower in flowers:
            flower_folder_path = os.path.join(folder_path, flower)
            os.makedirs(flower_folder_path, exist_ok=True)

    # Moves images to the right sub-folder
    for flower in flowers:
        files = os.listdir(os.path.join(image_dir, flower))
        random.shuffle(files)

        train_set_files_qty = int(len(files) * train_set_ratio)
        validation_set_files_qty = int(len(files) * validation_set_ratio)

        count = 1
        for image_name in files:
            src_path = os.path.join(image_dir, flower, image_name)
            if count <= train_set_files_qty:
                dest_path = os.path.join(image_dir, 'train', flower, image_name)
            elif count <= (train_set_files_qty + validation_set_files_qty):
                dest_path = os.path.join(image_dir, 'validation', flower, image_name)
            else:
                dest_path = os.path.join(image_dir, 'test', flower, image_name)
            
            shutil.move(src_path, dest_path)
            count += 1

        # Deletes the inputfolder if it´s empty
        os.rmdir(os.path.join(image_dir, flower))

In [None]:
split_train_validation_test_images(image_dir=f"inputs/dataset/raw/flower_photos",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

In [None]:
labels = os.listdir("inputs/dataset/raw/flower_photos/train")

print(f"Flower labels: {labels}")

In [None]:
image_dirs = 'inputs/dataset/raw/flower_photos'

data = []


for folder in ['train', 'validation', 'test']:
    for label in labels:
        
        data.append({
            'Set': folder,
            'Label': label,
            'Frequency': int(len(os.listdir(image_dirs + '/' + folder + '/' + label)))
        })
        
        print(f"* {folder} - {label}: {len(os.listdir(image_dirs + '/' + folder + '/' + label))} images")


df_freq = pd.DataFrame(data)


print("\n")
sns.set_style("whitegrid")
plt.figure(figsize=(8, 5))
sns.barplot(data=df_freq, x='Set', y='Frequency', hue='Label')
plt.savefig(f'{file_path}/labels_distribution.png', bbox_inches='tight', dpi=150)
plt.show()