Data description

In [10]:
from JoinDatasets import *
import shutil
import os
labels = ["Healthy", "Powdery", "Rust"]
sets = ["Test", "Train", "Validation"]

def join_data_sets(main_dir, output_dir):
    for label in labels:
        for set in sets:
            dir_path = os.path.join(main_dir, set, set, label)
            files = os.listdir(dir_path)
            for file in files:
                src = os.path.join(dir_path, file)
                dst = os.path.join(output_dir, label)
                if not os.path.exists(dst):
                    os.mkdir(dst)

                shutil.copy2(src, dst)

main_dir = "./data"
output_dir = "./merged-data"
join_data_sets(main_dir, output_dir)

Dataset content

In [None]:
import os
import matplotlib.pyplot as plt

# Function to count the number of files in a directory
def count_files(directory):
    """
    Count files in the given directory.

    :param directory: directory to analyse
    :return: number of files in the given directory
    """
    count = 0
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            count += 1
    return count

plots_saving_dir = "saved-plots/"

# Count images in each folder
healthy_count = count_files(output_dir + "/Healthy")
powdery_count = count_files(output_dir + "/Powdery")
rust_count = count_files(output_dir + "/Rust")

# Create a bar chart
labels = ['Healthy', 'Powdery', 'Rust']
counts = [healthy_count, powdery_count, rust_count]

plt.bar(labels, counts)
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Number of Images in Each Category')
plt.savefig(plots_saving_dir+'Number_of_Images_in_Each_Category.png', bbox_inches='tight')
plt.show()

# Calculate total count
total_count = healthy_count + powdery_count + rust_count

# Calculate percentages
healthy_percentage = (healthy_count / total_count) * 100
powdery_percentage = (powdery_count / total_count) * 100
rust_percentage = (rust_count / total_count) * 100

# Create a pie chart
labels = ['Healthy', 'Powdery', 'Rust']
sizes = [healthy_percentage, powdery_percentage, rust_percentage]
colors = ['lightgreen', 'lightblue', 'lightcoral']

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Percentage of Images of Each Category in the Dataset')
plt.savefig(plots_saving_dir+'Percentage_of_Images_of_Each_Category_in_the_Dataset.png', bbox_inches='tight')

plt.show()


Data relations

In [18]:
from rgbMean import *

healthy_dir = "merged-data\\Healthy"
powdery_dir = "merged-data\\Powdery"
rusty_dir = "merged-data\\Rust"
directories = [healthy_dir, powdery_dir, rusty_dir]
directories_rgb_count = []
rgbMean = RgbMean()
for directory in directories:
    fileNames = os.listdir(directory)
    filePaths = [directory + "\\" + fileName for fileName in fileNames]
    with multiprocessing.Pool(4) as pool:
        result = pool.map(rgbMean.get_single_image_rgb_mean, filePaths)
        rgb_counts = rgbMean.get_rgb_count(result)
        directories_rgb_count.append(rgb_counts)

In [None]:
for directory_rgb_count in directories_rgb_count: 
    plt.xlabel('RGB Value')
    plt.ylabel('Count')
    plt.plot(directory_rgb_count[0], label = "R", color='r')
    plt.plot(directory_rgb_count[1], label = "G", color='g')
    plt.plot(directory_rgb_count[2], label = "B", color='b')
    plt.show()

Data difficulty

Data representation

Data normalisation

In [None]:
from JoinDatasets import join_datasets_with_shape_normalisation
import os

path = os.getcwd()
join_datasets_with_shape_normalisation(os.path.join(path,"data"), os.path.join(path,"merged-data","unchanged"), "unchanged")
print("Finished copying")

In [None]:
join_datasets_with_shape_normalisation(os.path.join(path,"data"), os.path.join(path,"merged-data","cropped"), "crop", (2421,1728)) # 2592,1728
print("Finished cropping")

In [None]:
#join_datasets_with_shape_normalisation(os.path.join(path,"data"), os.path.join(path,"merged-data","resized"), "resize", (3982,2700))
join_datasets_with_shape_normalisation(os.path.join(path,"data"), os.path.join(path,"merged-data","resized"), "resize", (225,225))
print("Finished resizing")

Data augmentation

In [25]:
import numpy as np
from skimage import exposure, io
import os
from scipy import ndimage
from multiprocessing import Process

def better_contrast(original_image, saving_dir, new_name):
    v_min, v_max = np.percentile(original_image, (0.2, 99.8))
    better_contrast = exposure.rescale_intensity(original_image, in_range=(v_min, v_max))
    save_image(better_contrast, saving_dir, new_name)

def gamma_correction(original_image, saving_dir, new_name):
    adjusted_gamma_image = exposure.adjust_gamma(original_image, gamma=0.4, gain=0.9)
    save_image(adjusted_gamma_image, saving_dir, new_name)

def log_correction(original_image, saving_dir, new_name):
    log_correction_image = exposure.adjust_log(original_image)
    save_image(log_correction_image, saving_dir, new_name)

def sigmoid_correction(original_image, saving_dir, new_name):
    sigmoid_correction_image = exposure.adjust_sigmoid(original_image)
    save_image(sigmoid_correction_image, saving_dir, new_name)

def horizontal_flip(original_image, saving_dir, new_name):
    horizontal_flip = original_image[:, ::-1]
    save_image(horizontal_flip, saving_dir, new_name)

def vertical_flip(original_image, saving_dir, new_name):
    vertical_flip = original_image[::-1, :]
    save_image(vertical_flip, saving_dir, new_name)

def blured(original_image, saving_dir, new_name):
    blured_image = ndimage.uniform_filter(original_image, size=(11, 11, 1))
    save_image(blured_image, saving_dir, new_name)

def save_image(image, saving_dir, new_name):
    new_filename = saving_dir + new_name + '.jpg'
    io.imsave(fname=new_filename, arr=image)

def process_image(image_path, saving_dir):
    original_image = io.imread(image_path)
    save_original(original_image, saving_dir, '_original')
    better_contrast(original_image, saving_dir, '_contrast')
    gamma_correction(original_image, saving_dir, '_gamma')
    log_correction(original_image, saving_dir, '_log')
    sigmoid_correction(original_image, saving_dir, '_sigmoid')
    horizontal_flip(original_image, saving_dir, '_horizontal')
    vertical_flip(original_image, saving_dir, '_vertical')
    blured(original_image, saving_dir, '_blured')

def save_original(original_image, saving_dir, new_name):
    save_image(original_image, saving_dir, new_name)

In [None]:
import os
from joblib import Parallel, delayed

input_folder = 'merged-data/resized'
output_dir = 'augmented-data/'

def process_image_par(subfolder, filename):
    image_path = os.path.join(input_folder, subfolder, filename)
    original_image = io.imread(image_path)
    saving_dir = os.path.join(output_dir, subfolder, filename.split('.')[0])

    save_original(original_image, saving_dir, '_original')
    better_contrast(original_image, saving_dir, '_contrast')
    gamma_correction(original_image, saving_dir, '_gamma')
    log_correction(original_image, saving_dir, '_log')
    sigmoid_correction(original_image, saving_dir, '_sigmoid')
    horizontal_flip(original_image, saving_dir, '_horizontal')
    vertical_flip(original_image, saving_dir, '_vertical')
    blured(original_image, saving_dir, '_blurred')

if __name__ == "__main__":
    # Create output directories if they don't exist
    os.makedirs(output_dir, exist_ok=True)

    for subfolder in os.listdir(input_folder):
        subfolder_path = os.path.join(input_folder, subfolder)
        if os.path.isdir(subfolder_path):
            print(f"Processing images in '{subfolder}' folder...")

            # Parallelize the processing of images within each subfolder
            Parallel(n_jobs=8)(
                delayed(process_image_par)(subfolder, filename)
                for filename in os.listdir(subfolder_path)
                if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png')
            )


Data splits

In [1]:
import os
import shutil
from copytools import *

# PROPORTIONS
TRAIN = 0.8
VAL = 0.1
TEST = 0.1

merged_data_dir = "merged-data/resized"
augmented_data_dir = "augmented-data"

healthy_dir = "Healthy"
powdery_dir = "Powdery"
rust_dir = "Rust"
# images_directories = [healthy_dir, powdery_dir, rust_dir]
images_directories = [healthy_dir, powdery_dir, rust_dir]
all_splits_dir = "splits"
train_dir = "train"
val_dir = "val"
test_dir = "test"

# split 1
for directory in images_directories:
    directory_path = f"{merged_data_dir}/{directory}"
    files = os.listdir(directory_path)
    train_count = round(len(files) * TRAIN)
    val_count = round(len(files) * VAL)
    test_count = round(len(files) * TEST)
    for filename in files:        
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            image_path = os.path.join(directory_path, filename)
            if train_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split1/{train_dir}/{directory}/"), exist_ok=True)
                destination = f"{all_splits_dir}/split1/{train_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                train_count -= 1
            elif val_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split1/{val_dir}/{directory}/"), exist_ok=True)
                destination = f"{all_splits_dir}/split1/{val_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                val_count -= 1
            elif test_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split1/{test_dir}/{directory}/"), exist_ok=True)
                destination = f"{all_splits_dir}/split1/{test_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                test_count -= 1

In [12]:
# split 2 - resized -> augmented -> splited            
for directory in images_directories:
    directory_path = f"{augmented_data_dir}/{directory}"
    files = os.listdir(directory_path)
    train_count = round(len(files) * TRAIN)
    val_count = round(len(files) * VAL)
    test_count = round(len(files) * TEST)
    for filename in files:        
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            image_path = os.path.join(directory_path, filename)
            if train_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split2/{train_dir}/{directory}/{filename}"), exist_ok=True)
                destination = f"{all_splits_dir}/split2/{train_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                train_count -= 1
            elif val_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split2/{val_dir}/{directory}/{filename}"), exist_ok=True)
                destination = f"{all_splits_dir}/split2/{val_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                val_count -= 1
            elif test_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split2/{test_dir}/{directory}/"), exist_ok=True)
                destination = f"{all_splits_dir}/split2/{test_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                test_count -= 1

In [2]:
# split 3 - copy split 2 without VAL set -> create VAL set from TRAIN subset
import random


destination_directory = f"{all_splits_dir}/split3"
for directory in [train_dir, test_dir]:
    source_directory = f"{all_splits_dir}/split2/{directory}"
    try:
        copytree(source_directory, f"{destination_directory}/{directory}")
    except:
        print("directory already exists")
# test,train,valid directory
for directory in images_directories:
    directory_path = f"{destination_directory}/{train_dir}/{directory}"
    source_path = f"{all_splits_dir}/split2/{train_dir}/{directory}"
    files = os.listdir(f"{source_path}")
    val_count = round(len(files) * VAL)
    random.shuffle(files)
    for filename in files:        
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            image_path = os.path.join(directory_path, filename)
            if val_count != 0:
                os.makedirs(os.path.dirname(f"{all_splits_dir}/split3/{val_dir}/{directory}/{filename}"), exist_ok=True)
                destination = f"{all_splits_dir}/split3/{val_dir}/{directory}/{filename}"
                shutil.copyfile(image_path, destination)
                val_count -= 1
