## Plot Class Distribution

In [None]:
import os
import matplotlib.pyplot as plt

def plot_class_counts(dataset_dir):
    class_counts = {}
    
    for class_folder in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_folder)
        if os.path.isdir(class_path):
            class_number = int(class_folder)
            trainset_num_files = len(os.listdir(class_path))
            class_counts[class_number] = trainset_num_files

    sorted_class_counts = dict(sorted(class_counts.items()))

    class_counts = dict(sorted(class_counts.items(), key=lambda item: item[1], reverse=True))

    plt.figure(figsize=(8, 4))
    plt.bar(class_counts.keys(), class_counts.values(), color='turquoise')
    plt.xlabel('Class')
    plt.ylabel('Number of Files')
    plt.title('Number of Files in Each Class')
    plt.show()

trainset_dir = ".\\Train"
plot_class_counts(trainset_dir)

## Image Augmentation to even out distribution

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

output_dir = ".\\Train_Augmented"

os.makedirs(output_dir, exist_ok=True)

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    brightness_range=[0.5, 1.5] 
)

def augment_and_save_image(image_path, output_dir, prefix, num_augmented):
    img = load_img(image_path)
    img_array = img_to_array(img)
    img_array = img_array.reshape((1,) + img_array.shape)

    i = 0
    for batch in datagen.flow(img_array, batch_size=1, save_to_dir=output_dir, save_prefix=prefix, save_format='jpg'):
        i += 1
        if i >= num_augmented:
            break

for class_folder in os.listdir(trainset_dir):
    class_path = os.path.join(trainset_dir, class_folder)
    if os.path.isdir(class_path):
        output_class_dir = os.path.join(output_dir, class_folder)
        os.makedirs(output_class_dir, exist_ok=True)
        num_files = len(os.listdir(output_class_dir))
        if num_files < 1000:
            for filename in os.listdir(class_path):
                if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"):
                    image_path = os.path.join(class_path, filename)
                    trainset_num_files = len(os.listdir(class_path))
                    num_augmented = int(2000 / trainset_num_files)
                    augment_and_save_image(image_path, output_class_dir, filename.split('.')[0], num_augmented)

## Plot new Class Distribution

In [None]:
output_dir = ".\\Train_Augmented"
plot_class_counts(output_dir)