In [6]:
import os
import cv2
import numpy as np
import random
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Paths
base_dir = r"D:\Major Project\normalized\train"  # Path to train dataset
labels_file = os.path.join(base_dir, "labels.csv")  # Labels CSV path

# Define augmentation methods
datagen = ImageDataGenerator(
    rotation_range=20,        # Rotate image by 20 degrees
    width_shift_range=0.2,    # Shift width by 20%
    height_shift_range=0.2,   # Shift height by 20%
    shear_range=0.15,         # Shear transformation
    zoom_range=0.2,           # Zoom into the image
    horizontal_flip=True,     # Flip images horizontally
    brightness_range=[0.8, 1.2],  # Adjust brightness
    fill_mode="nearest"       # Fill missing pixels
)

# Read labels.csv to get class distribution
df = pd.read_csv(labels_file)

# Convert class names to lowercase for consistency
df["class"] = df["class"].str.lower()

# Get unique class names from labels.csv
unique_classes = df["class"].unique()

# Check existing class folders
existing_folders = [folder.lower() for folder in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, folder))]

# Ensure all class folders exist
for category in unique_classes:
    class_folder = os.path.join(base_dir, category)
    if not os.path.exists(class_folder):
        print(f"âš  Warning: {class_folder} does not exist! Creating it now...")
        os.makedirs(class_folder, exist_ok=True)

# Count images per class
class_counts = df["class"].value_counts()
target_count = class_counts.max()  # Use the most populated class as reference

# Augment images for underrepresented classes
for category, count in class_counts.items():
    if count < target_count:
        class_folder = os.path.join(base_dir, category)

        # Get all images in the class folder
        images = [f for f in os.listdir(class_folder) if f.endswith((".jpg", ".png"))]

        if not images:
            print(f"âš  Skipping {category}, no images found in {class_folder}")
            continue

        augment_needed = target_count - count
        print(f"ðŸ”„ Augmenting {category}: Need {augment_needed} more images.")

        i = 0
        while i < augment_needed:
            img_file = random.choice(images)  # Pick a random image
            img_path = os.path.join(class_folder, img_file)

            # Read the image
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = np.expand_dims(img, axis=0)  # Expand dimensions for augmentation

            # Generate an augmented image
            augmented = datagen.flow(img, batch_size=1)
            aug_img = next(augmented)[0].astype(np.uint8)

            # Save the augmented image
            aug_filename = f"aug_{i}_{img_file}"
            aug_path = os.path.join(class_folder, aug_filename)
            cv2.imwrite(aug_path, cv2.cvtColor(aug_img, cv2.COLOR_RGB2BGR))

            i += 1

print("âœ… Data augmentation complete! The dataset is now more balanced.")


ðŸ”„ Augmenting implant: Need 3458 more images.
ðŸ”„ Augmenting cavity: Need 4666 more images.
ðŸ”„ Augmenting impacted tooth: Need 4814 more images.
âœ… Data augmentation complete! The dataset is now more balanced.
