## Create  Datasets used in this Project using this ipynb file

In [1]:
print("hello world")

hello world


In [None]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

# --- 1. Configuration ---
# Define paths and parameters
RAW_DATA_DIR = os.path.join('raw_data', 'BrinjalFruitX')
PROCESSED_DIR = os.path.join('processed_data', 'BrinjalFruitX_299x299')
IMG_SIZE = (299, 299)
TEST_SPLIT_SIZE = 0.20 # 20% for the final test set
VALIDATION_SPLIT_SIZE = 0.125 # 10% of the original data (0.125 * 0.8 = 0.1)
RANDOM_STATE = 42 # For reproducible splits

# --- 2. Create Processed Data Directory ---
# This ensures the folder exists before we try to save files to it.
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Directory '{PROCESSED_DIR}' is ready.")

# --- 3. Load Images and Labels ---
images = []
labels = []

# Get class names from the folder names in the raw data directory
class_names = sorted([d for d in os.listdir(RAW_DATA_DIR) if os.path.isdir(os.path.join(RAW_DATA_DIR, d))])
# Create a mapping from class name to an integer index
label_map = {name: i for i, name in enumerate(class_names)}

print("Starting image loading and preprocessing...")
# Use tqdm for a progress bar
for class_name in tqdm(class_names, desc="Processing classes"):
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    class_label = label_map[class_name]

    for image_file in os.listdir(class_path):
        image_path = os.path.join(class_path, image_file)

        # Read the image
        image = cv2.imread(image_path)

        # Check if the image was loaded correctly
        if image is not None:
            # Convert image from BGR (OpenCV's default) to RGB
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            # Resize image to the standard size
            image = cv2.resize(image, IMG_SIZE)
            
            images.append(image)
            labels.append(class_label)
        else:
            print(f"Warning: Could not read image {image_path}. Skipping.")

print("Image loading complete.")

# --- 4. Convert to NumPy Arrays and Normalize ---
# Convert lists to NumPy arrays for efficient processing
images_np = np.array(images)
labels_np = np.array(labels)

# Normalize pixel values from the [0, 255] range to the [0.0, 1.0] range
images_np = images_np / 255.0

print(f"Converted to NumPy arrays. Image data shape: {images_np.shape}, Labels shape: {labels_np.shape}")

# --- 5. Split the Data ---
# First split: separate out the 20% test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    images_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np  # Ensures class distribution is similar across splits
)

# Second split: separate the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=VALIDATION_SPLIT_SIZE, # 0.125 of the 80% results in 10% of the original data
    random_state=RANDOM_STATE,
    stratify=y_train_val # Stratify again for the validation split
)

print("Data splitting complete:")
print(f"  Training set:   {X_train.shape[0]} samples")
print(f"  Validation set: {X_val.shape[0]} samples")
print(f"  Test set:       {X_test.shape[0]} samples")

# --- 6. Save the Processed Data ---
print("Saving processed data to .npy files...")

np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train)

np.save(os.path.join(PROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PROCESSED_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

# Save the class names/label map for later use in decoding predictions
with open(os.path.join(PROCESSED_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)

print("All data has been processed and saved successfully! ✅")


Directory 'processed_data\BrinjalFruitX_299x299' is ready.
Starting image loading and preprocessing...


Processing classes: 100%|██████████| 5/5 [00:47<00:00,  9.57s/it]


Image loading complete.
Converted to NumPy arrays. Image data shape: (1802, 299, 299, 3), Labels shape: (1802,)
Data splitting complete:
  Training set:   1260 samples
  Validation set: 181 samples
  Test set:       361 samples
Saving processed data to .npy files...
All data has been processed and saved successfully! ✅


In [4]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

# --- 1. Configuration ---
# Define paths and parameters
RAW_DATA_DIR = os.path.join('raw_data', 'BrinjalFruitX')
PROCESSED_DIR = os.path.join('processed_data', 'BrinjalFruitX_600x600')
IMG_SIZE = (600, 600)
TEST_SPLIT_SIZE = 0.20 # 20% for the final test set
VALIDATION_SPLIT_SIZE = 0.125 # 10% of the original data (0.125 * 0.8 = 0.1)
RANDOM_STATE = 42 # For reproducible splits

# --- 2. Create Processed Data Directory ---
# This ensures the folder exists before we try to save files to it.
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Directory '{PROCESSED_DIR}' is ready.")

# --- 3. Load Images and Labels ---
images = []
labels = []

# Get class names from the folder names in the raw data directory
class_names = sorted([d for d in os.listdir(RAW_DATA_DIR) if os.path.isdir(os.path.join(RAW_DATA_DIR, d))])
# Create a mapping from class name to an integer index
label_map = {name: i for i, name in enumerate(class_names)}

print("Starting image loading and preprocessing...")
# Use tqdm for a progress bar
for class_name in tqdm(class_names, desc="Processing classes"):
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    class_label = label_map[class_name]

    for image_file in os.listdir(class_path):
        image_path = os.path.join(class_path, image_file)

        # Read the image
        image = cv2.imread(image_path)

        # Check if the image was loaded correctly
        if image is not None:
            # Convert image from BGR (OpenCV's default) to RGB
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            # Resize image to the standard size
            image = cv2.resize(image, IMG_SIZE)
            
            images.append(image)
            labels.append(class_label)
        else:
            print(f"Warning: Could not read image {image_path}. Skipping.")

print("Image loading complete.")

# --- 4. Convert to NumPy Arrays and Normalize ---
# Convert lists to NumPy arrays for efficient processing
images_np = np.array(images)
labels_np = np.array(labels)

# Normalize pixel values from the [0, 255] range to the [0.0, 1.0] range
images_np = images_np / 255.0

print(f"Converted to NumPy arrays. Image data shape: {images_np.shape}, Labels shape: {labels_np.shape}")

# --- 5. Split the Data ---
# First split: separate out the 20% test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    images_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np  # Ensures class distribution is similar across splits
)

# Second split: separate the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=VALIDATION_SPLIT_SIZE, # 0.125 of the 80% results in 10% of the original data
    random_state=RANDOM_STATE,
    stratify=y_train_val # Stratify again for the validation split
)

print("Data splitting complete:")
print(f"  Training set:   {X_train.shape[0]} samples")
print(f"  Validation set: {X_val.shape[0]} samples")
print(f"  Test set:       {X_test.shape[0]} samples")

# --- 6. Save the Processed Data ---
print("Saving processed data to .npy files...")

np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train)

np.save(os.path.join(PROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PROCESSED_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

# Save the class names/label map for later use in decoding predictions
with open(os.path.join(PROCESSED_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)

print("All data has been processed and saved successfully! ✅")


Directory 'processed_data\BrinjalFruitX_600x600' is ready.
Starting image loading and preprocessing...


Processing classes: 100%|██████████| 5/5 [00:44<00:00,  8.93s/it]


Image loading complete.
Converted to NumPy arrays. Image data shape: (1802, 600, 600, 3), Labels shape: (1802,)
Data splitting complete:
  Training set:   1260 samples
  Validation set: 181 samples
  Test set:       361 samples
Saving processed data to .npy files...
All data has been processed and saved successfully! ✅


In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import pandas as pd
# --- ADDED ---
from imblearn.over_sampling import RandomOverSampler

# --- 1. Configuration ---
# Define paths and parameters
RAW_DATA_DIR = os.path.join('raw_data', 'BrinjalFruitX')
PROCESSED_DIR = os.path.join('processed_data', 'BrinjalFruitX_balanced') # Changed output dir
IMG_SIZE = (224, 224)
TEST_SPLIT_SIZE = 0.20 # 20% for the final test set
VALIDATION_SPLIT_SIZE = 0.125 # 10% of the original data (0.125 * 0.8 = 0.1)
RANDOM_STATE = 42 # For reproducible splits

# --- 2. Create Processed Data Directory ---
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Directory '{PROCESSED_DIR}' is ready.")

# --- 3. Load Images and Labels ---
images = []
labels = []

class_names = sorted([d for d in os.listdir(RAW_DATA_DIR) if os.path.isdir(os.path.join(RAW_DATA_DIR, d))])
label_map = {name: i for i, name in enumerate(class_names)}

print("Starting image loading and preprocessing...")
for class_name in tqdm(class_names, desc="Processing classes"):
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    class_label = label_map[class_name]

    for image_file in os.listdir(class_path):
        image_path = os.path.join(class_path, image_file)
        image = cv2.imread(image_path)

        if image is not None:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, IMG_SIZE)
            images.append(image)
            labels.append(class_label)
        else:
            print(f"Warning: Could not read image {image_path}. Skipping.")

print("Image loading complete.")

# --- 4. Convert to NumPy Arrays and Normalize ---
images_np = np.array(images)
labels_np = np.array(labels)
images_np = images_np / 255.0

print(f"Converted to NumPy arrays. Image data shape: {images_np.shape}, Labels shape: {labels_np.shape}")

# --- 5. Split the Data ---
# First split: separate out the 20% test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    images_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np
)

# Second split: separate the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=VALIDATION_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_train_val
)

print("\nData splitting complete:")
print(f"  Original Training set:   {X_train.shape[0]} samples")
print(f"  Validation set:          {X_val.shape[0]} samples")
print(f"  Test set:                {X_test.shape[0]} samples")

# --- ADDED: Handle Class Imbalance using Oversampling on the Training Set ONLY ---
print("\nHandling class imbalance on the training set...")

# Reshape image data for the oversampler
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)

# Initialize the oversampler
ros = RandomOverSampler(random_state=RANDOM_STATE)

# Apply oversampling
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_reshaped, y_train)

# Reshape the image data back to its original dimensions
X_train_balanced = X_train_resampled.reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 3)
y_train_balanced = y_train_resampled

print("\n--- Class Distribution After Oversampling (Training Set) ---")
unique, counts = np.unique(y_train_balanced, return_counts=True)
print(dict(zip(unique, counts)))
print(f"  Balanced Training set:   {X_train_balanced.shape[0]} samples")

# --- 6. Save the Processed Data ---
print("\nSaving processed data to .npy files...")

np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train_balanced)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train_balanced)

# Save the original, untouched validation and test sets
np.save(os.path.join(PROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PROCESSED_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

with open(os.path.join(PROCESSED_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)

print("\nAll data has been processed and saved successfully! ✅")

Directory 'processed_data\BrinjalFruitX_balanced' is ready.
Starting image loading and preprocessing...


Processing classes: 100%|██████████| 5/5 [00:44<00:00,  8.91s/it]


Image loading complete.
Converted to NumPy arrays. Image data shape: (1802, 224, 224, 3), Labels shape: (1802,)

Data splitting complete:
  Original Training set:   1260 samples
  Validation set:          181 samples
  Test set:                361 samples

Handling class imbalance on the training set...

--- Class Distribution After Oversampling (Training Set) ---
{0: 507, 1: 507, 2: 507, 3: 507, 4: 507}
  Balanced Training set:   2535 samples

Saving processed data to .npy files...

All data has been processed and saved successfully! ✅


In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import pandas as pd
# --- ADDED ---
from imblearn.over_sampling import RandomOverSampler

# --- 1. Configuration ---
# Define paths and parameters
RAW_DATA_DIR = os.path.join('raw_data', 'BrinjalFruitX')
PROCESSED_DIR = os.path.join('processed_data', 'BrinjalFruitX_balanced_classless') # Changed output dir
IMG_SIZE = (224, 224)
TEST_SPLIT_SIZE = 0.20 # 20% for the final test set
VALIDATION_SPLIT_SIZE = 0.125 # 10% of the original data (0.125 * 0.8 = 0.1)
RANDOM_STATE = 42 # For reproducible splits

# --- 2. Create Processed Data Directory ---
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Directory '{PROCESSED_DIR}' is ready.")

# --- 3. Load Images and Labels ---
images = []
labels = []

class_names = sorted([d for d in os.listdir(RAW_DATA_DIR) if os.path.isdir(os.path.join(RAW_DATA_DIR, d))])
label_map = {name: i for i, name in enumerate(class_names)}

print("Starting image loading and preprocessing...")
for class_name in tqdm(class_names, desc="Processing classes"):
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    class_label = label_map[class_name]

    for image_file in os.listdir(class_path):
        image_path = os.path.join(class_path, image_file)
        image = cv2.imread(image_path)

        if image is not None:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, IMG_SIZE)
            images.append(image)
            labels.append(class_label)
        else:
            print(f"Warning: Could not read image {image_path}. Skipping.")

print("Image loading complete.")

# --- 4. Convert to NumPy Arrays and Normalize ---
images_np = np.array(images)
labels_np = np.array(labels)
images_np = images_np / 255.0

print(f"Converted to NumPy arrays. Image data shape: {images_np.shape}, Labels shape: {labels_np.shape}")

# --- 5. Split the Data ---
# First split: separate out the 20% test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    images_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np
)

# Second split: separate the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=VALIDATION_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_train_val
)

print("\nData splitting complete:")
print(f"  Original Training set:   {X_train.shape[0]} samples")
print(f"  Validation set:          {X_val.shape[0]} samples")
print(f"  Test set:                {X_test.shape[0]} samples")

# --- ADDED: Handle Class Imbalance using Oversampling on the Training Set ONLY ---
print("\nHandling class imbalance on the training set...")

# Reshape image data for the oversampler
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)

# Initialize the oversampler
ros = RandomOverSampler(random_state=RANDOM_STATE)

# Apply oversampling
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_reshaped, y_train)

# Reshape the image data back to its original dimensions
X_train_balanced = X_train_resampled.reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 3)
y_train_balanced = y_train_resampled

print("\n--- Class Distribution After Oversampling (Training Set) ---")
unique, counts = np.unique(y_train_balanced, return_counts=True)
print(dict(zip(unique, counts)))
print(f"  Balanced Training set:   {X_train_balanced.shape[0]} samples")

# --- 6. Save the Processed Data ---
print("\nSaving processed data to .npy files...")

np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train_balanced)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train_balanced)

# Save the original, untouched validation and test sets
np.save(os.path.join(PROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PROCESSED_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

with open(os.path.join(PROCESSED_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)

print("\nAll data has been processed and saved successfully! ✅")

Directory 'processed_data\BrinjalFruitX_balanced_classless' is ready.
Starting image loading and preprocessing...


Processing classes: 100%|██████████| 2/2 [01:16<00:00, 38.49s/it]


Image loading complete.
Converted to NumPy arrays. Image data shape: (1239, 224, 224, 3), Labels shape: (1239,)

Data splitting complete:
  Original Training set:   867 samples
  Validation set:          124 samples
  Test set:                248 samples

Handling class imbalance on the training set...

--- Class Distribution After Oversampling (Training Set) ---
{0: 507, 1: 507}
  Balanced Training set:   1014 samples

Saving processed data to .npy files...

All data has been processed and saved successfully! ✅


In [2]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import pandas as pd
# --- ADDED ---
from imblearn.over_sampling import RandomOverSampler

# --- 1. Configuration ---
# Define paths and parameters
RAW_DATA_DIR = os.path.join('raw_data', 'BrinjalFruitX')
PROCESSED_DIR = os.path.join('processed_data', 'BrinjalFruitX_299x299_balanced_classless') # Changed output dir
IMG_SIZE = (299, 299)
TEST_SPLIT_SIZE = 0.20 # 20% for the final test set
VALIDATION_SPLIT_SIZE = 0.125 # 10% of the original data (0.125 * 0.8 = 0.1)
RANDOM_STATE = 42 # For reproducible splits

# --- 2. Create Processed Data Directory ---
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Directory '{PROCESSED_DIR}' is ready.")

# --- 3. Load Images and Labels ---
images = []
labels = []

class_names = sorted([d for d in os.listdir(RAW_DATA_DIR) if os.path.isdir(os.path.join(RAW_DATA_DIR, d))])
label_map = {name: i for i, name in enumerate(class_names)}

print("Starting image loading and preprocessing...")
for class_name in tqdm(class_names, desc="Processing classes"):
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    class_label = label_map[class_name]

    for image_file in os.listdir(class_path):
        image_path = os.path.join(class_path, image_file)
        image = cv2.imread(image_path)

        if image is not None:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, IMG_SIZE)
            images.append(image)
            labels.append(class_label)
        else:
            print(f"Warning: Could not read image {image_path}. Skipping.")

print("Image loading complete.")

# --- 4. Convert to NumPy Arrays and Normalize ---
images_np = np.array(images)
labels_np = np.array(labels)
images_np = images_np / 255.0

print(f"Converted to NumPy arrays. Image data shape: {images_np.shape}, Labels shape: {labels_np.shape}")

# --- 5. Split the Data ---
# First split: separate out the 20% test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    images_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np
)

# Second split: separate the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=VALIDATION_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_train_val
)

print("\nData splitting complete:")
print(f"  Original Training set:   {X_train.shape[0]} samples")
print(f"  Validation set:          {X_val.shape[0]} samples")
print(f"  Test set:                {X_test.shape[0]} samples")

# --- ADDED: Handle Class Imbalance using Oversampling on the Training Set ONLY ---
print("\nHandling class imbalance on the training set...")

# Reshape image data for the oversampler
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)

# Initialize the oversampler
ros = RandomOverSampler(random_state=RANDOM_STATE)

# Apply oversampling
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_reshaped, y_train)

# Reshape the image data back to its original dimensions
X_train_balanced = X_train_resampled.reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 3)
y_train_balanced = y_train_resampled

print("\n--- Class Distribution After Oversampling (Training Set) ---")
unique, counts = np.unique(y_train_balanced, return_counts=True)
print(dict(zip(unique, counts)))
print(f"  Balanced Training set:   {X_train_balanced.shape[0]} samples")

# --- 6. Save the Processed Data ---
print("\nSaving processed data to .npy files...")

np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train_balanced)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train_balanced)

# Save the original, untouched validation and test sets
np.save(os.path.join(PROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PROCESSED_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

with open(os.path.join(PROCESSED_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)

print("\nAll data has been processed and saved successfully! ✅")

Directory 'processed_data\BrinjalFruitX_299x299_balanced_classless' is ready.
Starting image loading and preprocessing...


Processing classes: 100%|██████████| 2/2 [00:35<00:00, 17.82s/it]


Image loading complete.
Converted to NumPy arrays. Image data shape: (1239, 299, 299, 3), Labels shape: (1239,)

Data splitting complete:
  Original Training set:   867 samples
  Validation set:          124 samples
  Test set:                248 samples

Handling class imbalance on the training set...

--- Class Distribution After Oversampling (Training Set) ---
{0: 507, 1: 507}
  Balanced Training set:   1014 samples

Saving processed data to .npy files...

All data has been processed and saved successfully! ✅


In [3]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import pandas as pd
# --- ADDED ---
from imblearn.over_sampling import RandomOverSampler

# --- 1. Configuration ---
# Define paths and parameters
RAW_DATA_DIR = os.path.join('raw_data', 'BrinjalFruitX')
PROCESSED_DIR = os.path.join('processed_data', 'BrinjalFruitX_600x600_balanced_classless') # Changed output dir
IMG_SIZE = (600, 600)
TEST_SPLIT_SIZE = 0.20 # 20% for the final test set
VALIDATION_SPLIT_SIZE = 0.125 # 10% of the original data (0.125 * 0.8 = 0.1)
RANDOM_STATE = 42 # For reproducible splits

# --- 2. Create Processed Data Directory ---
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Directory '{PROCESSED_DIR}' is ready.")

# --- 3. Load Images and Labels ---
images = []
labels = []

class_names = sorted([d for d in os.listdir(RAW_DATA_DIR) if os.path.isdir(os.path.join(RAW_DATA_DIR, d))])
label_map = {name: i for i, name in enumerate(class_names)}

print("Starting image loading and preprocessing...")
for class_name in tqdm(class_names, desc="Processing classes"):
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    class_label = label_map[class_name]

    for image_file in os.listdir(class_path):
        image_path = os.path.join(class_path, image_file)
        image = cv2.imread(image_path)

        if image is not None:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, IMG_SIZE)
            images.append(image)
            labels.append(class_label)
        else:
            print(f"Warning: Could not read image {image_path}. Skipping.")

print("Image loading complete.")

# --- 4. Convert to NumPy Arrays and Normalize ---
images_np = np.array(images)
labels_np = np.array(labels)
images_np = images_np / 255.0

print(f"Converted to NumPy arrays. Image data shape: {images_np.shape}, Labels shape: {labels_np.shape}")

# --- 5. Split the Data ---
# First split: separate out the 20% test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    images_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np
)

# Second split: separate the remaining data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=VALIDATION_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_train_val
)

print("\nData splitting complete:")
print(f"  Original Training set:   {X_train.shape[0]} samples")
print(f"  Validation set:          {X_val.shape[0]} samples")
print(f"  Test set:                {X_test.shape[0]} samples")

# --- ADDED: Handle Class Imbalance using Oversampling on the Training Set ONLY ---
print("\nHandling class imbalance on the training set...")

# Reshape image data for the oversampler
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)

# Initialize the oversampler
ros = RandomOverSampler(random_state=RANDOM_STATE)

# Apply oversampling
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_reshaped, y_train)

# Reshape the image data back to its original dimensions
X_train_balanced = X_train_resampled.reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 3)
y_train_balanced = y_train_resampled

print("\n--- Class Distribution After Oversampling (Training Set) ---")
unique, counts = np.unique(y_train_balanced, return_counts=True)
print(dict(zip(unique, counts)))
print(f"  Balanced Training set:   {X_train_balanced.shape[0]} samples")

# --- 6. Save the Processed Data ---
print("\nSaving processed data to .npy files...")

np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train_balanced)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train_balanced)

# Save the original, untouched validation and test sets
np.save(os.path.join(PROCESSED_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(PROCESSED_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

with open(os.path.join(PROCESSED_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)

print("\nAll data has been processed and saved successfully! ✅")

Directory 'processed_data\BrinjalFruitX_600x600_balanced_classless' is ready.
Starting image loading and preprocessing...


Processing classes: 100%|██████████| 2/2 [00:35<00:00, 17.56s/it]


Image loading complete.
Converted to NumPy arrays. Image data shape: (1239, 600, 600, 3), Labels shape: (1239,)

Data splitting complete:
  Original Training set:   867 samples
  Validation set:          124 samples
  Test set:                248 samples

Handling class imbalance on the training set...

--- Class Distribution After Oversampling (Training Set) ---
{0: 507, 1: 507}
  Balanced Training set:   1014 samples

Saving processed data to .npy files...

All data has been processed and saved successfully! ✅
