In [2]:
"""
Notebook 03: Export Combined (Original + Augmented) Images to NPY
==================================================================
Combines original images with augmented minority class samples
"""

import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import json
import os

print("="*70)
print("EXPORTING BALANCED DATASET TO NPY FORMAT")
print("="*70)

# ==========================================
# 0) Configuration
# ==========================================
FINAL_SIZE = (256, 256)
print(f"\nTarget image size: {FINAL_SIZE[0]}×{FINAL_SIZE[1]}")

# ==========================================
# 1) Load Original Split Information
# ==========================================
print("\n[1/5] Loading original split information...")

train_df = pd.read_csv("train_manifest.csv")
val_df = pd.read_csv("val_manifest.csv")
test_df = pd.read_csv("test_manifest.csv")
df = pd.read_csv("manifest_clean.csv")

with open("classes.json") as f:
    class_names = json.load(f)

print(f"✓ Original data:")
print(f"  Total: {len(df)}")
print(f"  Train: {len(train_df)}")
print(f"  Val: {len(val_df)}")
print(f"  Test: {len(test_df)}")

# ==========================================
# 2) Load Augmented Data (if exists)
# ==========================================
print("\n[2/5] Loading augmented data...")

has_augmented = os.path.exists("augmented_train_images.npy")

if has_augmented:
    augmented_images = np.load("augmented_train_images.npy")
    augmented_labels = np.load("augmented_train_labels.npy", allow_pickle=True)
    print(f"✓ Loaded {len(augmented_images)} augmented images")
else:
    augmented_images = np.array([])
    augmented_labels = np.array([])
    print("⚠ No augmented data found - using original data only")

# ==========================================
# 3) Load and Resize Original Images
# ==========================================
print("\n[3/5] Loading and resizing ORIGINAL images...")

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(class_names)

paths = df["path"].values
labels_encoded = le.transform(df["label"].values)
original_images = []

for path in tqdm(paths, desc="Processing original images"):
    try:
        img = Image.open(path).convert('RGB')
        img_resized = img.resize(FINAL_SIZE, resample=Image.BILINEAR)
        img_array = np.array(img_resized, dtype=np.uint8)
        original_images.append(img_array)
    except Exception as e:
        print(f"\n⚠ Error loading {path}: {e}")
        original_images.append(np.zeros((FINAL_SIZE[0], FINAL_SIZE[1], 3), dtype=np.uint8))

original_images = np.stack(original_images, axis=0)
print(f"✓ Original images shape: {original_images.shape}")

# ==========================================
# 4) Create Balanced Training Split
# ==========================================
print("\n[4/5] Creating balanced training split...")

# Create index mapping
path_to_idx = {path: idx for idx, path in enumerate(df["path"].values)}

# Original train indices
train_indices_original = np.array([path_to_idx[path] for path in train_df["path"].values])

# Val and test indices (unchanged)
val_indices = np.array([path_to_idx[path] for path in val_df["path"].values])
test_indices = np.array([path_to_idx[path] for path in test_df["path"].values])

if has_augmented:
    # Combine original images with augmented ones
    all_images = np.concatenate([original_images, augmented_images], axis=0)
    
    # Encode augmented labels
    augmented_labels_encoded = le.transform(augmented_labels)
    all_labels = np.concatenate([labels_encoded, augmented_labels_encoded], axis=0)
    
    # Train indices = original train + new augmented indices
    augmented_start_idx = len(original_images)
    augmented_indices = np.arange(augmented_start_idx, augmented_start_idx + len(augmented_images))
    train_indices = np.concatenate([train_indices_original, augmented_indices])
    
    print(f"✓ Combined dataset:")
    print(f"  Original train: {len(train_indices_original)} images")
    print(f"  Augmented: {len(augmented_images)} images")
    print(f"  Total train: {len(train_indices)} images")
else:
    all_images = original_images
    all_labels = labels_encoded
    train_indices = train_indices_original
    print(f"✓ Using original data only (no augmentation)")

print(f"  Val: {len(val_indices)} images")
print(f"  Test: {len(test_indices)} images")
print(f"  Total: {len(all_images)} images")

# ==========================================
# 5) Save Everything
# ==========================================
print("\n[5/5] Saving final NPY files...")

np.save("images.npy", all_images)
np.save("labels.npy", all_labels.astype(np.int64))
np.save("split_train.npy", train_indices)
np.save("split_val.npy", val_indices)
np.save("split_test.npy", test_indices)
np.save("class_names.npy", np.array(class_names))
np.save("filepaths.npy", paths)

print("✓ Saved files:")
print("  - images.npy (all images including augmented)")
print("  - labels.npy (all labels)")
print("  - split_train.npy (train indices - includes augmented)")
print("  - split_val.npy")
print("  - split_test.npy")
print("  - class_names.npy")
print("  - filepaths.npy")

# ==========================================
# 6) Verification
# ==========================================
print("\n" + "="*70)
print("VERIFICATION")
print("="*70)

train_labels = all_labels[train_indices]
val_labels = all_labels[val_indices]
test_labels = all_labels[test_indices]

print("\n✓ Balanced class distribution:")
print(f"\n{'Class':<20} {'Train':>8} {'Val':>8} {'Test':>8}")
print("-" * 60)

for i, cls in enumerate(class_names):
    train_count = np.sum(train_labels == i)
    val_count = np.sum(val_labels == i)
    test_count = np.sum(test_labels == i)
    print(f"{cls:<20} {train_count:>8} {val_count:>8} {test_count:>8}")

print("-" * 60)
print(f"{'TOTAL':<20} {len(train_indices):>8} {len(val_indices):>8} {len(test_indices):>8}")

print("\n✓ Dataset is now balanced and ready for training!")
print("="*70)

EXPORTING BALANCED DATASET TO NPY FORMAT

Target image size: 256×256

[1/5] Loading original split information...
✓ Original data:
  Total: 4752
  Train: 3326
  Val: 713
  Test: 713

[2/5] Loading augmented data...
✓ Loaded 2479 augmented images

[3/5] Loading and resizing ORIGINAL images...


Processing original images: 100%|██████████| 4752/4752 [00:25<00:00, 184.70it/s]


✓ Original images shape: (4752, 256, 256, 3)

[4/5] Creating balanced training split...
✓ Combined dataset:
  Original train: 3326 images
  Augmented: 2479 images
  Total train: 5805 images
  Val: 713 images
  Test: 713 images
  Total: 7231 images

[5/5] Saving final NPY files...
✓ Saved files:
  - images.npy (all images including augmented)
  - labels.npy (all labels)
  - split_train.npy (train indices - includes augmented)
  - split_val.npy
  - split_test.npy
  - class_names.npy
  - filepaths.npy

VERIFICATION

✓ Balanced class distribution:

Class                   Train      Val     Test
------------------------------------------------------------
Cardboard                 645       69       69
Food Organics             645       61       62
Glass                     645       63       63
Metal                     645      119      118
Miscellaneous Trash       645       74       75
Paper                     645       75       75
Plastic                   645      138      138
Text