## Data Preprocessing

In [8]:
import os
import numpy as np
from PIL import Image
import cv2
from hashlib import md5
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import random


# config and storeage

In [12]:
# --- Config ---
INPUT_DIR = "../../data/raw/fer2013/train"
OUTPUT_FILE = "../../data/processed/FC211003_Suneth/train.npz"

TARGET_SIZE = (48, 48)
MAX_IMAGES_PER_CLASS = 4000       # Apply to all classes
MIN_FACE_CONFIDENCE = 1.1
MIN_NEIGHBORS = 4

# Final class map
MERGE_MAP = {
    "happy": "happy",
    "sad": "sad",
    "angry": "stressed",      # Merge angry → stressed
    "disgust": None,          # Remove
    "fear": "fear",
    "neutral": "neutral",
    "surprise": None          # Remove surprise
}

seen_hashes = set()

## PreProcessing

### Step 1 - Dataset Scanning Plan

In [10]:
# --- Class image counter ---
class_image_counts = {}

# Loop through dataset class folders
all_files = []

for original_class in tqdm(os.listdir(INPUT_DIR), desc="Classes"):
    class_path = os.path.join(INPUT_DIR, original_class)
    if not os.path.isdir(class_path):
        continue

    mapped_class = MERGE_MAP.get(original_class.lower())
    if mapped_class is None:
        print(f"[SKIP] Class '{original_class}' removed as per EDA.")
        continue

    for file_name in tqdm(os.listdir(class_path), desc=f"{original_class}", leave=False):
        fpath = os.path.join(class_path, file_name)

        # Skip if class is already full
        if class_image_counts.get(mapped_class, 0) >= MAX_IMAGES_PER_CLASS:
            continue

        all_files.append((fpath, mapped_class))
        class_image_counts[mapped_class] = class_image_counts.get(mapped_class, 0) + 1


Classes:   0%|          | 0/7 [00:00<?, ?it/s]

angry:   0%|          | 0/3995 [00:00<?, ?it/s]

[SKIP] Class 'disgust' removed as per EDA.


fear:   0%|          | 0/4097 [00:00<?, ?it/s]

happy:   0%|          | 0/7215 [00:00<?, ?it/s]

neutral:   0%|          | 0/4965 [00:00<?, ?it/s]

sad:   0%|          | 0/4830 [00:00<?, ?it/s]

[SKIP] Class 'surprise' removed as per EDA.


### Step 2 - Load Images, Remove Duplicates, and Filter Bad Samples

In [13]:
print("📦 Starting image loading and filtering...")

for fpath, mapped_class in tqdm(all_files, desc="Processing Images"):
    try:
        # Step 1: Remove duplicate images
        with open(fpath, 'rb') as f:
            img_hash = md5(f.read()).hexdigest()
        if img_hash in seen_hashes:
            continue
        seen_hashes.add(img_hash)

        # Step 2: Load image and convert to grayscale
        img = Image.open(fpath).convert("L")
        img_np = np.array(img)

        # Step 3: Skip blank or too small images
        if img_np.shape[0] < 32 or img_np.shape[1] < 32 or np.mean(img_np) < 5:
            continue

        # Step 4: Face detection
        faces = face_cascade.detectMultiScale(
            img_np, scaleFactor=MIN_FACE_CONFIDENCE, minNeighbors=MIN_NEIGHBORS
        )
        if len(faces) == 0:
            continue  # Skip if no face detected

        # Step 5: Resize image
        img_resized = Image.fromarray(img_np).resize(TARGET_SIZE, Image.BILINEAR)

        # Step 6: Normalize pixels
        img_array = np.array(img_resized, dtype=np.float32) / 255.0
        img_array = np.expand_dims(img_array, axis=-1)  # shape: (48, 48, 1)

        # Step 7: Save to final arrays
        images.append(img_array)
        labels.append(mapped_class)

    except Exception as e:
        print(f"[ERROR] Failed on {fpath}: {e}")


📦 Starting image loading and filtering...


Processing Images:   0%|          | 0/19995 [00:00<?, ?it/s]

### Step 3 - Load Images, Remove Duplicates, and Filter Bad Samples
output saving as a single.npz file

In [14]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

print("🔤 Encoding labels...")
le = LabelEncoder()
y_encoded = le.fit_transform(labels)            # e.g., 'happy' → 0
y_onehot = to_categorical(y_encoded)            # one-hot encode: [0, 1, 0, 0, 0, 0]

# Convert to numpy arrays
X = np.array(images, dtype=np.float32)
y = np.array(y_onehot, dtype=np.float32)

# Save to disk
print(f"💾 Saving {X.shape[0]} samples to {OUTPUT_FILE}...")
np.savez_compressed(OUTPUT_FILE, X=X, y=y, label_names=le.classes_)
print("✅ Dataset saved successfully!")


🔤 Encoding labels...
💾 Saving 5692 samples to ../../data/processed/FC211003_Suneth/train.npz...
✅ Dataset saved successfully!
