## Data Preprocessing

In [4]:
import os
import numpy as np
from PIL import Image
import cv2
from hashlib import md5
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


# config and storeage

In [5]:

# --- Config ---
INPUT_DIR = "../../data/raw/fer2013/train"
OUTPUT_FILE = "../../data/processed/FC211003_Suneth/train"
TARGET_SIZE = (48, 48)
MIN_FACE_CONFIDENCE = 1.1
MIN_NEIGHBORS = 4
MERGE_MAP = {
    "angry": "stressed",
    "disgust": "stressed",
    "happy": "happy",
    "sad": "sad",
    "neutral": "neutral"
}


# --- Storage ---
images = []
labels = []
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
seen_hashes = set()


## PreProcessing

In [6]:
print("Starting preprocessing...")

for original_class in tqdm(os.listdir(INPUT_DIR), desc="Classes"):
    class_path = os.path.join(INPUT_DIR, original_class)
    if not os.path.isdir(class_path):
        continue

    merged_class = MERGE_MAP.get(original_class.lower())
    if merged_class is None:
        continue

    for file_name in tqdm(os.listdir(class_path), desc=original_class, leave=False):
        fpath = os.path.join(class_path, file_name)

        try:
            # Load image and check hash
            with open(fpath, 'rb') as f:
                img_hash = md5(f.read()).hexdigest()
            if img_hash in seen_hashes:
                continue  # skip duplicates
            seen_hashes.add(img_hash)

            # Load and convert
            img = Image.open(fpath).convert("L")
            img_np = np.array(img)

            # Face detection
            faces = face_cascade.detectMultiScale(img_np, scaleFactor=MIN_FACE_CONFIDENCE, minNeighbors=MIN_NEIGHBORS)
            if len(faces) == 0:
                continue  # skip if no face detected

            # Resize and normalize
            img_resized = Image.fromarray(img_np).resize(TARGET_SIZE)
            img_array = np.array(img_resized) / 255.0
            img_array = np.expand_dims(img_array, axis=-1)  # shape: (48, 48, 1)

            images.append(img_array)
            labels.append(merged_class)

        except Exception as e:
            print(f"[ERROR] {fpath}: {e}")


Starting preprocessing...


Classes:   0%|          | 0/7 [00:00<?, ?it/s]

angry:   0%|          | 0/3995 [00:00<?, ?it/s]

disgust:   0%|          | 0/436 [00:00<?, ?it/s]

happy:   0%|          | 0/7215 [00:00<?, ?it/s]

neutral:   0%|          | 0/4965 [00:00<?, ?it/s]

sad:   0%|          | 0/4830 [00:00<?, ?it/s]

# --- Encode labels ---
output saving as a single.npz file

In [7]:
print("Encoding labels...")
le = LabelEncoder()
y_encoded = le.fit_transform(labels)
y_onehot = to_categorical(y_encoded)

# --- Convert and Save ---
X = np.array(images, dtype=np.float32)
y = np.array(y_onehot, dtype=np.float32)

print(f"Saving {X.shape[0]} samples to {OUTPUT_FILE}...")
np.savez_compressed(OUTPUT_FILE, X=X, y=y)
print("Done.")


Encoding labels...
Saving 6551 samples to ../../data/processed/FC211003_Suneth/train...
Done.
