                      Data Preprocessing


        
        Optional Augmentations: 
            -Rotation: ±15 degrees (faces can be slightly tilted)
            -Horizontal flip: ONLY for non-asymmetric emotions
            -Brightness adjustment: ±10% (lighting variations)
            -Small zoom: ±5% (distance variations)
            -NO vertical flip (would create upside-down faces!)

        Steps:
    
        
       

Total corrupted files found: 0

In [1]:
import os
import cv2
import numpy as np
#from tqdm import tqdm
from pathlib import Path
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2025-07-13 20:22:01.861061: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-13 20:22:01.866729: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-13 20:22:01.932325: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-13 20:22:01.934068: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#set paths
RAW_DIR = Path("/app/data/raw/fer2013")
PROCESSED_DIR = Path("/app/data/processed/FC211002")

In [3]:
#Mapping FER2013 (7 classes) → Project (5 classes)
CLASS_MAPPING = {
    'angry': 'angry',
    'disgust': 'stressed',
    'fear': 'stressed',
    'happy': 'happy',
    'sad': 'sad',
    'surprise': 'happy',
    'neutral': 'neutral'
}

TARGET_CLASSES = ['angry', 'happy', 'sad', 'stressed', 'neutral']

In [4]:
#Create folder structure for processed data
def prepare_folder_structure():
    for split in ['train', 'test']:
        for cls in TARGET_CLASSES:
            out_dir = os.path.join(PROCESSED_DIR, split, cls)
            os.makedirs(out_dir, exist_ok=True)

In [None]:
# # Detect duplicates using image hashing
# def is_duplicate(img_array, seen_hashes):
#     img_hash = hash(img_array.tobytes())
#     if img_hash in seen_hashes:
#         return True
#     seen_hashes.add(img_hash)
#     return False

In [None]:
# # Detect outlier images 
# def is_outlier(img_array):
#     brightness = np.mean(img_array)
#     contrast = np.std(img_array)
#     return brightness < 10 or brightness > 245 or contrast < 5

In [5]:
# Image Preprocessing with Validation
def preprocess_image_with_checks(img_path, seen_hashes):
    try:
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return None, "Missing or unreadable"
        if img.shape != (48, 48):
            return None, "Incorrect size"
        # if is_duplicate(img, seen_hashes):
        #     return None, "Duplicate"
        # if is_outlier(img):
        #     return None, "Outlier"

        # Normalize and reshape
        img = img.astype(np.float32) / 255.0
        img = np.expand_dims(img, axis=-1)
        return img, None
    except:
        return None, "Corrupt"

In [6]:
# Data Augmentation Configuration
augmentor = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    brightness_range=[0.9, 1.1],
    zoom_range=0.05,
    horizontal_flip=True,
    fill_mode='nearest'
)


In [7]:
# Main Preprocessing Pipeline
def preprocess_and_save(split='train', augment=False, augment_count=2):
    input_split_dir = os.path.join(RAW_DIR, split)
    seen_hashes = set()

    for orig_class in os.listdir(input_split_dir):
        orig_path = os.path.join(input_split_dir, orig_class)
        if not os.path.isdir(orig_path):
            continue

        mapped_class = CLASS_MAPPING.get(orig_class)
        if mapped_class not in TARGET_CLASSES:
            continue

        out_class_dir = os.path.join(PROCESSED_DIR, split, mapped_class)
        os.makedirs(out_class_dir, exist_ok=True)

        image_files = [f for f in os.listdir(orig_path) if f.lower().endswith(('.png', '.jpg'))]

        for img_file in image_files:
            img_path = os.path.join(orig_path, img_file)
            img, issue = preprocess_image_with_checks(img_path, seen_hashes)

            if img is None:
                print(f"Skipped {img_file} — {issue}")
                continue

            # Save preprocessed image
            save_name = os.path.splitext(img_file)[0] + "_pre.png"
            save_path = os.path.join(out_class_dir, save_name)
            cv2.imwrite(save_path, (img.squeeze() * 255).astype(np.uint8))
             # Augmentation
            if augment:
                img_batch = img.reshape((1, 48, 48, 1))
                aug_iter = augmentor.flow(img_batch, batch_size=1)
                for i in range(augment_count):
                    aug_img = next(aug_iter)[0]
                    aug_name = os.path.splitext(img_file)[0] + f"_aug{i}.png"
                    aug_path = os.path.join(out_class_dir, aug_name)
                    cv2.imwrite(aug_path, (aug_img.squeeze() * 255).astype(np.uint8))


In [8]:
# Run Everything
prepare_folder_structure()
preprocess_and_save(split='train', augment=True, augment_count=3)   # With augmentation
preprocess_and_save(split='test', augment=False)                    # No augmentation

print("All preprocessing complete. Data saved in '{PROCESSED_DIR}'")

All preprocessing complete. Data saved in '{PROCESSED_DIR}'
