# Data Cleaning & Split  - EfficientNet B0

----------------------------------------------------------------------------

    Data Cleaning

        - Load raw data directories for train and test splits.
        - For each image in the dataset:
            - verify cropped and Read the image in grayscale.
            - Check and handle  corrpted , unreadable ,dupicates images[Uses MD5 hashing]
            - Verify image size is exactly 48×48 pixels.
            - Quality metrics: Dark/bright/contrast/blur thresholds from EDA
            - Robust filtering: Rejects images with 2+ quality issues
----------------------------------------

    Data Split


        - Split the training data into training and validation sets , stratified by class to maintain class proportions.
        - Keep the test split untouched 
----------------------------------------        
    
        - Save processed datasets (images and labels) as compressed .npz files (train.npz, val.npz, test.npz).ed)
        
---------------------------------------

In [1]:
import os
import cv2
import numpy as np
import hashlib
from pathlib import Path
import pandas as pd
from collections import Counter
import shutil
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


## 1. Configuration

In [2]:
#set paths
RAW_DIR= Path("/app/data/raw/fer2013")
PROCESSED_DIR = Path("/app/data/processed/FC211002_Nethmi")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


In [3]:
# Data quality thresholds (from EDA)
DARK_THRESHOLD = 50
BRIGHT_THRESHOLD = 200
LOW_CONTRAST_THRESHOLD = 15
BLUR_THRESHOLD = 100

In [4]:
print(f"Raw data directory: {RAW_DIR}")
print(f"Processed data directory: {PROCESSED_DIR}")

Raw data directory: /app/data/raw/fer2013
Processed data directory: /app/data/processed/FC211002_Nethmi


## 2.Class Mapping

In [5]:
## Mapping FER2013 classes to 5 project classes
emotion_labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

CLASS_MAPPING = {
    'angry': 'angry',
    'disgust': 'angry',
    'fear': 'stressed',
    'surprise': 'stressed',
    'happy': 'happy',
    'neutral': 'neutral',
    'sad': 'sad'
}
TARGET_CLASSES = ['angry', 'happy', 'sad', 'stressed', 'neutral']

## 3.Data Cleaning

In [7]:

#Generate MD5 hash for duplicate detection
def get_image_hash(image_path):
    with open(image_path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

#Check image quality metrics
#Check image for 
    # brightness (too dark or too bright)
    # contrast (too low)
    # blur
def check_image_quality(image):
    mean_pixel = np.mean(image)
    contrast = np.std(image)
    blur_score = cv2.Laplacian(image, cv2.CV_64F).var()
    
    quality_issues = 0
    if mean_pixel < DARK_THRESHOLD: quality_issues += 1
    if mean_pixel > BRIGHT_THRESHOLD: quality_issues += 1
    if contrast < LOW_CONTRAST_THRESHOLD: quality_issues += 1
    if blur_score < BLUR_THRESHOLD: quality_issues += 1
    
    return quality_issues >= 2  # Reject if 2+ issues



In [9]:
# General cleaning function for train/test sets 
# Clean  (basic cleaning only, no quality removal yet) 
def process_data(split_name, remove_quality=False):
   
    print(f"\nProcessing {split_name.upper()} set...")
    
    split_dir = RAW_DIR / split_name
    cleaned_images, cleaned_labels = [], []  # lists to store valid images and labels
    stats = {k:0 for k in ['total_files','valid_images','duplicates','corrupted','wrong_size','poor_quality','read_errors']}
    seen_hashes = set()  # to track duplicates
    
    # Map original folders to TARGET_CLASSES
    mapped_dirs = {cls: [] for cls in TARGET_CLASSES}
    for orig_dir in split_dir.iterdir():
        if orig_dir.is_dir() and orig_dir.name in CLASS_MAPPING:
            mapped_dirs[CLASS_MAPPING[orig_dir.name]].append(orig_dir)
    
    # Iterate over each mapped class
    for cls in TARGET_CLASSES:
        for folder in mapped_dirs[cls]:
            # Get all images in folder
            image_files = list(folder.glob("*.jpg")) + list(folder.glob("*.png"))
            for img_path in image_files:
                stats['total_files'] += 1
                try:
                    # Duplicate check 
                    img_hash = get_image_hash(img_path)
                    if img_hash in seen_hashes:
                        stats['duplicates'] += 1
                        continue
                    seen_hashes.add(img_hash)

                    # Read grayscale image 
                    img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
                    if img is None:  # corrupted/unreadable
                        stats['corrupted'] += 1
                        continue

                    # Size check 
                    if img.shape != (48,48):
                        stats['wrong_size'] += 1
                        continue

                    # Poor-quality check 
                    # Only applied if remove_quality=True 
                    if remove_quality and check_image_quality(img):
                        stats['poor_quality'] += 1
                        continue

                    # Passed all checks : store image and label
                    cleaned_images.append(img)
                    cleaned_labels.append(TARGET_CLASSES.index(cls))
                    stats['valid_images'] += 1

                except:
                    stats['read_errors'] += 1

    # Summary of cleaning
    print(f"\n{split_name.upper()} CLEANING SUMMARY:")
    print(f"  Total files processed: {stats['total_files']}")
    print(f"  Valid images: {stats['valid_images']} ({stats['valid_images']/stats['total_files']*100:.1f}%)")
    print(f"  Issues removed:")
    print(f"    - Duplicates:        {stats['duplicates']}")
    print(f"    - Corrupted:         {stats['corrupted']}")
    print(f"    - Wrong size:        {stats['wrong_size']}")
    print(f"    - Poor quality:      {stats['poor_quality']}")
    print(f"    - Read errors:       {stats['read_errors']}")
    
    return np.array(cleaned_images), np.array(cleaned_labels)



In [10]:
# Clean both train and test sets
train_images, train_labels = process_data('train', remove_quality=False)
test_images, test_labels   = process_data('test', remove_quality=False)



Processing TRAIN set...

TRAIN CLEANING SUMMARY:
  Total files processed: 28709
  Valid images: 27473 (95.7%)
  Issues removed:
    - Duplicates:        1236
    - Corrupted:         0
    - Wrong size:        0
    - Poor quality:      0
    - Read errors:       0

Processing TEST set...

TEST CLEANING SUMMARY:
  Total files processed: 7178
  Valid images: 7092 (98.8%)
  Issues removed:
    - Duplicates:        86
    - Corrupted:         0
    - Wrong size:        0
    - Poor quality:      0
    - Read errors:       0


## 4.Data Split

In [12]:
# Split train → train + val 
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_images, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)


## 5. Remove Poor Quality Images from Train set

In [13]:
#  Remove poor-quality images only from training set 
def remove_poor_quality(images, labels):
    
    keep_images, keep_labels = [], []
    for img, lbl in zip(images, labels):
        if not check_image_quality(img):  # keep only good-quality images
            keep_images.append(img)
            keep_labels.append(lbl)
    return np.array(keep_images), np.array(keep_labels)

X_train, y_train = remove_poor_quality(X_train, y_train)

#  Summary of final datasets 
print(f"\nFINAL DATASETS:")
print(f"  Training:   {len(X_train)} images (poor-quality images removed)")
print(f"  Validation: {len(X_val)} images (unchanged)")
print(f"  Test:       {len(test_images)} images (unchanged)")



FINAL DATASETS:
  Training:   21970 images (poor-quality images removed)
  Validation: 5495 images (unchanged)
  Test:       7092 images (unchanged)


## 6.Class Distribution

In [15]:
# Class Distribution Display 
from collections import Counter

def display_class_distribution(X_train, y_train, X_val, y_val, X_test, y_test):
    
    print("\nCLASS DISTRIBUTION AFTER CLEANING:")
    # print("="*65)
    
    train_dist, val_dist, test_dist = Counter(y_train), Counter(y_val), Counter(y_test)
    total_train, total_val, total_test = len(y_train), len(y_val), len(y_test)
    
    print(f"{'Class':<10} {'Train':<12} {'Val':<12} {'Test':<12}")
    
    for i, cls in enumerate(TARGET_CLASSES):
        tr_count = train_dist[i]
        val_count = val_dist[i]
        te_count = test_dist[i]
        print(f"{cls:<10} {tr_count} ({tr_count/total_train*100:5.1f}%) "
              f"{val_count} ({val_count/total_val*100:5.1f}%) "
              f"{te_count} ({te_count/total_test*100:5.1f}%)")
    
    # Print totals
    print(f"{'-'*65}")
    print(f"{'Total':<10} {total_train} ({100:5.1f}%) "
          f"{total_val} ({100:5.1f}%) "
          f"{total_test} ({100:5.1f}%)")

# Call the function after cleaning and splitting 
display_class_distribution(X_train, y_train, X_val, y_val, test_images, test_labels)



CLASS DISTRIBUTION AFTER CLEANING:
Class      Train        Val          Test        
angry      3383 ( 15.4%) 846 ( 15.4%) 1055 ( 14.9%)
happy      5668 ( 25.8%) 1417 ( 25.8%) 1767 ( 24.9%)
sad        3779 ( 17.2%) 945 ( 17.2%) 1241 ( 17.5%)
stressed   5250 ( 23.9%) 1313 ( 23.9%) 1804 ( 25.4%)
neutral    3890 ( 17.7%) 974 ( 17.7%) 1225 ( 17.3%)
-----------------------------------------------------------------
Total      21970 (100.0%) 5495 (100.0%) 7092 (100.0%)


## 7.Save Cleaned Dataset

In [16]:
# Ensure processed directory exists
PROCESSED_DIR = Path("/app/data/processed/FC211002_Nethmi")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Save train set
np.savez_compressed(PROCESSED_DIR / "train.npz", images=X_train, labels=y_train)

# Save validation set
np.savez_compressed(PROCESSED_DIR / "val.npz", images=X_val, labels=y_val)

# Save test set (unchanged)
np.savez_compressed(PROCESSED_DIR / "test.npz", images=test_images, labels=test_labels)

print("Processed datasets saved as train.npz, val.npz, test.npz")

Processed datasets saved as train.npz, val.npz, test.npz
