# 01_prepare_dataset.ipynb

Prepare the **Facial Emotion Recognition** dataset for the Emotion Recognition CNN model I am building

This notebook:
1. Checks the environment and dependencies.
2. Defines paths for the raw YOLO dataset and output folders.
3. Validates class mapping for **9 emotions** (standardizes *Natural* → *Neutral*).
4. Converts YOLO labels → **cropped grayscale face images** by class.
5. Runs **basic quality checks** and summarizes class counts.

## Setup Environment and Paths


In [1]:
import sys, os, platform
from pathlib import Path
import numpy as np, cv2, matplotlib
import matplotlib.pyplot as plt

print("Python:", sys.version)
print("OS:", platform.platform())
print("NumPy:", np.__version__)
print("OpenCV:", cv2.__version__)
print("Matplotlib:", matplotlib.__version__)

DATA_ROOT = Path("../data")
YOLO_ROOT = DATA_ROOT / "raw_yolo"   # expected: train/ valid/ test/ each with images/ & labels/

# Output folders
POOL_ROOT = DATA_ROOT / "cls_pool"   # output: class-organized pool for Model Notebook
ART_OUT = Path("../artifacts") / "outputs"

for p in [POOL_ROOT, ART_OUT]:
    p.mkdir(parents=True, exist_ok=True)

print("YOLO root:", YOLO_ROOT.resolve())
print("Pool root:", POOL_ROOT.resolve())
print("Artifacts:", ART_OUT.resolve())


Python: 3.13.9 (tags/v3.13.9:8183fa5, Oct 14 2025, 14:09:13) [MSC v.1944 64 bit (AMD64)]
OS: Windows-11-10.0.26200-SP0
NumPy: 2.2.6
OpenCV: 4.11.0
Matplotlib: 3.9.4
YOLO root: C:\Code\Emotion-Recognition-CNN\data\raw_yolo
Pool root: C:\Code\Emotion-Recognition-CNN\data\cls_pool
Artifacts: C:\Code\Emotion-Recognition-CNN\artifacts\outputs


## Define Class map (9 emotions)

Standardize “Natural” → **Neutral** for consistency.

In [2]:
ID2NAME = {
    0: "Angry",
    1: "Contempt",
    2: "Disgust",
    3: "Fear",
    4: "Happy",
    5: "Neutral",  # called "Natural" in source
    6: "Sad",
    7: "Sleepy",
    8: "Surprised",
}
NAME2ID = {v:k for k,v in ID2NAME.items()}
NUM_CLASSES = len(ID2NAME)
ID2NAME


{0: 'Angry',
 1: 'Contempt',
 2: 'Disgust',
 3: 'Fear',
 4: 'Happy',
 5: 'Neutral',
 6: 'Sad',
 7: 'Sleepy',
 8: 'Surprised'}

## Scan YOLO split structure

Verify label files exist under each split:
- `raw_yolo/train/labels/`
- `raw_yolo/valid/labels/`
- `raw_yolo/test/labels/`

In [3]:
from collections import Counter

splits = ["train", "valid", "test"]
label_index = {}
total_lbl = 0

for sp in splits:
    lbl_dir = YOLO_ROOT / sp / "labels"
    imgs_dir = YOLO_ROOT / sp / "images"
    label_files = sorted(lbl_dir.glob("*.txt")) if lbl_dir.exists() else []
    label_index[sp] = (imgs_dir, label_files)
    print(f"{sp:5s}: labels={len(label_files):5d} | labels dir: {lbl_dir.exists()} | images dir: {imgs_dir.exists()}")
    total_lbl += len(label_files)

print("Total label files across splits:", total_lbl)


train: labels=64866 | labels dir: True | images dir: True
valid: labels= 1720 | labels dir: True | images dir: True
test : labels= 1700 | labels dir: True | images dir: True
Total label files across splits: 68286


## Convert YOLO labels (across all splits) → unified class pool

For each `*.txt`:
1) read lines: `class x_center y_center width height` (normalized 0..1)  
2) find paired image (same stem under that split's `images/`)  
3) crop with a small margin (~8%), convert to **grayscale**  
4) dedupe by **SHA1 hash of the grayscale crop** (skips exact duplicates across splits)  
5) save to: `data/cls_pool/<ClassName>/<stem>_<hash7>.jpg`  
6) record a manifest entry


In [4]:
import hashlib, json, cv2
from pathlib import Path

SUPPORTED_EXTS = (".jpg", ".png", ".jpeg", ".JPG", ".PNG", ".JPEG")

def _clip(v, lo, hi): return max(lo, min(hi, v))

def find_image(imgs_dir: Path, stem: str):
    for ext in SUPPORTED_EXTS:
        p = imgs_dir / f"{stem}{ext}"
        if p.exists():
            return p
    return None

manifest = {
    "source": str(YOLO_ROOT.resolve()),
    "pool_root": str(POOL_ROOT.resolve()),
    "files": [],                  # entries: {split, src_img, src_lbl, dst, class, sha1}
    "duplicates_skipped": 0,
    "unknown_classes": []
}

seen_hashes = set()
saved = 0
unknown_class_ids = set()

for sp in splits:
    imgs_dir, label_files = label_index[sp]
    if not label_files: 
        continue

    for lbl in label_files:
        stem = lbl.stem
        img_path = find_image(imgs_dir, stem)
        if img_path is None:
            continue

        img = cv2.imread(str(img_path))
        if img is None:
            continue

        H, W = img.shape[:2]

        with lbl.open("r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) != 5:
                    continue
                cid, xc, yc, w, h = parts
                try:
                    cid = int(cid)
                    xc, yc, w, h = map(float, (xc, yc, w, h))
                except:
                    continue

                if cid not in ID2NAME:
                    unknown_class_ids.add(cid)
                    continue

                x1 = int((xc - w/2) * W); y1 = int((yc - h/2) * H)
                x2 = int((xc + w/2) * W); y2 = int((yc + h/2) * H)

                # add ~8% margin
                mx = int(0.08 * (x2 - x1 + 1))
                my = int(0.08 * (y2 - y1 + 1))
                x1 = _clip(x1 - mx, 0, W-1); y1 = _clip(y1 - my, 0, H-1)
                x2 = _clip(x2 + mx, 0, W-1); y2 = _clip(y2 + my, 0, H-1)

                crop = img[y1:y2, x1:x2]
                if crop.size == 0:
                    continue

                # grayscale
                gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)

                # stable hash of the crop (PNG-encoded bytes)
                ok, buf = cv2.imencode(".png", gray)
                if not ok:
                    continue
                sha = hashlib.sha1(buf.tobytes()).hexdigest()
                if sha in seen_hashes:
                    manifest["duplicates_skipped"] += 1
                    continue
                seen_hashes.add(sha)

                cls_name = ID2NAME[cid]
                dst_dir = POOL_ROOT / cls_name
                dst_dir.mkdir(parents=True, exist_ok=True)
                dst = dst_dir / f"{stem}_{sha[:7]}.jpg"

                cv2.imwrite(str(dst), gray)
                manifest["files"].append({
                    "split": sp,
                    "src_img": str(img_path.resolve()),
                    "src_lbl": str(lbl.resolve()),
                    "dst": str(dst.resolve()),
                    "class": cls_name,
                    "sha1": sha
                })
                saved += 1

print(f"Saved {saved} unique crops into pool. Skipped {manifest['duplicates_skipped']} duplicates.")
if unknown_class_ids:
    manifest["unknown_classes"] = sorted(int(x) for x in unknown_class_ids)
    print("Warning: unknown class IDs encountered:", sorted(unknown_class_ids))


Saved 67915 unique crops into pool. Skipped 366 duplicates.


## Pool summary & manifests


In [5]:
from collections import Counter
import json

# summarize counts in pool
pool_counts = Counter()
for ext in (".jpg", ".jpeg", ".png"):
    pool_counts += Counter(p.parent.name for p in POOL_ROOT.rglob(f"*{ext}"))

summary = {
    "pool_root": str(POOL_ROOT.resolve()),
    "counts": dict(pool_counts),
    "total": int(sum(pool_counts.values()))
}

# write artifacts
with open(ART_OUT / "class_names.json", "w") as f:
    json.dump(list(ID2NAME.values()), f, indent=2)

with open(ART_OUT / "pool_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

with open(ART_OUT / "pool_manifest.json", "w") as f:
    json.dump(manifest, f, indent=2)

print("Saved:", (ART_OUT / "class_names.json").resolve())
print("Saved:", (ART_OUT / "pool_summary.json").resolve())
print("Saved:", (ART_OUT / "pool_manifest.json").resolve())

for cls in sorted(pool_counts):
    print(f"{cls:12s} : {pool_counts[cls]}")
print("Total in pool:", summary["total"])


Saved: C:\Code\Emotion-Recognition-CNN\artifacts\outputs\class_names.json
Saved: C:\Code\Emotion-Recognition-CNN\artifacts\outputs\pool_summary.json
Saved: C:\Code\Emotion-Recognition-CNN\artifacts\outputs\pool_manifest.json
Angry        : 11699
Contempt     : 2693
Disgust      : 4502
Fear         : 5424
Happy        : 14582
Neutral      : 5966
Sad          : 12545
Sleepy       : 1120
Surprised    : 9384
Total in pool: 67915
