## imports

In [18]:
from pathlib import Path
import pandas as pd, csv
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import shutil
import re

In [7]:
IMAGES_DIR = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\images_renamed")   # folder with  images
CSV_PATH   = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\TIF_labels.xlsx")  # CSV with columns: image,label
OUTPUT_DIR = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions")   # where splits/CSVs (and optional folders) go

# split ratios
TRAIN_RATIO, VAL_RATIO, TEST_RATIO = 0.70, 0.15, 0.15  # must sum to 1

# If your CSV image IDs don't include extensions, we’ll search these:
ALLOWED_EXTS = [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"]

# If original filenames have suffixes (e.g., "...-4317HA") but your CSV has "...-4317",
# set this to True to allow prefix matches when exact matches fail:
ALLOW_PREFIX_MATCH = True

# If True, also copy files into ImageNet-style folders data/{split}/{label}/img.jpg
MAKE_CLASS_FOLDERS = False
# ======================

# Standardize class names (case-insensitive) to your 7 classes
CANONICAL = {
    "happy": "happy",
    "sad": "sad",
    "anger": "anger",
    "angry": "anger",
    "neutral": "neutral",
    "disgust": "disgust",
    "surprise": "surprise",
    "surprised": "surprise",
    "fear": "fear",
    "scared": "fear",
}


In [8]:
def normalize_label(x: str) -> str:
    key = re.sub(r"\s+", "", str(x).strip().lower())
    if key not in CANONICAL:
        raise ValueError(f"Unknown label '{x}'. Add a mapping in CANONICAL.")
    return CANONICAL[key]

def resolve_image_path(img_id: str) -> Path | None:
    """
    If img_id has an extension, try directly.
    Else try each ALLOWED_EXTS.
    If still not found and ALLOW_PREFIX_MATCH=True, accept any file whose stem startswith(img_id).
    """
    cand = IMAGES_DIR / img_id
    if cand.suffix:
        return cand if cand.exists() else None

    # try exact stem with allowed extensions
    for ext in ALLOWED_EXTS:
        p = IMAGES_DIR / f"{img_id}{ext}"
        if p.exists():
            return p

    if ALLOW_PREFIX_MATCH:
        # find files whose stem starts with img_id
        matches = [p for p in IMAGES_DIR.iterdir() if p.is_file() and p.stem.startswith(img_id)]
        if len(matches) == 1:
            return matches[0]
        # if multiple matches, prefer allowed extensions
        pruned = [m for m in matches if m.suffix.lower() in ALLOWED_EXTS]
        if len(pruned) == 1:
            return pruned[0]

    return None


In [19]:
ORDERED_CLASSES = ["happy", "sad", "anger", "neutral", "disgust", "surprise", "fear"]
CLASS_TO_ID = {c: i for i, c in enumerate(ORDERED_CLASSES)}

In [None]:
from __future__ import annotations
from pathlib import Path
import pandas as pd
import zipfile
import csv
import re
import shutil
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

ORDERED_CLASSES = ["happy", "sad", "anger", "neutral", "disgust", "surprise", "fear"]
CANONICAL = {
    "happy":"happy","sad":"sad","anger":"anger","angry":"anger",
    "neutral":"neutral","disgust":"disgust",
    "surprise":"surprise","surprised":"surprise",
    "fear":"fear","scared":"fear"
}
CLASS_TO_ID = {c:i for i,c in enumerate(ORDERED_CLASSES)}

def normalize_label(x: str) -> str:
    key = re.sub(r"\s+", "", str(x).strip().lower())
    if key not in CANONICAL:
        raise ValueError(f"Unknown label '{x}'")
    return CANONICAL[key]

def read_labels_safely(path):
    path = Path(path)

    # 1) If it's actually an Excel workbook (xlsx/xls are ZIPs), use read_excel
    if path.suffix.lower() in {".xlsx", ".xls"} or zipfile.is_zipfile(path):
        df = pd.read_excel(path, usecols=[0, 1])
        df.columns = ["image", "label"]
    else:
        # 2) Robust CSV read (handles weird encodings/quotes)
        tried = []
        for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
            for quoting in (csv.QUOTE_MINIMAL, csv.QUOTE_NONE):
                try:
                    df = pd.read_csv(
                        path,
                        engine="python",
                        sep=None,             # auto-detect delimiter
                        encoding=enc,
                        quoting=quoting,
                        usecols=[0, 1],
                        on_bad_lines="skip",
                        skip_blank_lines=True,
                        header=0
                    )
                    df.columns = ["image", "label"]
                    break
                except Exception as e:
                    tried.append((enc, quoting, str(e)))
                    df = None
            if df is not None:
                break
        if df is None:
            raise RuntimeError(f"Could not parse labels file. Tries: {tried}")

    # 3) Clean up invisible characters
    for col in ("image", "label"):
        df[col] = (
            df[col].astype(str)
                   .str.replace("\ufeff","", regex=False)  # BOM
                   .str.replace("\xa0"," ", regex=False)   # NBSP
                   .str.strip()
        )

    # 4) If any cell looks like ZIP bytes (Excel), re-read as Excel
    if df["label"].str.contains(r"PK\x03|\b_rels/\.rels\b").any():
        df = pd.read_excel(path, usecols=[0, 1])
        df.columns = ["image", "label"]
        for col in ("image", "label"):
            df[col] = df[col].astype(str).str.strip()

    return df

# ------- Use it -------
df = read_labels_safely(CSV_PATH)

# Validate labels and map to IDs
df["label_norm"] = df["label"].apply(normalize_label)   # will raise if anything is off
df["label_id"]    = df["label_norm"].map(CLASS_TO_ID)


In [24]:
IMAGES_DIR = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\images_renamed")
CSV_PATH    = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\TIF_labels.xlsx")
OUTPUT_DIR = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions")     # where to save the splits/CSVs
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MOVE_UNLABELED = False  # False = copy (safe), True = move (destructive)

# Allowed image extensions:
ALLOWED_EXTS = [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"]

# Classes (must match your task and stay stable across runs)
ORDERED_CLASSES = ["happy", "sad", "anger", "neutral", "disgust", "surprise", "fear"]
CLASS_TO_ID = {c: i for i, c in enumerate(ORDERED_CLASSES)}

# Canonical label mapping (normalize casing/variants)
CANONICAL = {
    "happy": "happy",
    "sad": "sad",
    "anger": "anger",
    "angry": "anger",
    "neutral": "neutral",
    "disgust": "disgust",
    "surprise": "surprise",
    "surprised": "surprise",
    "fear": "fear",
    "scared": "fear",
}
# ============================================


def normalize_label(x: str) -> str:
    key = re.sub(r"\s+", "", str(x).strip().lower())
    if key not in CANONICAL:
        raise ValueError(f"Unknown label '{x}' (normalized='{key}'). "
                         f"Allowed: {sorted(set(CANONICAL.values()))}")
    return CANONICAL[key]


def read_labels_safely(path: Path) -> pd.DataFrame:
    """Load first two columns as ['image','label'] from CSV or Excel, handling messy cases."""
    path = Path(path)

    # Excel? (xlsx/xls are ZIPs)
    if path.suffix.lower() in {".xlsx", ".xls"} or zipfile.is_zipfile(path):
        try:
            df = pd.read_excel(path, usecols=[0, 1])
            df.columns = ["image", "label"]
            return df
        except Exception as e:
            raise RuntimeError(f"Failed to read Excel file: {path}\n{e}")

    # Try robust CSV reads with various encodings & quoting modes
    tried = []
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        for quoting in (csv.QUOTE_MINIMAL, csv.QUOTE_NONE):
            try:
                df = pd.read_csv(
                    path,
                    engine="python",   # tolerant parser
                    sep=None,          # auto-detect delimiter
                    encoding=enc,
                    quoting=quoting,
                    usecols=[0, 1],
                    on_bad_lines="skip",
                    skip_blank_lines=True,
                    header=0,
                )
                df.columns = ["image", "label"]
                # Clean invisible characters
                for col in ("image", "label"):
                    df[col] = (
                        df[col].astype(str)
                        .str.replace("\ufeff", "", regex=False)
                        .str.replace("\xa0", " ", regex=False)
                        .str.strip()
                    )
                # If someone saved Excel as .csv by mistake and we read binary, bail out
                if df["label"].astype(str).str.contains(r"PK\x03|_rels/.rels", regex=True).any():
                    raise ValueError("Looks like an Excel workbook bytes leaked into CSV; use read_excel.")
                return df
            except Exception as e:
                tried.append((enc, quoting, str(e)))
                continue

    raise RuntimeError(f"Could not parse labels file: {path}\nTried: {tried}")


def build_file_indices(images_dir: Path):
    """Index files in IMAGES_DIR for fast resolution. Non-recursive (single folder)."""
    files = [p for p in images_dir.iterdir() if p.is_file() and p.suffix.lower() in ALLOWED_EXTS]
    index_by_name = {p.name: p for p in files}                     # exact filename
    index_by_name_ci = {p.name.lower(): p for p in files}          # case-insensitive
    index_by_stem = {p.stem: p for p in files}                     # stem -> path
    index_by_stem_ci = {p.stem.lower(): p for p in files}
    # Handle potential "two trailing chars mismatch" between CSV ID and actual stem
    index_by_stem_trim2 = {p.stem[:-2]: p for p in files if len(p.stem) >= 2}
    index_by_stem_trim2_ci = {k.lower(): v for k, v in index_by_stem_trim2.items()}

    return {
        "files": files,
        "by_name": index_by_name,
        "by_name_ci": index_by_name_ci,
        "by_stem": index_by_stem,
        "by_stem_ci": index_by_stem_ci,
        "by_stem_trim2": index_by_stem_trim2,
        "by_stem_trim2_ci": index_by_stem_trim2_ci,
    }


def resolve_image_path_factory(indices: dict):
    files = indices["files"]
    by_name = indices["by_name"]
    by_name_ci = indices["by_name_ci"]
    by_stem = indices["by_stem"]
    by_stem_ci = indices["by_stem_ci"]
    by_stem_trim2 = indices["by_stem_trim2"]
    by_stem_trim2_ci = indices["by_stem_trim2_ci"]

    def resolve_image_path(img_id: str) -> str | None:
        img_id = str(img_id).strip()
        if not img_id:
            return None

        # If CSV provides extension -> try exact and case-insensitive
        if Path(img_id).suffix:
            p = by_name.get(img_id) or by_name_ci.get(img_id.lower())
            return str(p) if p else None

        # No extension: try stem exact / ci
        p = by_stem.get(img_id) or by_stem_ci.get(img_id.lower())
        if p:
            return str(p)

        # Try adding common extensions via full name lookup
        for ext in ALLOWED_EXTS:
            name = img_id + ext
            p = by_name.get(name) or by_name_ci.get(name.lower())
            if p:
                return str(p)

        # Handle "last-2-chars mismatch" case
        p = by_stem_trim2.get(img_id) or by_stem_trim2_ci.get(img_id.lower())
        if p:
            return str(p)

        # Last resort: prefix match on stems (can be slow if many files)
        lower_id = img_id.lower()
        for cand in files:
            if cand.stem.lower().startswith(lower_id):
                return str(cand)

        return None

    return resolve_image_path


def main():
    # 1) Load labels
    df = read_labels_safely(CSV_PATH)

    # 2) Normalize labels and map to IDs
    df["label_norm"] = df["label"].apply(normalize_label)
    df["label_id"] = df["label_norm"].map(CLASS_TO_ID)

    # 3) Resolve image file paths
    indices = build_file_indices(IMAGES_DIR)
    resolve_image_path = resolve_image_path_factory(indices)
    df["resolved_path"] = df["image"].apply(resolve_image_path)

    # 4) Report and drop rows that didn't resolve to a file
    missing = df["resolved_path"].isna().sum()
    if missing:
        print(f"[WARN] {missing} rows have no matching image. Showing a few:")
        print(df.loc[df["resolved_path"].isna(), ["image", "label"]].head(5))
    df = df.dropna(subset=["resolved_path"]).copy()

    # 5) De-duplicate by actual file path (if any duplicates in labels)
    before = len(df)
    df = df.drop_duplicates(subset=["resolved_path"]).reset_index(drop=True)
    removed_dups = before - len(df)
    if removed_dups:
        print(f"[INFO] Removed {removed_dups} duplicate rows by resolved_path")

    # 6) Stratified split 70/15/15
    TRAIN_RATIO, VAL_RATIO, TEST_RATIO = 0.70, 0.15, 0.15
    y = df["label_id"].values

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=(1 - TRAIN_RATIO), random_state=42)
    train_idx, temp_idx = next(sss1.split(np.zeros(len(df)), y))
    df_train = df.iloc[train_idx].reset_index(drop=True)
    df_temp = df.iloc[temp_idx].reset_index(drop=True)

    temp_y = df_temp["label_id"].values
    test_portion = TEST_RATIO / (VAL_RATIO + TEST_RATIO)  # 0.5 if 15/15
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=test_portion, random_state=42)
    val_idx, test_idx = next(sss2.split(np.zeros(len(df_temp)), temp_y))
    df_val = df_temp.iloc[val_idx].reset_index(drop=True)
    df_test = df_temp.iloc[test_idx].reset_index(drop=True)

    # 7) Save split CSVs
    cols = ["resolved_path", "image", "label", "label_norm", "label_id"]
    (OUTPUT_DIR / "splits").mkdir(parents=True, exist_ok=True)
    df_train[cols].to_csv(OUTPUT_DIR / "splits" / "train.csv", index=False, encoding="utf-8")
    df_val[cols].to_csv(OUTPUT_DIR / "splits" / "val.csv", index=False, encoding="utf-8")
    df_test[cols].to_csv(OUTPUT_DIR / "splits" / "test.csv", index=False, encoding="utf-8")

    print("Saved:", OUTPUT_DIR / "splits" / "train.csv", len(df_train))
    print("Saved:", OUTPUT_DIR / "splits" / "val.csv", len(df_val))
    print("Saved:", OUTPUT_DIR / "splits" / "test.csv", len(df_test))

    # 8) Class distribution + suggested class weights
    counts = df_train["label_norm"].value_counts().reindex(ORDERED_CLASSES, fill_value=0)
    total = counts.sum()
    weights = {c: (total / (len(ORDERED_CLASSES) * max(1, counts[c]))) for c in ORDERED_CLASSES}
    pd.DataFrame({"count": counts, "class_weight": pd.Series(weights)}).to_csv(
        OUTPUT_DIR / "train_class_stats.csv"
    )
    print("\nTrain distribution:\n", counts)
    print("\nSuggested class weights:\n", weights)

    # 9) Collect unlabeled images (present on disk but not referenced by labels)
    UNLABELED_DIR = OUTPUT_DIR / "unlabeled"
    UNLABELED_DIR.mkdir(parents=True, exist_ok=True)

    # We index only the top-level folder; keep consistent here too:
    all_imgs = [p for p in IMAGES_DIR.iterdir() if p.is_file() and p.suffix.lower() in ALLOWED_EXTS]
    labeled_paths = set(Path(p).resolve() for p in df["resolved_path"].astype(str).map(Path))
    unlabeled = [p for p in all_imgs if p.resolve() not in labeled_paths]

    print(f"\nFound {len(unlabeled)} unlabeled images.")
    moved_or_copied = 0
    for src in unlabeled:
        dst = UNLABELED_DIR / src.name
        i = 1
        while dst.exists():
            dst = UNLABELED_DIR / f"{src.stem}_{i}{src.suffix}"
            i += 1
        if MOVE_UNLABELED:
            shutil.move(str(src), str(dst))
        else:
            shutil.copy2(str(src), str(dst))
        moved_or_copied += 1
    print(f"{'Moved' if MOVE_UNLABELED else 'Copied'} {moved_or_copied} files to: {UNLABELED_DIR}")

    print("\nAll done. Ready for training with the CSVs in:", OUTPUT_DIR / "splits")


if __name__ == "__main__":
    main()

[WARN] 5 rows have no matching image. Showing a few:
                image    label
0   A02F10-JTP-4231HA    Happy
11    A03F7-JTPC-4438  Neutral
70   A12F5-JTPWC-6533  Disgust
71   A12F5-JTPWC-6554      Sad
72   A12F5-JTPWC-6557      Sad
Saved: C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions\splits\train.csv 79
Saved: C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions\splits\val.csv 17
Saved: C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions\splits\test.csv 18

Train distribution:
 label_norm
happy       21
sad         14
anger        5
neutral     17
disgust      8
surprise     8
fear         6
Name: count, dtype: int64

Suggested class weights:
 {'happy': np.float64(0.5374149659863946), 'sad': np.float64(0.8061224489795918), 'anger': np.float64(2.257142857142857), 'neutral': np.float64(0.6638655462184874), 'disgust': np.float64(1.4107142857142858), 'surprise': np.float64(1.4107142857142858), 'fear': np.float64(1.880952380952381)}



### count how many pictures are in each folder

In [29]:
from collections import Counter

ROOT = Path(r"C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions\splits")  # <-- change this to your top folder
ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}
SAVE_CSV = True  # set False if you don't want a CSV file
CSV_PATH = ROOT / "image_counts.csv"

def is_image(p: Path) -> bool:
    return p.is_file() and p.suffix.lower() in ALLOWED_EXTS


print("== Immediate folders ==")
immediate_rows = []
targets = [ROOT] + [p for p in ROOT.iterdir() if p.is_dir()]
for d in targets:
    count = sum(1 for p in d.iterdir() if is_image(p))
    immediate_rows.append((str(d), count))
for folder, count in sorted(immediate_rows, key=lambda x: x[1], reverse=True):
    print(f"{count:6}  {folder}")
print(f"TOTAL (immediate): {sum(c for _, c in immediate_rows)}\n")

# 2) Recursive: count for EVERY folder in the tree
print("== Recursive (every folder) ==")
counts = Counter()
for p in ROOT.rglob("*"):
    if is_image(p):
        counts[str(p.parent)] += 1

recursive_rows = sorted(counts.items(), key=lambda x: x[1], reverse=True)
for folder, count in recursive_rows:
    print(f"{count:6}  {folder}")
print(f"TOTAL (recursive): {sum(counts.values())}")


== Immediate folders ==
     0  C:\Users\ADMIN\Downloads\emotion classification\Data\data_emotions\splits
TOTAL (immediate): 0

== Recursive (every folder) ==
TOTAL (recursive): 0
