In [1]:
pip install pandas matplotlib seaborn


Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.10.6-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.2-cp313-cp313-win_amd64.whl.metadata (111 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp313-cp313-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.6-cp313-cp313-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   ------------------- -----

In [2]:
from pathlib import Path
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
AUG_DIR = Path(r"C:\Users\ADMIN\Downloads\emotion classification\data_emotions")  # <-- change if needed
CSV_IN  = Path(r"C:\Users\ADMIN\Downloads\emotion classification\data_emotions\augmented\train_with_aug.csv")
OUT_DIR = Path(r"C:\Users\ADMIN\Downloads\emotion classification\data_emotions\augmented/EDA"); 
OUT_DIR.mkdir(parents=True, exist_ok=True)


In [7]:
ORDERED_CLASSES = ["happy","sad","anger","neutral","disgust","surprise","fear"]
SAVE_DPI = 180
# ------------------------------------------

sns.set_theme(style="whitegrid", context="talk")

# --- Load & normalize columns ---
df = pd.read_csv(CSV_IN)
df.columns = [c.strip().lower() for c in df.columns]
needed = {"resolved_path","image","label","label_norm","label_id"}
missing = needed - set(df.columns)
assert not missing, f"CSV is missing columns: {missing}"

# --- Detect augmented rows & parse tags (matches the augment filename pattern we used) ---
pat = re.compile(r"_r(?P<rot>-?\d+)_(?P<flip>hf|orig)_bc(?P<bc>[a-z]+)_c(?P<crop>\d+)", re.I)

def parse_aug(name: str):
    m = pat.search(name)
    if not m:
        return pd.Series({"is_augmented": False, "rot": None, "flip": None, "bc": None, "crop": None})
    return pd.Series({
        "is_augmented": True,
        "rot": int(m.group("rot")),
        "flip": "flipped" if m.group("flip").lower()=="hf" else "original",
        "bc": m.group("bc").lower(),        # dark / orig / bright
        "crop": int(m.group("crop"))
    })

tags = df["image"].astype(str).apply(parse_aug)
df = pd.concat([df, tags], axis=1)



## Class counts 

In [9]:
counts = (
    df["label_norm"].value_counts()
      .reindex(ORDERED_CLASSES, fill_value=0)
      .rename_axis("label_norm")
      .reset_index(name="count")
)
plt.figure(figsize=(10,6))
sns.barplot(data=counts, x="label_norm", y="count", order=ORDERED_CLASSES, edgecolor="black", errorbar=None)
plt.title("Class distribution (train_with_aug)")
plt.xlabel("Class"); plt.ylabel("# images")
plt.tight_layout()
plt.savefig(OUT_DIR / "class_counts_bar.png", dpi=SAVE_DPI); plt.close()


## Class percentage 

In [10]:
counts["percent"] = (counts["count"] / counts["count"].sum() * 100)
plt.figure(figsize=(10,6))
sns.barplot(data=counts, x="label_norm", y="percent", order=ORDERED_CLASSES, edgecolor="black", errorbar=None)
plt.title("Class distribution (%) (train_with_aug)")
plt.xlabel("Class"); plt.ylabel("Percent")
plt.tight_layout()
plt.savefig(OUT_DIR / "class_percent_bar.png", dpi=SAVE_DPI); plt.close()

## Original vs Augmented per class

In [11]:
stack = (
    df.groupby(["label_norm","is_augmented"])
      .size().reset_index(name="count")
      .assign(kind=lambda x: x["is_augmented"].map({False:"original", True:"augmented"}))
)
stack["label_norm"] = pd.Categorical(stack["label_norm"], categories=ORDERED_CLASSES, ordered=True)
plt.figure(figsize=(12,6))
sns.barplot(data=stack, x="label_norm", y="count", hue="kind", order=ORDERED_CLASSES,
            edgecolor="black", errorbar=None)
plt.title("Original vs Augmented per class")
plt.xlabel("Class"); plt.ylabel("# images")
plt.legend(title="")
plt.tight_layout()
plt.savefig(OUT_DIR / "orig_vs_aug_per_class.png", dpi=SAVE_DPI); plt.close()
