In [7]:
#!/usr/bin/env python
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split

# ---- 1. Config -------------------------------------------------------------
ROOT = pathlib.Path("images")
OUT  = ROOT / "manifest1k.csv"
SEED = 42

# ---- 2. Gather all image paths & labels -----------------------------------
rows = []
for folder, label in [
    ("contents",     0),   # real originals
    ("styles",       0),   # real style-only
    ("stylizations", 1),   # stylized (fake)
]:
    for p in (ROOT / folder).glob("*.jpg"):
        rows.append((str(p), label))

df = pd.DataFrame(rows, columns=["path","label"])
print(f"Found {len(df)} images: {df.label.value_counts().to_dict()}")

# ---- 3. Down-sample to at most 15k per class ------------------------------
max_per_label = 500
balanced_parts = []
for lbl in (0, 1):
    part = df[df.label == lbl]
    n    = min(len(part), max_per_label)
    balanced_parts.append(part.sample(n=n, random_state=SEED))

balanced = pd.concat(balanced_parts).sample(frac=1, random_state=SEED).reset_index(drop=True)
print(f"After balancing: {balanced.label.value_counts().to_dict()}")

# ---- 4. Stratified train/test split (80/20) -------------------------------
train_df, test_df = train_test_split(
    balanced,
    test_size=0.2,
    random_state=SEED,
    stratify=balanced["label"]
)

train_df["split"] = "train"
test_df ["split"] = "test"

result = pd.concat([train_df, test_df]).reset_index(drop=True)
print(f"Train/test split: {result.split.value_counts().to_dict()}")
print(f"Final class counts: {result.label.value_counts().to_dict()}")

# ---- 5. Save manifest.csv -------------------------------------------------
result.to_csv(OUT, index=False)
print(f"Wrote {len(result)} rows to {OUT}")


Found 10000 images: {0: 5000, 1: 5000}
After balancing: {1: 500, 0: 500}
Train/test split: {'train': 800, 'test': 200}
Final class counts: {1: 500, 0: 500}
Wrote 1000 rows to images\manifest1k.csv
