In [1]:
import os
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data")

# List all subfolders (e.g., black_shirt, blue_dress, etc.)
categories = [p.name for p in DATA_DIR.iterdir() if p.is_dir()]
print(f"Found {len(categories)} folders:")
print(categories[:5])  # print a few to preview

# Create DataFrame of all image paths with folder labels
image_data = []
for label in categories:
    folder = DATA_DIR / label
    for img_file in folder.iterdir():
        if img_file.suffix.lower() in [".jpg", ".jpeg", ".png"]:
            image_data.append({
                "filepath": str(img_file.resolve()),
                "raw_label": label
            })

df = pd.DataFrame(image_data)
print(f"Total images: {len(df)}")
df.head()


Found 24 folders:
['green_shirt', 'black_shoes', 'red_dress', 'black_shirt', 'white_dress']
Total images: 11385


Unnamed: 0,filepath,raw_label
0,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt
1,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt
2,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt
3,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt
4,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt


Extract Color & Category from Folder Names

In [2]:
# Assuming all folder names are in format: color_category (e.g., red_shoes)
df[["color", "category"]] = df["raw_label"].str.split("_", expand=True)

print(df["color"].value_counts())
print(df["category"].value_counts())

df.head()


color
black    3130
blue     2863
white    1812
red      1718
green    1047
brown     815
Name: count, dtype: int64
category
shoes     3418
pants     2789
dress     2570
shirt     1686
shorts     922
Name: count, dtype: int64


Unnamed: 0,filepath,raw_label,color,category
0,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt,green,shirt
1,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt,green,shirt
2,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt,green,shirt
3,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt,green,shirt
4,/Users/mac/Documents/TestProjects/catecut-test...,green_shirt,green,shirt


Stratified Split: Train/Val/Test

In [3]:
from sklearn.model_selection import train_test_split

# Use combined label to stratify by both color & category
df["stratify_group"] = df["color"] + "_" + df["category"]

train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["stratify_group"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["stratify_group"], random_state=42)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

# Drop stratify_group and save splits
for split_df, name in zip([train_df, val_df, test_df], ["train", "val", "test"]):
    split_df.drop(columns=["stratify_group"], inplace=True)
    split_df.to_csv(f"../data/{name}_labels.csv", index=False)


Train: 7969 | Val: 1708 | Test: 1708
