# Construire la liste des images (train/test)

In [1]:
from pathlib import Path
import pandas as pd

ROOT = Path(r"C:\Users\othni\Projects\mvtec_ad\data")

categories = sorted([d.name for d in ROOT.iterdir() if d.is_dir()])
categories

['bottle',
 'cable',
 'capsule',
 'carpet',
 'grid',
 'hazelnut',
 'leather',
 'metal_nut',
 'pill',
 'screw',
 'tile',
 'toothbrush',
 'transistor',
 'wood',
 'zipper']

In [2]:
rows = []

for cat in categories:
    cat_root = ROOT / cat
    
    # --- train/good -> label 0 (normal) ---
    train_good_dir = cat_root / "train" / "good"
    for img_path in sorted(train_good_dir.glob("*.png")):
        rows.append({
            "path": str(img_path),
            "category": cat,
            "split": "train",         # pour l'instant : train uniquement ici
            "label": 0,               # 0 = normal, 1 = défaut
            "defect_type": "good"
        })
    
    # --- test/* -> good + différents défauts ---
    test_root = cat_root / "test"
    for subdir in sorted(test_root.iterdir()):
        if not subdir.is_dir():
            continue
        defect_type = subdir.name
        label = 0 if defect_type == "good" else 1
        
        for img_path in sorted(subdir.glob("*.png")):
            rows.append({
                "path": str(img_path),
                "category": cat,
                "split": "test",      # on splitera plus tard en val/test
                "label": label,
                "defect_type": defect_type
            })

len(rows)

5354

In [3]:
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,path,category,split,label,defect_type
0,C:\Users\othni\Projects\mvtec_ad\data\bottle\t...,bottle,train,0,good
1,C:\Users\othni\Projects\mvtec_ad\data\bottle\t...,bottle,train,0,good
2,C:\Users\othni\Projects\mvtec_ad\data\bottle\t...,bottle,train,0,good
3,C:\Users\othni\Projects\mvtec_ad\data\bottle\t...,bottle,train,0,good
4,C:\Users\othni\Projects\mvtec_ad\data\bottle\t...,bottle,train,0,good


# Séparer val et test à partir de test

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

df["final_split"] = df["split"]  # on va créer une colonne finale : train / val / test

rng = np.random.RandomState(42)

val_indices = []
test_indices = []

# On ne split que les images initialement marquées "test"
df_test = df[df["split"] == "test"].copy()

# On fait un split par (catégorie, label) pour garder un équilibre
for (cat, label), group in df_test.groupby(["category", "label"]):
    idx = group.index.values
    
    if len(idx) == 1:
        # un seul exemple -> on le met en test
        test_indices.extend(idx.tolist())
    else:
        idx_val, idx_test = train_test_split(
            idx,
            test_size=0.5,
            random_state=42,
            shuffle=True
        )
        val_indices.extend(idx_val.tolist())
        test_indices.extend(idx_test.tolist())

# Mise à jour des splits
df.loc[val_indices, "final_split"] = "val"
df.loc[test_indices, "final_split"] = "test"

# Vérification rapide
df["final_split"].value_counts()

final_split
train    3629
test      869
val       856
Name: count, dtype: int64

# Vérifier les counts par catégorie / split

In [5]:
df.groupby(["category", "final_split", "label"]).size().unstack(level=[1,2]).fillna(0)

final_split,test,test,train,val,val
label,0,1,0,0,1
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bottle,10,32,209,10,31
cable,29,46,224,29,46
capsule,12,55,219,11,54
carpet,14,45,280,14,44
grid,11,29,264,10,28
hazelnut,20,35,391,20,35
leather,16,46,245,16,46
metal_nut,11,47,220,11,46
pill,13,71,267,13,70
screw,21,60,320,20,59


In [6]:
output_path = Path(r"C:\Users\othni\Projects\mvtec_ad\experiments\image_level_df.csv")
df.to_csv(output_path, index=False)
output_path

WindowsPath('C:/Users/othni/Projects/mvtec_ad/experiments/image_level_df.csv')