In [13]:
import json
import argparse
from collections import defaultdict,Counter
from sklearn.model_selection import train_test_split

def split_coco_by_material(coco_json_path, train_out, test_out, test_size=0.2, random_state=72):
    """
    Splits a COCO dataset into train/test text files stratified by material composition.

    Parameters
    ----------
    coco_json_path : str
        Path to COCO-style annotation JSON.
    train_out : str
        Path to output train.txt.
    test_out : str
        Path to output test.txt.
    test_size : float
        Fraction for test split (e.g. 0.2 for 20%).
    random_state : int
        Random seed for reproducibility.
    """

    with open(coco_json_path, "r") as f:
        coco = json.load(f)

    # Map image_id -> file_name
    id2name = {img["id"]: img["file_name"].split('_')[0] for img in coco["images"]}

    # Collect textile compositions per image
    image_materials = defaultdict(set)

    for ann in coco["annotations"]:
        if ann.get("category_id") is None:
            continue

        # Get category name
        cat_id = ann["category_id"]
        cat_name = next((c["name"] for c in coco["categories"] if c["id"] == cat_id), None)

        if cat_name != "textile":
            continue

        # Extract material attributes (dict of {material: percentage})
        attributes = ann.get("attributes", {})
        
        for material, perc in attributes.items():
            if perc > 0:
                image_materials[ann["image_id"]].add(material)
                
    # Build dataset (image_name, composition_label)
    data = []
    for img_id, file_name in id2name.items():
        if img_id in image_materials:
            composition = "_".join(sorted(image_materials[img_id]))
            data.append((file_name, composition))

    if not data:
        raise ValueError("No textile images found in dataset!")
    counts = Counter([d[1] for d in data])
    filtered_data = [d for d in data if counts[d[1]] > 1]
    X = [d[0] for d in filtered_data]   # file names
    y = [d[1] for d in filtered_data]   # composition labels
    print(y)
    # Stratified split
    X_train, X_test, _, _ = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Write outputs
    with open(train_out, "w") as f:
        f.write("\n".join(X_train))
    with open(test_out, "w") as f:
        f.write("\n".join(X_test))

    print(f"Train split: {len(X_train)} images")
    print(f"Test split: {len(X_test)} images")

In [14]:
split_coco_by_material('../smartex_acquisition_ws/smartex_textile/labels/smartex_annotations_cocostyle.json','train.txt','test.txt')

['elastan_polyester', 'nylon_viscosa', 'acrylic', 'nylon_viscosa', 'wool', 'polyester_wool', 'cotton_polyester', 'cotton_polyester', 'cotton', 'polyester', 'cotton_elastan', 'acrylic_wool', 'acrylic_wool', 'cotton_polyester', 'acrylic', 'nylon', 'cotton', 'elastan_polyester', 'nylon', 'cotton_elastan', 'cotton', 'acrylic_wool', 'cotton', 'acrylic_wool', 'cotton', 'nylon_viscosa', 'polyester_wool', 'cotton_elastan', 'polyester_wool', 'cotton_polyester', 'polyester', 'cotton_polyester', 'nylon', 'polyester', 'acrylic_wool', 'nylon', 'polyester_wool', 'wool', 'acrylic_wool', 'acrylic', 'elastan_polyester', 'nylon', 'acrylic_wool', 'cotton', 'acrylic', 'nylon', 'nylon', 'nylon', 'wool', 'cotton', 'polyester_wool', 'cotton', 'polyester', 'polyester_wool', 'cotton_polyester', 'acrylic', 'wool', 'nylon_viscosa', 'cotton', 'acrylic_wool', 'polyester', 'wool', 'cotton_polyester', 'acrylic_wool', 'acrylic', 'cotton', 'nylon', 'acrylic', 'polyester_wool', 'wool', 'cotton_elastan', 'nylon', 'cotto