In [None]:
from PIL import Image
import torch
import matplotlib.pyplot as plt
from transformers import YolosImageProcessor, YolosForObjectDetection
from torchvision.transforms import ToTensor, ToPILImage

In [None]:
# Here you should put the path of your image
IMAGE_PATH = "data/image_examples/person-3070570300.jpg"

In [None]:
# This is the order of the categories list. NO NOT CHANGE. Just for visualization purposes
cats = ['shirt, blouse', 'top, t-shirt, sweatshirt', 'sweater', 'cardigan', 'jacket', 'vest', 'pants', 'shorts', 'skirt', 'coat', 'dress', 'jumpsuit', 'cape', 'glasses', 'hat', 'headband, head covering, hair accessory', 'tie', 'glove', 'watch', 'belt', 'leg warmer', 'tights, stockings', 'sock', 'shoe', 'bag, wallet', 'scarf', 'umbrella', 'hood', 'collar', 'lapel', 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet', 'ruffle', 'sequin', 'tassel']

In [None]:
def fix_channels(t):
    """
    Some images may have 4 channels (transparent images) or just 1 channel (black and white images), in order to let the images have only 3 channels. I am going to remove the fourth channel in transparent images and stack the single channel in back and white images.
    :param t: Tensor-like image
    :return: Tensor-like image with three channels
    """
    if len(t.shape) == 2:
        return ToPILImage()(torch.stack([t for i in (0, 0, 0)]))
    if t.shape[0] == 4:
        return ToPILImage()(t[:3])
    if t.shape[0] == 1:
        return ToPILImage()(torch.stack([t[0] for i in (0, 0, 0)]))
    return ToPILImage()(t)

In [None]:
def idx_to_text(i):
    return cats[i]

In [None]:
# Random colors used for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def plot_results(pil_img, prob, boxes):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), colors):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        cl = p.argmax()
        ax.text(xmin, ymin, idx_to_text(cl), fontsize=10,
                bbox=dict(facecolor=c, alpha=0.8))
    plt.axis('off')
    plt.show()
    plt.savefig("image.png")

In [None]:
def visualize_predictions(image, outputs, threshold=0.8):
    # keep only predictions with confidence >= threshold
    probas = outputs.logits.softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > threshold

    # convert predicted boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs.pred_boxes[0, keep].cpu(), image.size)

    # plot results
    plot_results(image, probas[keep], bboxes_scaled)

In [None]:
MODEL_NAME = "valentinafeve/yolos-fashionpedia"

In [None]:
feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
model = YolosForObjectDetection.from_pretrained(MODEL_NAME)

In [None]:
image = Image.open(open(IMAGE_PATH, "rb"))
image = fix_channels(ToTensor()(image))
image = image.resize((600, 800))
image

In [None]:
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)

In [None]:
img = visualize_predictions(image, outputs, threshold=0.5)
img

In [None]:
def get_clothing_crops(image, outputs, threshold=0.5):
    # 1. Process probabilities
    probas = outputs.logits.softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > threshold

    # 2. Rescale boxes
    bboxes_scaled = rescale_bboxes(outputs.pred_boxes[0, keep].cpu(), image.size)
    confidences, indices = probas[keep].max(-1)

    # 3. Define what we DON'T want (indices 28 to 45 in your 'cats' list)
    # These are: collar, lapel, epaulette, sleeve, pocket, neckline, buckle, etc.
    forbidden_indices = set(range(28, 46))

    crops = []
    for i, (xmin, ymin, xmax, ymax) in enumerate(bboxes_scaled.tolist()):
        label_idx = indices[i].item()

        # SKIP if the detection is a "part" or "detail"
        if label_idx in forbidden_indices:
            continue

        # Crop and store main clothing items
        box = (xmin, ymin, xmax, ymax)
        cropped_img = image.crop(box)

        crops.append({
            "image": cropped_img,
            "label": cats[label_idx],
            "confidence": confidences[i].item()
        })

    return crops

In [None]:
# Assuming you have loaded 'image' and 'model_outputs'
detected_crops = get_clothing_crops(image, outputs, threshold=0.5)

for i, crop_data in enumerate(detected_crops):
    crop_img = crop_data["image"]
    label = crop_data["label"]

    # Save the crop
    filename = f"crop_{i}_{label.replace(', ', '_')}.png"
    crop_img.save(filename)
    print(f"Saved: {filename} (Confidence: {crop_data['confidence']:.2f})")

    # Optional: display in notebook
    # display(crop_img)

In [None]:
def show_crops(crops):
    num_crops = len(crops)
    if num_crops == 0: return

    fig, axes = plt.subplots(1, num_crops, figsize=(5 * num_crops, 5))
    if num_crops == 1: axes = [axes] # Handle single crop case

    for ax, crop in zip(axes, crops):
        ax.imshow(crop["image"])
        ax.set_title(f"{crop['label']}\n{crop['confidence']:.2f}")
        ax.axis('off')
    plt.show()

# Use it:
show_crops(detected_crops)

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

# Load CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define our formality labels
# TIP: Using "a photo of..." or descriptive phrases helps CLIP's accuracy
formality_labels = [
    "formal evening wear or black tie",
    "business professional suit",
    "smart business casual clothing",
    "relaxed casual everyday clothes",
    "sportswear or gym clothes"
]

In [None]:
import numpy as np
from sklearn.cluster import KMeans

def get_dominant_color(pil_img, k=1):
    # 1. Resize for speed (we don't need 4K resolution to find "Blue")
    img = pil_img.copy()
    img.thumbnail((100, 100))

    # 2. Convert to numpy and reshape to a list of pixels
    img_data = np.array(img)

    # Handle RGBA or Grayscale
    if len(img_data.shape) == 3:
        pixels = img_data.reshape(-1, img_data.shape[-1])
    else:
        # If grayscale, stack it to make it look like RGB
        pixels = img_data.reshape(-1, 1).repeat(3, axis=1)

    # 3. Use K-Means to find the most frequent color
    # We use k=1 for the absolute dominant, or k=3 if you want a palette
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(pixels)

    # Get the RGB values of the cluster center
    dominant_rgb = kmeans.cluster_centers_[0].astype(int)

    # Convert to Hex for easy use in apps
    hex_code = '#{:02x}{:02x}{:02x}'.format(*dominant_rgb)

    return tuple(dominant_rgb), hex_code

In [None]:
def get_color_name(rgb):
    # Basic CSS3/Web color map
    COLORS = {
        "Black": (0, 0, 0), "White": (255, 255, 255), "Grey": (128, 128, 128),
        "Red": (255, 0, 0), "Navy": (0, 0, 128), "Blue": (0, 0, 255),
        "Green": (0, 128, 0), "Beige": (245, 245, 220), "Brown": (165, 42, 42),
        "Burgundy": (128, 0, 32), "Khaki": (240, 230, 140)
    }

    # Find the color with the minimum Euclidean distance
    # Formula: $d = \sqrt{(r_2-r_1)^2 + (g_2-g_1)^2 + (b_2-b_1)^2}$
    min_dist = float('inf')
    best_name = "Unknown"

    for name, target_rgb in COLORS.items():
        dist = np.sqrt(np.sum((np.array(rgb) - np.array(target_rgb))**2))
        if dist < min_dist:
            min_dist = dist
            best_name = name
    return best_name

In [None]:
from rembg import remove
def analyze_wardrobe(crops):
    results = []

    for crop in crops:
        raw_img = crop["image"]

        # 1. REMOVE BACKGROUND
        # This ensures the color extractor and CLIP aren't distracted by the background
        # clean_img = remove(raw_img).convert("RGB")
        # For now this is distorting the colors
        clean_img = raw_img

        # 2. GET FORMALITY (CLIP)
        inputs = clip_processor(
            text=formality_labels,
            images=clean_img,
            return_tensors="pt",
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = clip_model(**inputs)

        probs = outputs.logits_per_image.softmax(dim=1)
        top_idx = probs.argmax().item()

        formality_type = formality_labels[top_idx]
        conf_score = probs[0][top_idx].item()

        # 3. GET COLOR (K-Means)
        rgb, hex_val = get_dominant_color(clean_img)
        color_name = get_color_name(rgb)

        # 4. STORE RESULTS (Ensuring all keys exist!)
        results.append({
            "category": crop["label"],
            "formality": formality_type,
            "confidence": f"{conf_score:.2f}", # Fixed the missing key
            "color": color_name,
            "hex": hex_val,
            "image": clean_img
        })

    return results

In [None]:
# Assuming you already have 'clothing_crops' from the YOLOS step
final_wardrobe = analyze_wardrobe(detected_crops)

for item in final_wardrobe:
    print(f"--- {item['category'].upper()} ---")
    print(f"Style:      {item['formality']} (Match: {item['confidence']})")
    print(f"Color:      {item['color']} ({item['hex']})")
    print("-" * 30)
    display(item['image']) # Show the clean, no-background crop

In [None]:
detected_crops

# V2 with weather sutability

In [None]:
material_labels = [
    "denim fabric", "leather or faux leather", "knitted wool or sweater material",
    "cotton or jersey fabric", "silk or satin", "synthetic gym wear material",
    "heavy puffer jacket material", "linen or thin summer fabric"
]

weather_labels = [
    "heavy winter cold weather clothing",
    "mild autumn or spring clothing",
    "hot summer weather clothing",
    "rainy and waterproof clothing"
]

In [None]:
def analyze_wardrobe_v2(crops):
    results = []
    for crop in crops:
        clean_img = crop["image"]

        # Helper function to get the best label from CLIP
        def get_best_label(img, labels):
            inputs = clip_processor(text=labels, images=img, return_tensors="pt", padding=True).to(device)
            with torch.no_grad():
                outputs = clip_model(**inputs)
            probs = outputs.logits_per_image.softmax(dim=1)
            idx = probs.argmax().item()
            return labels[idx], probs[0][idx].item()

        # 1. Analyze Style/Formality
        formality, _ = get_best_label(clean_img, formality_labels)

        # 2. Analyze Material
        material, _ = get_best_label(clean_img, material_labels)

        # 3. Analyze Weather Suitability
        weather, _ = get_best_label(clean_img, weather_labels)

        # 4. Get Color (from previous step)
        rgb, hex_val = get_dominant_color(clean_img)
        color_name = get_color_name(rgb)

        results.append({
            "category": crop["label"],
            "formality": formality,
            "material": material.replace(" material", "").replace(" fabric", ""), # Clean up strings
            "weather": weather,
            "color": color_name,
            "hex": hex_val,
            "image": clean_img
        })

    return results

In [None]:
# Assuming you already have 'clothing_crops' from the YOLOS step
final_wardrobe = analyze_wardrobe_v2(detected_crops)

for item in final_wardrobe:
    print(f"--- {item['category'].upper()} ---")
    print(f"Style:      {item['formality']}")
    print(f"Color:      {item['color']} ({item['hex']})")
    print(f"Material:   {item['material']}")
    print(f"Weather:    {item['weather']}")
    print("-" * 30)
    display(item['image']) # Show the clean, no-background crop

## Trying out the Fasionopedia dataset

In [None]:
# !pip install "deeplake<4"

In [None]:
# import deeplake
# import torchvision.transforms as transforms

# ds_train = deeplake.load('hub://activeloop/fashionpedia-train')
# # ds_test = deeplake.load('hub://activeloop/fashionpedia-test')

# # Creating dataloader
# # train_dataloader = ds_train.pytorch(transform={'images': transforms.Compose([
# #     transforms.Resize((512, 512)),
# #     transforms.ToTensor(),
# # ])}, num_workers=0, batch_size=4, shuffle=False)
# # test_dataloader = ds_test.pytorch(num_workers=0, batch_size=4, shuffle=False)

In [None]:
# images, images_meta, masks, boxes, categories, super_categories, areas, iscrowds, attributes = next(iter(train_dataloader))

In [None]:
# ds_train.visualize()

In [None]:
# import torch
# from torchvision import transforms

# # 1. Setup the loader to ONLY handle images (this always works)
# tform = transforms.Compose([
#     transforms.Resize((512, 512)),
#     transforms.ToTensor(),
# ])

# train_dataloader = ds_train.pytorch(
#     tensors=['images'], # Only pull images automatically
#     batch_size=4,
#     transform={'images': tform},
#     decode_method={'images': 'pil'},
#     return_index=True, # THIS IS KEY: It gives us the index of the sample
#     num_workers=0
# )

# # 2. Get a batch
# for data in train_dataloader:
#   images = data['images']
#   idxs = data['index'] # These are the row numbers in the dataset

#   # 3. Use those indexes to pull labels DIRECTLY from the dataset
#   # This bypasses the DataLoader's empty tensor issue
#   batch_categories = [ds_train.categories[i].numpy() for i in idxs]
#   batch_attributes = [ds_train.attributes[i].numpy() for i in idxs]

#   print(f"Batch Image Shape: {images.shape}")
#   print(f"Indices in this batch: {idxs.tolist()}")
#   print(f"Actual Categories for first image: {batch_categories[0]}")

In [None]:
# batch

In [None]:
# print(ds_train)

In [None]:
# from torchvision import transforms

# tform = transforms.Compose([
#     transforms.Resize((512, 512)),
#     transforms.ToTensor(),
# ])

# # 1. Ask for images and index
# train_dataloader = ds_train.pytorch(
#     tensors=['images'],
#     batch_size=4,
#     transform={'images': tform},
#     decode_method={'images': 'pil'},
#     return_index=True  # This gives us the 'row number' in the dataset
# )

# # 2. Grab a batch
# batch = next(iter(train_dataloader))
# indices = batch['index']

# print(f"Indices in this batch: {indices.tolist()}")

# # 3. Use those indices to pull the labels directly from the dataset object
# for i in indices:
#     idx = i.item()
#     cat = ds_train.categories[idx].numpy()
#     print(f"Image Index {idx} Categories: {cat}")

In [None]:
# # 1. Get the class names list from the dataset
# class_names = ds_train.categories.info.class_names

# # 2. Get the indices from your last successful batch
# indices = batch['index'].tolist()

# # 3. Print the "Human Readable" labels
# for i, idx in enumerate(indices):
#     raw_ids = ds_train.categories[idx].numpy()

#     # Convert IDs to names (ignoring padding/background if necessary)
#     names = [class_names[cid] for cid in raw_ids if cid < len(class_names)]

#     print(f"Image {i} (Index {idx}) contains: {', '.join(names)}")

In [None]:
# def display_batch(features_batch, labels_batch):
#     """
#     This function displays the batch of images and their labels
#     :param features_batch: the batch of images as a tensor
#     :param labels_batch: the batch of labels as a tensor
#     :return:
#     """
#     len_batch = len(features_batch)
#     cols = 7
#     rows = len_batch-cols
#     fig = plt.figure(figsize=(10,rows*2))

#     for i in range(1, len_batch):
#         image, label = features_batch[i], labels_batch[i]
#         fig.add_subplot(rows, cols, i)
#         imshow(image, label=f"{i}- {class_names[label]}")
#     fig.show();

In [None]:
# # Check if your dataset has the class names stored
# class_names = ds_train.categories.info.class_names
# print(f"Category ID 23 is: {class_names[23]}")

In [None]:
# # Access the categories from your batch
# labels = batch['categories']

# print("--- Category Labels ---")
# print(labels)

# # If you want to see how many objects are in each of the 4 images:
# for i, sample_labels in enumerate(labels):
#     # Filter out padding (Fashionpedia often uses 0 or -1 for padding)
#     true_labels = sample_labels[sample_labels != 0]
#     print(f"Image {i} has {len(true_labels)} objects. Category IDs: {true_labels.tolist()}")

In [None]:
# # Show a sample
# torch.manual_seed(RANDOM_SEED)
# random_idx = torch.randint(0, len(train_features_batch), size=[1]).item()
# image, label = train_features_batch[random_idx], train_labels_batch[random_idx]
# display_batch(train_features_batch, train_labels_batch) # Utilized the created functoin here

In [2]:
from datasets import load_dataset

dataset = load_dataset('detection-datasets/fashionpedia')
dataset

DatasetDict({
    train: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 45623
    })
    val: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 1158
    })
})

In [3]:
import torch
from torchvision import transforms

def collate_fn(batch):
    images = [item["pixel_values"] for item in batch]
    targets = [item["objects"] for item in batch]

    # Stack images into a single 4D tensor [B, C, H, W]
    # Note: This only works if all images are the same size!
    images = torch.stack(images)
    return images, targets

def transform_fn(examples):
    # Basic transforms: Convert PIL to Tensor
    # You might want to add Resize() or Normalize() here
    t = transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
    ])

    examples["pixel_values"] = [t(img.convert("RGB")) for img in examples["image"]]
    # Keep the objects as they are for the collate_fn to handle
    return examples

# Apply the transformation to the dataset
transformed_dataset = dataset["train"].with_transform(transform_fn)

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    transformed_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

# Test a single batch
batch = next(iter(train_dataloader))
images, targets = batch
print(f"Batch images shape: {images.shape}")
print(f"Number of target dicts: {len(targets)}")

Batch images shape: torch.Size([32, 3, 512, 512])
Number of target dicts: 32
