In [9]:
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch

# Load CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Your image (already cropped to a clothing item)
image = Image.open("../gui/wardrobe_items/pants/item_0_852749.png").convert("RGB")

# Define style labels
style_labels = ["casual", "formal", "sporty", "streetwear", "bohemian", "business", "vintage", "jeans"]

# Prepare input
inputs = processor(text=style_labels, images=image, return_tensors="pt", padding=True)

# Get similarity logits
with torch.no_grad():
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)

# Match style
predicted_style = style_labels[probs.argmax()]
print("Predicted style:", predicted_style)


Predicted style: jeans


In [8]:
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

prompt = "a photo of a clothing item whose style is "
image = Image.open("../gui/wardrobe_items/pants/item_0_852749.png").convert("RGB")
inputs = processor(images=image, text=prompt, return_tensors="pt")

out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
print("Caption:", caption)

# Example caption: "A casual blue denim jacket with a relaxed fit."
# Now extract "casual" as the style.


Caption: a photo of a clothing item whose style is a jeans
