In [None]:
# Extract tags using NLP
# filter them further using SPACY

from sklearn.feature_extraction.text import CountVectorizer
import spacy
nlp = spacy.load("en_core_web_sm")

def get_tags(desc):
    vectorizer = CountVectorizer(max_features=100, stop_words='english', ngram_range=(1, 2))
    X = vectorizer.fit_transform([desc])
    tags = vectorizer.get_feature_names_out()
    keep = []
    for term in tags:
        doc = nlp(term.replace("_", " "))
        if all(token.pos_ in ["NOUN", "ADJ"] for token in doc):
            keep.append(term.lower())
    return list(set(keep))

In [None]:
# Vectorize title + desc + image
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load FashionCLIP model and processor from Hugging Face
model_name = "patrickjohncyh/fashion-clip"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

def vectorize(text, img):
    # Load image
    image = Image.open(img).convert("RGB")

    # 1. Get image embedding
    img_inputs = processor(images=image, return_tensors="pt")
    image_emb = model.get_image_features(**img_inputs)

    # 2. Get text embedding
    txt_inputs = processor(text=text, return_tensors="pt", padding=True)
    text_emb = model.get_text_features(**txt_inputs)

    # 3. Normalize both embeddings
    image_emb = torch.nn.functional.normalize(image_emb, p=2, dim=1)
    text_emb = torch.nn.functional.normalize(text_emb, p=2, dim=1)

    # 4. Combine and normalize again
    combined_emb = (image_emb + text_emb) / 2
    combined_emb = torch.nn.functional.normalize(combined_emb, p=2, dim=1)

    return combined_emb[0].cpu().tolist()

In [None]:
# men
# load the json
import json
from pinecone import Pinecone, PodSpec

f = open("men_db.json", 'r')
d = json.load(f)
pc = Pinecone(api_key="")
index = pc.Index("menfit")



for i in d:
    i['tags'] = get_tags(i['description'])
    text = f"{i['name']} . {i['description']}"
    img = i['path']

    # get vector
    vec = vectorize(text, img)

    # save in pinecone
    index.upsert([{
        "id": i['id'],
        "values": vec,
        "metadata": i
    }])

In [None]:
# women
# load the json
import json
from pinecone import Pinecone, PodSpec

f = open("women_db.json", 'r')
d = json.load(f)
pc = Pinecone(api_key="")
index = pc.Index("womanfit")



for i in d:
    i['tags'] = get_tags(i['description'])
    text = f"{i['brand']} {i['name']} . {i['description']}"
    img = i['path']

    # get vector
    vec = vectorize(text, img)

    # save in pinecone
    index.upsert([{
        "id": i['id'],
        "values": vec,
        "metadata": i
    }])