# 캡션 생성 : gpt-4o-mini를 통해 crop 된 이미지의 캡션을 생성

## top

In [None]:
# -*- coding: utf-8 -*-
# Stage 1: 이미지 → GPT-4o-mini 태깅 → CSV 저장
# - 결과 CSV: top_caption.csv  (columns: product_id, combined_text)
# - category/type은 원본 CSV(top.csv)에서 조회
# - top (상의)만 처리

import os, re, io, base64, asyncio
from typing import List, Dict, Tuple
import pandas as pd
from PIL import Image
from dotenv import load_dotenv
from openai import AsyncOpenAI

# ================== 경로/환경 ==================
IMAGE_DIR = "crop_29cm"
PRODUCT_INFO_CSV = "29cm/29cm_1000.csv"
OUT_CSV = "29cm/top_caption.csv"

MAX_SIDE = 768
JPEG_QUALITY = 85
MAX_CONCURRENCY = 4

load_dotenv()
aclient = AsyncOpenAI()  # OPENAI_API_KEY 필요

# ================== key 목록 ==================
TOK_KEYS = [
    "color.main","color.sub","pattern","pattern_scale","material.fabric",
    "sleeve.length","sleeve.width","sleeve.style","fit","neckline","collar",
    "closure","graphic","graphic_position","graphic_size","length.top",
    "sleeve.cuff","shoulder"
]

# ================== GPT 프롬프트 ==================
PROMPT = """
You are a fashion vision tagger for tops (e.g., t-shirt, hoodie, blouse, sweater, sleeveless, athleisure).
Analyze ONLY the top garment. Ignore accessories, pants, skirts, skin, hair, or background.
If the attribute truly does not exist, output "none".

Return EXACTLY 18 lowercase, comma-separated tokens as key=value pairs,
using these keys IN THIS ORDER (keys must match exactly; no extra fields):

1) color.main
   black / white / gray / beige / cream / brown / navy / blue / green / yellow / orange / red / pink / purple / unknown
2) color.sub
   second-most visible (≥15% of garment area) else none
3) pattern
   solid / stripe / check / houndstooth / herringbone / dot / floral / paisley / animal / camouflage / text / scenic / logo / geometric / abstract / lace-knit / mixed / unknown
4) pattern_scale
   small / medium / large / none / unknown   (if pattern=solid ⇒ none)
5) material.fabric
   knit / denim / leather / suede / corduroy / chiffon / satin / lace / tweed / wool-blend / woven-cotton / woven-poly / other / unknown
6) sleeve.length
   sleeveless / short / half / three-quarter / long / unknown
7) sleeve.width
   slim / regular / wide / unknown
8) sleeve.style
   none / puff / balloon / raglan / kimono / off-shoulder / cold-shoulder / bishop / roll-up / spaghetti / tank / unknown
9) fit
   slim / regular / oversized / unknown
10) neckline
   round / v / square / halter / off-shoulder / strapless / cowl / one-shoulder / boat / unknown
11) collar
   none / shirt / polo / mandarin / high-neck / hood / unknown
12) closure
   zipper / buttons / hooks / drawstring / pullover / none / unknown
13) graphic
   none / logo / text / image / photo / art / abstract / all-over / unknown
14) graphic_position
   chest / sleeve / back / hem / multi / center / none / unknown
15) graphic_size
   small / medium / large / all-over / none / unknown
16) length.top
   cropped / regular / longline / tunic / unknown
17) sleeve.cuff
   plain / ribbed / elastic / buttoned / rolled / none / unknown
18) shoulder
   dropped / raglan / regular / padded / off-shoulder / one-shoulder / unknown

---

CONSISTENCY RULES
- if pattern=solid ⇒ pattern_scale=none
- if graphic=none ⇒ graphic_position=none and graphic_size=none
- if graphic=all-over ⇒ pattern=solid
- if pattern ≠ solid and graphic ≠ none ⇒ both can coexist (e.g., floral shirt with chest logo)
- if sleeve.length=sleeveless and sleeve.style in {spaghetti, tank} ⇒ keep both
- neckline and collar are independent; use none if not present
- When unsure, output "unknown"

---

FORMAT GUARD
- Exactly 18 tokens, lowercase, comma-separated
- key=value for every token
- No spaces around commas, no explanations
- Example:
  "color.main=white,color.sub=none,pattern=solid,pattern_scale=none,material.fabric=knit,sleeve.length=short,sleeve.width=regular,sleeve.style=none,fit=slim,neckline=round,collar=shirt,closure=buttons,graphic=logo,graphic_position=chest,graphic_size=small,length.top=regular,sleeve.cuff=ribbed,shoulder=regular"

"""

# ================== 도우미 함수 ==================
def image_to_b64(image_path: str, max_side: int = MAX_SIDE, jpeg_quality: int = JPEG_QUALITY):
    im = Image.open(image_path).convert("RGB")
    w, h = im.size
    scale = max(w, h) / float(max_side)
    if scale > 1.0:
        try: resample = Image.Resampling.BICUBIC
        except AttributeError: resample = Image.BICUBIC
        im = im.resize((int(w/scale), int(h/scale)), resample)
    buf = io.BytesIO()
    im.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
    return base64.b64encode(buf.getvalue()).decode("utf-8"), "image/jpeg"

# 파일명에서 product_id 추출
PID_FROM_NAME = re.compile(r"^top_([^.\\/]+)", re.IGNORECASE)
def extract_product_id_from_filename(path: str) -> str:
    base = os.path.basename(path)
    m = PID_FROM_NAME.search(base)
    return (m.group(1) if m else "").strip().lower()

# 제품 CSV 불러오기 (상의만)
def load_top_map(csv_path: str) -> Dict[str, Tuple[str,str]]:
    df = pd.read_csv(csv_path, dtype=str)
    df = df.fillna("").apply(lambda col: col.str.strip().str.lower())
    df_top = df[df["대분류"] == "상의"]  # top만 필터
    return dict(zip(df_top["상품코드"], zip(df_top["대분류"], df_top["소분류"])))

def map_categories_and_sub(major: str, sub: str) -> Tuple[str, str]:
    major_map = {"상의":"top"}
    sub_map = {
        "후드티":"hoodie","셔츠블라우스":"shirt-blouse","긴소매":"longsleeve",
        "반소매":"shortsleeve","피케카라":"polo","니트스웨터":"knit-sweater",
        "슬리브리스":"sleeveless"
    }
    return major_map.get(major, "top"), sub_map.get(sub, "unknown")

# ================== normalize ==================
def normalize_caption_18(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = " ".join(text.splitlines()).strip().strip('"').strip("'")

    # key=value 추출
    raw_parts = [p.strip().lower() for p in text.split(",") if "=" in p]
    kv_map = {}
    for p in raw_parts:
        if "=" in p:
            k, v = p.split("=", 1)
            kv_map[k.strip()] = v.strip()

    # 누락된 키 unknown 채우기
    fixed = []
    for k in TOK_KEYS:
        val = kv_map.get(k, "unknown")
        fixed.append(f"{k}={val}")

    # 룰 보정
    kv = {kv.split("=")[0]: kv.split("=")[1] for kv in fixed}
    if kv["pattern"] == "solid":
        kv["pattern_scale"] = "none"
    if kv["graphic"] == "none":
        kv["graphic_position"] = "none"
        kv["graphic_size"] = "none"

    return ",".join([f"{k}={kv[k]}" for k in TOK_KEYS])

def tokens_to_combined_text(tokens_csv: str, category: str, type_: str) -> str:
    parts = [p.strip() for p in tokens_csv.split(",")]
    fixed = []
    for i, tok in enumerate(parts):
        if "=" in tok: fixed.append(tok)
        else: fixed.append(f"{TOK_KEYS[i]}={tok}")
    return f"category={category} | type={type_} | " + " | ".join(fixed)

# ================== GPT 호출 ==================
async def tag_one(image_path: str) -> Tuple[str, str]:
    b64, mime = image_to_b64(image_path)
    messages = [{
        "role":"user",
        "content":[
            {"type":"text","text":PROMPT},
            {"type":"image_url","image_url":{"url":f"data:{mime};base64,{b64}"}}
        ]
    }]
    resp = await aclient.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=160,
        temperature=0.0,
        top_p=1.0,
        seed=12345,
    )
    text = resp.choices[0].message.content.strip()
    return image_path, normalize_caption_18(text)

async def run_stage1(image_paths: List[str], pid2cat: Dict[str, Tuple[str,str]]) -> pd.DataFrame:
    sem = asyncio.Semaphore(MAX_CONCURRENCY)
    out = []
    done = 0

    async def worker(p):
        async with sem:
            try:
                _, cap18 = await tag_one(p)
            except Exception as e:
                print("caption error:", p, e)
                cap18 = ",".join([f"{k}=unknown" for k in TOK_KEYS])
            pid = extract_product_id_from_filename(p)
            major, sub = pid2cat.get(pid, ("상의",""))
            cat_en, type_en = map_categories_and_sub(major, sub)
            combined = tokens_to_combined_text(cap18, cat_en, type_en)
            return {"product_id": pid, "combined_text": combined}

    tasks = [asyncio.create_task(worker(p)) for p in image_paths]

    for fut in asyncio.as_completed(tasks):
        row = await fut
        out.append(row)
        done += 1
        if done % 10 == 0 or done == len(tasks):
            df_partial = pd.DataFrame(out).drop_duplicates("product_id")
            df_partial.to_csv(OUT_CSV, index=False, encoding="utf-8")
            print(f"[Stage1] {done}/{len(tasks)} saved → {OUT_CSV}")

    return pd.DataFrame(out).drop_duplicates("product_id")

# ================== 실행 ==================
exts = (".jpg",".jpeg",".png",".webp",".bmp",".jfif")
image_paths = [
    os.path.join(IMAGE_DIR,f)
    for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(exts) and f.lower().startswith("top_")
]
if len(image_paths) == 0:
    raise RuntimeError("처리할 top 이미지가 없습니다.")

pid2cat = load_top_map(PRODUCT_INFO_CSV)

import nest_asyncio
nest_asyncio.apply()

df = await run_stage1(image_paths, pid2cat)
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("✅ 저장 완료:", OUT_CSV)
print(df.head())


## pants

In [None]:
# -*- coding: utf-8 -*-
# Stage 1: 이미지 → GPT-4o-mini 태깅 → combined_text 생성 → CSV 저장
# - 결과 CSV: pants_caption.csv  (columns: product_id, combined_text)
# - category/type은 원본 CSV(pants.csv)에서 조회
# - pants (하의)만 처리

import os, re, io, base64, asyncio
from typing import List, Dict, Tuple
import pandas as pd
from PIL import Image
from dotenv import load_dotenv
from openai import AsyncOpenAI

# ================== 경로/환경 ==================
IMAGE_DIR = "crop_0.5"          # crop 이미지 폴더 (파일명: pants_상품코드.ext / jeans_ / shorts_ 등)
PRODUCT_INFO_CSV = "29cm_1000.csv"  # [상품코드, 대분류, 소분류]
OUT_CSV = "pants_caption.csv"   # 최종 산출물

MAX_SIDE = 768
JPEG_QUALITY = 85
MAX_CONCURRENCY = 4

load_dotenv()
aclient = AsyncOpenAI()  # OPENAI_API_KEY 필요

# ================== 토큰 키 (PANTS 16개) ==================
TOK_KEYS = [
    "fit","rise","waistband","closure","cuffs","front.structure",
    "pockets.style","pockets.secure","material.fabric","denim.wash",
    "pattern","pattern_scale","color.main","color.sub","leg.length","hem.opening"
]

# ================== 허용값/별칭 ==================
ALLOWED_PATTERN = {
    "solid","stripe","check","houndstooth","herringbone","dot","floral","paisley",
    "animal","camouflage","text","scenic","logo","geometric","abstract","lace-knit","mixed","unknown"
}
PATTERN_ALIASES = {
    "newspaper":"text","typographic":"text","letters":"text","letter":"text","textual":"text","script":"text","font":"text",
    "city":"scenic","building":"scenic","buildings":"scenic","map":"scenic","landmark":"scenic","architecture":"scenic",
    "chevron":"herringbone",
    "animal print":"animal","leopard":"animal","zebra":"animal","snake":"animal","giraffe":"animal","cow":"animal","tiger":"animal",
    "camo":"camouflage","military":"camouflage",
    "monogram":"logo"
}
ALLOWED_PATSCALE = {"small","medium","large","none","unknown"}
ALLOWED_COLOR = {"black","white","gray","beige","cream","brown","navy","blue","green","yellow","orange","red","pink","purple","unknown"}

ALLOWED_FIT = {"skinny","slim","straight","tapered","wide","relaxed","bootcut","flared","loose","unknown"}
ALLOWED_RISE = {"low","mid","high","unknown"}
ALLOWED_WAISTBAND = {"fixed","elastic","drawstring","elastic+drawstring","unknown"}
ALLOWED_CLOSURE = {"zipper","buttons","drawstring","none","unknown"}
ALLOWED_CUFFS = {"none","elastic","rib","rolled","raw","zipped","unknown"}
ALLOWED_FRONT = {"flat-front","pleated-single","pleated-double","darts-only","unknown"}
ALLOWED_POCKET_STYLE = {"5-pocket","slant","welt","patch","cargo","zip","none","unknown"}
ALLOWED_POCKET_SECURE = {"none","button","zip","flap","mixed","unknown"}
ALLOWED_FABRIC = {"denim","jersey","fleece","woven-cotton","twill","corduroy","woven-poly","nylon","ripstop","wool-blend","leather","satin","other","unknown"}
ALLOWED_DENIM_WASH = {"raw","dark-wash","mid-wash","light-wash","acid-wash","stone-wash","bleach","coated","colored","whiskered-faded","distressed","clean","none","unknown"}
ALLOWED_LENGTH = {"shorts","cropped","ankle","full","unknown"}
ALLOWED_HEMOPEN = {"narrow","regular","wide","unknown"}

def _normalize_pattern_value(p: str) -> str:
    p = (p or "").strip().lower()
    p = PATTERN_ALIASES.get(p, p)
    return p if p in ALLOWED_PATTERN else "unknown"

def _nz(s: str) -> str:
    return (s or "").strip().lower()

# ================== GPT 프롬프트 (바지 16토큰) ==================
PROMPT = """
You are a fashion vision tagger for fashion product retrieval.
Analyze ONLY the pants region even if other items/body parts are visible.
Never infer hidden details; if not clearly visible, output "unknown".
If not applicable, output "none".
Ignore tops, footwear, accessories, or background items. Only describe the pants themselves.
IMPORTANT: Do NOT output product category (e.g., denim/jogger/slacks/…). These are provided externally.

OUTPUT
Return ONE line with EXACTLY 16 lowercase, comma-separated tokens as key=value pairs,
using these keys IN THIS ORDER (keys must match exactly; no extra fields):

1) fit
   skinny / slim / straight / tapered / wide / relaxed / bootcut / flared / loose / unknown
2) rise
   low / mid / high / unknown
3) waistband
   fixed / elastic / drawstring / elastic+drawstring / unknown
4) closure
   zipper / buttons / drawstring / none / unknown
5) cuffs
   none / elastic / rib / rolled / raw / zipped / unknown
6) front.structure
   flat-front / pleated-single / pleated-double / darts-only / unknown
7) pockets.style
   5-pocket / slant / welt / patch / cargo / zip / none / unknown
8) pockets.secure
   none / button / zip / flap / mixed / unknown
9) material.fabric
   denim / jersey / fleece / woven-cotton / twill / corduroy / woven-poly / nylon / ripstop / wool-blend / leather / satin / other / unknown
10) denim.wash
   raw / dark-wash / mid-wash / light-wash / acid-wash / stone-wash / bleach / coated / colored / whiskered-faded / distressed / clean / none / unknown
   (if material.fabric ≠ denim ⇒ set to "none")
11) pattern
   solid / stripe / check / houndstooth / herringbone / dot / floral / paisley / animal / camouflage / text / scenic / logo / geometric / abstract / lace-knit / mixed / unknown
12) pattern_scale
   small / medium / large / none / unknown (if pattern=solid ⇒ none)
13) color.main
   black / white / gray / beige / cream / brown / navy / blue / green / yellow / orange / red / pink / purple / unknown
14) color.sub
   second-most color on the pants (≥15% of pant area) else none
15) leg.length
   shorts / cropped / ankle / full / unknown
16) hem.opening
   narrow / regular / wide / unknown

---

SELECTION GUIDELINES
- fit: overall leg silhouette (tapered narrows to hem; straight stays constant; relaxed/loose is roomy; bootcut/flared widens from knee).
- rise: relative to natural waist (low <, high >; else mid). If covered by tops ⇒ unknown.
- waistband vs closure:
  elastic / elastic+drawstring ⇒ gathered stretch; drawstring shows a visible cord.
  fixed waist ⇒ usually zipper or buttons; if only a cord is visible ⇒ closure=drawstring.
- cuffs: rib/elastic bands ⇒ jogger-like; rolled = turn-ups; raw = cut-off fray; zipped = hem zippers.
- front.structure: flat-front (no pleats), pleated-single/double, darts-only (no visible pleats).
- pockets.style: 5-pocket (jeans layout), slant (chino/slacks), welt (back slit with welts), patch (sewn-on), cargo (large thigh), zip (visible zipper pockets).
  If multiple are equally present, prefer cargo > 5-pocket > slant > welt > patch > zip.
- material.fabric: pick the dominant cloth; coated denim is still denim (mark coating under denim.wash).
- denim.wash (DENIM ONLY): raw, dark-/mid-/light-wash; acid-/stone-/bleach; coated/colored; whiskered-faded; distressed; clean. If not denim ⇒ none.
- pattern vs wash: whiskers/fades/distressing are denim.wash, NOT pattern. pattern is prints/weaves (check, camo, stripe, etc.).
- pattern_scale: small < ~1/20; medium ~1/20–1/6; large > ~1/6 of visible leg height.
- colors: measure on pant area only (ignore belt/background/shoes).
- leg.length: shorts above knee; cropped noticeably above ankle; ankle around ankle bone; full covers ankle.
- hem.opening: the openness at hem relative to knee/thigh. flared/bootcut ⇒ typically wide; skinny/tapered ⇒ typically narrow.
- A small single brand logo patch/embroidery does not count as a pattern; keep pattern=solid unless logos repeat across the fabric.

---

CONSISTENCY RULES
- waistband ∈ {elastic, elastic+drawstring} without a fixed band ⇒ closure should be drawstring or none (avoid zipper/buttons unless clearly visible).
- if pattern=solid ⇒ pattern_scale=none (enforce).
- if material.fabric ≠ denim ⇒ denim.wash=none (enforce).
- fit vs hem.opening: if fit ∈ {flared, bootcut} and hem.opening is unknown ⇒ set hem.opening=wide. If fit ∈ {skinny, slim, tapered} and hem.opening is unknown ⇒ set hem.opening=narrow.
- cuffs ∈ {elastic, rib, zipped} with unknown hem.opening ⇒ prefer hem.opening=narrow.
- when attributes conflict and you cannot resolve, set the conflicting token(s) to "unknown".

---

PATTERN DISAMBIGUATION PRIORITY
animal > (stripe | check) > camouflage > logo > text > scenic
> (houndstooth | herringbone) > (floral | paisley) > (geometric | abstract) > lace-knit > solid.

---

FORMAT RULES
- Exactly 16 tokens, lowercase, comma-separated
- If <70% certain ⇒ unknown; if truly absent ⇒ none.
- Each token must be key=value
- No extra words, no explanations
- Example valid output:
  "fit=slim,rise=mid,waistband=fixed,closure=zipper,cuffs=none,front.structure=flat-front,pockets.style=5-pocket,pockets.secure=none,material.fabric=denim,denim.wash=mid-wash,pattern=solid,pattern_scale=none,color.main=blue,color.sub=none,leg.length=full,hem.opening=regular"

"""

# ================== 도우미 함수 ==================
def image_to_b64(image_path: str, max_side: int = MAX_SIDE, jpeg_quality: int = JPEG_QUALITY):
    im = Image.open(image_path).convert("RGB")
    w, h = im.size
    scale = max(w, h) / float(max_side)
    if scale > 1.0:
        try: resample = Image.Resampling.BICUBIC
        except AttributeError: resample = Image.BICUBIC
        im = im.resize((int(w/scale), int(h/scale)), resample)
    buf = io.BytesIO()
    im.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
    return base64.b64encode(buf.getvalue()).decode("utf-8"), "image/jpeg"

# 파일명에서 product_id 추출 — pants/jeans/shorts 접두 허용
PID_FROM_NAME = re.compile(r"^(?:pants|jeans|shorts)_([^.\\/]+)", re.IGNORECASE)
def extract_product_id_from_filename(path: str) -> str:
    base = os.path.basename(path)
    m = PID_FROM_NAME.search(base)
    return (m.group(1) if m else "").strip().lower()

# 제품 CSV 불러오기 (하의만)
def load_pants_map(csv_path: str) -> Dict[str, Tuple[str,str]]:
    df = pd.read_csv(csv_path, dtype=str)
    df = df.fillna("").apply(lambda col: col.str.strip().str.lower())
    # 대분류가 '하의' 또는 '바지'인 것만 사용
    df_pants = df[df["대분류"].isin(["하의","바지"])]
    return dict(zip(df_pants["상품코드"], zip(df_pants["대분류"], df_pants["소분류"])))

# 카테고리/타입 매핑 (ko → en)
def map_categories_and_type(major: str, sub: str) -> Tuple[str, str]:
    def norm(x: str) -> str:
        return (x or "").replace(" ", "").replace("/", "").replace("-", "").strip().lower()
    major_map = {"하의":"pants","바지":"pants"}
    sub_map = {
        "데님팬츠":"denim-pants",
        "트레이닝조거팬츠":"jogger-pants",
        "코튼팬츠":"cotton-pants",
        "슈트팬츠슬랙스":"slacks",
        "슈트슬랙스":"slacks",
        "숏팬츠":"short-pants",
        "레깅스":"leggings",
        "카고팬츠":"cargo-pants",
    }
    major_en = major_map.get(norm(major), "pants")
    type_en  = sub_map.get(norm(sub), "unknown")
    return major_en, type_en

# 캡션 정규화 (16토큰 값만 정규화; key=가 포함돼도 안전)
def normalize_caption_pants16(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = " ".join(text.splitlines()).strip().strip('"').strip("'")
    parts = [p.strip().lower() for p in text.split(",") if p]

    # key=value가 들어왔으면 값만 추출
    vals = []
    for i, p in enumerate(parts):
        if "=" in p:
            k, v = p.split("=", 1)
            vals.append(v.strip())
        else:
            vals.append(p)

    # 패딩/트림
    if len(vals) < 16:
        vals += ["unknown"] * (16 - len(vals))
    elif len(vals) > 16:
        vals = vals[:16]

    # 인덱스
    i_fit, i_rise, i_waist, i_closure, i_cuffs, i_front = 0,1,2,3,4,5
    i_pstyle, i_psecure = 6,7
    i_fabric, i_wash = 8,9
    i_pat, i_pscale = 10,11
    i_cmain, i_csub = 12,13
    i_len, i_hem = 14,15

    def _in(x, allowed): return x if x in allowed else "unknown"

    vals[i_fit]     = _in(vals[i_fit], ALLOWED_FIT)
    vals[i_rise]    = _in(vals[i_rise], ALLOWED_RISE)
    vals[i_waist]   = _in(vals[i_waist], ALLOWED_WAISTBAND)
    vals[i_closure] = _in(vals[i_closure], ALLOWED_CLOSURE)
    vals[i_cuffs]   = _in(vals[i_cuffs], ALLOWED_CUFFS)
    vals[i_front]   = _in(vals[i_front], ALLOWED_FRONT)
    vals[i_pstyle]  = _in(vals[i_pstyle], ALLOWED_POCKET_STYLE)
    vals[i_psecure] = _in(vals[i_psecure], ALLOWED_POCKET_SECURE)
    vals[i_fabric]  = _in(vals[i_fabric], ALLOWED_FABRIC)

    # denim.wash
    wash = _nz(vals[i_wash])
    if vals[i_fabric] != "denim":
        wash = "none"
    elif wash not in ALLOWED_DENIM_WASH:
        wash = "unknown"
    vals[i_wash] = wash

    # 패턴/스케일
    pattern = _normalize_pattern_value(vals[i_pat])
    vals[i_pat] = pattern
    pscale = _nz(vals[i_pscale])
    if pattern == "solid":
        pscale = "none"
    elif pscale not in ALLOWED_PATSCALE:
        pscale = "unknown"
    vals[i_pscale] = pscale

    # 색상
    vals[i_cmain] = _in(vals[i_cmain], ALLOWED_COLOR)
    csub = _nz(vals[i_csub])
    vals[i_csub] = csub if (csub in ALLOWED_COLOR or csub == "none") else "unknown"

    # 길이/밑단
    vals[i_len] = _in(vals[i_len], ALLOWED_LENGTH)
    vals[i_hem] = _in(vals[i_hem], ALLOWED_HEMOPEN)

    # 일관성 보정
    if vals[i_waist] in {"elastic","elastic+drawstring"} and vals[i_closure] in {"zipper","buttons"}:
        vals[i_closure] = "unknown"
    if vals[i_hem] == "unknown":
        if vals[i_fit] in {"flared","bootcut"}: vals[i_hem] = "wide"
        elif vals[i_fit] in {"skinny","slim","tapered"}: vals[i_hem] = "narrow"
    if vals[i_hem] == "unknown" and vals[i_cuffs] in {"elastic","rib","zipped"}:
        vals[i_hem] = "narrow"

    return ",".join(vals)


# ================== GPT 호출 ==================
async def tag_one(image_path: str) -> Tuple[str, str]:
    b64, mime = image_to_b64(image_path)
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": PROMPT},
            {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}
        ]
    }]
    resp = await aclient.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=160,
        temperature=0.0,
        top_p=1.0,
        seed=12345,
    )
    text = resp.choices[0].message.content.strip()
    return image_path, normalize_caption_pants16(text)

async def run_stage1(image_paths: List[str], pid2cat: Dict[str, Tuple[str,str]]) -> pd.DataFrame:
    sem = asyncio.Semaphore(MAX_CONCURRENCY)
    out = []
    done = 0

    async def worker(p):
        async with sem:
            try:
                _, cap16 = await tag_one(p)
            except Exception as e:
                print("caption error:", p, e)
                cap16 = ",".join([f"{k}=unknown" for k in TOK_KEYS])
            pid = extract_product_id_from_filename(p)
            major, sub = pid2cat.get(pid, ("하의", ""))  # pants만
            cat_en, type_en = map_categories_and_type(major, sub)
            combined = tokens_to_combined_text(cap16, cat_en, type_en)
            return {"product_id": pid, "combined_text": combined}

    tasks = [asyncio.create_task(worker(p)) for p in image_paths]

    for fut in asyncio.as_completed(tasks):
        row = await fut
        out.append(row)
        done += 1

        # 🔹 10개마다 중간 저장 (덮어쓰기)
        if done % 10 == 0 or done == len(tasks):
            df_partial = pd.DataFrame(out).drop_duplicates(subset=["product_id"], keep="first")
            df_partial.to_csv(OUT_CSV, index=False, encoding="utf-8")
            print(f"[Stage1] {done}/{len(tasks)} saved → {OUT_CSV}")

    # 마지막에 최종 DataFrame 반환
    return pd.DataFrame(out).drop_duplicates(subset=["product_id"], keep="first")

# ================== 실행 ==================
# pants_* / jeans_* / shorts_* 이미지만 추출
exts = (".jpg",".jpeg",".png",".webp",".bmp",".jfif")
image_paths = [
    os.path.join(IMAGE_DIR, f)
    for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(exts) and (f.lower().startswith("pants_") or f.lower().startswith("jeans_") or f.lower().startswith("shorts_"))
]
if len(image_paths) == 0:
    raise RuntimeError("처리할 pants/jeans/shorts 이미지가 없습니다.")

pid2cat = load_pants_map(PRODUCT_INFO_CSV)

# 🔹 Jupyter에서는 asyncio.run() 대신 await 사용
import nest_asyncio, asyncio
nest_asyncio.apply()

df = await run_stage1(image_paths, pid2cat)   # ✅ 여기서 await로 실행
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("✅ 저장 완료:", OUT_CSV)
df.head()


## skirt

In [None]:
# -*- coding: utf-8 -*-
# Stage 1: 이미지 → GPT-4o-mini 태깅 → combined_text 생성 → CSV 저장
# - 결과 CSV: skirt_caption.csv  (columns: product_id, combined_text)
# - category/type은 원본 CSV(skirt.csv)에서 조회
# - skirt (스커트)만 처리

import os, re, io, base64, asyncio
from typing import List, Dict, Tuple
import pandas as pd
from PIL import Image
from dotenv import load_dotenv
from openai import AsyncOpenAI

# ================== 경로/환경 ==================
IMAGE_DIR = "crop_29cm"
PRODUCT_INFO_CSV = "29cm/29cm_1000.csv"
OUT_CSV = "29cm/skirt_caption.csv"

MAX_SIDE = 768
JPEG_QUALITY = 85
MAX_CONCURRENCY = 4

load_dotenv()
aclient = AsyncOpenAI()  # OPENAI_API_KEY 필요

# ================== 토큰 키 (15개) ==================
TOK_KEYS = [
    "color.main","color.sub","color.tone","pattern","pattern_scale",
    "material.fabric","silhouette","pleated","flare_level","wrap",
    "closure","details.pockets","details.slit","details.hem_finish",
    "style"
]

# ================== GPT 프롬프트 ==================
PROMPT = """
You are a vision tagger for fashion product retrieval.
Analyze ONLY the skirt region even if other items/body parts are visible.
Never infer hidden details.  
If an attribute is not clearly visible:
- For closure, details.pockets, details.slit: output "none"
- For all other attributes: output "unknown"

If the attribute truly does not exist, output "none".  
Ignore hands, legs, accessories, or background items. Only describe the skirt itself.

Return ONE line with EXACTLY 15 lowercase, comma-separated tokens as key=value pairs,
using these keys IN THIS ORDER (keys must match exactly; no extra fields):

1) color.main
   black / white / gray / beige / cream / brown / navy / blue / green / yellow / orange / red / pink / purple / unknown
2) color.sub
   second-most visible (≥15% of skirt area) else none
3) color.tone
   light / mid / dark / unknown
4) pattern
   solid / stripe / check / houndstooth / herringbone / dot / floral / paisley / animal / camouflage / text / scenic / logo / geometric / abstract / lace-knit / mixed / unknown
5) pattern_scale
   small / medium / large / none / unknown (if pattern=solid ⇒ none)
6) material.fabric
   knit / denim / leather / suede / corduroy / chiffon / satin / lace / tweed / wool-blend / woven-cotton / woven-poly / tulle / other / unknown
7) silhouette
   a-line / h-line / pencil / trumpet / mermaid / pleated / tulip / bubble / asymmetric / wrap / layered / gypsy / tiered / prairie / flounced / bias / draped / peplum / pant-skirt / slit / sarong / other / unknown
8) pleated
   yes / no / unknown
9) flare_level
   low / medium / high / none / unknown
10) wrap
   yes / no / unknown
11) closure
   zipper / buttons / hooks / drawstring / none / unknown
12) details.pockets
   cargo / welt / patch / none / unknown
13) details.slit
   front / side / back / none / unknown
14) details.hem_finish
   cutoff / clean / uneven / rolled / raw / unknown
15) style
   casual / formal / office / evening / street / school / sporty / unknown

---

CONSISTENCY RULES
- if pattern=solid ⇒ pattern_scale=none
- if silhouette=pleated ⇒ pleated=yes
- if pleated=yes but silhouette≠pleated ⇒ silhouette must not be "unknown"
- if wrap=yes ⇒ closure=none

---

FORMAT RULES
- Exactly 15 tokens, lowercase, comma-separated
- key=value for every token
- No extra words, no explanations
- Example valid output:
  "color.main=black,color.sub=none,color.tone=dark,pattern=solid,pattern_scale=none,material.fabric=chiffon,silhouette=pleated,pleated=yes,flare_level=high,wrap=no,closure=zipper,details.pockets=none,details.slit=side,details.hem_finish=clean,style=casual"

"""

# ================== normalize (A 코드 그대로 유지) ==================
def normalize_caption_15(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = " ".join(text.splitlines()).strip().strip('"').strip("'")
    parts = [p.strip().lower() for p in text.split(",") if p]
    if len(parts) < 15:
        parts += ["unknown"] * (15 - len(parts))
    elif len(parts) > 15:
        parts = parts[:15]

    i = {name: idx for idx, name in enumerate(TOK_KEYS)}

    if parts[i["pattern"]] == "solid":
        parts[i["pattern_scale"]] = "none"
    if parts[i["pleated"]] not in ("yes","no","unknown"):
        parts[i["pleated"]] = "unknown"
    if parts[i["wrap"]] not in ("yes","no","unknown"):
        parts[i["wrap"]] = "unknown"
    if parts[i["flare_level"]] not in ("low","medium","high","none","unknown"):
        parts[i["flare_level"]] = "unknown"
    if parts[i["details.slit"]] not in ("front","side","back","none","unknown"):
        parts[i["details.slit"]] = "unknown"
    if parts[i["details.hem_finish"]] not in ("cutoff","clean","uneven","rolled","raw","unknown"):
        parts[i["details.hem_finish"]] = "unknown"
    if parts[i["silhouette"]] == "pleated":
        parts[i["pleated"]] = "yes"
    if parts[i["wrap"]] == "yes":
        parts[i["closure"]] = "none"




    return ",".join(parts)

def tokens_to_combined_text(tokens_csv: str, category: str, type_: str) -> str:
    parts = [p.strip() for p in tokens_csv.split(",")]
    fixed = []
    for i, tok in enumerate(parts):
        if "=" in tok:
            fixed.append(tok)
        else:
            fixed.append(f"{TOK_KEYS[i]}={tok}")
    return f"category={category} | type={type_} | " + " | ".join(fixed)

# ================== 제품 CSV 매핑 ==================
def load_category_map_fixed(csv_path: str) -> Dict[str, Tuple[str, str]]:
    df = pd.read_csv(csv_path, dtype=str).fillna("").apply(lambda col: col.str.strip().str.lower())
    df_skirt = df[df["대분류"] == "스커트"]
    return dict(zip(df_skirt["상품코드"], zip(df_skirt["대분류"], df_skirt["소분류"])))

PID_RE = re.compile(r"^skirt_([^.\\/]+)", re.IGNORECASE)
def extract_product_id_from_filename(path: str) -> str:
    base = os.path.basename(path)
    m = PID_RE.search(base)
    return (m.group(1) if m else "").strip().lower()

def map_categories_and_sub(major: str, sub: str) -> Tuple[str, str]:
    major_map = {"스커트": "skirt"}
    sub_map = {"미니스커트":"miniskirt","미디스커트":"midiskirt","롱스커트":"longskirt"}
    return major_map.get(major, "skirt"), sub_map.get(sub, "unknown")

# ================== GPT 호출 ==================
def image_to_b64(image_path: str, max_side: int = MAX_SIDE, jpeg_quality: int = JPEG_QUALITY):
    im = Image.open(image_path).convert("RGB")
    w, h = im.size
    scale = max(w, h) / float(max_side)
    if scale > 1.0:
        try: resample = Image.Resampling.BICUBIC
        except AttributeError: resample = Image.BICUBIC
        im = im.resize((int(w/scale), int(h/scale)), resample)
    buf = io.BytesIO()
    im.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
    return base64.b64encode(buf.getvalue()).decode("utf-8"), "image/jpeg"

async def tag_one(image_path: str) -> Tuple[str, str]:
    b64, mime = image_to_b64(image_path)
    messages = [{
        "role":"user",
        "content":[
            {"type":"text","text":PROMPT},
            {"type":"image_url","image_url":{"url":f"data:{mime};base64,{b64}"}}
        ]
    }]
    resp = await aclient.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=160,
        temperature=0.0,
        top_p=1.0,
        seed=12345,
    )
    text = resp.choices[0].message.content.strip()
    return image_path, normalize_caption_15(text)

# ================== Stage1 실행 ==================
async def run_stage1(image_paths: List[str], pid2cat: Dict[str, Tuple[str,str]]) -> pd.DataFrame:
    sem = asyncio.Semaphore(MAX_CONCURRENCY)
    out = []
    done = 0

    async def worker(p):
        async with sem:
            try:
                _, cap15 = await tag_one(p)
            except Exception as e:
                print("caption error:", p, e)
                cap15 = ",".join([f"{k}=unknown" for k in TOK_KEYS])
            pid = extract_product_id_from_filename(p)
            major, sub = pid2cat.get(pid, ("스커트", ""))
            cat_en, type_en = map_categories_and_sub(major, sub)
            combined = tokens_to_combined_text(cap15, cat_en, type_en)
            return {"product_id": pid, "combined_text": combined}

    tasks = [asyncio.create_task(worker(p)) for p in image_paths]

    for fut in asyncio.as_completed(tasks):
        row = await fut
        out.append(row)
        done += 1
        if done % 10 == 0 or done == len(tasks):
            df_partial = pd.DataFrame(out).drop_duplicates("product_id")
            df_partial.to_csv(OUT_CSV, index=False, encoding="utf-8")
            print(f"[Stage1] {done}/{len(tasks)} saved → {OUT_CSV}")

    return pd.DataFrame(out).drop_duplicates("product_id")

# ================== 실행 ==================
exts = (".jpg",".jpeg",".png",".webp",".bmp",".jfif")
image_paths = [
    os.path.join(IMAGE_DIR,f)
    for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(exts) and f.lower().startswith("skirt_")
]
if len(image_paths) == 0:
    raise RuntimeError("처리할 skirt 이미지가 없습니다.")

pid2cat = load_category_map_fixed(PRODUCT_INFO_CSV)

import nest_asyncio
nest_asyncio.apply()

df = await run_stage1(image_paths, pid2cat)
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("✅ 저장 완료:", OUT_CSV)
df.head()


## dress

In [None]:
# -*- coding: utf-8 -*-
# Stage 1: 이미지 → GPT-4o-mini 태깅 → combined_text 생성 → CSV 저장
# - 결과 CSV: dress_caption.csv  (columns: product_id, combined_text)
# - category/type은 원본 CSV(29cm_1000.csv)에서 조회
# - dress (원피스)만 처리

import os, re, io, base64, asyncio
from typing import List, Dict, Tuple
import pandas as pd
from PIL import Image
from dotenv import load_dotenv
from openai import AsyncOpenAI

# ================== 경로/환경 ==================
IMAGE_DIR = "crop_0.5"          # crop 이미지 폴더 (파일명: dress_상품코드.ext 등)
PRODUCT_INFO_CSV = "29cm_1000.csv"  # [상품코드, 대분류, 소분류]
OUT_CSV = "dress_caption.csv"       # 최종 산출물

MAX_SIDE = 768
JPEG_QUALITY = 85
MAX_CONCURRENCY = 4

load_dotenv()
aclient = AsyncOpenAI()  # OPENAI_API_KEY 필요

# ================== 토큰 키 ==================
TOK_KEYS = [
    "bodice.fit","skirt.volume","dress.length","hemline.shape","hem.finish",
    "waistline","neckline","sleeve.length","sleeve.style","skirt.structure",
    "pattern","pattern_scale","material.fabric","color.main","color.sub","closure","skirt.slit"
]

# ================== 허용값/별칭 ==================
ALLOWED_PATTERNS = {
    "solid","stripe","check","houndstooth","herringbone","dot","floral","paisley",
    "animal","camouflage","text","scenic","logo","geometric","abstract","lace-knit","mixed","unknown"
}
PATTERN_ALIASES = {
    "newspaper":"text","typographic":"text","letters":"text","letter":"text","textual":"text","script":"text","font":"text",
    "city":"scenic","building":"scenic","map":"scenic","chevron":"herringbone",
    "animal print":"animal","leopard":"animal","zebra":"animal","snake":"animal","tiger":"animal","cow":"animal",
    "camo":"camouflage","monogram":"logo","heather":"abstract","marled":"abstract","tie-dye":"abstract","ikat":"abstract"
}
ALLOWED_PATSCALE = {"small","medium","large","none","unknown"}
ALLOWED_COLOR = {"black","white","gray","beige","cream","brown","navy","blue","green","yellow","orange","red","pink","purple","unknown"}
ALLOWED_HEM_FINISH = {"clean","rolled","lettuce","scalloped","lace-trim","ruffle-trim","fringed","binding","raw","cutoff","pleated-hem","unknown"}
ALLOWED_SLIT = {"none","front","side","back","two-side","high-slit","unknown"}
ALLOWED_MATERIAL = {
    "cotton","linen","wool","silk","satin","denim","leather","suede","tweed","knit","rib-knit",
    "lace","chiffon","tulle","velvet","corduroy","fleece","jersey","terry","seersucker","poplin",
    "crepe","organza","brocade","jacquard","modal","rayon","viscose","lyocell","tencel",
    "polyester","nylon","elastane","spandex","acrylic","pu","mesh","eyelet","crochet",
    "faux-fur","faux-leather","blended","unknown"
}
MATERIAL_ALIASES = {
    "poly":"polyester","polyamide":"nylon","spandex":"elastane","elastan":"elastane","lycra":"elastane",
    "pleather":"faux-leather","fake leather":"faux-leather","artificial leather":"faux-leather",
    "tencel™":"tencel","viscosa":"viscose","modal rayon":"modal","cotten":"cotton"
}

def _normalize_pattern_value(p: str) -> str:
    p = (p or "").strip().lower()
    p = PATTERN_ALIASES.get(p, p)
    return p if p in ALLOWED_PATTERNS else "unknown"

def _nz(s: str) -> str:
    return (s or "").strip().lower()

# ================== GPT 프롬프트 ==================
PROMPT = """
You are a vision tagger for fashion product retrieval.
Analyze ONLY the dress (one-piece) region even if other items/body parts are visible.
Never infer hidden details; if not clearly visible, output "unknown".
If not applicable, output "none".
Ignore hands, legs, accessories, or background items. Only describe the dress itself.

Return ONE line with EXACTLY 17 lowercase, comma-separated tokens as key=value pairs,
using these keys IN THIS ORDER (keys must match exactly; no extra fields):

1) bodice.fit
   fitted / semi / relaxed / unknown
2) skirt.volume
   slim / straight / a-line / full / mermaid / ball-gown / pencil / tulip / unknown
3) dress.length
   mini / knee / midi / ankle / maxi / unknown
4) hemline.shape
   straight / high-low / asymmetric / mermaid / ruffled / layered / wrap / train / handkerchief / bubble / shirttail / none / unknown
5) hem.finish
   clean / rolled / lettuce / scalloped / lace-trim / ruffle-trim / fringed / binding / raw / cutoff / pleated-hem / unknown
6) waistline
   none / natural / high / empire / drop / unknown
7) neckline
   round / v / square / halter / collar / off-shoulder / strapless / cowl / keyhole / boat / one-shoulder / unknown
8) sleeve.length
   sleeveless / cap / short / elbow / three-quarter / long / one-shoulder / strapless / unknown
9) sleeve.style
   none / puff / balloon / raglan / kimono / off-shoulder / cold-shoulder / bishop / roll-up / unknown
10) skirt.structure
   none / pleated / gathered / tiered / circle / bias / ruched / wrap / peplum / unknown
11) pattern
   solid / stripe / check / houndstooth / herringbone / dot / floral / paisley / animal / camouflage / text / scenic / logo / geometric / abstract / lace-knit / mixed / unknown
12) pattern_scale
   small / medium / large / none / unknown (if pattern=solid ⇒ none)
13) material.fabric
   cotton / linen / wool / silk / satin / denim / leather / suede / tweed / knit / rib-knit / lace / chiffon / tulle / velvet / corduroy / fleece / jersey / terry / seersucker / poplin / crepe / organza / brocade / jacquard / modal / rayon / viscose / lyocell / tencel / polyester / nylon / elastane / spandex / acrylic / pu / mesh / eyelet / crochet / faux-fur / faux-leather / blended / unknown
14) color.main
   black / white / gray / beige / cream / brown / navy / blue / green / yellow / orange / red / pink / purple / unknown
15) color.sub
   second-most ≥ ~15% else none
16) closure
   zipper / buttons / hooks / drawstring / none / unknown
17) skirt.slit
   none / front / side / back / two-side / high-slit / unknown

---

CONSISTENCY
- if pattern=solid ⇒ pattern_scale=none.
- if sleeve.length=strapless ⇒ neckline=strapless and sleeve.style=none.
- if sleeve.length=one-shoulder ⇒ neckline=one-shoulder and sleeve.style=none.
- hemline.shape describes the shape; hem.finish describes the finishing/trim; treat them independently.
- When unsure, output "unknown".

---

FORMAT RULES
- Exactly 17 tokens, lowercase, comma-separated
- Each token must be key=value
- No extra words, no explanations
- Example valid output:
  "bodice.fit=fitted,skirt.volume=a-line,dress.length=midi,hemline.shape=straight,hem.finish=clean,waistline=natural,neckline=round,sleeve.length=short,sleeve.style=none,skirt.structure=pleated,pattern=solid,pattern_scale=none,material.fabric=cotton,color.main=blue,color.sub=none,closure=zipper,skirt.slit=side"

"""

# ================== 도우미 함수 ==================
def image_to_b64(image_path: str, max_side: int = MAX_SIDE, jpeg_quality: int = JPEG_QUALITY):
    im = Image.open(image_path).convert("RGB")
    w, h = im.size
    scale = max(w, h) / float(max_side)
    if scale > 1.0:
        try: resample = Image.Resampling.BICUBIC
        except AttributeError: resample = Image.BICUBIC
        im = im.resize((int(w/scale), int(h/scale)), resample)
    buf = io.BytesIO()
    im.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
    return base64.b64encode(buf.getvalue()).decode("utf-8"), "image/jpeg"

# 파일명에서 product_id 추출
PID_FROM_NAME = re.compile(r"^dress_([^.\\/]+)", re.IGNORECASE)
def extract_product_id_from_filename(path: str) -> str:
    base = os.path.basename(path)
    m = PID_FROM_NAME.search(base)
    return (m.group(1) if m else "").strip().lower()

# 제품 CSV 불러오기 (원피스만)
def load_dress_map(csv_path: str) -> Dict[str, Tuple[str,str]]:
    df = pd.read_csv(csv_path, dtype=str)
    df = df.fillna("").apply(lambda col: col.str.strip().str.lower())
    df_dress = df[df["대분류"] == "원피스"]  # dress만 필터
    return dict(zip(df_dress["상품코드"], zip(df_dress["대분류"], df_dress["소분류"])))

def map_categories_and_length(major: str, sub: str) -> Tuple[str, str]:
    major_map = {"원피스":"dress"}
    sub_map = {
        "미니원피스":"minidress",
        "미디원피스":"mididress",
        "맥시원피스":"maxidress",
    }
    return major_map.get(major, "dress"), sub_map.get(sub, "unknown")

# 캡션 정규화
def normalize_caption_dress17(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = " ".join(text.splitlines()).strip().strip('"').strip("'")
    parts = [p.strip().lower() for p in text.split(",") if p]

    if len(parts) < 17:
        parts += ["unknown"] * (17 - len(parts))
    elif len(parts) > 17:
        parts = parts[:17]

    IDX = {k:i for i,k in enumerate(TOK_KEYS)}

    # pattern ↔ scale
    parts[IDX["pattern"]] = _normalize_pattern_value(parts[IDX["pattern"]])
    if parts[IDX["pattern"]] == "solid":
        parts[IDX["pattern_scale"]] = "none"
    elif parts[IDX["pattern_scale"]] not in ALLOWED_PATSCALE:
        parts[IDX["pattern_scale"]] = "unknown"

    # 색상
    if parts[IDX["color.main"]] not in ALLOWED_COLOR: parts[IDX["color.main"]] = "unknown"
    if parts[IDX["color.sub"]] not in ALLOWED_COLOR and parts[IDX["color.sub"]] != "none":
        parts[IDX["color.sub"]] = "unknown"

    # 소재
    mat = MATERIAL_ALIASES.get(_nz(parts[IDX["material.fabric"]]), _nz(parts[IDX["material.fabric"]]))
    parts[IDX["material.fabric"]] = mat if mat in ALLOWED_MATERIAL else "unknown"

    # hem.finish / slit
    if parts[IDX["hem.finish"]] not in ALLOWED_HEM_FINISH:
        parts[IDX["hem.finish"]] = "unknown"
    if parts[IDX["skirt.slit"]] not in ALLOWED_SLIT:
        parts[IDX["skirt.slit"]] = "unknown"

    return ",".join(parts)

def tokens_to_combined_text(tokens_csv: str, category: str, type_: str) -> str:
    parts = [p.strip() for p in tokens_csv.split(",")]
    fixed = []
    for i, tok in enumerate(parts):
        if "=" in tok: fixed.append(tok)
        else: fixed.append(f"{TOK_KEYS[i]}={tok}")
    return f"category={category} | type={type_} | " + " | ".join(fixed)

# ================== 토큰 → combined_text ==================
def tokens_to_combined_text(tokens_csv: str, category: str, type_: str) -> str:
    parts = [p.strip() for p in tokens_csv.split(",")]
    fixed = []
    for i, tok in enumerate(parts):
        if "=" in tok:
            fixed.append(tok)
        else:
            fixed.append(f"{TOK_KEYS[i]}={tok}")
    return f"category={category} | type={type_} | " + " | ".join(fixed)


# ================== GPT 호출 ==================
async def tag_one(image_path: str) -> Tuple[str, str]:
    b64, mime = image_to_b64(image_path)
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": PROMPT},
            {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}
        ]
    }]
    resp = await aclient.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=160,
        temperature=0.0,
        top_p=1.0,
        seed=12345,
    )
    text = resp.choices[0].message.content.strip()
    return image_path, normalize_caption_dress17(text)

async def run_stage1(image_paths: List[str], pid2cat: Dict[str, Tuple[str,str]]) -> pd.DataFrame:
    sem = asyncio.Semaphore(MAX_CONCURRENCY)
    out = []
    done = 0

    async def worker(p):
        async with sem:
            try:
                _, cap17 = await tag_one(p)
            except Exception as e:
                print("caption error:", p, e)
                cap17 = ",".join([f"{k}=unknown" for k in TOK_KEYS])
            pid = extract_product_id_from_filename(p)
            major, sub = pid2cat.get(pid, ("원피스", ""))  # dress만
            cat_en, type_en = map_categories_and_length(major, sub)
            combined = tokens_to_combined_text(cap17, cat_en, type_en)
            return {"product_id": pid, "combined_text": combined}

    tasks = [asyncio.create_task(worker(p)) for p in image_paths]

    for fut in asyncio.as_completed(tasks):
        row = await fut
        out.append(row)
        done += 1

        # 🔹 10개마다 중간 저장 (덮어쓰기)
        if done % 10 == 0 or done == len(tasks):
            df_partial = pd.DataFrame(out).drop_duplicates(subset=["product_id"], keep="first")
            df_partial.to_csv(OUT_CSV, index=False, encoding="utf-8")
            print(f"[Stage1] {done}/{len(tasks)} saved → {OUT_CSV}")

    # 마지막에 최종 DataFrame 반환
    return pd.DataFrame(out).drop_duplicates(subset=["product_id"], keep="first")



# ================== 실행 ==================
# dress_* 이미지만 추출
exts = (".jpg",".jpeg",".png",".webp",".bmp",".jfif")
image_paths = [
    os.path.join(IMAGE_DIR, f)
    for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(exts) and f.lower().startswith("dress_")
]
if len(image_paths) == 0:
    raise RuntimeError("처리할 dress 이미지가 없습니다.")

pid2cat = load_dress_map(PRODUCT_INFO_CSV)

# 🔹 Jupyter에서는 asyncio.run() 대신 await 사용
import nest_asyncio, asyncio
nest_asyncio.apply()

df = await run_stage1(image_paths, pid2cat)   # ✅ 여기서 await로 실행
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("✅ 저장 완료:", OUT_CSV)
df.head()


# caption split : 77 토큰이 넘어 캡션을 두 개로 나누기

## top

In [None]:
import pandas as pd

# CSV 로드
df = pd.read_csv("29cm/top_caption.csv")

def split_top_caption(text: str):
    """top category 속성들을 caption1, caption2로 분리"""
    if pd.isna(text):
        return "", ""

    parts = [p.strip() for p in text.split("|")]

    # caption1 키 (앞부분: 아이템 정체성)
    caption1_keys = [
        "category", "type", "color.main", "color.sub", "pattern", "pattern_scale", "material.fabric"
    ]

    caption1, caption2 = [], []
    for p in parts:
        key = p.split("=")[0].strip()
        if key in caption1_keys:
            caption1.append(p)
        else:
            caption2.append(p)

    return " | ".join(caption1), " | ".join(caption2)

# caption1, caption2 컬럼 생성
df[["caption1", "caption2"]] = df["combined_text"].apply(
    lambda x: pd.Series(split_top_caption(str(x)))
)

# 저장
df.to_csv("29cm/top_caption_split.csv", index=False, encoding="utf-8-sig")

print("✅ caption1, caption2 분리 완료 → 29cm/top_caption_split.csv")


## pants

In [None]:
import pandas as pd

# CSV 로드
df = pd.read_csv("29cm/pants_caption.csv")

def split_pants_caption(text: str):
    """pants category 속성들을 caption1, caption2로 분리"""
    if pd.isna(text):
        return "", ""

    parts = [p.strip() for p in text.split("|")]

    # caption1: 전반적 외형 (핏, 기장, 재질, 색상, 패턴 등 큰 그림)
    caption1_keys = [
        "category", "type", "fit", "rise", "leg.length",
        "material.fabric", "pattern", "pattern_scale",
        "color.main", "color.sub"
    ]

    caption1, caption2 = [], []
    for p in parts:
        key = p.split("=")[0].strip()
        if key in caption1_keys:
            caption1.append(p)
        else:
            caption2.append(p)

    return " | ".join(caption1), " | ".join(caption2)

# caption1, caption2 컬럼 생성
df[["caption1", "caption2"]] = df["combined_text"].apply(
    lambda x: pd.Series(split_pants_caption(str(x)))
)

# 저장
df.to_csv("29cm/pants_caption_split.csv", index=False, encoding="utf-8-sig")

print("✅ caption1, caption2 분리 완료 → 29cm/pants_caption_split.csv")


## skirt

In [None]:
import pandas as pd

# CSV 로드
df = pd.read_csv("29cm/skirt_caption.csv")

def split_skirt_caption(text: str):
    """skirt category 속성들을 caption1, caption2로 분리"""
    if pd.isna(text):
        return "", ""

    parts = [p.strip() for p in text.split("|")]

    # caption1: 전반적 외형 (카테고리, 타입, 색상, 패턴, 소재, 실루엣 등)
    caption1_keys = [
        "category", "type", "skirt.length", "silhouette",
        "material.fabric", "pattern", "pattern_scale",
        "color.main", "color.sub", "color.tone"
    ]

    caption1, caption2 = [], []
    for p in parts:
        key = p.split("=")[0].strip()
        if key in caption1_keys:
            caption1.append(p)
        else:
            caption2.append(p)

    return " | ".join(caption1), " | ".join(caption2)

# caption1, caption2 컬럼 생성
df[["caption1", "caption2"]] = df["combined_text"].apply(
    lambda x: pd.Series(split_skirt_caption(str(x)))
)

# 저장
df.to_csv("29cm/skirt_caption_split.csv", index=False, encoding="utf-8-sig")

print("✅ caption1, caption2 분리 완료 → 29cm/skirt_caption_split.csv")


## dress

In [None]:
import pandas as pd

# CSV 로드
df = pd.read_csv("29cm/dress_caption.csv")

def split_dress_caption(text: str):
    """dress category 속성들을 caption1, caption2로 분리"""
    if pd.isna(text):
        return "", ""

    parts = [p.strip() for p in text.split("|")]

    # caption1: 전반적 외형 (드레스 길이, 실루엣, 소재, 색상, 패턴 등 큰 그림)
    caption1_keys = [
        "category", "type", "dress.length", "skirt.volume",
        "material.fabric", "pattern", "pattern_scale",
        "color.main", "color.sub"
    ]

    caption1, caption2 = [], []
    for p in parts:
        key = p.split("=")[0].strip()
        if key in caption1_keys:
            caption1.append(p)
        else:
            caption2.append(p)

    return " | ".join(caption1), " | ".join(caption2)

# caption1, caption2 컬럼 생성
df[["caption1", "caption2"]] = df["combined_text"].apply(
    lambda x: pd.Series(split_dress_caption(str(x)))
)

# 저장
df.to_csv("29cm/dress_caption_split.csv", index=False, encoding="utf-8-sig")

print("✅ caption1, caption2 분리 완료 → 29cm/dress_caption_split.csv")


# Embedding : 두 개로 나뉜 캡션을 각각 임베딩하여 평균을 내어 합치고 text 와 crop된 이미지를 Clip 모델을 통해 임베딩 

## top

In [None]:
# -*- coding: utf-8 -*-
# Top Embedding Pipeline (caption1+caption2 평균 text 임베딩 + multi = text+image 0.5:0.5)
# CSV 저장 시 information(원본 캡션) 컬럼만 남김

import os, io, json, torch
import numpy as np
import pandas as pd
from PIL import Image
from fashion_clip.fashion_clip import FashionCLIP

# ==============================
# 0) 설정
# ==============================
IMAGE_DIR = "crop_29cm"                       # crop된 이미지 폴더
CAPTIONS_CSV = "29cm/top_caption_split.csv"   # caption1, caption2 포함된 CSV
PRODUCT_INFO_CSV = "29cm/29cm_1000.csv"       # [상품코드, 대분류, 소분류]
OUTPUT_CSV = "29cm/top_embeddings_real_final.csv"  # 최종 산출물

# 디바이스 선택
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# 모델 로드
fclip = FashionCLIP("fashion-clip")
try:
    fclip.to(device)
except Exception:
    pass

# ==============================
# 1) 유틸 함수
# ==============================
def l2_normalize(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        x = x[None, :]
    n = np.linalg.norm(x, axis=1, keepdims=True)
    n = np.maximum(n, eps)
    return (x / n).astype(np.float32)

def _to_numpy_2d(x) -> np.ndarray:
    if isinstance(x, np.ndarray):
        arr = x
    elif torch.is_tensor(x):
        arr = x.detach().cpu().numpy()
    else:
        arr = np.asarray(x)
    if arr.ndim == 1:
        arr = arr[None, :]
    return arr.astype(np.float32)

def torch_empty_cache():
    if device == "cuda":
        torch.cuda.empty_cache()
    elif device == "mps":
        try:
            torch.mps.empty_cache()
        except Exception:
            pass

def determine_best_batch_size(start: int = 512) -> int:
    """OOM 피하면서 가장 큰 batch_size 선택"""
    if start >= 512:
        candidates = [start,384,256,192,128,96,64,48,32,24,16,12,8,4,2,1]
    elif start >= 256:
        candidates = [start,192,128,96,64,48,32,24,16,12,8,4,2,1]
    else:
        candidates = [start,48,32,24,16,12,8,4,2,1]
    for bs in candidates:
        try:
            with torch.no_grad():
                _ = fclip.encode_text(["warmup"] * bs, batch_size=bs)
                dummy = Image.new("RGB", (224,224), (255,255,255))
                _ = fclip.encode_images([dummy] * bs, batch_size=bs)
            print(f">>> OK batch_size={bs}")
            return bs
        except Exception as e:
            print(f"OOM/Fail at batch_size={bs} → {e}")
            torch_empty_cache()
    return 1

# ==============================
# 2) 임베딩 함수
# ==============================
def embed_in_batches(records, batch_size: int):
    out = []
    n = len(records)
    total_batches = (n + batch_size - 1) // batch_size

    for bi in range(total_batches):
        s, e = bi*batch_size, min((bi+1)*batch_size, n)
        chunk = records[s:e]

        texts1 = [r["caption1"] for r in chunk]
        texts2 = [r["caption2"] for r in chunk]
        combined_texts = [r["caption1"] + " | " + r["caption2"] for r in chunk]
        images = [Image.open(r["image_path"]).convert("RGB") for r in chunk]

        try:
            with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                # caption1, caption2 임베딩
                t1_emb = fclip.encode_text(texts1, batch_size=len(texts1))
                t2_emb = fclip.encode_text(texts2, batch_size=len(texts2))
                # combined_text 임베딩 (information 컬럼용)
                t_comb_emb = fclip.encode_text(combined_texts, batch_size=len(combined_texts))
                # 이미지 임베딩
                v_emb = fclip.encode_images(images, batch_size=len(images))
        finally:
            for img in images:
                try: img.close()
                except: pass

        t1_np = _to_numpy_2d(t1_emb)
        t2_np = _to_numpy_2d(t2_emb)
        t_comb_np = _to_numpy_2d(t_comb_emb)
        v_np  = _to_numpy_2d(v_emb)

        # text-only = caption1 + caption2 평균
        text_only = l2_normalize((t1_np + t2_np) / 2)

        # multi = text + image (0.5:0.5)
        multi = l2_normalize(0.5*text_only + 0.5*l2_normalize(v_np))

        # 결과 저장 (caption1, caption2 대신 information만 저장)
        for r, info, t_vec, m_vec in zip(chunk, combined_texts, text_only, multi):
            out.append({
                "id": r["product_id"],
                "category": r["category"],
                "type": r["type"],
                "information": info,
                "text": json.dumps(t_vec.tolist()),
                "multi": json.dumps(m_vec.tolist())
            })

        torch_empty_cache()
        print(f"[EMBED] batch {bi+1}/{total_batches} → {e}/{n}")

    return out

# ==============================
# 3) 메인
# ==============================
def main():
    if not os.path.exists(CAPTIONS_CSV):
        raise FileNotFoundError(f"캡션 CSV가 없습니다: {CAPTIONS_CSV}")
    if not os.path.exists(PRODUCT_INFO_CSV):
        raise FileNotFoundError(f"상품정보 CSV가 없습니다: {PRODUCT_INFO_CSV}")

    captions_df = pd.read_csv(CAPTIONS_CSV)
    product_df = pd.read_csv(PRODUCT_INFO_CSV, dtype=str).fillna("")
    product_df = product_df.apply(lambda col: col.str.strip().str.lower())

    product_map = dict(zip(product_df["상품코드"], zip(product_df["대분류"], product_df["소분류"])))

    # 한글 → 영어 매핑
    major_map = {"상의": "top"}
    sub_map = {
        "후드티": "hoodie",
        "셔츠블라우스": "shirt-blouse",
        "긴소매": "longsleeve",
        "반소매": "shortsleeve",
        "피케카라": "polo",
        "니트스웨터": "knit-sweater",
        "슬리브리스": "sleeveless",
    }

    records = []
    for _, row in captions_df.iterrows():
        pid = str(row["product_id"]).lower()
        caption1 = str(row["caption1"])
        caption2 = str(row["caption2"])

        # category/type 매핑
        raw_category, raw_type = product_map.get(pid, ("unknown","unknown"))
        category = major_map.get(raw_category, raw_category)
        type_ = sub_map.get(raw_type, raw_type)

        # 이미지 경로 탐색
        img_path = None
        for ext in [".jpg",".jpeg",".png",".webp",".bmp",".jfif"]:
            candidate = os.path.join(IMAGE_DIR, f"top_{pid}{ext}")
            if os.path.exists(candidate):
                img_path = candidate
                break
        if not img_path:
            continue

        records.append({
            "product_id": pid,
            "category": category,
            "type": type_,
            "caption1": caption1,
            "caption2": caption2,
            "image_path": img_path
        })

    if not records:
        raise RuntimeError("처리할 레코드가 없습니다.")

    # 배치 크기 결정
    start_bs = 512 if device in ("cuda","mps") else 256
    best_bs = determine_best_batch_size(start=start_bs)
    print(f">>> Using batch_size={best_bs} on {device}")

    embedded = embed_in_batches(records, batch_size=best_bs)

    # 저장 (최종 컬럼: id, category, type, information, text, multi)
    df = pd.DataFrame(embedded, columns=["id","category","type","information","text","multi"])
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"✅ 임베딩 저장 완료: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


## pants

In [None]:
# -*- coding: utf-8 -*-
# Top Embedding Pipeline (caption1+caption2 평균 text 임베딩 + multi = text+image 0.5:0.5)
# CSV 저장 시 information(원본 캡션) 컬럼만 남김

import os, io, json, torch
import numpy as np
import pandas as pd
from PIL import Image
from fashion_clip.fashion_clip import FashionCLIP

# ==============================
# 0) 설정
# ==============================
IMAGE_DIR = "crop_29cm"                       # crop된 이미지 폴더
CAPTIONS_CSV = "29cm/pants_caption_split.csv"   # caption1, caption2 포함된 CSV
PRODUCT_INFO_CSV = "29cm/29cm_1000.csv"       # [상품코드, 대분류, 소분류]
OUTPUT_CSV = "29cm/pants_embeddings_real_final.csv"  # 최종 산출물

# 디바이스 선택
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# 모델 로드
fclip = FashionCLIP("fashion-clip")
try:
    fclip.to(device)
except Exception:
    pass

# ==============================
# 1) 유틸 함수
# ==============================
def l2_normalize(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        x = x[None, :]
    n = np.linalg.norm(x, axis=1, keepdims=True)
    n = np.maximum(n, eps)
    return (x / n).astype(np.float32)

def _to_numpy_2d(x) -> np.ndarray:
    if isinstance(x, np.ndarray):
        arr = x
    elif torch.is_tensor(x):
        arr = x.detach().cpu().numpy()
    else:
        arr = np.asarray(x)
    if arr.ndim == 1:
        arr = arr[None, :]
    return arr.astype(np.float32)

def torch_empty_cache():
    if device == "cuda":
        torch.cuda.empty_cache()
    elif device == "mps":
        try:
            torch.mps.empty_cache()
        except Exception:
            pass

def determine_best_batch_size(start: int = 512) -> int:
    """OOM 피하면서 가장 큰 batch_size 선택"""
    if start >= 512:
        candidates = [start,384,256,192,128,96,64,48,32,24,16,12,8,4,2,1]
    elif start >= 256:
        candidates = [start,192,128,96,64,48,32,24,16,12,8,4,2,1]
    else:
        candidates = [start,48,32,24,16,12,8,4,2,1]
    for bs in candidates:
        try:
            with torch.no_grad():
                _ = fclip.encode_text(["warmup"] * bs, batch_size=bs)
                dummy = Image.new("RGB", (224,224), (255,255,255))
                _ = fclip.encode_images([dummy] * bs, batch_size=bs)
            print(f">>> OK batch_size={bs}")
            return bs
        except Exception as e:
            print(f"OOM/Fail at batch_size={bs} → {e}")
            torch_empty_cache()
    return 1

# ==============================
# 2) 임베딩 함수
# ==============================
def embed_in_batches(records, batch_size: int):
    out = []
    n = len(records)
    total_batches = (n + batch_size - 1) // batch_size

    for bi in range(total_batches):
        s, e = bi*batch_size, min((bi+1)*batch_size, n)
        chunk = records[s:e]

        texts1 = [r["caption1"] for r in chunk]
        texts2 = [r["caption2"] for r in chunk]
        combined_texts = [r["caption1"] + " | " + r["caption2"] for r in chunk]
        images = [Image.open(r["image_path"]).convert("RGB") for r in chunk]

        try:
            with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                # caption1, caption2 임베딩
                t1_emb = fclip.encode_text(texts1, batch_size=len(texts1))
                t2_emb = fclip.encode_text(texts2, batch_size=len(texts2))
                # combined_text 임베딩 (information 컬럼용)
                t_comb_emb = fclip.encode_text(combined_texts, batch_size=len(combined_texts))
                # 이미지 임베딩
                v_emb = fclip.encode_images(images, batch_size=len(images))
        finally:
            for img in images:
                try: img.close()
                except: pass

        t1_np = _to_numpy_2d(t1_emb)
        t2_np = _to_numpy_2d(t2_emb)
        t_comb_np = _to_numpy_2d(t_comb_emb)
        v_np  = _to_numpy_2d(v_emb)

        # text-only = caption1 + caption2 평균
        text_only = l2_normalize((t1_np + t2_np) / 2)

        # multi = text + image (0.5:0.5)
        multi = l2_normalize(0.5*text_only + 0.5*l2_normalize(v_np))

        # 결과 저장 (caption1, caption2 대신 information만 저장)
        for r, info, t_vec, m_vec in zip(chunk, combined_texts, text_only, multi):
            out.append({
                "id": r["product_id"],
                "category": r["category"],
                "type": r["type"],
                "information": info,
                "text": json.dumps(t_vec.tolist()),
                "multi": json.dumps(m_vec.tolist())
            })

        torch_empty_cache()
        print(f"[EMBED] batch {bi+1}/{total_batches} → {e}/{n}")

    return out

# ==============================
# 3) 메인
# ==============================
def main():
    if not os.path.exists(CAPTIONS_CSV):
        raise FileNotFoundError(f"캡션 CSV가 없습니다: {CAPTIONS_CSV}")
    if not os.path.exists(PRODUCT_INFO_CSV):
        raise FileNotFoundError(f"상품정보 CSV가 없습니다: {PRODUCT_INFO_CSV}")

    captions_df = pd.read_csv(CAPTIONS_CSV)
    product_df = pd.read_csv(PRODUCT_INFO_CSV, dtype=str).fillna("")
    product_df = product_df.apply(lambda col: col.str.strip().str.lower())

    product_map = dict(zip(product_df["상품코드"], zip(product_df["대분류"], product_df["소분류"])))

    # 한글 → 영어 매핑
    major_map = {
        "하의": "pants",
        "바지": "pants",
    }
    sub_map = {
        "데님팬츠": "denim-pants",
        "트레이닝조거팬츠": "jogger-pants",
        "코튼팬츠": "cotton-pants",
        "슈트팬츠슬랙스": "slacks",  # '슈트 팬츠/슬랙스' 합쳐진 케이스
        "슈트슬랙스": "slacks",
        "숏팬츠": "short-pants",
        "카고팬츠": "cargo-pants"
    }

    records = []
    for _, row in captions_df.iterrows():
        pid = str(row["product_id"]).lower()
        caption1 = str(row["caption1"])
        caption2 = str(row["caption2"])

        # category/type 매핑
        raw_category, raw_type = product_map.get(pid, ("unknown","unknown"))
        category = major_map.get(raw_category, raw_category)
        type_ = sub_map.get(raw_type, raw_type)

        # 이미지 경로 탐색
        img_path = None
        for ext in [".jpg",".jpeg",".png",".webp",".bmp",".jfif"]:
            candidate = os.path.join(IMAGE_DIR, f"pants_{pid}{ext}")
            if os.path.exists(candidate):
                img_path = candidate
                break
        if not img_path:
            continue

        records.append({
            "product_id": pid,
            "category": category,
            "type": type_,
            "caption1": caption1,
            "caption2": caption2,
            "image_path": img_path
        })

    if not records:
        raise RuntimeError("처리할 레코드가 없습니다.")

    # 배치 크기 결정
    start_bs = 512 if device in ("cuda","mps") else 256
    best_bs = determine_best_batch_size(start=start_bs)
    print(f">>> Using batch_size={best_bs} on {device}")

    embedded = embed_in_batches(records, batch_size=best_bs)

    # 저장 (최종 컬럼: id, category, type, information, text, multi)
    df = pd.DataFrame(embedded, columns=["id","category","type","information","text","multi"])
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"✅ 임베딩 저장 완료: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


## skirt

In [None]:
# -*- coding: utf-8 -*-
# Top Embedding Pipeline (caption1+caption2 평균 text 임베딩 + multi = text+image 0.5:0.5)
# CSV 저장 시 information(원본 캡션) 컬럼만 남김

import os, io, json, torch
import numpy as np
import pandas as pd
from PIL import Image
from fashion_clip.fashion_clip import FashionCLIP

# ==============================
# 0) 설정
# ==============================
IMAGE_DIR = "crop_29cm"                       # crop된 이미지 폴더
CAPTIONS_CSV = "29cm/skirt_caption_split.csv"   # caption1, caption2 포함된 CSV
PRODUCT_INFO_CSV = "29cm/29cm_1000.csv"       # [상품코드, 대분류, 소분류]
OUTPUT_CSV = "29cm/skirt_embeddings_real_final.csv"  # 최종 산출물

# 디바이스 선택
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# 모델 로드
fclip = FashionCLIP("fashion-clip")
try:
    fclip.to(device)
except Exception:
    pass

# ==============================
# 1) 유틸 함수
# ==============================
def l2_normalize(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        x = x[None, :]
    n = np.linalg.norm(x, axis=1, keepdims=True)
    n = np.maximum(n, eps)
    return (x / n).astype(np.float32)

def _to_numpy_2d(x) -> np.ndarray:
    if isinstance(x, np.ndarray):
        arr = x
    elif torch.is_tensor(x):
        arr = x.detach().cpu().numpy()
    else:
        arr = np.asarray(x)
    if arr.ndim == 1:
        arr = arr[None, :]
    return arr.astype(np.float32)

def torch_empty_cache():
    if device == "cuda":
        torch.cuda.empty_cache()
    elif device == "mps":
        try:
            torch.mps.empty_cache()
        except Exception:
            pass

def determine_best_batch_size(start: int = 512) -> int:
    """OOM 피하면서 가장 큰 batch_size 선택"""
    if start >= 512:
        candidates = [start,384,256,192,128,96,64,48,32,24,16,12,8,4,2,1]
    elif start >= 256:
        candidates = [start,192,128,96,64,48,32,24,16,12,8,4,2,1]
    else:
        candidates = [start,48,32,24,16,12,8,4,2,1]
    for bs in candidates:
        try:
            with torch.no_grad():
                _ = fclip.encode_text(["warmup"] * bs, batch_size=bs)
                dummy = Image.new("RGB", (224,224), (255,255,255))
                _ = fclip.encode_images([dummy] * bs, batch_size=bs)
            print(f">>> OK batch_size={bs}")
            return bs
        except Exception as e:
            print(f"OOM/Fail at batch_size={bs} → {e}")
            torch_empty_cache()
    return 1

# ==============================
# 2) 임베딩 함수
# ==============================
def embed_in_batches(records, batch_size: int):
    out = []
    n = len(records)
    total_batches = (n + batch_size - 1) // batch_size

    for bi in range(total_batches):
        s, e = bi*batch_size, min((bi+1)*batch_size, n)
        chunk = records[s:e]

        texts1 = [r["caption1"] for r in chunk]
        texts2 = [r["caption2"] for r in chunk]
        combined_texts = [r["caption1"] + " | " + r["caption2"] for r in chunk]
        images = [Image.open(r["image_path"]).convert("RGB") for r in chunk]

        try:
            with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                # caption1, caption2 임베딩
                t1_emb = fclip.encode_text(texts1, batch_size=len(texts1))
                t2_emb = fclip.encode_text(texts2, batch_size=len(texts2))
                # combined_text 임베딩 (information 컬럼용)
                t_comb_emb = fclip.encode_text(combined_texts, batch_size=len(combined_texts))
                # 이미지 임베딩
                v_emb = fclip.encode_images(images, batch_size=len(images))
        finally:
            for img in images:
                try: img.close()
                except: pass

        t1_np = _to_numpy_2d(t1_emb)
        t2_np = _to_numpy_2d(t2_emb)
        t_comb_np = _to_numpy_2d(t_comb_emb)
        v_np  = _to_numpy_2d(v_emb)

        # text-only = caption1 + caption2 평균
        text_only = l2_normalize((t1_np + t2_np) / 2)

        # multi = text + image (0.5:0.5)
        multi = l2_normalize(0.5*text_only + 0.5*l2_normalize(v_np))

        # 결과 저장 (caption1, caption2 대신 information만 저장)
        for r, info, t_vec, m_vec in zip(chunk, combined_texts, text_only, multi):
            out.append({
                "id": r["product_id"],
                "category": r["category"],
                "type": r["type"],
                "information": info,
                "text": json.dumps(t_vec.tolist()),
                "multi": json.dumps(m_vec.tolist())
            })

        torch_empty_cache()
        print(f"[EMBED] batch {bi+1}/{total_batches} → {e}/{n}")

    return out

# ==============================
# 3) 메인
# ==============================
def main():
    if not os.path.exists(CAPTIONS_CSV):
        raise FileNotFoundError(f"캡션 CSV가 없습니다: {CAPTIONS_CSV}")
    if not os.path.exists(PRODUCT_INFO_CSV):
        raise FileNotFoundError(f"상품정보 CSV가 없습니다: {PRODUCT_INFO_CSV}")

    captions_df = pd.read_csv(CAPTIONS_CSV)
    product_df = pd.read_csv(PRODUCT_INFO_CSV, dtype=str).fillna("")
    product_df = product_df.apply(lambda col: col.str.strip().str.lower())

    product_map = dict(zip(product_df["상품코드"], zip(product_df["대분류"], product_df["소분류"])))

    # 한글 → 영어 매핑
    major_map = {"스커트": "skirt"}
    sub_map = {
        "미니스커트": "miniskirt",
        "미디스커트": "midiskirt",
        "롱스커트": "longskirt",
    }

    records = []
    for _, row in captions_df.iterrows():
        pid = str(row["product_id"]).lower()
        caption1 = str(row["caption1"])
        caption2 = str(row["caption2"])

        # category/type 매핑
        raw_category, raw_type = product_map.get(pid, ("unknown","unknown"))
        category = major_map.get(raw_category, raw_category)
        type_ = sub_map.get(raw_type, raw_type)

        # 이미지 경로 탐색
        img_path = None
        for ext in [".jpg",".jpeg",".png",".webp",".bmp",".jfif"]:
            candidate = os.path.join(IMAGE_DIR, f"skirt_{pid}{ext}")
            if os.path.exists(candidate):
                img_path = candidate
                break
        if not img_path:
            continue

        records.append({
            "product_id": pid,
            "category": category,
            "type": type_,
            "caption1": caption1,
            "caption2": caption2,
            "image_path": img_path
        })

    if not records:
        raise RuntimeError("처리할 레코드가 없습니다.")

    # 배치 크기 결정
    start_bs = 512 if device in ("cuda","mps") else 256
    best_bs = determine_best_batch_size(start=start_bs)
    print(f">>> Using batch_size={best_bs} on {device}")

    embedded = embed_in_batches(records, batch_size=best_bs)

    # 저장 (최종 컬럼: id, category, type, information, text, multi)
    df = pd.DataFrame(embedded, columns=["id","category","type","information","text","multi"])
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"✅ 임베딩 저장 완료: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


## dress

In [None]:
# -*- coding: utf-8 -*-
# Top Embedding Pipeline (caption1+caption2 평균 text 임베딩 + multi = text+image 0.5:0.5)
# CSV 저장 시 information(원본 캡션) 컬럼만 남김

import os, io, json, torch
import numpy as np
import pandas as pd
from PIL import Image
from fashion_clip.fashion_clip import FashionCLIP

# ==============================
# 0) 설정
# ==============================
IMAGE_DIR = "crop_29cm"                       # crop된 이미지 폴더
CAPTIONS_CSV = "29cm/dress_caption_split.csv"   # caption1, caption2 포함된 CSV
PRODUCT_INFO_CSV = "29cm/29cm_1000.csv"       # [상품코드, 대분류, 소분류]
OUTPUT_CSV = "29cm/dress_embeddings_real_final.csv"  # 최종 산출물

# 디바이스 선택
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# 모델 로드
fclip = FashionCLIP("fashion-clip")
try:
    fclip.to(device)
except Exception:
    pass

# ==============================
# 1) 유틸 함수
# ==============================
def l2_normalize(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        x = x[None, :]
    n = np.linalg.norm(x, axis=1, keepdims=True)
    n = np.maximum(n, eps)
    return (x / n).astype(np.float32)

def _to_numpy_2d(x) -> np.ndarray:
    if isinstance(x, np.ndarray):
        arr = x
    elif torch.is_tensor(x):
        arr = x.detach().cpu().numpy()
    else:
        arr = np.asarray(x)
    if arr.ndim == 1:
        arr = arr[None, :]
    return arr.astype(np.float32)

def torch_empty_cache():
    if device == "cuda":
        torch.cuda.empty_cache()
    elif device == "mps":
        try:
            torch.mps.empty_cache()
        except Exception:
            pass

def determine_best_batch_size(start: int = 512) -> int:
    """OOM 피하면서 가장 큰 batch_size 선택"""
    if start >= 512:
        candidates = [start,384,256,192,128,96,64,48,32,24,16,12,8,4,2,1]
    elif start >= 256:
        candidates = [start,192,128,96,64,48,32,24,16,12,8,4,2,1]
    else:
        candidates = [start,48,32,24,16,12,8,4,2,1]
    for bs in candidates:
        try:
            with torch.no_grad():
                _ = fclip.encode_text(["warmup"] * bs, batch_size=bs)
                dummy = Image.new("RGB", (224,224), (255,255,255))
                _ = fclip.encode_images([dummy] * bs, batch_size=bs)
            print(f">>> OK batch_size={bs}")
            return bs
        except Exception as e:
            print(f"OOM/Fail at batch_size={bs} → {e}")
            torch_empty_cache()
    return 1

# ==============================
# 2) 임베딩 함수
# ==============================
def embed_in_batches(records, batch_size: int):
    out = []
    n = len(records)
    total_batches = (n + batch_size - 1) // batch_size

    for bi in range(total_batches):
        s, e = bi*batch_size, min((bi+1)*batch_size, n)
        chunk = records[s:e]

        texts1 = [r["caption1"] for r in chunk]
        texts2 = [r["caption2"] for r in chunk]
        combined_texts = [r["caption1"] + " | " + r["caption2"] for r in chunk]
        images = [Image.open(r["image_path"]).convert("RGB") for r in chunk]

        try:
            with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                # caption1, caption2 임베딩
                t1_emb = fclip.encode_text(texts1, batch_size=len(texts1))
                t2_emb = fclip.encode_text(texts2, batch_size=len(texts2))
                # combined_text 임베딩 (information 컬럼용)
                t_comb_emb = fclip.encode_text(combined_texts, batch_size=len(combined_texts))
                # 이미지 임베딩
                v_emb = fclip.encode_images(images, batch_size=len(images))
        finally:
            for img in images:
                try: img.close()
                except: pass

        t1_np = _to_numpy_2d(t1_emb)
        t2_np = _to_numpy_2d(t2_emb)
        t_comb_np = _to_numpy_2d(t_comb_emb)
        v_np  = _to_numpy_2d(v_emb)

        # text-only = caption1 + caption2 평균
        text_only = l2_normalize((t1_np + t2_np) / 2)

        # multi = text + image (0.5:0.5)
        multi = l2_normalize(0.5*text_only + 0.5*l2_normalize(v_np))

        # 결과 저장 (caption1, caption2 대신 information만 저장)
        for r, info, t_vec, m_vec in zip(chunk, combined_texts, text_only, multi):
            out.append({
                "id": r["product_id"],
                "category": r["category"],
                "type": r["type"],
                "information": info,
                "text": json.dumps(t_vec.tolist()),
                "multi": json.dumps(m_vec.tolist())
            })

        torch_empty_cache()
        print(f"[EMBED] batch {bi+1}/{total_batches} → {e}/{n}")

    return out

# ==============================
# 3) 메인
# ==============================
def main():
    if not os.path.exists(CAPTIONS_CSV):
        raise FileNotFoundError(f"캡션 CSV가 없습니다: {CAPTIONS_CSV}")
    if not os.path.exists(PRODUCT_INFO_CSV):
        raise FileNotFoundError(f"상품정보 CSV가 없습니다: {PRODUCT_INFO_CSV}")

    captions_df = pd.read_csv(CAPTIONS_CSV)
    product_df = pd.read_csv(PRODUCT_INFO_CSV, dtype=str).fillna("")
    product_df = product_df.apply(lambda col: col.str.strip().str.lower())

    product_map = dict(zip(product_df["상품코드"], zip(product_df["대분류"], product_df["소분류"])))

    # 한글 → 영어 매핑
    major_map = {"원피스":"dress"}
    sub_map = {
        "미니원피스":"minidress",
        "미디원피스":"mididress",
        "맥시원피스":"maxidress",
    }

    records = []
    for _, row in captions_df.iterrows():
        pid = str(row["product_id"]).lower()
        caption1 = str(row["caption1"])
        caption2 = str(row["caption2"])

        # category/type 매핑
        raw_category, raw_type = product_map.get(pid, ("unknown","unknown"))
        category = major_map.get(raw_category, raw_category)
        type_ = sub_map.get(raw_type, raw_type)

        # 이미지 경로 탐색
        img_path = None
        for ext in [".jpg",".jpeg",".png",".webp",".bmp",".jfif"]:
            candidate = os.path.join(IMAGE_DIR, f"dress_{pid}{ext}")
            if os.path.exists(candidate):
                img_path = candidate
                break
        if not img_path:
            continue

        records.append({
            "product_id": pid,
            "category": category,
            "type": type_,
            "caption1": caption1,
            "caption2": caption2,
            "image_path": img_path
        })

    if not records:
        raise RuntimeError("처리할 레코드가 없습니다.")

    # 배치 크기 결정
    start_bs = 512 if device in ("cuda","mps") else 256
    best_bs = determine_best_batch_size(start=start_bs)
    print(f">>> Using batch_size={best_bs} on {device}")

    embedded = embed_in_batches(records, batch_size=best_bs)

    # 저장 (최종 컬럼: id, category, type, information, text, multi)
    df = pd.DataFrame(embedded, columns=["id","category","type","information","text","multi"])
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"✅ 임베딩 저장 완료: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


# Qdrant upload : Qdrant 에 데이터 삽입

## top

In [None]:
import pandas as pd
import ast
import os
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv

# 0. .env 로드
load_dotenv()

# 1. 데이터 불러오기
df = pd.read_csv("29cm/top_embeddings_real_final.csv")

# 2. 환경변수에서 QDRANT_API_KEY 불러오기
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# 3. 클라이언트 연결
client = QdrantClient(
    url="http://43.201.185.192:6333",
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,
    timeout=120.0
)

# 4. 컬렉션 이름
COLLECTION_NAME = "ivlle"

# 5. 배치 업서트
BATCH = 500
total = len(df)

for i in range(0, total, BATCH):
    sl = df.iloc[i:i+BATCH]
    points = []
    for _, row in sl.iterrows():
        payload = {
            "category": row["category"],
            "type": row["type"],
            "information": row["information"],
        }
        vec_text = ast.literal_eval(row["text"])   # 문자열 → 리스트 변환
        vec_multi = ast.literal_eval(row["multi"])
        
        points.append(
            models.PointStruct(
                id=int(row["id"]),
                vector={
                    "text": vec_text,
                    "multi": vec_multi,
                },
                payload=payload
            )
        )
    
    client.upsert(collection_name=COLLECTION_NAME, points=points)
    print(f"✅ Uploaded {i+len(sl)}/{total}")

print("🎉 모든 데이터 업서트 완료")


## pants

In [None]:
import pandas as pd
import ast
import os
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv

# 0. .env 로드
load_dotenv()

# 1. 데이터 불러오기
df = pd.read_csv("29cm/pants_embeddings_real_final.csv")

# 2. 환경변수에서 QDRANT_API_KEY 불러오기
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# 3. 클라이언트 연결
client = QdrantClient(
    url="http://43.201.185.192:6333",
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,
    timeout=120.0
)

# 4. 컬렉션 이름
COLLECTION_NAME = "ivlle"

# 5. 배치 업서트
BATCH = 500
total = len(df)

for i in range(0, total, BATCH):
    sl = df.iloc[i:i+BATCH]
    points = []
    for _, row in sl.iterrows():
        payload = {
            "category": row["category"],
            "type": row["type"],
            "information": row["information"],
        }
        vec_text = ast.literal_eval(row["text"])   # 문자열 → 리스트 변환
        vec_multi = ast.literal_eval(row["multi"])
        
        points.append(
            models.PointStruct(
                id=int(row["id"]),
                vector={
                    "text": vec_text,
                    "multi": vec_multi,
                },
                payload=payload
            )
        )
    
    client.upsert(collection_name=COLLECTION_NAME, points=points)
    print(f"✅ Uploaded {i+len(sl)}/{total}")

print("🎉 모든 데이터 업서트 완료")


## skirt

In [None]:
import pandas as pd
import ast
import os
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv

# 0. .env 로드
load_dotenv()

# 1. 데이터 불러오기
df = pd.read_csv("29cm/skirt_embeddings_real_final.csv")

# 2. 환경변수에서 QDRANT_API_KEY 불러오기
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# 3. 클라이언트 연결
client = QdrantClient(
    url="http://43.201.185.192:6333",
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,
    timeout=120.0
)

# 4. 컬렉션 이름
COLLECTION_NAME = "ivlle"

# 5. 배치 업서트
BATCH = 500
total = len(df)

for i in range(0, total, BATCH):
    sl = df.iloc[i:i+BATCH]
    points = []
    for _, row in sl.iterrows():
        payload = {
            "category": row["category"],
            "type": row["type"],
            "information": row["information"],
        }
        vec_text = ast.literal_eval(row["text"])   # 문자열 → 리스트 변환
        vec_multi = ast.literal_eval(row["multi"])
        
        points.append(
            models.PointStruct(
                id=int(row["id"]),
                vector={
                    "text": vec_text,
                    "multi": vec_multi,
                },
                payload=payload
            )
        )
    
    client.upsert(collection_name=COLLECTION_NAME, points=points)
    print(f"✅ Uploaded {i+len(sl)}/{total}")

print("🎉 모든 데이터 업서트 완료")


## dress

In [None]:
import pandas as pd
import ast
import os
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv

# 0. .env 로드
load_dotenv()

# 1. 데이터 불러오기
df = pd.read_csv("29cm/dress_embeddings_real_final.csv")

# 2. 환경변수에서 QDRANT_API_KEY 불러오기
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# 3. 클라이언트 연결
client = QdrantClient(
    url="http://43.201.185.192:6333",
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,
    timeout=120.0
)

# 4. 컬렉션 이름
COLLECTION_NAME = "ivlle"

# 5. 배치 업서트
BATCH = 500
total = len(df)

for i in range(0, total, BATCH):
    sl = df.iloc[i:i+BATCH]
    points = []
    for _, row in sl.iterrows():
        payload = {
            "category": row["category"],
            "type": row["type"],
            "information": row["information"],
        }
        vec_text = ast.literal_eval(row["text"])   # 문자열 → 리스트 변환
        vec_multi = ast.literal_eval(row["multi"])
        
        points.append(
            models.PointStruct(
                id=int(row["id"]),
                vector={
                    "text": vec_text,
                    "multi": vec_multi,
                },
                payload=payload
            )
        )
    
    client.upsert(collection_name=COLLECTION_NAME, points=points)
    print(f"✅ Uploaded {i+len(sl)}/{total}")

print("🎉 모든 데이터 업서트 완료")
