In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import torch
import json
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import os
import openai
from tqdm.notebook import tqdm

from dotenv import load_dotenv
load_dotenv()

pd.set_option('display.max_columns', None)

In [2]:
from text_utils import create_embeddings

In [3]:
# initialize openai
openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
# Splade 모델 github에서 다운로드 필요 https://github.com/naver/splade
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
# sparse_model.to('cpu')  # move to GPU if possible
sparse_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

In [5]:
def gen_sparse_vector(text):
    tokens = splade_tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        sparse_emb = sparse_model(
            d_kwargs=tokens.to('cpu')
        )['d_rep'].squeeze()

    return sparse_emb

In [6]:
input_text = "pants"
# input_text = "low waist"
# input_text = "trousers" # (pants의 동의어)

In [7]:
texts = [
   "pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), waistline_name : low waist,",
   "pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), geometric pattern",
   "symmetrical, silhouette_fit_name : regular (fit), waistline_name : low waist,",
   "symmetrical, silhouette_fit_name : regular (fit), waistline_name : high waist,"
]

In [8]:
embs = create_embeddings(texts)
input_emb = create_embeddings([input_text])

print("Dense vector similarities :", cosine_similarity(input_emb, embs))

Dense vector similarities : [[0.42016653 0.43070957 0.31788163 0.31183107]]


In [16]:
for i in texts:
    print('-', i)

print()
s_embs = [gen_sparse_vector(t).numpy() for t in texts]
input_s_emb = [gen_sparse_vector(input_text).numpy()]

print("Sparse vector similarities :", cosine_similarity(input_s_emb, s_embs))

- pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), waistline_name : low waist,
- pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), geometric pattern
- symmetrical, silhouette_fit_name : regular (fit), waistline_name : low waist,
- symmetrical, silhouette_fit_name : regular (fit), waistline_name : high waist,

Sparse vector similarities : [[0.4135003  0.43643478 0.0507601  0.05063569]]


- 각 단어별 weight 살펴보기

In [17]:
# create the tokens that will be input into the model
tokens = splade_tokenizer(texts[0], return_tensors="pt")
splade_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])

with torch.no_grad():
    sparse_emb = sparse_model(
        d_kwargs=tokens.to('cpu')
    )['d_rep'].squeeze()
sparse_emb.shape

indices = sparse_emb.nonzero().squeeze().cpu().tolist()
values = sparse_emb[indices].cpu().tolist()

print(len(indices))

idx2token = {idx: token for token, idx in splade_tokenizer.get_vocab().items()}

92


"pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), waistline_name : low waist,"

In [19]:
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(indices, values)
}
# sort so we can see most relevant tokens first
sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
sparse_dict_tokens

{'silhouette': 2.4,
 'pants': 1.71,
 'waist': 1.66,
 'straight': 1.61,
 'symmetrical': 1.58,
 'low': 1.45,
 'fit': 1.45,
 'regular': 1.42,
 'name': 1.3,
 'jeans': 1.26,
 'symmetry': 1.19,
 'trousers': 1.04,
 '##line': 0.95,
 'lower': 0.85,
 'meaning': 0.74,
 'clothing': 0.72,
 'names': 0.69,
 'reynolds': 0.68,
 'flat': 0.6,
 '_': 0.57,
 'shadow': 0.56,
 'clothes': 0.55,
 'fitting': 0.47,
 'shape': 0.46,
 'definition': 0.46,
 'webb': 0.45,
 'word': 0.41,
 'morris': 0.4,
 'justin': 0.37,
 'style': 0.36,
 'dress': 0.35,
 'line': 0.34,
 'design': 0.33,
 'fashion': 0.33,
 'belt': 0.3,
 'jean': 0.29,
 'gender': 0.29,
 'sport': 0.28,
 'photography': 0.27,
 'logo': 0.27,
 'named': 0.25,
 'lowest': 0.25,
 'zipper': 0.25,
 'madison': 0.23,
 'prefix': 0.23,
 'sex': 0.22,
 'shoe': 0.22,
 'quote': 0.21,
 '##tour': 0.21,
 'image': 0.2,
 'clark': 0.19,
 'math': 0.19,
 'torso': 0.18,
 'abbreviation': 0.16,
 'symbol': 0.15,
 'simon': 0.14,
 'kelly': 0.14,
 'print': 0.14,
 'thomas': 0.13,
 'sizes': 0.13

## sparse vector

예시
- document : 

```json
silhouette_name : symmetrical,
silhouette_fit_name : regular (fit),
waistline_name : low waist,
length_name : maxi (length),
opening_type_name : fly (opening),
non-textile material type_name : no non-textile material
```

In [6]:
def listify(string, encap_type="()"):
    return [int(num) for num in string.strip(encap_type).split(', ')]

In [7]:
attributes = pd.read_csv("../data/imaterialist-fashion-2020-fgvc7/attribute_specific.csv")
new_df = pd.read_csv("../data/imaterialist-fashion-2020-fgvc7/clothes_final2.csv")

new_df['bbox'] = [listify(i, "[]") for i in new_df['bbox']]
new_df['bbox_big'] = [listify(i, "[]") for i in new_df['bbox_big']]

In [8]:
new_df.head(2)

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames
0,0,00000663ed1ff0c4e0132b9b9ac53f6e,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,115136143154230295316317,,"[1163, 3923, 2839, 5212]","[1122, 3891, 2880, 5214]",1758,1323,2325834,6,pants,lowerbody,"symmetrical, regular (fit), low waist, maxi (l...",
1,1,00000663ed1ff0c4e0132b9b9ac53f6e,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,115136142146225295316317,163.0,"[1212, 1371, 2394, 3978]","[1183, 1306, 2423, 4043]",1240,2737,3393880,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...",shirt (collar)


In [9]:
new_df['name'].unique()

array(['pants', 'shirt, blouse', 'jacket', 'top, t-shirt, sweatshirt',
       'dress', 'shoe', 'glasses', 'skirt', 'bag, wallet', 'belt',
       'headband, head covering, hair accessory', 'sock', 'hat', 'watch',
       'glove', 'tights, stockings', 'sweater', 'tie', 'shorts', 'scarf',
       'coat', 'vest', 'umbrella', 'cardigan', 'cape', 'jumpsuit',
       'leg warmer'], dtype=object)

### 1. 각 supercategory 별로 group

#### Supercategory 별로 attribute를 구분하기

- 조금 더 자세히 살펴보면, 우리들이 갖고 있는 attribute은 몇 가지 레벨로 나눌 수 있다
	- 가장 큰 단위는 당연히 classID - 바지, 상의, 신발 등
	- 더 세밀하게 나눠보면 classID에 따른 특징들을 그룹화 할 수 있다
		- 하위 attribute들은 각자 다른 특징을 나타낸다
		- 핏감, 전체적인 옷의 형태, 질감, 마감, 길이 등
		- 따라서 이런 attribute들을 모두 각자의 그룹에 맞게 고려되어야 한다

In [10]:
attributes.head(3)

Unnamed: 0,id,name,supercategory,level,supercategory2,taxonomy_id
0,0,classic (t-shirt),nickname,1,main_category,att000002_00
1,1,polo (shirt),nickname,1,main_category,att000003_00
2,2,undershirt,nickname,1,main_category,att000004_00


In [11]:
attributes['supercategory2'].unique()

array(['main_category', 'silhouette', 'silhouette_fit', 'waistline',
       'length', 'collar_type', 'neckline_type', 'sleeve_type',
       'pocket_type', 'opening_type', 'non-textile material type',
       'leather', 'textile finishing, manufacturing techniques',
       'textile pattern', 'animal', 'other'], dtype=object)

아무런 attribute이 없는 항목은 'normal'이라는 attribute을 임의로 부여

In [12]:
new_df.loc[new_df['AttributesIds'].isna(), 'AttributesIds'] = "999"
new_df.loc[new_df['AttributesNames'].isna(), 'AttributesNames'] = "normal"

In [13]:
new_df.tail(2)

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames
97922,97922,fffe20b555b98c3c1f26c8dfff275cbc,2446541 2 2449539 5 2452536 10 2455534 14 2458...,3000,2001,0,115136142146225295316322,204160163,"[815, 640, 1128, 1576]","[808, 617, 1135, 1599]",327,982,321114,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...","set-in sleeve, wrist-length, shirt (collar)"
97923,97923,ffffbf7014a9e408bfbb81a75bc70638,63365 36 63852 77 64343 85 64838 89 65332 94 6...,500,375,10,102128142150295308317,157,"[126, 115, 297, 442]","[122, 107, 301, 450]",179,343,61397,10,dress,wholebody,"sheath (dress), straight, normal waist, above-...",short (length)


- Main attribute과 secondary attribute들을 하나로 묶음
    - 이는 supercategory2를 임의로 지정하여 sleeve, collar들에 해당하는 attribute ID를 별도로 처리할 수 있기 때문

In [14]:
def merge_columns(row):
    # Check if either value is np.nan and return the other value in such cases
    if pd.isna(row['AttributesIds']) and not pd.isna(row['second_AttributesIds']):
        return row['second_AttributesIds']
    elif not pd.isna(row['AttributesIds']) and pd.isna(row['second_AttributesIds']):
        return row['AttributesIds']
    elif pd.isna(row['AttributesIds']) and pd.isna(row['second_AttributesIds']):
        return np.nan
    else:
        # Both values are not np.nan, merge with a comma
        return f"{row['AttributesIds']},{row['second_AttributesIds']}"

new_df['AttributesIds_merged'] = new_df.apply(merge_columns, axis=1)

In [15]:
new_df.head(2)

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames,AttributesIds_merged
0,0,00000663ed1ff0c4e0132b9b9ac53f6e,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,115136143154230295316317,,"[1163, 3923, 2839, 5212]","[1122, 3891, 2880, 5214]",1758,1323,2325834,6,pants,lowerbody,"symmetrical, regular (fit), low waist, maxi (l...",,115136143154230295316317
1,1,00000663ed1ff0c4e0132b9b9ac53f6e,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,115136142146225295316317,163.0,"[1212, 1371, 2394, 3978]","[1183, 1306, 2423, 4043]",1240,2737,3393880,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...",shirt (collar),115136142146225295316317163


In [16]:
def convert2list(string):
    if pd.isna(string):
        return np.nan
    else:
        return list(set([i for i in string.split(',')]))

new_df['AttributesIds_list'] = new_df['AttributesIds_merged'].apply(convert2list)

In [17]:
new_df.head(2)

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames,AttributesIds_merged,AttributesIds_list
0,0,00000663ed1ff0c4e0132b9b9ac53f6e,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,115136143154230295316317,,"[1163, 3923, 2839, 5212]","[1122, 3891, 2880, 5214]",1758,1323,2325834,6,pants,lowerbody,"symmetrical, regular (fit), low waist, maxi (l...",,115136143154230295316317,"[317, 154, 316, 136, 295, 230, 115, 143]"
1,1,00000663ed1ff0c4e0132b9b9ac53f6e,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,115136142146225295316317,163.0,"[1212, 1371, 2394, 3978]","[1183, 1306, 2423, 4043]",1240,2737,3393880,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...",shirt (collar),115136142146225295316317163,"[146, 225, 317, 163, 316, 136, 295, 115, 142]"


#### 각 attribute들을 분류하여 각 컬럼에 배치
- `attributes`를 참고하며, 각 attribute ID를 attribute name으로 변환

In [18]:
attributes.head(2)

Unnamed: 0,id,name,supercategory,level,supercategory2,taxonomy_id
0,0,classic (t-shirt),nickname,1,main_category,att000002_00
1,1,polo (shirt),nickname,1,main_category,att000003_00


In [19]:
attributes['id'] = attributes['id'].astype(str)

# Create a mapping of id to supercategory2
id_to_supercategory2 = attributes.set_index('id')['supercategory2'].to_dict()

In [20]:
id_to_supercategory2

{'0': 'main_category',
 '1': 'main_category',
 '2': 'main_category',
 '3': 'main_category',
 '4': 'main_category',
 '5': 'main_category',
 '6': 'main_category',
 '7': 'main_category',
 '8': 'main_category',
 '9': 'main_category',
 '10': 'main_category',
 '11': 'main_category',
 '12': 'main_category',
 '13': 'main_category',
 '14': 'main_category',
 '15': 'main_category',
 '16': 'main_category',
 '17': 'main_category',
 '18': 'main_category',
 '19': 'main_category',
 '20': 'main_category',
 '21': 'main_category',
 '22': 'main_category',
 '23': 'main_category',
 '24': 'main_category',
 '25': 'main_category',
 '26': 'main_category',
 '27': 'main_category',
 '28': 'main_category',
 '29': 'main_category',
 '30': 'main_category',
 '31': 'main_category',
 '32': 'main_category',
 '33': 'main_category',
 '34': 'main_category',
 '35': 'main_category',
 '36': 'main_category',
 '37': 'main_category',
 '38': 'main_category',
 '39': 'main_category',
 '40': 'main_category',
 '41': 'main_category',
 '

In [21]:
# 각 카테고리별로 list를 만든다
category_distributions = list()

# row를 루프를 돌면서 각 dictionary value에 채워 넣는다
for idx, row in tqdm(new_df.iterrows()):
    tmp_dict = {k:'' for k in attributes['supercategory2'].unique()}
    for attr in row['AttributesIds_list']:
        supercat_type = id_to_supercategory2[attr]
        if tmp_dict[supercat_type]=='':
            tmp_dict[supercat_type] += attr
        else:
            tmp_dict[supercat_type] += "," + attr
        # break
    category_distributions.append(tmp_dict)

0it [00:00, ?it/s]

In [22]:
category_distributions[0]

{'main_category': '',
 'silhouette': '115',
 'silhouette_fit': '136',
 'waistline': '143',
 'length': '154',
 'collar_type': '',
 'neckline_type': '',
 'sleeve_type': '',
 'pocket_type': '',
 'opening_type': '230',
 'non-textile material type': '295',
 'leather': '',
 'textile finishing, manufacturing techniques': '316',
 'textile pattern': '317',
 'animal': '',
 'other': ''}

In [23]:
# 각 dictionary를 dataframe 형태로 변환
category_dist_df = [pd.DataFrame([d]) for d in category_distributions]
# list of dataframe을 하나의 dataframe으로 concat
category_dist_df = pd.concat(category_dist_df, axis=0)
# reset index
category_dist_df.reset_index(inplace=True, drop=True)

category_dist_df = category_dist_df.replace('', np.nan)

In [24]:
category_dist_df.tail()

Unnamed: 0,main_category,silhouette,silhouette_fit,waistline,length,collar_type,neckline_type,sleeve_type,pocket_type,opening_type,non-textile material type,leather,"textile finishing, manufacturing techniques",textile pattern,animal,other
97919,,,,,,,,,,,,,,,,999.0
97920,,,,,,,,,,,,,,,,999.0
97921,36.0,128115.0,136.0,142.0,154.0,,,,,230.0,295.0,,298.0,317.0,,
97922,,115.0,136.0,142.0,146160.0,163.0,,204.0,,225.0,295.0,,316.0,322.0,,
97923,102.0,128.0,,142.0,157150.0,,,,,,295.0,,308.0,317.0,,


- 기존의 데이터셋과 결합

In [25]:
new_df = pd.concat([new_df, category_dist_df], axis=1)
new_df = new_df.replace('', np.nan)
new_df.head()

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames,AttributesIds_merged,AttributesIds_list,main_category,silhouette,silhouette_fit,waistline,length,collar_type,neckline_type,sleeve_type,pocket_type,opening_type,non-textile material type,leather,"textile finishing, manufacturing techniques",textile pattern,animal,other
0,0,00000663ed1ff0c4e0132b9b9ac53f6e,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,115136143154230295316317,,"[1163, 3923, 2839, 5212]","[1122, 3891, 2880, 5214]",1758,1323,2325834,6,pants,lowerbody,"symmetrical, regular (fit), low waist, maxi (l...",,115136143154230295316317,"[317, 154, 316, 136, 295, 230, 115, 143]",,115,136.0,143,154,,,,,230.0,295,,316,317,,
1,1,00000663ed1ff0c4e0132b9b9ac53f6e,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,115136142146225295316317,163.0,"[1212, 1371, 2394, 3978]","[1183, 1306, 2423, 4043]",1240,2737,3393880,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...",shirt (collar),115136142146225295316317163,"[146, 225, 317, 163, 316, 136, 295, 115, 142]",,115,136.0,142,146,163.0,,,,225.0,295,,316,317,,
2,2,00000663ed1ff0c4e0132b9b9ac53f6e,4566382 8 4571592 25 4576803 41 4582013 58 458...,5214,3676,4,17115136145149225295311317,219204160174.0,"[875, 1437, 3309, 4871]","[815, 1352, 3369, 4956]",2554,3604,9204616,4,jacket,upperbody,"blazer, symmetrical, regular (fit), no waistli...","welt (pocket), set-in sleeve, wrist-length, no...","17,115,136,145,149,225,295,311,317,219,204,160...","[225, 317, 219, 204, 149, 17, 160, 145, 311, 1...",17.0,115,136.0,145,149160,174.0,,204.0,219.0,225.0,295,,311,317,,
3,3,0000fe7c9191fba733c8a69cfaf962b7,1343707 9 1346138 27 1348569 44 1351000 62 135...,2448,2448,1,0115145146295316317,190.0,"[548, 405, 1946, 2263]","[514, 359, 1980, 2309]",1466,1950,2858700,1,"top, t-shirt, sweatshirt",upperbody,"classic (t-shirt), symmetrical, no waistline, ...",scoop (neck),0115145146295316317190,"[146, 115, 317, 190, 316, 145, 295, 0]",0.0,115,,145,146,,190.0,,,,295,,316,317,,
4,4,0002ec21ddb8477e98b2cbb87ea2e269,2287509 4 2290504 12 2290588 26 2293501 18 229...,3000,1997,10,102128142150229295301318,182.0,"[762, 719, 1262, 1967]","[750, 688, 1274, 1998]",524,1310,686440,10,dress,wholebody,"sheath (dress), straight, normal waist, above-...",round (neck),102128142150229295301318182,"[318, 128, 182, 229, 102, 301, 295, 142, 150]",102.0,128,,142,150,,182.0,,,229.0,295,,301,318,,


### 2. 하나의 document로 변환

In [26]:
category_dist_df.head()

Unnamed: 0,main_category,silhouette,silhouette_fit,waistline,length,collar_type,neckline_type,sleeve_type,pocket_type,opening_type,non-textile material type,leather,"textile finishing, manufacturing techniques",textile pattern,animal,other
0,,115,136.0,143,154,,,,,230.0,295,,316,317,,
1,,115,136.0,142,146,163.0,,,,225.0,295,,316,317,,
2,17.0,115,136.0,145,149160,174.0,,204.0,219.0,225.0,295,,311,317,,
3,0.0,115,,145,146,,190.0,,,,295,,316,317,,
4,102.0,128,,142,150,,182.0,,,229.0,295,,301,318,,


In [27]:
id_to_name = pd.Series(attributes.name.values, index=attributes.id).to_dict()

# Define a function to convert IDs to names
def ids_to_names(ids, id_to_name=id_to_name):
    if pd.isna(ids):
        return np.nan
    names = [id_to_name.get(id_, 'Unknown') for id_ in ids.split(',')]
    return ', '.join(names)


for col in category_dist_df.columns:
    if 'name' not in col:
        category_dist_df[col+"_name"] = category_dist_df[col].apply(ids_to_names)

In [28]:
category_dist_df.head(2)

Unnamed: 0,main_category,silhouette,silhouette_fit,waistline,length,collar_type,neckline_type,sleeve_type,pocket_type,opening_type,non-textile material type,leather,"textile finishing, manufacturing techniques",textile pattern,animal,other,main_category_name,silhouette_name,silhouette_fit_name,waistline_name,length_name,collar_type_name,neckline_type_name,sleeve_type_name,pocket_type_name,opening_type_name,non-textile material type_name,leather_name,"textile finishing, manufacturing techniques_name",textile pattern_name,animal_name,other_name
0,,115,136,143,154,,,,,230,295,,316,317,,,,symmetrical,regular (fit),low waist,maxi (length),,,,,fly (opening),no non-textile material,,no special manufacturing technique,plain (pattern),,
1,,115,136,142,146,163.0,,,,225,295,,316,317,,,,symmetrical,regular (fit),normal waist,above-the-hip (length),shirt (collar),,,,single breasted,no non-textile material,,no special manufacturing technique,plain (pattern),,


In [29]:
# human-readable한 값들로 변환된 컬럼들을 활용
named_df = category_dist_df[[i for i in category_dist_df.columns if '_name' in i]]

# 각 row 별로 하나의 string 값으로 변환
def row_to_string(row):
    return ',\n'.join([f"{col} : {row[col]}" for col in named_df.columns if pd.notna(row[col])])

# Applying the function to each row of the DataFrame and storing the results in a list
list_of_strings = named_df.apply(row_to_string, axis=1).tolist()

In [30]:
print(list_of_strings[0])

silhouette_name : symmetrical,
silhouette_fit_name : regular (fit),
waistline_name : low waist,
length_name : maxi (length),
opening_type_name : fly (opening),
non-textile material type_name : no non-textile material,
textile finishing, manufacturing techniques_name : no special manufacturing technique,
textile pattern_name : plain (pattern)


In [31]:
new_df['doc'] = list_of_strings

In [32]:
new_df.head(2)

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames,AttributesIds_merged,AttributesIds_list,main_category,silhouette,silhouette_fit,waistline,length,collar_type,neckline_type,sleeve_type,pocket_type,opening_type,non-textile material type,leather,"textile finishing, manufacturing techniques",textile pattern,animal,other,doc
0,0,00000663ed1ff0c4e0132b9b9ac53f6e,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,115136143154230295316317,,"[1163, 3923, 2839, 5212]","[1122, 3891, 2880, 5214]",1758,1323,2325834,6,pants,lowerbody,"symmetrical, regular (fit), low waist, maxi (l...",,115136143154230295316317,"[317, 154, 316, 136, 295, 230, 115, 143]",,115,136,143,154,,,,,230,295,,316,317,,,"silhouette_name : symmetrical,\nsilhouette_fit..."
1,1,00000663ed1ff0c4e0132b9b9ac53f6e,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,115136142146225295316317,163.0,"[1212, 1371, 2394, 3978]","[1183, 1306, 2423, 4043]",1240,2737,3393880,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...",shirt (collar),115136142146225295316317163,"[146, 225, 317, 163, 316, 136, 295, 115, 142]",,115,136,142,146,163.0,,,,225,295,,316,317,,,"silhouette_name : symmetrical,\nsilhouette_fit..."


In [33]:
new_df.head(2)

Unnamed: 0,entity_id,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds,second_AttributesIds,bbox,bbox_big,width,height,area,id,name,supercategory,AttributesNames,second_AttributesNames,AttributesIds_merged,AttributesIds_list,main_category,silhouette,silhouette_fit,waistline,length,collar_type,neckline_type,sleeve_type,pocket_type,opening_type,non-textile material type,leather,"textile finishing, manufacturing techniques",textile pattern,animal,other,doc
0,0,00000663ed1ff0c4e0132b9b9ac53f6e,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,115136143154230295316317,,"[1163, 3923, 2839, 5212]","[1122, 3891, 2880, 5214]",1758,1323,2325834,6,pants,lowerbody,"symmetrical, regular (fit), low waist, maxi (l...",,115136143154230295316317,"[317, 154, 316, 136, 295, 230, 115, 143]",,115,136,143,154,,,,,230,295,,316,317,,,"silhouette_name : symmetrical,\nsilhouette_fit..."
1,1,00000663ed1ff0c4e0132b9b9ac53f6e,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,115136142146225295316317,163.0,"[1212, 1371, 2394, 3978]","[1183, 1306, 2423, 4043]",1240,2737,3393880,0,"shirt, blouse",upperbody,"symmetrical, regular (fit), normal waist, abov...",shirt (collar),115136142146225295316317163,"[146, 225, 317, 163, 316, 136, 295, 115, 142]",,115,136,142,146,163.0,,,,225,295,,316,317,,,"silhouette_name : symmetrical,\nsilhouette_fit..."


In [34]:
# # 로컬에 저장
# new_df.to_csv("../data/imaterialist-fashion-2020-fgvc7/clothes_final_sparse_doc.csv", index=False)

In [35]:
base_path = "../data/imaterialist-fashion-2020-fgvc7/cropped_images/"

new_df['img_path'] = base_path + new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str) + ".jpg"
# image df와의 join을 위한 키 생성
new_df['img_id'] = new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str)

In [None]:
new_df.loc[2022, 'img_path']

Image.open("../data/imaterialist-fashion-2020-fgvc7/train/054f0ae9527a9a79a4de6f3acc166e5b.jpg")

In [None]:
i = 2020
print(new_df.loc[i, 'name'])
print(new_df.loc[i, 'doc'])
Image.open(new_df.loc[i, 'img_path'])

In [None]:
i = 2022
print(new_df.loc[i, 'name'])
print(new_df.loc[i, 'doc'])
Image.open(new_df.loc[i, 'img_path'])

---

In [49]:
new_df = pd.read_csv("../data/imaterialist-fashion-2020-fgvc7/clothes_final_sparse_doc.csv")

  new_df = pd.read_csv("../data/imaterialist-fashion-2020-fgvc7/clothes_final_sparse_doc.csv")


### 이미지 embeddings와 함께 merge하여 하나의 dataframe으로 결합

In [50]:
embeddings = {}

with open('../data/imaterialist-fashion-2020-fgvc7/img_embeddings_fashion_fine_tuned.json', 'r') as file:
    for line in file:
        # Convert each line to a dictionary
        embedding_dict = json.loads(line.strip())
        
        # Convert the list back to a NumPy array if necessary
        for img_name, emb_list in embedding_dict.items():
            embeddings[img_name] = np.array(emb_list)

image_embedddings = pd.DataFrame([embeddings]).T.reset_index()
image_embedddings.rename(columns={"index":"img_id", 0:"img_emb"}, inplace=True)

In [51]:
image_embedddings.head(2)

Unnamed: 0,img_id,img_emb
0,00000663ed1ff0c4e0132b9b9ac53f6e_0,"[0.20991066098213196, 0.8551046848297119, -0.1..."
1,00000663ed1ff0c4e0132b9b9ac53f6e_1,"[0.005228467285633087, 0.6059291362762451, -1...."


In [52]:
new_df['img_id'] = new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str)

In [53]:
new_df = pd.merge(new_df, image_embedddings, on='img_id', how='left')

In [54]:
# 모두 잘 join 되었는지 확인
new_df.img_emb.isna().sum()

0

### 3. CLIP : SPLADE = Dense : sparse vector

- hybrid search를 위해서는 dense vector와 sparse vector를 짝을 지어줘야 함

In [55]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to('cpu')  # move to GPU if possible
sparse_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

### 4. pineconeDB upsert 형태로 변환

#### Upsert 형식

```json
{"id" : "0838a48a7b0bfa789a5181ab0e8f4ee2_3040", # 이미지 파일 이름 + entity ID
 "values" : [-0.08405803143978119, -0.7088879346847534, ...], # CLIP embeddings
 "sparse_values" : {
    "indices" : [1045, 1062, ...], # non-zero index
    "values" : [1.3038887977600098, 0.304147332906723, ...] # non-zero values
    },
"metadata" : {
    # 이미지 파일 path
    "img_path": "../data/imaterialist-fashion-2020-fgvc7/cropped_images/0838a48a7b0bfa789a5181ab0e8f4ee2_3040.jpg",
    "category": "coat"
} 
}

```

In [56]:
def gen_sparse_vector(text):
    tokens = splade_tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        sparse_emb = sparse_model(
            d_kwargs=tokens.to('cpu')
        )['d_rep'].squeeze()
    
    indices = sparse_emb.nonzero().squeeze().cpu().tolist()
    values = sparse_emb[indices].cpu().tolist()

    return indices, values

def upsert_format(id, text, img_emb):
    index, value = gen_sparse_vector(text)
    
    sparse_values = {
        "indices": index,
        "values": value
    }
    
    upsert = {
        "id": id,
        "values": img_emb,
        "sparse_values":sparse_values,
        "metadata":{"img_path":"../data/imaterialist-fashion-2020-fgvc7/cropped_images/"+id+".jpg"}
    }
    return upsert

In [None]:
tmp = new_df.head(5)

In [None]:
upserts = list()

for _, row in tqdm(tmp.iterrows(), total=tmp.shape[0]):
    upserts.append(upsert_format(row['img_id'], row['doc'], row['img_emb'].tolist()))

In [None]:
type(upserts[0]['values'])

In [None]:
type(upserts[0]['sparse_values'])

In [None]:
upserts[0]['sparse_values'].keys()

In [None]:
upserts[0].keys()

In [None]:
upserts[0]['id']

In [None]:
upserts[0]['metadata']

In [None]:
upserts[0]['sparse_values'].keys()

`02. generate_SPLADE_embeddings.ipynb` 참고

### 만들어진 sparse vector 읽어오기

In [57]:
data_read = []

# Open the file in read mode
with open("../data/imaterialist-fashion-2020-fgvc7/upsert_vectors_fashion_fine_tuned.json", 'r') as file:
    # Iterate through each line in the file
    for line in file:
        # Parse the JSON string into a Python dictionary
        data = json.loads(line)
        # Append the dictionary to the list
        data_read.append(data)

# Now, data_read contains all the dictionaries read from the file
print(f"Successfully read {len(data_read)} items from upsert_vectors_fashion_fine_tuned.json")

Successfully read 97924 items from upsert_vectors_fashion_fine_tuned.json


In [58]:
data_read[0].keys()

dict_keys(['id', 'values', 'sparse_values', 'metadata'])

In [59]:
data_read[0]['metadata']

{'img_path': 'imaterialist-fashion-2020-fgvc7/cropped_images/00000663ed1ff0c4e0132b9b9ac53f6e_0.jpg'}