In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import tqdm
import random
import glob
import json

In [2]:
RANDOM_SEED=42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [3]:
ZAPPOS_ROOT = '/home/amrith/zappos-50k/'
ZAPPOS_IMAGES_ROOT = '/home/amrith/zappos-50k/ut-zap50k-images-square'

In [4]:
df = pd.read_csv(f'{ZAPPOS_ROOT}/ut-zap50k-data/meta-data-bin.csv')
df['HeelHeight.High.heel'] = df['HeelHeight.4in...4.3.4in'] + df['HeelHeight.5in...over']
df['HeelHeight.Short.heel'] = df['HeelHeight.Flat'] + df['HeelHeight.Under.1in'] + df['HeelHeight.1in...1.3.4in']

In [5]:
df.shape

(50025, 154)

In [6]:
print(sorted(df.columns))

['CID', 'Category.Boots', 'Category.Sandals', 'Category.Shoes', 'Category.Slippers', 'Closure.Adjustable', 'Closure.Ankle.Strap', 'Closure.Ankle.Wrap', 'Closure.Belt', 'Closure.Buckle', 'Closure.Bungee', 'Closure.Button.Loop', 'Closure.Elastic.Gore', 'Closure.Hook.and.Loop', 'Closure.Lace.up', 'Closure.Monk.Strap', 'Closure.Pull.on', 'Closure.Sling.Back', 'Closure.Slip.On', 'Closure.Snap', 'Closure.Spat.Strap', 'Closure.T.Strap', 'Closure.Toggle', 'Closure.Zipper', 'Gender.Boys', 'Gender.Girls', 'Gender.Men', 'Gender.Women', 'HeelHeight.1in...1.3.4in', 'HeelHeight.2in...2.3.4in', 'HeelHeight.3in...3.3.4in', 'HeelHeight.4in...4.3.4in', 'HeelHeight.5in...over', 'HeelHeight.Flat', 'HeelHeight.High.heel', 'HeelHeight.Short.heel', 'HeelHeight.Under.1in', 'Insole.EVA', 'Insole.Gel', 'Insole.Hypoallergenic', 'Insole.Latex.Lined', 'Insole.Leather', 'Insole.Memory.Foam', 'Insole.Moisture.Wicking', 'Insole.Orthotic.Friendly', 'Insole.Padded', 'Insole.Polyurethane', 'Insole.Poron', 'Insole.Remova

In [7]:
# Attribute list
ATTR_LIST_1 = ['Category.Shoes', 'Category.Sandals', 'SubCategory.Oxfords', 'SubCategory.Heel', 'SubCategory.Boot', 'SubCategory.Slipper.Flats', 'SubCategory.Flats', 'SubCategory.Slipper.Heels', 'SubCategory.Athletic', 'SubCategory.Knee.High', 'SubCategory.Crib.Shoes', 'SubCategory.Over.the.Knee', 'HeelHeight.High.heel', 'HeelHeight.Short.heel', 'Closure.Pull.on', 'Closure.Ankle.Strap', 'Closure.Zipper', 'Closure.Elastic.Gore', 'Closure.Sling.Back', 'Closure.Toggle', 'Closure.Snap', 'Closure.T.Strap', 'Closure.Spat.Strap', 'Gender.Men', 'Gender.Boys', 'Material.Rubber', 'Material.Wool', 'Material.Silk', 'Material.Aluminum', 'Material.Plastic', 'ToeStyle.Capped Toe', 'ToeStyle.Square Toe', 'ToeStyle.Snub Toe', 'ToeStyle.Bicycle Toe', 'ToeStyle.Open Toe', 'ToeStyle.Pointed Toe', 'ToeStyle.Almond', 'ToeStyle.Apron Toe', 'ToeStyle.Snip Toe', 'ToeStyle.Medallion']
ATTR_LIST_2 = ['Category.Boots', 'Category.Slippers', 'SubCategory.Mid.Calf', 'SubCategory.Ankle', 'SubCategory.Loafers', 'SubCategory.Boat.Shoes', 'SubCategory.Clogs.and.Mules', 'SubCategory.Sneakers.and.Athletic.Shoes', 'SubCategory.Heels', 'SubCategory.Prewalker', 'SubCategory.Prewalker.Boots', 'SubCategory.Firstwalker', 'Closure.Lace.up', 'Closure.Buckle', 'Closure.Hook.and.Loop', 'Closure.Slip.On', 'Closure.Ankle.Wrap', 'Closure.Bungee', 'Closure.Adjustable',  'Closure.Button.Loop', 'Closure.Monk.Strap', 'Closure.Belt', 'Gender.Women', 'Gender.Girls', 'Material.Suede', 'Material.Snakeskin', 'Material.Corduroy', 'Material.Horse.Hair', 'Material.Stingray', 'ToeStyle.Round Toe', 'ToeStyle.Closed Toe', 'ToeStyle.Moc Toe', 'ToeStyle.Wingtip', 'ToeStyle.Center Seam', 'ToeStyle.Algonquin', 'ToeStyle.Bump Toe', 'ToeStyle.Wide Toe Box', 'ToeStyle.Peep Toe']
FULL_ATTR_LIST = sorted(list(set(ATTR_LIST_1 + ATTR_LIST_2)))
for x in FULL_ATTR_LIST:
    if x not in df.columns: 
        print(x)
print(len(FULL_ATTR_LIST))

78


In [8]:
ATTR_LIST = []
MIN_IMAGES_PER_ATTR = 1000
for a, x in zip(FULL_ATTR_LIST, df.loc[:, FULL_ATTR_LIST].apply(sum, axis=0)):
    if x > MIN_IMAGES_PER_ATTR:
        ATTR_LIST.append(a)
print(len(ATTR_LIST))

38


In [9]:
ATTR_LIST

['Category.Boots',
 'Category.Sandals',
 'Category.Shoes',
 'Category.Slippers',
 'Closure.Ankle.Strap',
 'Closure.Buckle',
 'Closure.Elastic.Gore',
 'Closure.Hook.and.Loop',
 'Closure.Lace.up',
 'Closure.Pull.on',
 'Closure.Slip.On',
 'Closure.Zipper',
 'Gender.Boys',
 'Gender.Girls',
 'Gender.Men',
 'Gender.Women',
 'HeelHeight.High.heel',
 'HeelHeight.Short.heel',
 'Material.Rubber',
 'Material.Suede',
 'SubCategory.Ankle',
 'SubCategory.Clogs.and.Mules',
 'SubCategory.Flats',
 'SubCategory.Heels',
 'SubCategory.Knee.High',
 'SubCategory.Loafers',
 'SubCategory.Mid.Calf',
 'SubCategory.Oxfords',
 'SubCategory.Slipper.Flats',
 'SubCategory.Sneakers.and.Athletic.Shoes',
 'ToeStyle.Almond',
 'ToeStyle.Capped Toe',
 'ToeStyle.Closed Toe',
 'ToeStyle.Moc Toe',
 'ToeStyle.Open Toe',
 'ToeStyle.Peep Toe',
 'ToeStyle.Pointed Toe',
 'ToeStyle.Round Toe']

1. logic to sample attribute list
2. logic to check if attributes are from same category
3. logic to sample images that are positive for the attribute pair

In [10]:
def is_same_category(attr_pair):
    return len(set([attr.lower().split(".")[0] for attr in attrs])) < len(attr_pair)
    
def sample_attrs(attributes_list, n_attrs=2):
    return np.random.choice(attributes_list, n_attrs, replace=False)

def get_all_positive_images(df, attrs):
    return df[attrs].apply(sum, axis=1) == len(attrs)

def get_all_negative_images(df, attrs):
    return (1-df[attrs]).apply(sum, axis=1) == len(attrs)

def sample_images(df, selected_images, n_samples):
    return list(np.random.choice(df[selected_images]["CID"].values, n_samples, replace=False))

In [11]:
N_SAMPLES = 8000
POSITIVE_IMAGES_THRESHOLD = 20
NEGATIVE_IMAGES_THRESHOLD = 20
N_SUPPORT = 10
N_QUERY = 10

In [12]:
CID_to_impath = {x.split("/")[-1][:-4].replace(".", "-"):x for x in glob.glob(f"{ZAPPOS_IMAGES_ROOT}/**/*.jpg", recursive=True)}

In [None]:
# main logic for sampling
n_selected = 0
dataset = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
counter = 0
import time
st = time.time()
while n_selected < N_SAMPLES:
    counter += 1
    # get attributes
    attrs = sample_attrs(ATTR_LIST)
    print(attrs)
    # if any pair of the sampled attributes are from the same category then reject
    if is_same_category(attrs):
        # print("Dropped -- same category", attrs)
        continue
    # get images which satisfy the attributes
    positive_images = get_all_positive_images(df, attrs)
    negative_images = positive_images.apply(lambda x: not x)
    assert all([(x != y) for x,y in zip(positive_images, negative_images)])
    # if we want to enforce 0s for all attributes then we need to change the logic above
    # to include get_all_negative_images(df, attrs)
    
    # if the no of images that are positive for the chosen attributes is too less, then reject 
    if (sum(positive_images) < POSITIVE_IMAGES_THRESHOLD) or (sum(negative_images) < NEGATIVE_IMAGES_THRESHOLD) :
        # print("Dropped -- didnt meet threshold", attrs)
        continue
    
    positive = [CID_to_impath[z] for z in sample_images(df, positive_images, N_SUPPORT+N_QUERY)]
    negative = [CID_to_impath[z] for z in sample_images(df, negative_images, N_SUPPORT+N_QUERY)]
    
    dataset[n_selected][0]['support'] = negative[:N_SUPPORT]
    dataset[n_selected][0]['query'] = negative[N_SUPPORT:]
    
    dataset[n_selected][1]['support'] = positive[:N_SUPPORT]
    dataset[n_selected][1]['query'] = positive[N_SUPPORT:]
    
    dataset[n_selected][0]['attributes'] = ";".join(["!"+a for a in attrs])
    dataset[n_selected][1]['attributes'] = ";".join(attrs)
    
    
    n_selected += 1
    print(f'Selected {n_selected}/{counter} Time Elapsed {time.time()-st}')
        

['ToeStyle.Moc Toe' 'ToeStyle.Pointed Toe']
['Gender.Women' 'SubCategory.Clogs.and.Mules']
Selected 1/2 Time Elapsed 0.7242496013641357
['ToeStyle.Capped Toe' 'SubCategory.Ankle']
Selected 2/3 Time Elapsed 1.4317758083343506
['SubCategory.Heels' 'ToeStyle.Pointed Toe']
Selected 3/4 Time Elapsed 2.172649621963501
['Closure.Slip.On' 'ToeStyle.Moc Toe']
Selected 4/5 Time Elapsed 2.9156758785247803
['HeelHeight.High.heel' 'SubCategory.Knee.High']
Selected 5/6 Time Elapsed 3.629976272583008
['Gender.Women' 'Category.Slippers']
Selected 6/7 Time Elapsed 4.352325201034546
['Gender.Boys' 'Closure.Hook.and.Loop']
Selected 7/8 Time Elapsed 5.058362007141113
['ToeStyle.Pointed Toe' 'SubCategory.Ankle']
Selected 8/9 Time Elapsed 5.774469614028931
['SubCategory.Clogs.and.Mules' 'ToeStyle.Capped Toe']
['SubCategory.Slipper.Flats' 'Closure.Zipper']
['Closure.Lace.up' 'ToeStyle.Moc Toe']
Selected 9/12 Time Elapsed 7.837586164474487
['ToeStyle.Open Toe' 'ToeStyle.Moc Toe']
['Category.Sandals' 'Gender.M

['SubCategory.Oxfords' 'Gender.Men']
Selected 71/114 Time Elapsed 66.71055030822754
['ToeStyle.Closed Toe' 'ToeStyle.Pointed Toe']
['Closure.Lace.up' 'HeelHeight.Short.heel']
Selected 72/116 Time Elapsed 67.43828463554382
['Closure.Zipper' 'HeelHeight.Short.heel']
Selected 73/117 Time Elapsed 68.19641137123108
['SubCategory.Oxfords' 'SubCategory.Ankle']
['Material.Rubber' 'Closure.Lace.up']
Selected 74/119 Time Elapsed 68.92776393890381
['Closure.Buckle' 'ToeStyle.Open Toe']
Selected 75/120 Time Elapsed 69.65725183486938
['Closure.Pull.on' 'Closure.Hook.and.Loop']
['SubCategory.Mid.Calf' 'SubCategory.Slipper.Flats']
['Gender.Girls' 'Closure.Hook.and.Loop']
Selected 76/123 Time Elapsed 70.38961172103882
['Material.Suede' 'Category.Shoes']
Selected 77/124 Time Elapsed 71.11578822135925
['HeelHeight.High.heel' 'SubCategory.Slipper.Flats']
['Category.Shoes' 'SubCategory.Sneakers.and.Athletic.Shoes']
Selected 78/126 Time Elapsed 72.52158665657043
['ToeStyle.Moc Toe' 'ToeStyle.Pointed Toe']


In [None]:
len(dataset)

In [None]:
json.dump(dataset, open(f'zappos-minImg{MIN_IMAGES_PER_ATTR}-alltrue_vs_anyfalse-nsamp{N_SAMPLES}-ns{N_SUPPORT}-nq{N_QUERY}.json', 'w'))