In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import tqdm
import random
import glob
import json
import networkx as nx

In [2]:
ZAPPOS_ROOT = '/home/amrith/zappos-50k/'
ZAPPOS_IMAGES_ROOT = '/home/amrith/zappos-50k/ut-zap50k-images-square'

In [3]:
df = pd.read_csv(f'{ZAPPOS_ROOT}/ut-zap50k-data/meta-data-bin.csv')
df['HeelHeight.High.heel'] = df['HeelHeight.4in...4.3.4in'] + df['HeelHeight.5in...over']
df['HeelHeight.Short.heel'] = df['HeelHeight.Flat'] + df['HeelHeight.Under.1in'] + df['HeelHeight.1in...1.3.4in']

In [4]:
df.shape

(50025, 154)

In [5]:
print(sorted(df.columns))

['CID', 'Category.Boots', 'Category.Sandals', 'Category.Shoes', 'Category.Slippers', 'Closure.Adjustable', 'Closure.Ankle.Strap', 'Closure.Ankle.Wrap', 'Closure.Belt', 'Closure.Buckle', 'Closure.Bungee', 'Closure.Button.Loop', 'Closure.Elastic.Gore', 'Closure.Hook.and.Loop', 'Closure.Lace.up', 'Closure.Monk.Strap', 'Closure.Pull.on', 'Closure.Sling.Back', 'Closure.Slip.On', 'Closure.Snap', 'Closure.Spat.Strap', 'Closure.T.Strap', 'Closure.Toggle', 'Closure.Zipper', 'Gender.Boys', 'Gender.Girls', 'Gender.Men', 'Gender.Women', 'HeelHeight.1in...1.3.4in', 'HeelHeight.2in...2.3.4in', 'HeelHeight.3in...3.3.4in', 'HeelHeight.4in...4.3.4in', 'HeelHeight.5in...over', 'HeelHeight.Flat', 'HeelHeight.High.heel', 'HeelHeight.Short.heel', 'HeelHeight.Under.1in', 'Insole.EVA', 'Insole.Gel', 'Insole.Hypoallergenic', 'Insole.Latex.Lined', 'Insole.Leather', 'Insole.Memory.Foam', 'Insole.Moisture.Wicking', 'Insole.Orthotic.Friendly', 'Insole.Padded', 'Insole.Polyurethane', 'Insole.Poron', 'Insole.Remova

In [6]:
# Attribute list
ATTR_LIST_1 = ['Category.Shoes', 'Category.Sandals', 'SubCategory.Oxfords', 'SubCategory.Heel', 'SubCategory.Boot', 'SubCategory.Slipper.Flats', 'SubCategory.Flats', 'SubCategory.Slipper.Heels', 'SubCategory.Athletic', 'SubCategory.Knee.High', 'SubCategory.Crib.Shoes', 'SubCategory.Over.the.Knee', 'HeelHeight.High.heel', 'HeelHeight.Short.heel', 'Closure.Pull.on', 'Closure.Ankle.Strap', 'Closure.Zipper', 'Closure.Elastic.Gore', 'Closure.Sling.Back', 'Closure.Toggle', 'Closure.Snap', 'Closure.T.Strap', 'Closure.Spat.Strap', 'Gender.Men', 'Gender.Boys', 'Material.Rubber', 'Material.Wool', 'Material.Silk', 'Material.Aluminum', 'Material.Plastic', 'ToeStyle.Capped Toe', 'ToeStyle.Square Toe', 'ToeStyle.Snub Toe', 'ToeStyle.Bicycle Toe', 'ToeStyle.Open Toe', 'ToeStyle.Pointed Toe', 'ToeStyle.Almond', 'ToeStyle.Apron Toe', 'ToeStyle.Snip Toe', 'ToeStyle.Medallion']
ATTR_LIST_2 = ['Category.Boots', 'Category.Slippers', 'SubCategory.Mid.Calf', 'SubCategory.Ankle', 'SubCategory.Loafers', 'SubCategory.Boat.Shoes', 'SubCategory.Clogs.and.Mules', 'SubCategory.Sneakers.and.Athletic.Shoes', 'SubCategory.Heels', 'SubCategory.Prewalker', 'SubCategory.Prewalker.Boots', 'SubCategory.Firstwalker', 'Closure.Lace.up', 'Closure.Buckle', 'Closure.Hook.and.Loop', 'Closure.Slip.On', 'Closure.Ankle.Wrap', 'Closure.Bungee', 'Closure.Adjustable',  'Closure.Button.Loop', 'Closure.Monk.Strap', 'Closure.Belt', 'Gender.Women', 'Gender.Girls', 'Material.Suede', 'Material.Snakeskin', 'Material.Corduroy', 'Material.Horse.Hair', 'Material.Stingray', 'ToeStyle.Round Toe', 'ToeStyle.Closed Toe', 'ToeStyle.Moc Toe', 'ToeStyle.Wingtip', 'ToeStyle.Center Seam', 'ToeStyle.Algonquin', 'ToeStyle.Bump Toe', 'ToeStyle.Wide Toe Box', 'ToeStyle.Peep Toe']
FULL_ATTR_LIST = sorted(list(set(ATTR_LIST_1 + ATTR_LIST_2)))
for x in FULL_ATTR_LIST:
    if x not in df.columns: 
        print(x)
print(len(FULL_ATTR_LIST))

78


In [7]:
for a, x in zip(FULL_ATTR_LIST, df.loc[:, FULL_ATTR_LIST].apply(sum, axis=0)):
    print(f'{a} \t\t {x}')

Category.Boots 		 12832
Category.Sandals 		 5741
Category.Shoes 		 30169
Category.Slippers 		 1283
Closure.Adjustable 		 1
Closure.Ankle.Strap 		 1425
Closure.Ankle.Wrap 		 102
Closure.Belt 		 8
Closure.Buckle 		 3648
Closure.Bungee 		 640
Closure.Button.Loop 		 111
Closure.Elastic.Gore 		 1741
Closure.Hook.and.Loop 		 4853
Closure.Lace.up 		 16221
Closure.Monk.Strap 		 84
Closure.Pull.on 		 4618
Closure.Sling.Back 		 482
Closure.Slip.On 		 15725
Closure.Snap 		 70
Closure.Spat.Strap 		 20
Closure.T.Strap 		 28
Closure.Toggle 		 410
Closure.Zipper 		 5063
Gender.Boys 		 4495
Gender.Girls 		 6550
Gender.Men 		 13337
Gender.Women 		 28188
HeelHeight.High.heel 		 3527
HeelHeight.Short.heel 		 19605
Material.Aluminum 		 14
Material.Corduroy 		 68
Material.Horse.Hair 		 4
Material.Plastic 		 3
Material.Rubber 		 6830
Material.Silk 		 28
Material.Snakeskin 		 13
Material.Stingray 		 4
Material.Suede 		 8222
Material.Wool 		 434
SubCategory.Ankle 		 5855
SubCategory.Athletic 		 14
SubCategory

In [8]:
def is_same_category(attrs):
    return len(set([attr.lower().split(".")[0] for attr in attrs])) < len(attrs)
    
def get_all_positive_images(df, attrs):
    return df[attrs].apply(sum, axis=1) == len(attrs)

In [14]:
from tqdm import tqdm
import concurrent.futures
from multiprocessing import Pool

attr_pairs = []
for i in range(len(FULL_ATTR_LIST)):        
    for j in range(i+1, len(FULL_ATTR_LIST)):
        attrs = sorted([FULL_ATTR_LIST[i], FULL_ATTR_LIST[j]]) # sort to fix key for pair of attrs
        attr_pairs.append(attrs)

print(len(attr_pairs)) # list of all attr pairs

def get_positive_images_count(attrs):
    if is_same_category(attrs):
        return 0 # same category => 0 weight
    else:
        return get_all_positive_images(df, attrs).sum()

pool = Pool() 
counts = pool.map(get_positive_images_count, attr_pairs)

attr_pair_weight = defaultdict(int)
for a_pair, cnt in zip(attr_pairs, counts):
    attr_pair_weight[";".join(a_pair)] = cnt 

3003


In [15]:
attr_pair_weight

defaultdict(int,
            {'Category.Boots;Category.Sandals': 0,
             'Category.Boots;Category.Shoes': 0,
             'Category.Boots;Category.Slippers': 0,
             'Category.Boots;Closure.Adjustable': 0,
             'Category.Boots;Closure.Ankle.Strap': 0,
             'Category.Boots;Closure.Ankle.Wrap': 0,
             'Category.Boots;Closure.Belt': 8,
             'Category.Boots;Closure.Buckle': 370,
             'Category.Boots;Closure.Bungee': 248,
             'Category.Boots;Closure.Button.Loop': 82,
             'Category.Boots;Closure.Elastic.Gore': 473,
             'Category.Boots;Closure.Hook.and.Loop': 429,
             'Category.Boots;Closure.Lace.up': 3613,
             'Category.Boots;Closure.Monk.Strap': 0,
             'Category.Boots;Closure.Pull.on': 4496,
             'Category.Boots;Closure.Sling.Back': 0,
             'Category.Boots;Closure.Slip.On': 16,
             'Category.Boots;Closure.Snap': 45,
             'Category.Boots;Closure.Spat

In [16]:
G = nx.Graph()
for a_pair, wt in attr_pair_weight.items():
    a_1, a_2 = a_pair.split(";")
    G.add_edge(a_1, a_2, weight=wt)
print("no of connected components", G.number_connected_components())

NameError: name 'nx' is not defined