### Settings

In [4]:
import os
import json
import os.path as op

from seal.data.utils import save_json, load_json, get_stat

VAW_DIR = '../data/VAW/data'
VAW_CLEAN_DIR = '../data/VAW/clean_data'

if not op.exists(VAW_CLEAN_DIR):
    os.mkdir(VAW_CLEAN_DIR)

MAX_BOX_W = 50
MAX_BOX_H = 50

### 1 Load VAW

In [5]:
data_train = load_json(f'{VAW_DIR}/train_part1.json')
data_train += load_json(f'{VAW_DIR}/train_part2.json')
data_val = load_json(f'{VAW_DIR}/val.json')
data_test = load_json(f'{VAW_DIR}/test.json')

all_data = data_train + data_val + data_test

cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(all_data)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

# atts: 620
# objs: 2260
# pairs: 34873
# imgs: 260895


### 2 Clean VAW

#### 2.1 Filter Small Objects

In [6]:
all_data = [x for x in all_data if x['instance_bbox'][2] >= MAX_BOX_W and x['instance_bbox'][3] >= MAX_BOX_H]

cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(all_data)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

if not op.exists(op.join(VAW_DIR, 'all.json')):
    save_json(op.join(VAW_DIR, 'all.json'), all_data)

# atts: 620
# objs: 2115
# pairs: 32402
# imgs: 197185


#### 2.2 Merge object categories with similar meaning (this is followed from GraphEmb)

In [7]:
from tqdm import tqdm
from nltk.corpus import wordnet as wn

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def get_ss_name(obj):
    # Function to get WordNet synset.
    sss = wn.synsets(obj)
    if len(sss) == 0:
        ss = obj
    else:
        ss = sss[0].name()
    return ss

def similar(obj, group):
    # Function to depluralize object name, then check if 'obj' and 'group'
    # may be similar using their WordNet synsets.
    ss = get_ss_name(obj)
    lem_obj = wnl.lemmatize(obj, "n")

    for o, n in group:
        sso = get_ss_name(o)
        if ss == sso:
            return True
        lem_o = wnl.lemmatize(o, "n")
        if lem_obj == lem_o:
            return True
    return False

groups = []
obj2group = {}

for obj in tqdm(cnt_obj):
    found = False
    for i, gr in enumerate(groups):
        if similar(obj, gr):
            found = True
            gr.append((obj, cnt_obj[obj]))
            obj2group[obj] = i
    if not found:
        groups.append([(obj, cnt_obj[obj])])
        obj2group[obj] = len(groups) - 1

100%|██████████| 2115/2115 [00:51<00:00, 41.32it/s]


Start renaming each group with its most representative object category

In [8]:
for ins in all_data:
    obj = ins['object_name']
    group_idx = obj2group[obj]
    obj_rep = None
    n = 0
    for o, m in groups[group_idx]:
        if m > n:
            obj_rep = o
            n = m
    ins['object_name'] = obj_rep

In [9]:
n = 0
for gr in groups:
    if len(gr) > 2:
        print(gr)
        n += 1
        if n >= 10:
            break

[('floor', 1819), ('flooring', 72), ('floors', 36)]
[('railing', 342), ('rail', 158), ('rails', 47), ('railings', 31), ('runway', 42)]
[('shrubs', 57), ('bushes', 317), ('shrub', 69), ('bush', 620)]
[('place', 5), ('spot', 100), ('spots', 91)]
[('rocks', 245), ('stone', 107), ('rock', 413), ('stones', 62)]
[('car', 1827), ('cars', 199), ('automobile', 1)]
[('patch', 86), ('patches', 10), ('speckles', 1)]
[('motorcycle', 540), ('bike', 396), ('bikes', 24), ('motorcycles', 28)]
[('phone', 133), ('telephone', 11), ('phones', 1)]
[('airplane', 354), ('plane', 596), ('airplanes', 18), ('aeroplane', 1), ('planes', 41)]


In [10]:
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(all_data)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

# atts: 620
# objs: 1765
# pairs: 27840
# imgs: 197185


Replace 't-shirt' and 'tee-shirt' to 't shirt'. 


Why 't shirt'? Because 't shirt' can be indexed by WordNet when we want to merge these shirt objects together.

In [11]:
n = 0
for ins in all_data:
    obj_name = ins['object_name']
    if obj_name == 't-shirt' or obj_name == 'tee shirt':
        ins['object_name'] = 't shirt'
        n += 1
print(f'# renamed imgs: {n}')

# renamed imgs: 419


In [12]:
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(all_data)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

# atts: 620
# objs: 1763
# pairs: 27810
# imgs: 197185


#### 2.3 Remove bad attributes.

In [13]:
# Remove bad attributes.
bad_attrs = set([
    'light colored', 'dark colored', 'extended', 'close', 'blurry', 'still', 'stopped', # difficult or not relevant to intrinsic properties
    'dark skinned', 'light skinned', 'asian', 'caucasian', # sensitive
    'male', 'female', # sensitive
    'worn', 'printed', 'waiting', 'daytime', 'used', 'wild', 'lined', # noisy or requires too much context
    'lined up', 'interior', 'displayed', 'in the background', 'far away', 'for sale', 'out of focus', 'turning', # noisy or requires too much context
    'water' # not attribute
])

n_pos = 0
n_neg = 0

for ins in all_data:
    for i in range(len(ins['positive_attributes'])):
        n_before = len(ins['positive_attributes'])
        ins['positive_attributes'] = [attr for attr in ins['positive_attributes'] if attr not in bad_attrs]
        n_after = len(ins['positive_attributes'])
        n_pos += n_before - n_after
    
    for i in range(len(ins['negative_attributes'])):
        n_before = len(ins['negative_attributes'])
        ins['negative_attributes'] = [attr for attr in ins['negative_attributes'] if attr not in bad_attrs]
        n_after = len(ins['negative_attributes'])
        n_neg += n_before - n_after


print(f'Remove {n_pos} positive annotations')
print(f'Remove {n_neg} negative annotations')

Remove 10871 positive annotations
Remove 17344 negative annotations


In [14]:
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(all_data)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

# atts: 591
# objs: 1763
# pairs: 26382
# imgs: 197185


#### 2.4 Reform relevant metadata files.

In [15]:
attribute_parent_types = load_json(op.join(VAW_DIR, 'attribute_parent_types.json'))
attribute_types = load_json(op.join(VAW_DIR, 'attribute_types.json'))

#### 2.4 Check if two files has some attributes (not in data)

In [16]:
attribute_set = set()
object_set = set()
for data in all_data:
    for attribute in data['positive_attributes']:
        attribute_set.add(attribute)
    for attribute in data['negative_attributes']:
        attribute_set.add(attribute)
    object_set.add(data['object_name'])

num_not_exists = 0
attribute_flag = {attribute: False for idx, attribute in enumerate(sorted(list(attribute_set)))}
for attribute_type, attribute_names in attribute_types.items():
    for attribute_name in attribute_names:
        if attribute_name not in attribute_flag:
            num_not_exists += 1
        else:
            attribute_flag[attribute_name] = True
print('num redundant attributes: {}'.format(num_not_exists))
attribute_not_exist = [k for k, v in attribute_flag.items() if v == False]
print('redundant attributes not in the cleaned data:')
print(attribute_not_exist)

num redundant attributes: 76
redundant attributes not in the cleaned data:
['abandoned', 'almost empty', 'backless', 'being used', 'bell shaped', 'bushy', 'chipped', 'crossing arms', 'diagonal', 'dressed', 'holed', 'muscular', 'neat', 'numbered', 'nylon', 'overgrown', 'partially visible', 'perched', 'pulled back', 'raising arm', 'ridged', 'side view', 'staring', 'translucent', 'turned off', 'unmade', 'using phone', 'wearing bow tie', 'wearing headband', 'wheeled', 'white framed', 'wool']


Delete the redundant attributes

In [17]:
del_types = []
for attribute_type, attribute_names in attribute_types.items():
    new_attribute_names = []
    for attribute_name in attribute_names:
        if attribute_name in attribute_flag:
            new_attribute_names.append(attribute_name)    
    attribute_types[attribute_type] = new_attribute_names
    if len(new_attribute_names) == 0:
        del_types.append(attribute_type)
    print(attribute_type, len(attribute_names), len(new_attribute_names))

for x in del_types:
    del attribute_types[x]

color 61 58
letter color 3 3
hair color 6 6
skin color 2 0
wearing color 7 7
tone 4 1
color quantity 6 6
brightness 2 2
height 4 4
length 2 2
width 5 4
fatness 5 5
size 11 9
thickness 2 2
depth 2 2
size comparison 2 0
material 52 48
shape 25 25
orientation 6 6
pattern 14 13
closeness 2 0
face expression 13 13
hand movement 4 4
pose 14 13
activity 33 28
sport activity 11 11
face pose 4 3
weather 10 9
location 2 2
place 6 4
maturity 5 5
newness 3 3
cleanliness 4 4
hardness 2 2
weight 2 1
race 2 0
opaqeness 3 3
gender 2 0
texture 7 7
state 11 11
wearing accessories 6 6
other 285 244


In [18]:
attribute_index = {attribute: idx for idx, attribute in enumerate(sorted(list(attribute_set)))}
object_index = {object_name: idx for idx, object_name in enumerate(sorted(list(object_set)))}

#### 2.5 Resplit all data to train, val, test set

In [19]:
import copy
exp_data = copy.deepcopy(all_data)
print(f'# imgs before: {len(exp_data)}')

# imgs before: 197185


In [20]:
def find_id_set(data):
    out_set = set()
    for ins in data:
        out_set.add(ins['instance_id'])
    return out_set

train_id = find_id_set(data_train)
val_id = find_id_set(data_val)
test_id = find_id_set(data_test)

In [21]:
exp_train = [x for x in exp_data if x['instance_id'] in train_id]
exp_val = [x for x in exp_data if x['instance_id'] in val_id]
exp_test = [x for x in exp_data if x['instance_id'] in test_id]

In [22]:
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(exp_train)
print(f'train set')
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(exp_val)
print(f'val set')
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(exp_test)
print(f'test set')
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')


train set
# atts: 591
# objs: 1642
# pairs: 22709
# imgs: 163651
val set
# atts: 576
# objs: 766
# pairs: 6950
# imgs: 9180
test set
# atts: 591
# objs: 977
# pairs: 12697
# imgs: 24354


Delete the samples having no positive lables, after the deleting of some attributes.

In [23]:
exp_train = [x for x in exp_train if len(x['positive_attributes']) != 0]
exp_val = [x for x in exp_val if len(x['positive_attributes']) != 0]
exp_test = [x for x in exp_test if len(x['positive_attributes']) != 0]

In [24]:
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(exp_train)
print(f'train set')
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

print(f'val set')
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(exp_val)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

print(f'test set')
cnt_attr, cnt_obj, cnt_pair, cooc, obj_afford, obj_afford_cooc, n_images = get_stat(exp_test)
print(f'# atts: {len(cnt_attr)}')
print(f'# objs: {len(cnt_obj)}')
print(f'# pairs: {len(cnt_pair)}')
print(f'# imgs: {n_images}')

train set
# atts: 591
# objs: 1619
# pairs: 22709
# imgs: 141104
val set
# atts: 576
# objs: 750
# pairs: 6950
# imgs: 8221
test set
# atts: 591
# objs: 955
# pairs: 12697
# imgs: 21082


save new data

In [25]:
save_dir = VAW_CLEAN_DIR

def save_json(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)
    return data
save_json(op.join(save_dir, 'all.json'), exp_data)
save_json(op.join(save_dir, 'train.json'), exp_train)
save_json(op.join(save_dir, 'val.json'), exp_val)
save_json(op.join(save_dir, 'test.json'), exp_test)
save_json(op.join(save_dir, 'attribute_index.json'), attribute_index)
save_json(op.join(save_dir, 'object_index.json'), object_index)
save_json(op.join(save_dir, 'attribute_types.json'), attribute_types)

"Finish!"

'Finish!'