In [1]:
import json
import os
from pathlib import Path
import random
import shutil

from tqdm.notebook import tqdm


In [2]:
BASE_DIR = Path('/home/gabriel/data/screenshop_fashionpedia')


In [3]:
dataset_metadata_path = BASE_DIR / 'annotations/instances_attributes_train2020.json'


In [4]:
with open(dataset_metadata_path) as f:
    dataset_metadata = json.load(f)

In [5]:
dataset_metadata.keys()

dict_keys(['annotations', 'images', 'info', 'licenses', 'categories', 'attributes'])

In [6]:
bad_supercategories = ('garment parts', 'closures', 'decorations')
bad_categories = ('cape', 'glove', 'glasses', 'scarf', 'headband, head covering, hair accessory', 'tie', 'watch', 'belt', 'umbrella')


In [7]:
valid_category_ids = {c['id'] for c in dataset_metadata['categories'] if c['supercategory'] not in bad_supercategories and c['name'] not in bad_categories}


In [8]:
# Make category IDs zero-indexed and sequential again

new_category_ids = {cid: idx for idx, cid in enumerate(valid_category_ids)}

new_categories = []

for category in dataset_metadata['categories'].copy():
    if category['id'] not in valid_category_ids:
        continue
        
    category['id'] = new_category_ids[category['id']]
    new_categories.append(category)
    

In [9]:
print(f'Found {len(valid_category_ids)} valid categories...')

Found 18 valid categories...


In [10]:
images_ids_with_annotation = set()
clean_annotations = []

for annotation in dataset_metadata['annotations']:
    if annotation['category_id'] not in valid_category_ids:
        continue
        
    clean_annotations.append({
        'id': annotation['id'],
        'image_id': annotation['image_id'],
        'bbox': annotation['bbox'],
        'area': annotation['area'],
        'iscrowd': annotation['iscrowd'],
        'category_id': new_category_ids[annotation['category_id']],  # TODO: must be one-indexed for D7, but not for Faster R-CNN
    })
    
    images_ids_with_annotation.add(annotation['image_id'])

print(f'Found {len(clean_annotations)} valid annotations...')

Found 139992 valid annotations...


In [11]:
# image_exts = {}

# for image in (BASE_DIR / 'raw_images').glob('*'):
#     ext = image.name.split('.')[-1]
#     assert ext in ('jpg', 'png')
#     image_id = int(image.stem)
#     image_exts[image_id] = ext

In [12]:
clean_images = []

for image in dataset_metadata['images']:
    image_id = image["id"]
    
    if image_id not in images_ids_with_annotation:
        continue
    
    new_filename = f'{image_id:012d}.jpg'
    
    if not (BASE_DIR / 'raw_images' / new_filename).exists():
        print(f'Skipping problematic image {new_filename}...')
        continue
    
    assert (BASE_DIR / 'raw_images' / new_filename).exists(), image
    
    clean_images.append({
        'id': image['id'],
        'width': image['width'],
        'height': image['height'],
        'file_name': new_filename,
        'isstatic': image['isstatic'],
    })

print(f'Found {len(clean_images)} valid images...')


Skipping problematic image 000000009498.jpg...
Skipping problematic image 000000019373.jpg...
Skipping problematic image 000000012661.jpg...
Skipping problematic image 000000011740.jpg...
Skipping problematic image 000000015675.jpg...
Skipping problematic image 000000015635.jpg...
Skipping problematic image 000000009436.jpg...
Skipping problematic image 000000016851.jpg...
Skipping problematic image 000000012217.jpg...
Skipping problematic image 000000019832.jpg...
Skipping problematic image 000000008965.jpg...
Skipping problematic image 000000017454.jpg...
Skipping problematic image 000000020419.jpg...
Skipping problematic image 000000011042.jpg...
Skipping problematic image 000000012222.jpg...
Skipping problematic image 000000012610.jpg...
Skipping problematic image 000000019486.jpg...
Skipping problematic image 000000017083.jpg...
Skipping problematic image 000000009218.jpg...
Skipping problematic image 000000018498.jpg...
Skipping problematic image 000000010349.jpg...
Skipping prob

Skipping problematic image 000000023225.jpg...
Skipping problematic image 000000021302.jpg...
Skipping problematic image 000000025420.jpg...
Skipping problematic image 000000025433.jpg...
Skipping problematic image 000000025510.jpg...
Skipping problematic image 000000025535.jpg...
Skipping problematic image 000000025580.jpg...
Skipping problematic image 000000025702.jpg...
Skipping problematic image 000000025764.jpg...
Skipping problematic image 000000025769.jpg...
Skipping problematic image 000000025782.jpg...
Skipping problematic image 000000025828.jpg...
Skipping problematic image 000000026011.jpg...
Skipping problematic image 000000026112.jpg...
Skipping problematic image 000000026161.jpg...
Skipping problematic image 000000026328.jpg...
Skipping problematic image 000000026404.jpg...
Skipping problematic image 000000026462.jpg...
Skipping problematic image 000000026514.jpg...
Skipping problematic image 000000026660.jpg...
Skipping problematic image 000000026872.jpg...
Skipping prob

Skipping problematic image 000000030629.jpg...
Skipping problematic image 000000031070.jpg...
Skipping problematic image 000000031244.jpg...
Skipping problematic image 000000031470.jpg...
Skipping problematic image 000000031796.jpg...
Skipping problematic image 000000031892.jpg...
Skipping problematic image 000000031949.jpg...
Skipping problematic image 000000031982.jpg...
Skipping problematic image 000000032544.jpg...
Skipping problematic image 000000033554.jpg...
Skipping problematic image 000000033708.jpg...
Skipping problematic image 000000033954.jpg...
Skipping problematic image 000000034177.jpg...
Skipping problematic image 000000034388.jpg...
Skipping problematic image 000000034494.jpg...
Skipping problematic image 000000035134.jpg...
Skipping problematic image 000000035601.jpg...
Skipping problematic image 000000035750.jpg...
Skipping problematic image 000000035833.jpg...
Skipping problematic image 000000036135.jpg...
Skipping problematic image 000000036480.jpg...
Skipping prob

In [13]:
ids = list(image['id'] for image in clean_images)
random.shuffle(ids)

train_ids = set(ids[:38_000])
val_ids = set(ids[38_000:])

In [14]:
train_images = [x for x in clean_images if x['id'] in train_ids] 
val_images = [x for x in clean_images if x['id'] in val_ids] 

print(len(train_images), len(val_images))

38000 7155


In [15]:
if (BASE_DIR / 'screenshop_train2019').exists():
    shutil.rmtree(BASE_DIR / 'screenshop_train2019')

if (BASE_DIR / 'screenshop_val2019').exists():
    shutil.rmtree(BASE_DIR / 'screenshop_val2019')
    
os.makedirs(BASE_DIR / 'screenshop_train2019', exist_ok=True)
os.makedirs(BASE_DIR / 'screenshop_val2019', exist_ok=True)


In [16]:
for image in tqdm(clean_images):
    from_path = BASE_DIR / 'raw_images' / image['file_name']
    to_dir ='screenshop_train2019' if image['id'] in train_ids else 'screenshop_val2019'
    to_path = BASE_DIR / to_dir / image['file_name']
    shutil.copyfile(from_path, to_path)


HBox(children=(FloatProgress(value=0.0, max=45155.0), HTML(value='')))




In [17]:
train_annotations = [x for x in clean_annotations if x['image_id'] in train_ids] 
val_annotations = [x for x in clean_annotations if x['image_id'] in val_ids] 

print(len(train_annotations), len(val_annotations))

117077 22143


In [18]:
train_metadata = {
    'annotations': train_annotations,
    'images': train_images,
    'categories': new_categories,
}

val_metadata = {
    'annotations': val_annotations,
    'images': val_images,
    'categories': new_categories,
}


In [19]:
with open(BASE_DIR / 'annotations/instances_screenshop_train2019.json', 'w') as f:
    json.dump(train_metadata, f)

In [20]:
with open(BASE_DIR / 'annotations/instances_screenshop_val2019.json', 'w') as f:
    json.dump(val_metadata, f)