In [11]:
import json
import os
from pathlib import Path
import random
import shutil

from tqdm.notebook import tqdm


In [2]:
dataset_metadata_path = Path('/home/gabriel/data/screenshop_fashionpedia/annotations/instances_attributes_train2020.json')


In [3]:
with open(dataset_metadata_path) as f:
    dataset_metadata = json.load(f)

In [4]:
dataset_metadata.keys()

dict_keys(['annotations', 'images', 'info', 'licenses', 'categories', 'attributes'])

In [5]:
image_exts = {}

for image in Path('/home/gabriel/data/screenshop_fashionpedia/raw_images').glob('*'):
    ext = image.name.split('.')[-1]
    assert ext in ('jpg', 'png')
    image_id = int(image.stem)
    image_exts[image_id] = ext

In [6]:
clean_images = []

for image in dataset_metadata['images']:
    image_id = image["id"]
    new_filename = f'{image_id:012d}.{image_exts[image_id]}'
    
    assert (Path('/home/gabriel/data/screenshop_fashionpedia/raw_images') / new_filename).exists(), image
    
    clean_images.append({
        'id': image['id'],
        'width': image['width'],
        'height': image['height'],
        'file_name': new_filename,
        'isstatic': image['isstatic'],
    })


In [42]:
clean_annotations = []

for annotation in dataset_metadata['annotations']:
    clean_annotations.append({
        'id': annotation['id'],
        'image_id': annotation['image_id'],
        'bbox': annotation['bbox'],
        'area': annotation['area'],
        'iscrowd': annotation['iscrowd'],
        'category_id': annotation['category_id'] + 1,  # must be one-indexed
    })


In [26]:
ids = list(image['id'] for image in clean_images)
random.shuffle(ids)

train_ids = set(ids[:38_000])
val_ids = set(ids[38_000:])

In [43]:
train_images = [x for x in clean_images if x['id'] in train_ids] 
val_images = [x for x in clean_images if x['id'] in val_ids] 

print(len(train_images), len(val_images))

38000 7623


In [28]:
if Path('/home/gabriel/data/screenshop_fashionpedia/screenshop_train2019').exists():
    shutil.rmtree('/home/gabriel/data/screenshop_fashionpedia/screenshop_train2019')

if Path('/home/gabriel/data/screenshop_fashionpedia/screenshop_val2019').exists():
    shutil.rmtree('/home/gabriel/data/screenshop_fashionpedia/screenshop_val2019')

os.makedirs('/home/gabriel/data/screenshop_fashionpedia/screenshop_train2019', exist_ok=True)
os.makedirs('/home/gabriel/data/screenshop_fashionpedia/screenshop_val2019', exist_ok=True)


In [29]:
for image in tqdm(clean_images):
    from_path = Path('/home/gabriel/data/screenshop_fashionpedia/raw_images') / image['file_name']
    to_dir ='screenshop_train2019' if image['id'] in train_ids else 'screenshop_val2019'
    to_path = Path('/home/gabriel/data/screenshop_fashionpedia/') / to_dir / image['file_name']
    shutil.copyfile(from_path, to_path)


HBox(children=(FloatProgress(value=0.0, max=45623.0), HTML(value='')))




In [44]:
train_annotations = [x for x in clean_annotations if x['image_id'] in train_ids] 
val_annotations = [x for x in clean_annotations if x['image_id'] in val_ids] 

print(len(train_annotations), len(val_annotations))

277780 55621


In [45]:
train_metadata = {
    'annotations': train_annotations,
    'images': train_images,
    'categories': dataset_metadata['categories'],
}

val_metadata = {
    'annotations': val_annotations,
    'images': val_images,
    'categories': dataset_metadata['categories'],
}


In [48]:
with open('/home/gabriel/data/screenshop_fashionpedia/annotations/instances_screenshop_train2019.json', 'w') as f:
    json.dump(train_metadata, f)

In [49]:
with open('/home/gabriel/data/screenshop_fashionpedia/annotations/instances_screenshop_val2019.json', 'w') as f:
    json.dump(val_metadata, f)