In [1]:
import json
import pandas as pd
import urllib.parse
import os
import hashlib
import random
from collections import defaultdict

In [2]:
## separate train and test in target
label_to_file = {l:[] for l in range(12)}
with open("validation.txt") as fh:
    for l in fh.readlines():
        file_path, label = l.strip().split()
        label_to_file[int(label)].append(l.strip())

In [3]:
target_train = []
target_test = []

for k,v in label_to_file.items():
    len_samples = len(v)
    random.shuffle(v)
    target_train.extend(v[:int(len_samples*.8)])
    target_test.extend(v[int(len_samples*.8):])

In [4]:
with open("real_train.txt", "w") as fh:
    fh.write("\n".join(target_train))
with open("real_test.txt", "w") as fh:
    fh.write("\n".join(target_test))

In [5]:
def short_hash(input_string, len_hash=12):
    hash_hex = hashlib.md5(input_string.encode()).hexdigest()[:len_hash]
    return int(hash_hex, 16)

## do for each domain

In [12]:
json_data = {}

## categories

In [13]:
## get the category ids first.

id_to_classname = {}
with open("syn_train.txt") as fh:
    for l in fh.readlines():
        file_path, label = l.strip().split()
        classname = file_path.split("/")[1]
        id_to_classname[int(label)] = classname
classname_to_id = {v:k for k,v in id_to_classname.items()}

In [14]:
classname_to_id

{'person': 0,
 'car': 1,
 'aeroplane': 2,
 'bicycle': 3,
 'skateboard': 4,
 'train': 5,
 'bus': 6,
 'plant': 7,
 'truck': 8,
 'knife': 9,
 'horse': 10,
 'motorcycle': 11}

In [15]:
categories = [{"category_name":cname,  "category_id":idx} for idx, cname in id_to_classname.items()]
json_data['categories'] = categories

In [16]:
for domain in ["syn", "real"]:
    for split in ["train", "test"]:
        print("Creating for {}/{}".format(domain, split))
        fid_to_filename = {}
        fid_to_label = {}
        fid_to_category = {}

        with open("{}_{}.txt".format(domain, split)) as fh:
            for l in fh.readlines():
                file_path, label = l.strip().split()
                label = int(label)

                file_name = file_path.split("/")[-1].split(".")[0]
                fid = short_hash(file_name)
                fid_to_filename[fid] = file_path
                fid_to_label[fid] = label
                fid_to_category[fid] = id_to_classname[label]

        all_ids = list(fid_to_category.keys())

        ## images

        images = []

        for fid in all_ids:
            images.append({
                "filename" : fid_to_filename[fid],
                "id"       : int(fid),
            })

        ## annotations

        anns = []

        for fid in all_ids:

            anns.append({
                "image_id" : int(fid),
                "category" : fid_to_label[fid],
                'class_name' : fid_to_category[fid]
            })

        ## metadata

        meta = []

        for fid in all_ids:
            fid = int(fid)
            fname = fid_to_filename[fid]
            file_tag = fname.split("/")[-1].split(".")[0]

            meta.append({
                'image_id' : fid,
                'file_tag' : file_tag
            })


        json_data["{}_{}".format(domain, split)] = {
            "images" : images,
            "annotations" : anns,
            "metadata"    : meta
        }

Creating for syn/train
Creating for syn/test
Creating for real/train
Creating for real/test


In [17]:
for k in json_data.keys():
    if k != "categories":
        print(k, len(json_data[k]['images']))

syn_train 149581
syn_test 149581
real_train 44305
real_test 11083


In [28]:
json_data['real_train']['metadata'][10000]

{'image_id': 131988470685331, 'file_tag': 'car_1342861'}

In [21]:
with open("../../metadata/visda2017.json", "w") as fh:
    json.dump(json_data, fh, indent=4)

## check

In [22]:
import json
import os

In [23]:
data = json.load(open("../../metadata/visda2017.json"))

In [24]:
data.keys()

dict_keys(['categories', 'syn_train', 'syn_test', 'real_train', 'real_test'])

In [26]:
data['syn_train']["annotations"]

[{'image_id': 276841816413329, 'category': 0, 'class_name': 'person'},
 {'image_id': 9779928580946, 'category': 0, 'class_name': 'person'},
 {'image_id': 210607589330916, 'category': 0, 'class_name': 'person'},
 {'image_id': 230739998617198, 'category': 0, 'class_name': 'person'},
 {'image_id': 65787496705414, 'category': 0, 'class_name': 'person'},
 {'image_id': 172637932043234, 'category': 0, 'class_name': 'person'},
 {'image_id': 19325704431684, 'category': 0, 'class_name': 'person'},
 {'image_id': 172606216675085, 'category': 0, 'class_name': 'person'},
 {'image_id': 192428655167346, 'category': 0, 'class_name': 'person'},
 {'image_id': 231591016267812, 'category': 0, 'class_name': 'person'},
 {'image_id': 194657344929050, 'category': 0, 'class_name': 'person'},
 {'image_id': 60132808724167, 'category': 0, 'class_name': 'person'},
 {'image_id': 163415790257272, 'category': 0, 'class_name': 'person'},
 {'image_id': 151796418612147, 'category': 0, 'class_name': 'person'},
 {'image_id