In [1]:
import json
import pandas as pd
import urllib.parse
import os
import hashlib

In [2]:
def short_hash(input_string, len_hash=12):
    hash_hex = hashlib.md5(input_string.encode()).hexdigest()[:len_hash]
    return int(hash_hex, 16)

## do for each domain

In [19]:
json_data = {}

## categories

In [20]:
## get the category ids first.

id_to_classname = {}
with open("Art.txt") as fh:
    for l in fh.readlines():
        file_path, label = l.strip().split()
        classname = file_path.split("/")[1]
        id_to_classname[int(label)] = classname
classname_to_id = {v:k for k,v in id_to_classname.items()}

In [21]:
categories = [{"category_name":cname,  "category_id":idx} for idx, cname in id_to_classname.items()]
json_data['categories'] = categories

In [22]:
categories

[{'category_name': 'Drill', 'category_id': 0},
 {'category_name': 'Exit_Sign', 'category_id': 1},
 {'category_name': 'Bottle', 'category_id': 2},
 {'category_name': 'Glasses', 'category_id': 3},
 {'category_name': 'Computer', 'category_id': 4},
 {'category_name': 'File_Cabinet', 'category_id': 5},
 {'category_name': 'Shelf', 'category_id': 6},
 {'category_name': 'Toys', 'category_id': 7},
 {'category_name': 'Sink', 'category_id': 8},
 {'category_name': 'Laptop', 'category_id': 9},
 {'category_name': 'Kettle', 'category_id': 10},
 {'category_name': 'Folder', 'category_id': 11},
 {'category_name': 'Keyboard', 'category_id': 12},
 {'category_name': 'Flipflops', 'category_id': 13},
 {'category_name': 'Pencil', 'category_id': 14},
 {'category_name': 'Bed', 'category_id': 15},
 {'category_name': 'Hammer', 'category_id': 16},
 {'category_name': 'ToothBrush', 'category_id': 17},
 {'category_name': 'Couch', 'category_id': 18},
 {'category_name': 'Bike', 'category_id': 19},
 {'category_name': 'P

In [23]:
for domain in ["Real_World", "Art", "Clipart", "Product"]:
    for split in ["train", "test"]:
        
        print("Creating for {}/{}".format(domain, split))
        fid_to_filename = {}
        fid_to_label = {}
        fid_to_category = {}

        
        with open("{}_{}.txt".format(domain, split)) as fh:
            for l in fh.readlines():
                file_path, label = l.strip().split()
                label = int(label)

#                 file_name = file_path.split("/")[-1].split(".")[0]
                fid = short_hash(file_path)
                fid_to_filename[fid] = file_path
                fid_to_label[fid] = label
                fid_to_category[fid] = id_to_classname[label]

        all_ids = list(fid_to_category.keys())

        ## images

        images = []

        for fid in all_ids:
            images.append({
                "filename" : fid_to_filename[fid],
                "id"       : int(fid),
            })

        ## annotations

        anns = []

        for fid in all_ids:

            anns.append({
                "image_id" : int(fid),
                "category" : fid_to_label[fid],
                'class_name' : fid_to_category[fid]
            })

        ## metadata

        meta = []

        for fid in all_ids:
            fid = int(fid)
            fname = fid_to_filename[fid]
            file_tag = fname.split("/")[-1].split(".")[0]

            meta.append({
                'image_id' : fid,
                'file_tag' : file_tag
            })


        json_data["{}_{}".format(domain.lower().replace("_world",""), split)] = {
            "images" : images,
            "annotations" : anns,
            "metadata"    : meta
        }

Creating for Real_World/train
Creating for Real_World/test
Creating for Art/train
Creating for Art/test
Creating for Clipart/train
Creating for Clipart/test
Creating for Product/train
Creating for Product/test


In [24]:
for k in json_data.keys():
    if k != "categories":
        print(k, len(json_data[k]['images']))

real_train 3967
real_test 390
art_train 2041
art_test 386
clipart_train 3975
clipart_test 390
product_train 4049
product_test 390


In [25]:
json_data.keys()

dict_keys(['categories', 'real_train', 'real_test', 'art_train', 'art_test', 'clipart_train', 'clipart_test', 'product_train', 'product_test'])

In [27]:
json_data['art_train']['metadata'][1000]

{'image_id': 225307374647061, 'file_tag': '00019'}

In [28]:
with open("../../metadata/officeHome.json", "w") as fh:
    json.dump(json_data, fh, indent=4)

## check

In [15]:
import json
import os

In [16]:
data = json.load(open("../../metadata/domainnet.json"))

In [17]:
data.keys()

dict_keys(['categories', 'real_train', 'real_test', 'clipart_train', 'clipart_test', 'sketch_train', 'sketch_test', 'painting_train', 'painting_test', 'quickdraw_train', 'quickdraw_test', 'infograph_train', 'infograph_test'])

In [19]:
data['painting_train']["annotations"]

[{'image_id': 239812555587564,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 131732473278032,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 225725372335672,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 202816219127952,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 150778055291405,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 211752138705067,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 61924210820174, 'category': 0, 'class_name': 'aircraft_carrier'},
 {'image_id': 247314311278136,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 145439135604317,
  'category': 0,
  'class_name': 'aircraft_carrier'},
 {'image_id': 39877555589467, 'category': 0, 'class_name': 'aircraft_carrier'},
 {'image_id': 16534695187699, 'category': 0, 'class_name': 'aircraft_carrier'},
 {'image_id': 127831606148617,
  'category': 0,
  'class_name': 'aircraft_carrie