In [None]:
import json
import os
import pandas as pd
from collections import Counter

In [None]:
def flist_reader(flist):
    flist = pd.read_csv(flist, sep=' ', header=None).to_dict('list')
    return flist[0], flist[1]

def split_into_train_test(filename):
    """
    Split the file with format <fname label_id> into 80-20 train-test split.
    """

    files, labels = flist_reader(filename)
    
    ## split into train, test 80-20 for each label.

    train_files, train_labels = [], []
    test_files, test_labels = [], []

    for label in set(labels):
        label_files = [f for f, l in zip(files, labels) if l == label]
        label_labels = [l for f, l in zip(files, labels) if l == label]

        num_train = int(len(label_files) * 0.8)
        train_files.extend(label_files[:num_train])
        train_labels.extend(label_labels[:num_train])
        test_files.extend(label_files[num_train:])
        test_labels.extend(label_labels[num_train:])

    ## write to train and test files.

    train_df = pd.DataFrame({"fname" : train_files, "label" : train_labels})
    test_df = pd.DataFrame({"fname" : test_files, "label" : test_labels})

    train_df.to_csv(filename.replace("all", "train"), index=False, header=False, sep=" ")
    test_df.to_csv(filename.replace("all", "test"), index=False, header=False, sep=" ")

In [None]:
geoyfcc = json.load(open("/home/tarun/metadata/geoyfcc.json"))

In [None]:
for domain in ["usa", "asia"]:
    train = geoyfcc["{}_train".format(domain)]
    id_to_fname = {im["id"]:im["filename"] for im in train["images"]}
    id_to_class = {im["image_id"]:im["category"] for im in train["annotations"]}

    with open("/home/tarun/LangBasedGeoDA/data/geoyfcc/{}_all.txt".format(domain), "w") as fh:
        write_str = ""
        for im in train["images"]:
            fname = id_to_fname[im["id"]].partition("/newdata/tarun/datasets/GeoYFCC/")[-1]
            write_str += "{} {}\n".format(fname, id_to_class[im["id"]])
        fh.write(write_str)
    split_into_train_test("../data/geoyfcc/{}_all.txt".format(domain))

## Split metadata into train and test

In [1]:
import json
import pandas as pd
from PIL import Image

def flist_reader(flist):
    flist = pd.read_csv(flist, sep=' ', header=None).to_dict('list')
    return flist[0], flist[1]

def get_names(name):
    return "/newdata/tarun/datasets/GeoYFCC/" + name # name.split("/")[-1].split(".")[0]

In [2]:
data = json.load(open("../metadata/geoyfcc_old.json"))

In [14]:
for domain in ["usa", "asia"]:
    fields = ["images", "annotations", "metadata"]

    dom_data = data["{}_train".format(domain)]
    train_files = flist_reader("../data/geoyfcc/{}_train.txt".format(domain))[0]
    train_files = list(map(get_names, train_files))
    test_files = flist_reader("../data/geoyfcc/{}_test.txt".format(domain))[0]
    test_files = list(map(get_names, test_files))

    fname_to_fid = {m["filename"]:m["id"] for m in dom_data["images"]}

    train_fids = [fname_to_fid[fn] for fn in train_files]
    test_fids = [fname_to_fid[fn] for fn in test_files]

    for f in fields:
        
        print("{}/{}".format(domain, f))

        train_set = []
        test_set = []
        
        if f == "images":
            id_to_content = {im.get("id"):im for im in dom_data[f]}
        else:
            id_to_content = {im.get("id", im["image_id"]):im for im in dom_data[f]}
        train_set = [id_to_content[fid] for fid in train_fids]
        test_set = [id_to_content[fid] for fid in test_fids]
        data['{}_train'.format(domain)][f] = train_set
        data['{}_test'.format(domain)][f] = test_set

with open("../metadata/geoyfcc.json", "w") as fh:
    json.dump(data, fh, indent=4)

usa/images
usa/annotations
usa/metadata
asia/images
asia/annotations
asia/metadata
