In [None]:
import pandas as pd

train_tsv = pd.read_csv("./mscoco_imgfeat/train2014_obj36.tsv", sep='\t')
train_tsv

In [None]:
val_features = pd.read_csv("/home/VizWizVQA/Cross-Attention-VizWiz-VQA/data/SG-Bottom-up-features/val_features.tsv", sep='\t')
val_features

In [21]:
import sys
import csv
import base64
import time

import numpy as np

csv.field_size_limit(sys.maxsize)
FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
              "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]

def load_obj_tsv(fname, topk=None):
    """Load object features from tsv file.
    :param fname: The path to the tsv file.
    :param topk: Only load features for top K images (lines) in the tsv file.
        Will load all the features if topk is either -1 or None.
    :return: A list of image object features where each feature is a dict.
        See FILENAMES above for the keys in the feature dict.
    """
    data = []
    start_time = time.time()
    print("Start to load Faster-RCNN detected objects from %s" % fname)
    with open(fname) as f:
        reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
        for i, item in enumerate(reader):

            for key in ['img_h', 'img_w', 'num_boxes']:
                item[key] = int(item[key])
            
            boxes = item['num_boxes']
            decode_config = [
                ('objects_id', (boxes, ), np.int64),
                ('objects_conf', (boxes, ), np.float32),
                ('attrs_id', (boxes, ), np.int64),
                ('attrs_conf', (boxes, ), np.float32),
                ('boxes', (boxes, 4), np.float32),
                ('features', (boxes, -1), np.float32),
            ]
            for key, shape, dtype in decode_config:
                item[key] = np.frombuffer(base64.b64decode(item[key]), dtype=dtype)
                item[key] = item[key].reshape(shape)
                item[key].setflags(write=False)

            data.append(item)
            if topk is not None and len(data) == topk:
                break
    elapsed_time = time.time() - start_time
    print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time))
    return data

In [29]:
train_tsv = load_obj_tsv("./vizwiz/vizwiz_imgfeat/val_obj36.tsv", topk=1)
train_tsv

Start to load Faster-RCNN detected objects from ./vizwiz/vizwiz_imgfeat/val_obj36.tsv
Loaded 1 images in file ./vizwiz/vizwiz_imgfeat/val_obj36.tsv in 0 seconds.


[OrderedDict([('img_id', 'VizWiz_val_00003667'),
              ('img_h', 25),
              ('img_w', 60),
              ('objects_id',
               array([106, 106, 106, 106, 106, 182, 106, 106, 262, 182, 609, 609, 106,
                      106, 106, 182, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
                      106, 106, 106, 106, 106, 106, 106, 106, 106, 106])),
              ('objects_conf',
               array([0.2877, 0.2409, 0.2146, 0.1856, 0.1651, 0.154 , 0.1532, 0.1523,
                      0.1397, 0.1325, 0.127 , 0.097 , 0.094 , 0.0925, 0.0819, 0.0731,
                      0.0535, 0.0369, 0.0362, 0.0326, 0.0267, 0.0248, 0.0233, 0.0158,
                      0.0148, 0.0146, 0.0132, 0.0117, 0.0117, 0.0112, 0.01  , 0.0099,
                      0.0085, 0.0081, 0.0074, 0.0072], dtype=float32)),
              ('attrs_id',
               array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
import pandas as pd
import ast
import json
import base64
import numpy as np
np.set_printoptions(suppress=True, precision=4)
from os.path import join as pjoin
import yaml


def tsv_writer(values, tsv_file, sep='\t'):
    mkdir(op.dirname(tsv_file))
    lineidx_file = op.splitext(tsv_file)[0] + '.lineidx'
    idx = 0
    tsv_file_tmp = tsv_file + '.tmp'
    lineidx_file_tmp = lineidx_file + '.tmp'
    with open(tsv_file_tmp, 'w') as fp, open(lineidx_file_tmp, 'w') as fpidx:
        assert values is not None
        for value in values:
            assert value is not None
            # this step makes sure python2 and python3 encoded img string are the same.
            # for python2 encoded image string, it is a str class starts with "/".
            # for python3 encoded image string, it is a bytes class starts with "b'/".
            # v.decode('utf-8') converts bytes to str so the content is the same.
            # v.decode('utf-8') should only be applied to bytes class type. 
            value = [v if type(v)!=bytes else v.decode('utf-8') for v in value]
            v = '{0}\n'.format(sep.join(map(str, value)))
            fp.write(v)
            fpidx.write(str(idx) + '\n')
            idx = idx + len(v)
    os.rename(tsv_file_tmp, tsv_file)
    os.rename(lineidx_file_tmp, lineidx_file)


def tsv_reader(tsv_file, sep='\t'):
    with open(tsv_file, 'r') as fp:
        for i, line in enumerate(fp):
            yield [x.strip() for x in line.split(sep)]

# Output of test_sg_net.py from scene_graph_benchmark
# predictions.tsv -> img_id, features (dict)

# Input for run_retreival.py
# predictions.tsv (lets rename to labels.tsv to avoid confusion) -> img_id, label (dict)
# features.tsv -> img_id, num_boxes, base64encoding
# image_id2idx.json -> cap_id (for us = img_id) to image idx (row no in features.tsv) mapping 

def generate_additional_features(rect, h, w):
    mask = np.array([w, h, w, h], dtype=np.float32)
    rect = np.clip(rect / mask, 0, 1)
    res = np.hstack((rect, [rect[3]-rect[1], rect[2]-rect[0]]))
    return res.astype(np.float32)


def generate_features(x):
    idx, objects = x[0], x[1]['objects']
    num_boxes = len(objects)
    height, width = hw_df.loc[idx,1][0]['height'], hw_df.loc[idx,1][0]['width']
    features_arr = []
    for i in range(num_boxes):
        features = np.frombuffer(base64.b64decode(objects[i]['feature']), np.float32)
        pos_feat = generate_additional_features(objects[i]['rect'], height, width)
        x = np.hstack((features, pos_feat))
        features_arr.append(x.astype(np.float32))
    features = np.vstack(tuple(features_arr))
    features = base64.b64encode(features).decode("utf-8")
    return features

def generate_num_boxes(x):
    objects = x[1]['objects']
    return len(objects)

def generate_labels(x):
    idx, objects = x[0], x[1]['objects']
    height, width = hw_df.loc[idx,1][0]['height'], hw_df.loc[idx,1][0]['width']
    subset_of_objects = [{'class': obj['class'], 'rect': obj['rect']} for obj in objects] 
    results = {'image_h': height, 'image_w': width, 'objects': subset_of_objects}
    return results

# 将 Scene Graph Benchmark 特征转换为 BUTD 特征

FIELDNAMES 

- img_id
- img_h, img_w
- objects_id
- objects_conf
- attrs_id,
- attrs_conf 
- num_boxes
- boxes
- features


In [24]:
data_path = '/home/VizWizVQA/scene_graph_benchmark/data/VizWiz/test'
output_path = '/home/VizWizVQA/scene_graph_benchmark/data/VizWiz/test_output/'
image_id2idxfile = pjoin(output_path, 'image_id2idx.json')

hw_tsv = pjoin(data_path, 'test.hw.tsv')
hw_df = pd.read_csv(hw_tsv, sep='\t', header=None, converters={1:ast.literal_eval}, index_col=0)

pred_tsv = pjoin(output_path, 'predictions.tsv')
df = pd.read_csv(pred_tsv, sep='\t', header = None, converters={1:json.loads})

df['feature'] = df['feature']
df['num_boxes'] = df.apply(generate_num_boxes, axis=1)
df['label'] = df.apply(generate_labels, axis=1)
df['idx'] = np.arange(len(df))

df['feature'] = df['feature'].apply(json.dumps)
df['num_boxes'] = df['num_boxes'].apply(json.dumps)
df['label'] = df['label'].apply(json.dumps)

print(df.loc[0, 'label'])

{"image_h": 480, "image_w": 360, "objects": [{"class": "wall", "rect": [0.0, 0.0, 247.6843719482422, 108.7865219116211]}, {"class": "shadow", "rect": [0.0, 384.8695373535156, 203.30682373046875, 477.5732727050781]}, {"class": "bottom", "rect": [29.826379776000977, 238.19744873046875, 328.1093444824219, 360.5655517578125]}, {"class": "appliance", "rect": [73.40457153320312, 0.0, 359.4000244140625, 332.8822326660156]}, {"class": "laptop", "rect": [0.0, 0.0, 209.66915893554688, 399.7511291503906]}, {"class": "top", "rect": [86.78858947753906, 0.0, 359.4000244140625, 224.78053283691406]}, {"class": "table", "rect": [0.0, 193.53802490234375, 258.7671203613281, 479.4000244140625]}, {"class": "surface", "rect": [0.0, 176.1691131591797, 359.4000244140625, 479.4000244140625]}, {"class": "writing", "rect": [250.5624237060547, 64.13314056396484, 282.4050598144531, 130.3645782470703]}, {"class": "text", "rect": [256.51910400390625, 67.26131439208984, 288.42657470703125, 140.51499938964844]}]}
