## VINVL input preparation
* Requirements: [microsoft sg_benchmark](https://github.com/microsoft/scene_graph_benchmark/blob/main/INSTALL.md) + pandas + yaml
* To generate `predictions.tsv` using SG_benchmark, see [this issue](https://github.com/microsoft/scene_graph_benchmark/issues/7#issuecomment-819357369)

In [2]:
import pandas as pd
import ast
import json
import base64
import numpy as np
np.set_printoptions(suppress=True, precision=4)

### To feed the data into VinVL's run_captioning.py on COCO, we need to generate:
* test.yaml file with:
    * test.labels.tsv: labels with confidence intervals
        * image_id, [{'class':str, conf: float, 'rect':[x_tl, y_tl, x_br,y_br]},{},...,{}]
    * test.feature.tsv: features extracted via sg_benchmark
        * image_id, {"num_boxes": n, "features": concatenated base64-encoded features: 2048 spacial features, 6 additional features*}
    * other files in COCO's yaml (test.img.tsv, test.hw.tsv, captions.json) aren't used for inference



* 6 additional features (hypothesis): for each bounding box we have: 
    * [0]: x_top_left/image_w, 
    * [1]: y_top_left/image_h, 
    * [2]: x_bottom_right/image_w, 
    * [3]: y_bottom_right/image_h, 
    * [4]: box_height/image_h = (x_br - x_tl)/image_h = [2] -[0]
    * [5]: (y_br-y_tl)/image_w = [3]-[1]

### We need height and width of the pictures

In [3]:
hw_df = pd.read_csv('./VizWiz/val/val.hw.tsv',sep='\t',header=None,converters={1:ast.literal_eval},index_col=0)
hw_df.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
VizWiz_val_00000000,"[{'height': 1296, 'width': 968}]"
VizWiz_val_00000001,"[{'height': 162, 'width': 121}]"
VizWiz_val_00000002,"[{'height': 648, 'width': 484}]"
VizWiz_val_00000003,"[{'height': 2592, 'width': 1936}]"
VizWiz_val_00000004,"[{'height': 1296, 'width': 968}]"


### We also need `predictions.tsv` with bboxes, its class, confidence and spacial features

In [4]:
sg_tsv = './VizWiz/val_t/predictions.tsv'
df = pd.read_csv(sg_tsv,sep='\t',header = None,converters={1:json.loads})#converters={1:ast.literal_eval})
df[1] = df[1].apply(lambda x: x['objects'])

#example
df.loc[0,1][0]

{'rect': [77.09486389160156,
  283.3519592285156,
  966.3866577148438,
  1288.4888916015625],
 'bbox_id': 0,
 'class': 'computer monitor',
 'conf': 0.8116009831428528,
 'feature': 'mUPGPwAAAAAAAAAAyZWuQAAAAABIMQlBxNYpPgbfkz8AAAAAAAAAAMX5Cj81KkE+AAAAAKKlQDzCCpg9NGAOPgAAAADE8AU+9CVfPwAAAADRLFE813IzP06ZLEDRUM4+AAAAAHJMnEC1AtQ9AAAAAH4RYz61A+g9AAAAAAAAAACKjRFBwTnePwAAAACKwgc+P8eAQADyzD8AAAAA5DoGQAAAAABxWvI+wVvsPU6zGT0AAAAAsC8rQD4+vUF3Ous+yj6TP/D+VkA1TrE8AAAAAESj8T8EWYo/AAAAAAAAAABaBpNAAAAAAFDsG0EAAAAAAAAAAAAAAADGGwU9AFc8P322Ej465XU/AAAAACJ4qD8AAAAAAAAAAPWdAzwAAAAAAAAAAAAAAADg3IM/AAAAALPquj9diRFBUcKePwAAAAA5hDw+DX7EPwAAAACa6OI+AAAAALG7vT1jurtA9beSPgAAAADsCI0/AAAAAAAAAADIhBU+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4CesQAAAAAAAknU+AAAAAAAAAADnDKQ9DfqpQYBNoD8AAAAAETajPxeoWUAAAAAA+DbaPAAAAADXdzg+yb8SPwAAAAAAAAAAqYFeQAAAAACkUx49JF2oPQAAAABBg/Y/vCmJQI+orz89yBU/AAAAAAAAAABKTXJAPIrrP4VLaz4AAAAAjUvaPwAAAABzmUY9AAAAAAAAAAAKHA4+/bR6Pr7I/D4AAAAAAAAAAA+MID0AAAAAEq5mPwAAAAAAAAAAQb11PwAAAAC8wdo/AAAAAELQiT9BpEI/Sm+

In [5]:
def generate_additional_features(rect,h,w):
    mask = np.array([w,h,w,h],dtype=np.float32)
    rect = np.clip(rect/mask,0,1)
    res = np.hstack((rect,[rect[3]-rect[1], rect[2]-rect[0]]))
    return res.astype(np.float32)

def generate_features(x):
    idx, data,num_boxes = x[0],x[1],len(x[1])
    h,w,features_arr = hw_df.loc[idx,1][0]['height'],hw_df.loc[idx,1][0]['width'],[]

    for i in range(num_boxes):
        features = np.frombuffer(base64.b64decode(data[i]['feature']),np.float32)
        pos_feat = generate_additional_features(data[i]['rect'],h,w)
        x = np.hstack((features,pos_feat))
        features_arr.append(x.astype(np.float32))
        
    features = np.vstack(tuple(features_arr))
    features = base64.b64encode(features).decode("utf-8")
    return {"features":features, "num_boxes":num_boxes}

def generate_labels(x):
    data = x[1]
    res = [{"class":el['class'].capitalize(),"conf":el['conf'], "rect": el['rect']} for el in data] 
    return res

In [6]:
df['feature'] = df.apply(generate_features,axis=1)
df['feature'] = df['feature'].apply(json.dumps)

df['label'] = df.apply(generate_labels,axis=1)
df['label'] = df['label'].apply(json.dumps)

In [7]:
df.head()

Unnamed: 0,0,1,feature,label
0,VizWiz_val_00000000,"[{'rect': [77.09486389160156, 283.351959228515...","{""features"": ""mUPGPwAAAAAAAAAAyZWuQAAAAABIMQlB...","[{""class"": ""Computer monitor"", ""conf"": 0.81160..."
1,VizWiz_val_00000001,"[{'rect': [54.770816802978516, 65.501434326171...","{""features"": ""AAAAAES3GD0AAAAAARisPq5IEz9auG8+...","[{""class"": ""Hand"", ""conf"": 0.9663493633270264,..."
2,VizWiz_val_00000002,"[{'rect': [338.03662109375, 337.77288818359375...","{""features"": ""pmGkPmQojT4AAAAAwSs/O2Ny7TwAAAAA...","[{""class"": ""Dog"", ""conf"": 0.9385652542114258, ..."
3,VizWiz_val_00000003,"[{'rect': [286.3694763183594, 1483.44934082031...","{""features"": ""q1N5PQAAAAAAAAAAiI+SQQAAAAAB/Y5A...","[{""class"": ""Box"", ""conf"": 0.801200270652771, ""..."
4,VizWiz_val_00000004,"[{'rect': [789.4127197265625, 256.838256835937...","{""features"": ""AAAAAAAAAAAAAAAAAAAAAAoY0EAAAAAA...","[{""class"": ""Cloud"", ""conf"": 0.7336462736129761..."


In [12]:
np.frombuffer(base64.b64decode(eval(df['feature'][0])['features']),np.float32).shape
# print(type(df['feature'][0]))

(20540,)

### Write to tsv + generate lineidx

In [9]:
import os

OUTPUT_DIR = './VizWiz/val_t/'
LABEL_FILE = os.path.join(OUTPUT_DIR,'label.tsv')
FEATURE_FILE = os.path.join(OUTPUT_DIR,'feature.tsv')
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"path to {OUTPUT_DIR} created")

In [10]:
from maskrcnn_benchmark.structures.tsv_file_ops import tsv_reader, tsv_writer
tsv_writer(df[[0,'label']].values.tolist(),LABEL_FILE)
tsv_writer(df[[0,'feature']].values.tolist(),FEATURE_FILE)

ModuleNotFoundError: No module named 'maskrcnn_benchmark'

## Generate test.yaml for vinvl run_captioning

In [None]:
import yaml
import os.path as op
yaml_dict = {"label": "label.tsv",
             "feature": "features.tsv"}

with open(op.join(OUTPUT_DIR, 'vinvl_test_yaml.yaml'), 'w') as file:
        yaml.dump(yaml_dict, file)

### Misc: Read generated tsvs

In [None]:
pd.read_csv('vinvl_demo_images_features_nms_03/inference/feature.tsv',header=None,sep='\t').shape

### Encoding correctness check

In [35]:
df.loc[0,1][0]['rect']

[76.71561431884766, 206.3582763671875, 229.50418090820312, 532.83837890625]

In [22]:
features_t = np.frombuffer(base64.b64decode(df.loc[0,'feature']['features']),np.float32).reshape(df.loc[0,'feature']['num_boxes'],-1)
features_t.shape

In [36]:
features_t[0,-6:]

array([0.0639, 0.2579, 0.1913, 0.666 , 0.4081, 0.1273], dtype=float32)

In [31]:
def reverse_transform(feat,h=800,w=1200):
    return feat[:4] * np.array([w,h,w,h])

In [32]:
reverse_transform(features_t[0,-6:])

array([ 76.7156, 206.3583, 229.5042, 532.8384])