In [26]:
import json
import pandas as pd

### Load Data

In [65]:
# text questions
train_text_file = open("../Data/train/TextVQA_0.5.1_train.json")
val_text_file = open("../Data/val/TextVQA_0.5.1_val.json")
test_text_file = open("../Data/test/TextVQA_0.5.1_test.json")

# ocr tokens
train_ocr_file = open("../Data/train/TextVQA_Rosetta_OCR_v0.2_train.json")
val_ocr_file = open("../Data/val/TextVQA_Rosetta_OCR_v0.2_val.json")
test_ocr_file = open("../Data/test/TextVQA_Rosetta_OCR_v0.2_test.json")

# objects
train_img_df = pd.read_csv("../Data/train/train-annotations-bbox.csv")
val_img_df = pd.read_csv("../Data/val/validation-annotations-bbox.csv")
test_img_df = pd.read_csv("../Data/test/test-annotations-bbox.csv")

# extract data from .json 
#dict_keys(['data', 'dataset_type', 'dataset_name', 'dataset_version'])
train_text_data = json.load(train_text_file)
val_text_data = json.load(val_text_file)
test_text_data = json.load(test_text_file) 
train_ocr_data = json.load(train_ocr_file) 
val_ocr_data = json.load(val_ocr_file) 
test_ocr_data = json.load(test_ocr_file) 

In [37]:
# extract image id in train | val | test
train_image_id_set = set([train_text_data["data"][i]["image_id"] for i in range(len(train_text_data["data"]))]) # 21953 image_id
val_image_id_set = set([val_text_data["data"][i]["image_id"] for i in range(len(val_text_data["data"]))]) # 3166 image_id
test_image_id_set = set([test_text_data["data"][i]["image_id"] for i in range(len(test_text_data["data"]))]) # 3289 image_id 


### OCR Token Analysis

In [39]:
# build dictionary about ocr info
# key: image id
# value: number of ocr tokens detected
train_ocr_info_dict = {train_ocr_data["data"][i]["image_id"]:len(train_ocr_data["data"][i]["ocr_info"]) for i in range(len(train_ocr_data["data"]))}  # 21953 unique images
val_ocr_info_dict = {val_ocr_data["data"][i]["image_id"]:len(val_ocr_data["data"][i]["ocr_info"]) for i in range(len(val_ocr_data["data"]))}  # 3166 unique images
test_ocr_info_dict = {test_ocr_data["data"][i]["image_id"]:len(test_ocr_data["data"][i]["ocr_info"]) for i in range(len(test_ocr_data["data"]))}  # 3289 unique images

# compute average ocr token detected in each dataset
print (sum(train_ocr_info_dict.values())/len(train_ocr_data["data"])) # there are on average 12.45 ocr tokens detected per training image 
print (sum(val_ocr_info_dict.values())/len(val_ocr_data["data"])) # there are on average 12.89 ocr tokens detected per validation image
print (sum(test_ocr_info_dict.values())/len(test_ocr_data["data"])) # there are on average 9.60 ocr tokens detected per test image


12.447182617409922
12.893240682248894
9.597750076010946
3166


### Object Detection Analysis

In [61]:
# construct dataframes of object detection 
# columns: image id, count of objects detected
train_img_obj_count = train_img_df.groupby(["ImageID"]).size().reset_index(name='count')
val_img_obj_count = val_img_df.groupby(["ImageID"]).size().reset_index(name='count')
test_img_obj_count = test_img_df.groupby(["ImageID"]).size().reset_index(name='count')
total_img_obj_count = pd.concat([train_img_obj_count, val_img_obj_count, test_img_obj_count])

train_img_obj_count = total_img_obj_count[total_img_obj_count["ImageID"].isin(train_image_id_set)] # 21953 images with 5.04 object detected on average
val_img_obj_count = total_img_obj_count[total_img_obj_count["ImageID"].isin(val_image_id_set)] # 3166 images with 5.31 objects detected on average
test_img_obj_count = total_img_obj_count[total_img_obj_count["ImageID"].isin(test_image_id_set)] # 2897 images with 9.14 objects detected on average



5.044321960552089
5.3092229943145925
9.139799792889196
