# Visual feature extraction 

In [1]:
%%capture 
import os
!git clone https://github.com/airsplay/py-bottom-up-attention.git
os.chdir('/content/py-bottom-up-attention') 

# Install python libraries
!pip install -r requirements.txt
!pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

# Install detectron2
!python setup.py build develop

# or if you are on macOS
# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop

# or, as an alternative to `setup.py`, do
# pip install [--editable] .



In [2]:
import os
import io

import detectron2

# import some common detectron2 utilities
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

# import some common libraries
import numpy as np
import cv2
import torch

# Show the image in ipynb
from IPython.display import clear_output, Image, display
import PIL.Image

from google.colab import drive
drive.mount('/content/drive')

def showarray(a, fmt='jpg'):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

Mounted at /content/drive


In [9]:
import json 
vg_classes = []
with open('/content/drive/MyDrive/tmp.txt') as f:
    for object in f.readlines():
        vg_classes.append(object.split(',')[0].lower().strip())

MetadataCatalog.get("vg").thing_classes = vg_classes

cfg = get_cfg()
cfg.merge_from_file("/content/py-bottom-up-attention/configs/VG-Detection/faster_rcnn_R_101_C4_caffe.yaml")
cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300
cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2
# VG Weight
cfg.MODEL.WEIGHTS = "http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe.pkl"
predictor = DefaultPredictor(cfg)

Config '/content/py-bottom-up-attention/configs/VG-Detection/faster_rcnn_R_101_C4_caffe.yaml' has no VERSION. Assuming it to be compatible with latest v2.


Modifications for VG in RPN (modeling/proposal_generator/rpn.py):
	Use hidden dim 512 instead fo the same dim as Res4 (1024).

Modifications for VG in RoI heads (modeling/roi_heads/roi_heads.py):
	1. Change the stride of conv1 and shortcut in Res5.Block1 from 2 to 1.
	2. Modifying all conv2 with (padding: 1 --> 2) and (dilation: 1 --> 2).
	For more details, please check 'https://github.com/peteanderson80/bottom-up-attention/blob/master/models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt'.



faster_rcnn_from_caffe.pkl: 255MB [00:09, 26.7MB/s]                           


In [10]:
#NUM_OBJECTS = 36

from torch import nn

from detectron2.modeling.postprocessing import detector_postprocess
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs, fast_rcnn_inference_single_image
from detectron2.structures.boxes import Boxes
from detectron2.structures.instances import Instances

def doit(raw_image, raw_boxes):
        # Process Boxes
    raw_boxes = Boxes(torch.from_numpy(raw_boxes).cuda())
    
    with torch.no_grad():
        raw_height, raw_width = raw_image.shape[:2]
        print("Original image size: ", (raw_height, raw_width))
        
        # Preprocessing
        image = predictor.transform_gen.get_transform(raw_image).apply_image(raw_image)
        print("Transformed image size: ", image.shape[:2])
        
        # Scale the box
        new_height, new_width = image.shape[:2]
        scale_x = 1. * new_width / raw_width
        scale_y = 1. * new_height / raw_height
        #print(scale_x, scale_y)
        boxes = raw_boxes.clone()
        boxes.scale(scale_x=scale_x, scale_y=scale_y)
        
        # ----
        image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
        inputs = [{"image": image, "height": raw_height, "width": raw_width}]
        images = predictor.model.preprocess_image(inputs)
        
        # Run Backbone Res1-Res4
        features = predictor.model.backbone(images.tensor)
        
        # Run RoI head for each proposal (RoI Pooling + Res5)
        proposal_boxes = [boxes]
        features = [features[f] for f in predictor.model.roi_heads.in_features]
        box_features = predictor.model.roi_heads._shared_roi_transform(
            features, proposal_boxes
        )
        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
        print('Pooled features size:', feature_pooled.shape)
        
        # Predict classes and boxes for each proposal.
        pred_class_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(feature_pooled)
        print(pred_class_logits.shape)
        pred_class_prob = nn.functional.softmax(pred_class_logits, -1)
        pred_scores, pred_classes = pred_class_prob[..., :-1].max(-1)
        
        # Detectron2 Formatting (for visualization only)
        roi_features = feature_pooled
        instances = Instances(
            image_size=(raw_height, raw_width),
            pred_boxes=raw_boxes,
            scores=pred_scores,
            pred_classes=pred_classes
        )
        
        return instances, roi_features

In [11]:
import os 
os.chdir('/content')
! wget https://guillaumejaume.github.io/FUNSD/dataset.zip
! unzip dataset.zip && mv dataset data && rm -rf dataset.zip __MACOSX

--2022-05-23 13:24:01--  https://guillaumejaume.github.io/FUNSD/dataset.zip
Resolving guillaumejaume.github.io (guillaumejaume.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to guillaumejaume.github.io (guillaumejaume.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16838830 (16M) [application/zip]
Saving to: ‘dataset.zip’


2022-05-23 13:24:02 (201 MB/s) - ‘dataset.zip’ saved [16838830/16838830]

Archive:  dataset.zip
   creating: dataset/
   creating: dataset/training_data/
  inflating: dataset/training_data/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/dataset/
   creating: __MACOSX/dataset/training_data/
  inflating: __MACOSX/dataset/training_data/._.DS_Store  
   creating: dataset/training_data/images/
  inflating: dataset/training_data/images/92091873.png  
   creating: __MACOSX/dataset/training_data/images/
  inflating: __MACOSX/dataset/training_data/images/._92091873.png  
  inflatin

In [35]:
import json

with open('/content/data/training_data/annotations/0000971160.json') as f:
  data = json.load(f)

for x in data['form']:
  print(x)


{'box': [292, 91, 376, 175], 'text': 'R&D', 'label': 'other', 'words': [{'box': [292, 91, 376, 175], 'text': 'R&D'}], 'linking': [], 'id': 0}
{'box': [219, 316, 225, 327], 'text': ':', 'label': 'question', 'words': [{'box': [219, 316, 225, 327], 'text': ':'}], 'linking': [], 'id': 1}
{'box': [95, 355, 169, 370], 'text': 'Suggestion:', 'label': 'question', 'words': [{'box': [95, 355, 169, 370], 'text': 'Suggestion:'}], 'linking': [[2, 16]], 'id': 2}
{'box': [482, 268, 518, 282], 'text': 'Date:', 'label': 'question', 'words': [{'box': [482, 268, 518, 282], 'text': 'Date:'}], 'linking': [[3, 12]], 'id': 3}
{'box': [511, 309, 570, 323], 'text': 'Licensee', 'label': 'answer', 'words': [{'box': [511, 309, 570, 323], 'text': 'Licensee'}], 'linking': [[13, 4]], 'id': 4}
{'box': [211, 651, 217, 662], 'text': '', 'label': 'question', 'words': [{'box': [211, 651, 217, 662], 'text': ''}], 'linking': [], 'id': 5}
{'box': [461, 605, 483, 619], 'text': 'Yes', 'label': 'question', 'words': [{'box': [4

"\n\nimport json\n\nwith open('/content/drive/MyDrive/labeled_dataset.json') as f:\n  data = json.load(f)\n\n# Generating the bbox \nbbox_list = []\npdf_name_list = []\n\nfor pdf_name in data:\n    img_bbox_list=[]\n    pdf_name_list.append(pdf_name[:-4])\n\n    pdf_data = data[pdf_name]['pages']\n    for page_num in range(len(pdf_data)):\n        sub_object_list = data[pdf_name]['pages'][str(page_num)]['objects']\n        for obj_num in sub_object_list:\n            obj_data = data[pdf_name]['pages'][str(page_num)]['objects'][obj_num]['sub_obj']\n            for sub_obj_num in obj_data:\n                sub_obj_data = data[pdf_name]['pages'][str(page_num)]['objects'][obj_num]['sub_obj'][sub_obj_num]\n                img_bbox_list.append(sub_obj_data['coord'])\n    bbox_list.append(img_bbox_list)\n\nprint(len(bbox_list))\nprint(len(pdf_name_list))\n"

In [37]:
# Generating the bbox 
bbox_list = []
import json

import os
json_ls = os.listdir('/content/data/training_data/annotations')

for i, name in enumerate(json_ls):
  img_bbox_list=[]
  path = '/content/data/training_data/annotations/'+name
  with open(path) as f:
    json_list = json.load(f)
  json_list = json_list['form']
  for j in range(len(json_list)):
    tokens = json_list[j]['words']
    for k in range(len(tokens)):
      img_bbox_list.append(tokens[k]['box'])
  bbox_list.append(img_bbox_list)
print(len(bbox_list))

149


In [40]:
img_ls = os.listdir('/content/data/training_data/images')
for i in range(len(img_ls)):
  path = '/content/data/training_data/images/'+img_ls[i]
  im = cv2.imread(path)
  instances, features = doit(im, np.array(bbox_list[i]))
  visual_features = features.tolist()
  file_name = img_ls[i][:-4]
  path = '/content/drive/MyDrive/funsd/visual_feature/token/'+file_name+'.json'
  with open(path,'w+') as file_object:
    json.dump(visual_features,file_object)

Original image size:  (1000, 802)
Transformed image size:  (998, 800)
Pooled features size: torch.Size([228, 2048])
torch.Size([228, 1601])
Original image size:  (1000, 762)
Transformed image size:  (1050, 800)
Pooled features size: torch.Size([92, 2048])
torch.Size([92, 1601])
Original image size:  (1000, 771)
Transformed image size:  (1038, 800)
Pooled features size: torch.Size([93, 2048])
torch.Size([93, 1601])
Original image size:  (1000, 781)
Transformed image size:  (1024, 800)
Pooled features size: torch.Size([106, 2048])
torch.Size([106, 1601])
Original image size:  (1000, 754)
Transformed image size:  (1061, 800)
Pooled features size: torch.Size([232, 2048])
torch.Size([232, 1601])
Original image size:  (1000, 762)
Transformed image size:  (1050, 800)
Pooled features size: torch.Size([142, 2048])
torch.Size([142, 1601])
Original image size:  (1000, 776)
Transformed image size:  (1031, 800)
Pooled features size: torch.Size([233, 2048])
torch.Size([233, 1601])
Original image siz

In [41]:
import os
json_ls = os.listdir('/content/data/testing_data/annotations')

# Generating the bbox 
bbox_list = []
import json
for i,name in enumerate(json_ls):
  img_bbox_list=[]
  path = '/content/data/testing_data/annotations/'+name
  with open(path) as f:
    json_list = json.load(f)
  json_list = json_list['form']
  for j in range(len(json_list)):
    tokens = json_list[j]['words']
    for k in range(len(tokens)):
      img_bbox_list.append(tokens[k]['box'])
  bbox_list.append(img_bbox_list)
print(len(bbox_list))

!mkdir /content/drive/MyDrive/funsd/visual_feature/token_test

img_ls = os.listdir('/content/data/testing_data/images')
for i in range(len(img_ls)):
  path = '/content/data/testing_data/images/'+img_ls[i]
  im = cv2.imread(path)
  instances, features = doit(im, np.array(bbox_list[i]))
  visual_features = features.tolist()
  file_name = img_ls[i][:-4]
  path = '/content/drive/MyDrive/funsd/visual_feature/token_test/'+file_name+'.json'
  with open(path,'w') as file_object:
    json.dump(visual_features,file_object)

50
Original image size:  (1000, 777)
Transformed image size:  (1030, 800)
Pooled features size: torch.Size([270, 2048])
torch.Size([270, 1601])
Original image size:  (1000, 754)
Transformed image size:  (1061, 800)
Pooled features size: torch.Size([160, 2048])
torch.Size([160, 1601])
Original image size:  (1000, 780)
Transformed image size:  (1026, 800)
Pooled features size: torch.Size([227, 2048])
torch.Size([227, 1601])
Original image size:  (1000, 802)
Transformed image size:  (998, 800)
Pooled features size: torch.Size([108, 2048])
torch.Size([108, 1601])
Original image size:  (1000, 754)
Transformed image size:  (1061, 800)
Pooled features size: torch.Size([211, 2048])
torch.Size([211, 1601])
Original image size:  (1000, 754)
Transformed image size:  (1061, 800)
Pooled features size: torch.Size([195, 2048])
torch.Size([195, 1601])
Original image size:  (1000, 754)
Transformed image size:  (1061, 800)
Pooled features size: torch.Size([141, 2048])
torch.Size([141, 1601])
Original im

In [42]:
import os
json_ls = os.listdir('/content/data/training_data/annotations')

# Generating the bbox 
img_bbox_list = {}
import json
for i,name in enumerate(json_ls):
  img_bbox_list[name[:-5]] = []
  path = '/content/data/training_data/annotations/'+name
  with open(path) as f:
    json_list = json.load(f)
  json_list = json_list['form']
  for j in range(len(json_list)):
    img_bbox_list[name[:-5]].append(json_list[j]['box'])
print(img_bbox_list)

!mkdir /content/drive/MyDrive/funsd/visual_feature/object_train

img_ls = os.listdir('/content/data/training_data/images')
for i in range(len(img_ls)):
  path = '/content/data/training_data/images/'+img_ls[i]
  im = cv2.imread(path)
  instances, features = doit(im, np.array(img_bbox_list[img_ls[i][:-4]]))
  visual_features = features.tolist()
  file_name = img_ls[i][:-4]
  path = '/content/drive/MyDrive/funsd/visual_feature/object_train/'+file_name+'.json'
  with open(path,'w') as file_object:
    json.dump(visual_features,file_object)

{'716552': [[17, 109, 84, 119], [17, 163, 53, 173], [54, 163, 72, 173], [341, 154, 387, 168], [246, 215, 257, 223], [237, 187, 262, 201], [36, 216, 83, 227], [24, 218, 35, 228], [257, 216, 306, 226], [510, 215, 523, 223], [589, 215, 641, 226], [640, 215, 646, 226], [334, 258, 351, 266], [324, 257, 334, 264], [34, 394, 87, 409], [17, 480, 99, 495], [215, 395, 281, 406], [338, 395, 411, 408], [478, 394, 520, 408], [17, 514, 81, 527], [32, 568, 86, 583], [215, 570, 282, 584], [338, 571, 409, 584], [115, 538, 137, 548], [138, 536, 151, 549], [341, 535, 384, 549], [18, 631, 82, 645], [478, 570, 523, 580], [649, 712, 671, 827], [28, 856, 59, 866], [490, 858, 518, 868], [236, 11, 474, 52], [18, 11, 82, 23], [18, 77, 73, 90], [117, 99, 373, 117], [18, 135, 64, 145], [117, 128, 252, 143], [116, 155, 158, 172], [281, 162, 317, 172], [486, 161, 557, 171], [568, 151, 616, 167], [17, 187, 99, 202], [105, 215, 176, 226], [334, 208, 399, 226], [423, 209, 482, 226], [524, 209, 578, 228], [17, 256, 248

In [43]:
import os
json_ls = os.listdir('/content/data/testing_data/annotations')

# Generating the bbox 
img_bbox_list = {}
import json
for i,name in enumerate(json_ls):
  img_bbox_list[name[:-5]] = []
  path = '/content/data/testing_data/annotations/'+name
  with open(path) as f:
    json_list = json.load(f)
  json_list = json_list['form']
  for j in range(len(json_list)):
    img_bbox_list[name[:-5]].append(json_list[j]['box'])
print(img_bbox_list)

!mkdir /content/drive/MyDrive/funsd/visual_feature/object_test

img_ls = os.listdir('/content/data/testing_data/images')
for i in range(len(img_ls)):
  path = '/content/data/testing_data/images/'+img_ls[i]
  im = cv2.imread(path)
  instances, features = doit(im, np.array(img_bbox_list[img_ls[i][:-4]]))
  visual_features = features.tolist()
  file_name = img_ls[i][:-4]
  path = '/content/drive/MyDrive/funsd/visual_feature/object_test/'+file_name+'.json'
  with open(path,'w') as file_object:
    json.dump(visual_features,file_object)

{'92380595': [[293, 152, 398, 176], [27, 155, 63, 172], [28, 183, 50, 197], [88, 228, 126, 241], [28, 284, 52, 299], [546, 240, 556, 255], [546, 265, 591, 280], [433, 378, 453, 389], [249, 401, 257, 414], [68, 432, 124, 447], [68, 453, 124, 466], [68, 510, 104, 523], [49, 508, 62, 522], [317, 468, 388, 481], [525, 464, 579, 481], [31, 591, 49, 602], [32, 655, 82, 670], [85, 656, 119, 670], [194, 673, 233, 690], [374, 556, 381, 571], [374, 570, 382, 584], [374, 584, 381, 599], [374, 595, 381, 612], [559, 564, 613, 578], [32, 697, 107, 712], [490, 792, 529, 806], [32, 851, 67, 865], [98, 841, 112, 856], [708, 817, 722, 892], [444, 920, 487, 935], [342, 943, 399, 960], [151, 35, 544, 87], [133, 88, 563, 115], [27, 128, 224, 146], [437, 123, 668, 141], [443, 159, 616, 181], [87, 152, 146, 167], [87, 180, 254, 225], [306, 187, 393, 209], [131, 216, 398, 240], [447, 219, 551, 232], [447, 233, 520, 262], [447, 278, 507, 293], [88, 275, 199, 292], [162, 296, 295, 311], [244, 317, 475, 332], [5

In [None]:
import json 

!mkdir /content/drive/MyDrive/G33/visual_feature

#g33_annotation 
with open('/content/drive/MyDrive/labeled_dataset.json') as f:
    self_labeled_data = json.load(f)

png_name_list = []
bbox_list = []

for pdf_name in self_labeled_data:
     
    pdf_pages_conf = self_labeled_data[pdf_name]['pages']
    for png_id in pdf_pages_conf:
        tmp_list = []
        png_name = pdf_pages_conf[str(png_id)]['page_name']
        png_name_list.append(png_name[:-4])

        obj_list = pdf_pages_conf[str(png_id)]['objects']
        for obj_ind in obj_list:
            sub_obj_list = obj_list[obj_ind]['sub_obj']
            for sub_obj_ind in sub_obj_list:
                tmp_list.append(sub_obj_list[sub_obj_ind]['coord'])
        bbox_list.append(tmp_list)

print(len(png_name_list))
print(len(bbox_list))

for i in range(len(png_name_list)):
    path = '/content/drive/MyDrive/COMP5703_dataset/' + png_name_list[i] + '.png'
    im = cv2.imread(path)
    success = 0
    fail = 0
    instances, features = doit(im, np.array(bbox_list[i]))
    visual_features = features.tolist()
    try: 
        instances, features = doit(im, np.array(bbox_list[i]))
        visual_features = features.tolist()
        file_name = png_name_list[i]
        path = '/content/drive/MyDrive/G33/visual_feature/'+file_name+'.json'
        with open(path,'w') as file_object:
            json.dump(visual_features,file_object)
        success = success + 1
    except:
        fail = fail + 1 

print(success / (success + fail))
        

