# Dataset creation 
Document understanding dataset creation base on FUNSD.


## 1. Setup google drive
---

In [3]:
%%capture
# PDF data packages 
!pip install pdfminer
!pip install PyMuPDF

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# global variables
base_path = "/content/drive/MyDrive"
pdf_dataset_path = "/content/drive/MyDrive/form_dataset"
png_dataset_path = "/content/drive/MyDrive/COMP5703_dataset"
dataset_off_set = 7130


## 2. Create PNG
---

In [4]:
# pip packages 
from pathlib import Path
import glob, sys, fitz
import pandas as pd
import numpy as np
import ntpath
import tqdm
import os

# Create folder at google drive 
def create_folder(base_path, folder_name):
  folder_path = base_path + '/' + folder_name
  Path(folder_path).mkdir(parents = True, exist_ok = True)
  return folder_path

# read files under path
def read_files(path, off_set = False):
  if (off_set == False):
    all_files = os.listdir(path)
  else:
    all_files = os.listdir(path)[:off_set]

  return all_files

In [5]:
# pip packages 
from pathlib import Path
import glob, sys, fitz
import pandas as pd
import numpy as np
import ntpath
import tqdm
import os


# Create folder at google drive 
def create_folder(base_path, folder_name):
  folder_path = base_path + '/' + folder_name

  try:
    Path(folder_path).mkdir(parents = True, exist_ok = True)
    return folder_path
  except:
    print("create_folder(): Fail {} at {}".format(base_path, folder_name))
  
  return False


# read files under path
def read_files(path, off_set = False):
  if (off_set == False):
    all_files = os.listdir(path)
  else:
    all_files = os.listdir(path)[:off_set]

  return all_files


def create_dataset(off_set = False):
  print("create_dataset(): Start")

  png_path = create_folder(base_path, "COMP5703_dataset")
  print("    Success create PNG folder at {}".format(png_path))

  all_files = read_files(pdf_dataset_path, off_set = off_set)
  print("    Success read {} pdf files at {}".format(len(all_files), pdf_dataset_path))
  
  pdf_counter = 0
  png_counter = 0
  failure_pdf = []
  failure_png = []

  for pdf_file in all_files:
    # open pdf file 
    try:
      pdf_code = pdf_file.split('.')[0]
      pdf_path = pdf_dataset_path + '/' + pdf_file
      document = fitz.open(pdf_path)

    except:
      failure_pdf = failure_pdf.append(pdf_path)
      pass

    # iterate through all pages
    for page in document:
      try: 
        pix = page.get_pixmap()

        # save png file
        png_path = "{}-{}.png".format(png_dataset_path + '/' + pdf_code, page.number)
        pix.save(png_path)
        png_counter = png_counter + 1
      except:
        failure_png = failure_png.append(png_path)
        pass
    
    pdf_counter = pdf_counter + 1
  
  print("    create_dataset(): {}/{} pdf files readed".format(pdf_counter, len(all_files)))
  print("    create_dataset(): {} png files saved".format(png_counter))

  return failure_png, failure_pdf

def save_json(dict_data):
  try:
    json_file = open('data.json', "w")
    json.dump(dict_data, json_file)
  except:
    print("    save_json(): fail save as JSON")
  finally:
    return dict_data

# ---------- local cell main ----------
#failure_png, failure_pdf = create_dataset(off_set = dataset_off_set)

#if (len(failure_pdf) > 0 or len(failure_png) > 0):
#  print(failure_pdf)
#  print(failure_png)

## 3. Bounding box extraction 
---

In [None]:
# PDF data packages 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage

# CV
import cv2
from google.colab.patches import cv2_imshow

# pip packages 
from pathlib import Path
import glob, sys, fitz
import pandas as pd
import numpy as np
import ntpath
import tqdm
import json
import math
import os


def merge_box(box1, box2):
  return [min(box1[0], box2[0]), max(box1[1], box2[1]), max(box1[2], box2[2]), min(box1[3], box2[3])]

def get_box_distance(box1, box2):
  return min(box2[3] - box1[1], box1[3] - box2[1], box1[1] - box2[3], box2[1] - box1[3])


def merge_algo(sub_boxes, threshold):
  result = sub_boxes
  for i in range(len(sub_boxes)):
    for j in range(len(sub_boxes)):
      if (j <= i):
        continue
      if (get_box_distance(sub_boxes[i], sub_boxes[j]) <= verti_threshold):
        new_box = merge_box(sub_boxes[i], sub_boxes[j])            
        result[i] = new_box
        result.pop(j)
        return True, result

  return False, result


def merge_boxes(box_list, threshold):
  clean_box = []
  for box in box_list:
    width, height = get_box_detail(box)
    if (width > 5 and height > 5):
      clean_box.append(box)
    else:
      continue
  
  need_to_merge = True
  tmp_box = clean_box
  while need_to_merge:
    need_to_merge, tmp_box = merge_algo(tmp_box, threshold)

  sub_obj = {}
  for index in range(len(tmp_box)):
    sub_obj[index] = {}
    sub_obj[index]['coord'] = tmp_box[index]

  return sub_obj


def get_box_detail(coord):
  x1 = coord[0]
  y1 = coord[1]
  x2 = coord[2]
  y2 = coord[3]
  return abs(x2 - x1), abs(y2 - y1)


def get_box(sizeRatioW, sizeRatioH, pageW, pageH, bbox):
  x1 = int(bbox[0] * sizeRatioW)
  y1 = int(pageH - bbox[1] * sizeRatioH)
  x2 = int(bbox[2] * sizeRatioW)
  y2 = int(pageH - bbox[3] * sizeRatioH)
  return [x1, y1, x2, y2]

def get_png_data(layout, sizeRatioW, sizeRatioH, pageW, pageH, merge = False):
  png_dict = {}
  upper_index = 0

  for lt_obj in layout:
    if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
      upper_box = get_box(sizeRatioW, sizeRatioH, pageW, pageH, lt_obj.bbox)
      box_width, box_height = get_box_detail(upper_box)
      if (box_width <= 5 or box_height <= 5):
        continue
      else:
        png_dict[upper_index] = {}
        png_dict[upper_index]['lt_obj'] = upper_box
        # don't merge sub boxs 
        sub_index = 0
        sub_dict = {}
        merge_list = []
        for obj in lt_obj:
          sub_box = get_box(sizeRatioW, sizeRatioH, pageW, pageH, obj.bbox)
          box_width, box_height = get_box_detail(sub_box)
          if (merge == False):
            if (box_width <= 5 or box_height <= 5):
              continue
            else:
              sub_dict[sub_index] = {}
              sub_dict[sub_index]['coord'] = sub_box
              sub_index = sub_index + 1
          else:
            merge_list.append(sub_box)

      if(merge == True):
          sub_dict = merge_boxes(merge_list, global_dist_threshold)
        
      png_dict[upper_index]['sub_obj'] = sub_dict
      upper_index = upper_index + 1
  
      
  return png_dict


def get_pdf_data(pdf_name, merge = False):
  # set config and path 
  pdf_code = pdf_name.split('.')[0]
  pdf_path = pdf_dataset_path + '/' + pdf_name
  pdf_file = open(pdf_path, 'rb')

  # process pdf
  parser = PDFParser(pdf_file)
  document = PDFDocument(parser)
  rsrcmgr = PDFResourceManager()
  laparams = LAParams()
  device = PDFPageAggregator(rsrcmgr, laparams = laparams)
  interpreter = PDFPageInterpreter(rsrcmgr, device)

  # pdf level dict
  pdf_dict = {}
  pdf_dict['document_id'] = pdf_code
  pdf_dict['pages'] = {}

  # iterate all pages 
  page_index = 0
  for page in PDFPage.create_pages(document):
    # set config for image 
    png_name = "{}-{}.png".format(pdf_code, page_index)
    png_path = "{}/{}-{}.png".format(png_dataset_path, pdf_code, page_index)
    pdf_dict['pages'][page_index] = {}
    pdf_dict['pages'][page_index]['page_name'] = png_name

    # read image
    png = cv2.imread(png_path, cv2.IMREAD_UNCHANGED)
    height, width, channels = png.shape
    pageW = page.mediabox[2]
    pageH = page.mediabox[3] 
    sizeRatioW =  width / page.mediabox[2]
    sizeRatioH =  height / page.mediabox[3]
    pdf_dict['pages'][page_index]['height'] = pageH
    pdf_dict['pages'][page_index]['width'] = pageW
    pdf_dict['pages'][page_index]['objects'] = {}

    # get layout
    interpreter.process_page(page)
    layout = device.get_result()
    pdf_dict['pages'][page_index]['objects'] = get_png_data(layout, sizeRatioW, sizeRatioH, pageW, pageH, merge = merge)

    page_index = page_index + 1


  return pdf_dict

# iterate all 
def build_json(off_set = False, show = False, merge = False):
  all_pdf = read_files(pdf_dataset_path, off_set = off_set)
  fail_pdf_list = []
  pdf_level_dict = {}
  process = 0
  print("current: {}".format(process))
  for pdf in all_pdf:
    try:
      pdf_level_dict[pdf] = get_pdf_data(pdf, merge = merge)
    except:
      fail_pdf_list.append(pdf)
    
    if process % 200 == 0 and process >= 200:
      print("{}/{} completed".format(process, len(all_pdf)))
      save_json(pdf_level_dict)

    process = process + 1

  return pdf_level_dict, fail_pdf_list

global_dist_threshold = 5
verti_threshold = 5 
json_data, fail_pdf_list = build_json(off_set = dataset_off_set, show = False, merge = True)
save_json(json_data)


current: 0
200/7130 completed


In [7]:
print(json_data)
print(fail_pdf_list)

{}
['01938902.pdf', '01938906.pdf', '01938979.pdf', '01939104.pdf', '01939177.pdf', '01939210.pdf', '01939212.pdf', '01939244.pdf', '01939260.pdf', '01939347.pdf', '01939411.pdf', '01939443.pdf', '01939460.pdf', '01939560.pdf', '01939568.pdf', '01939632.pdf', '01939635.pdf', '01939638.pdf', '01939652.pdf', '01939653.pdf', '01939655.pdf', '01939715.pdf', '01939724.pdf', '01939836.pdf', '01939894.pdf', '01939897.pdf', '01939925.pdf', '01940013.pdf', '01940025.pdf', '01940026.pdf', '01940141.pdf', '01940195.pdf', '01940205.pdf', '01940271.pdf', '01940315.pdf', '01940380.pdf', '01940433.pdf', '01940453.pdf', '01940663.pdf', '01940686.pdf', '01940701.pdf', '01940716.pdf', '01940761.pdf', '01940815.pdf', '01940823.pdf', '01940859.pdf', '01940868.pdf', '01940877.pdf', '01940926.pdf', '01940966.pdf', '01941035.pdf', '01941038.pdf', '01941039.pdf', '01941043.pdf', '01941237.pdf', '01941291.pdf', '01941348.pdf', '01941602.pdf', '01941735.pdf', '01941743.pdf', '01941753.pdf', '01941775.pdf', '019

In [None]:
def save_json(dict_data):
  try:
    json_file = open('data.json', "w")
    json.dump(dict_data, json_file)
  except:
    print("    save_json(): fail save as JSON")
  finally:
    return dict_data

### 3.3 Visualisation
---

In [8]:
import cv2


# draw all boxes on image 
def draw_box(json_data, off_set = False, show_upper = True):
  for pdf in json_data:
    pages = json_data[pdf]['pages']
    
    for page in pages:
      objects = pages[page]['objects']
      png_name = pages[page]['page_name']
      png_image = cv2.imread(png_dataset_path + '/' + png_name, cv2.IMREAD_UNCHANGED)

      for upper_index in objects:
        upper_box = objects[upper_index]['lt_obj']
        sub_box_list = objects[upper_index]['sub_obj']

        for sub_index in sub_box_list:
          sub_box = sub_box_list[sub_index]['coord']
          cv2.rectangle(png_image, (sub_box[0], sub_box[1]), (sub_box[2], sub_box[3]), (0, 255, 0, 255), 1)

        if (show_upper == True):
          cv2.rectangle(png_image, (upper_box[0], upper_box[1]), (upper_box[2], upper_box[3]), (0, 0, 255, 255), 1)
      
    cv2_imshow(png_image)

  return False


draw_box(json_data, off_set = False, show_upper = False)

False

## OCR

We provide sample codes for OCR by using Google Cloud Vision. You need to create a Google Cloud Vision account with JSON keys. (Each Group Member at least create one account since our dataset is very large.)

In [2]:
!pip3 install --upgrade google-cloud-vision

Collecting google-cloud-vision
  Downloading google_cloud_vision-2.7.2-py2.py3-none-any.whl (383 kB)
[K     |████████████████████████████████| 383 kB 6.6 MB/s 
[?25hCollecting google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  Downloading google_api_core-2.7.1-py3-none-any.whl (114 kB)
[K     |████████████████████████████████| 114 kB 19.4 MB/s 
[?25hCollecting proto-plus>=1.15.0
  Downloading proto_plus-1.20.3-py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 1.9 MB/s 
Collecting grpcio-status<2.0dev,>=1.33.2
  Downloading grpcio_status-1.45.0-py3-none-any.whl (10.0 kB)
Collecting grpcio<2.0dev,>=1.33.2
  Downloading grpcio-1.45.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 37.1 MB/s 
[?25hCollecting protobuf>=3.12.0
  Downloading protobuf-3.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4

In [2]:
import os 
print(os.getcwd())

/content


In [3]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ="/content/i-mariner-344411-a88c2da9e583.json"

In [4]:
from google.cloud import vision
client = vision.ImageAnnotatorClient()

### OCR for Bounding Box 

#### Cropping Bounding Box images

In [None]:
import cv2
import os
for img in list(image_dict['/content/form_dataset/00346615'].keys()):
  dir = '/content/'+ img # Change to your image storage directory
  image = cv2.imread(dir)
  img_name = img.split('/')[-1]
  img_name = img_name.split('.')[0]

  for obj in image_dict['00346615'][img]['objects']:
    objt = image_dict['00346615'][img]['objects'][obj]
    #print(objt['bbox'])
    x = objt['bbox'][0]
    y = objt['bbox'][1]
    w = objt['bbox'][2]
    h = objt['bbox'][3]
    
    crop_img = image[h:y, x:w]
    cv2_imshow(crop_img)
    cv2.waitKey(0)
    
    # Change to your Google drive path for storing those cropped images based on the extracted bounding box coordinates
    try:
      os.mkdir('/content/drive/MyDrive/form_understanding_dataset/ocr_testing/'+img_name)
    except:
      pass
    save_dir = '/content/drive/MyDrive/form_understanding_dataset/ocr_testing'+'/'+img_name+'/'+str(obj)+'.jpg'
    cv2.imwrite(save_dir,crop_img)

#### OCR for Croped Images

In [None]:
for pdf in image_dict:
  for image_name in image_dict[pdf]:
    for i in range(len(image_dict[pdf][image_name]['objects'])):
      # Change this to your cropped image path
      dir = '/content/drive/MyDrive/form_understanding_dataset/ocr_testing/00346615-0/'+str(i)+'.jpg'
      with io.open(dir, 'rb') as image_file:
        content = image_file.read()
      image = vision.Image(content=content)
      try:
        response = client.text_detection(image = image)
        texts = response.text_annotations
        image_dict[pdf][image_name]['objects'][str(i)]['text'] = texts[0].description
        # please change code from here to add the token level information based on extracted text from google cloud vision API
      except:
        pass
      if response.error.message:
        raise Exception('{}\nFor more info on error messages, check: ''https://cloud.google.com/apis/design/errors'.format(response.error.message))


In [None]:
import json
with open('/content/drive/MyDrive/form_understanding_dataset/test_dataset_info.json','w') as f:
  json.dump(image_dict,f)

### OCR for Scanned Documents

#### Get Scanned Documents

In [5]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

# Authenticate
drive = None
def authenticate():
  global drive
  
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
  authenticate()
  
  for fileId in fileIds:    
    
    downloaded = drive.CreateFile({"id": fileId[1]})
    downloaded.GetContentFile(fileId[0])

In [6]:
try:
  _ = open("604_5000.zip", "r")
except:
  downloadFiles([["604_5000.zip", "1XEQB9hhVDjKDO5_OdbEjsKJTeqeEna7U"]])

In [7]:
!mkdir /content/form_dataset
!7za e /content/604_5000.zip -o/content/form_dataset


7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 845025911 bytes (806 MiB)

Extracting archive: /content/604_5000.zip
 81% 4096 Open              --
Path = /content/604_5000.zip
Type = zip
Physical Size = 845025911

  0%      2% 110 - 00345560.pdf                         5% 243 - 00348051.pdf                         7% 383 - 00351825.pdf                         9% 498 - 00354082.pdf                        12% 627 - 00357111.pdf                        14% 710 -

### Convert PDF to Images
Please use the Create Image Functions

In [None]:
!pip install fitz
!pip install PyMuPDF
import glob, sys, fitz
import os
from pathlib import Path
def createImages(foldername):
    num = 1
    
    all_files = os.listdir(foldername)[:100]
    Path("./images").mkdir(parents=True, exist_ok=True)
    print(all_files)
    num_10 = 0
    num_5  = 0
    for filename in all_files:
        print(num)
        num+=1
        path = foldername + '/' + filename
        doc = fitz.open(path)  # open document
        i = 0
        for page in doc:  # iterate through the pages
            i += 1
            pix = page.get_pixmap()  # render page to an image
            filename = filename.split('.')[0]
            filename = filename.split('/')[-1]
            # Change the direction to your own google drive
            filename = '/content/images/'+filename
            pix.save("{}_page-{}.png".format(filename, page.number))  # store image as a PNG

foldername = "/content/form_dataset"
createImages(foldername)

### OCR

In [9]:
import io
from google.cloud import vision
for i in range(0,1):
  # Change this to your splitted document image path
  dir = '/content/images/00344564_page-' + str(i) + '.png'
  with io.open(dir, 'rb') as image_file:
    content = image_file.read()
  image = vision.Image(content=content)
  #try:
  
  response = client.text_detection(image = image)
  
  texts = response.text_annotations
  # Please Store the extracted Texts into json file based on the provided json format
  print(texts)  
  #except:
    #print("pass")
    #pass
  if response.error.message:
    raise Exception('{}\nFor more info on error messages, check: ''https://cloud.google.com/apis/design/errors'.format(response.error.message))
  


[locale: "en"
description: "13. Feb. 2013 10:50\nSSB AUSTRALIA 612 92512997\nNc. 5167 P. 1/7\nSALOMON SMITHBARNEY\nAmemberof citigroup.\nfacsimile\nTo:\nThe IndexShares Fund\nFax:\n61 2 8295 8659\nTo:\nAustralian Stock Exchange Limited\nFax:\n1300 300 021\nFrom:\nVictoria Weekes\nFax:\n61 2 8225 5461\nCompany Secretary\nDate:\n13 February 2003\nPages:\n7 (Including cover sheet)\nSubject:\nSubstantial Shareholding Notice\nIf you do not receive all pages, please telephone on 61 2 82254965.\nThis fax is confidential and may be privileged. If you are not the intended recipient, please notify the sender immediately by telephone.\nSee attached.\nDocument2\nSALOMON SMITH BARNEY AUSTRALIA PTY LIMITED\nABN 56 081 472684\nLevel 40, Citigroup Centre, 2 Park Street, Sydney NSW 2000\nGPO Box 557, Sydney NSW 2001 Australia\nTELEPHONE: 61282254000\nFACSIMILE:\n61 2 8225 5461\n"
bounding_poly {
  vertices {
    x: 130
  }
  vertices {
    x: 1558
  }
  vertices {
    x: 1558
    y: 2208
  }
  vertices

In [10]:
import proto
serializable_tags = [proto.Message.to_dict(tag) for tag in texts]
print(serializable_tags)

[{'locale': 'en', 'description': '13. Feb. 2013 10:50\nSSB AUSTRALIA 612 92512997\nNc. 5167 P. 1/7\nSALOMON SMITHBARNEY\nAmemberof citigroup.\nfacsimile\nTo:\nThe IndexShares Fund\nFax:\n61 2 8295 8659\nTo:\nAustralian Stock Exchange Limited\nFax:\n1300 300 021\nFrom:\nVictoria Weekes\nFax:\n61 2 8225 5461\nCompany Secretary\nDate:\n13 February 2003\nPages:\n7 (Including cover sheet)\nSubject:\nSubstantial Shareholding Notice\nIf you do not receive all pages, please telephone on 61 2 82254965.\nThis fax is confidential and may be privileged. If you are not the intended recipient, please notify the sender immediately by telephone.\nSee attached.\nDocument2\nSALOMON SMITH BARNEY AUSTRALIA PTY LIMITED\nABN 56 081 472684\nLevel 40, Citigroup Centre, 2 Park Street, Sydney NSW 2000\nGPO Box 557, Sydney NSW 2001 Australia\nTELEPHONE: 61282254000\nFACSIMILE:\n61 2 8225 5461\n', 'bounding_poly': {'vertices': [{'x': 130, 'y': 0}, {'x': 1558, 'y': 0}, {'x': 1558, 'y': 2208}, {'x': 130, 'y': 2208}

In [11]:
import json
with open('/content/scanned_document.json','w') as f:
  json.dump(serializable_tags,f)