## Distributed OCR 

### 1. Setup Google vision 

In [1]:
%%capture
!pip3 install --upgrade google-cloud-vision
!pip install --upgrade google-cloud-vision

### 2. Global variables and helper functions

In [2]:
# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# pip packages 
from google.colab.patches import cv2_imshow
from google.cloud import vision
from pathlib import Path
import numpy as np
import json
import cv2
import io
import os

# ---- Read Json files from colab ----
def read_json(path):
    try:
        json_file = open(path, "rb")
        dict_data = json.load(json_file)
    except:
        print("read_json(): fail read JSON...")
        return False
    
    return dict_data

# ---- Save Json file from colab ----
def save_json(dict_data, path):
    try:
        json_file = open(path, "w")
        json.dump(dict_data, json_file)
    except:
        print("save_json(): fail save as JSON...")
    
    return dict_data

def get_pdf_list(data):
    pdf_list = []
    for pdf_name in data:
        pdf_list.append(pdf_name)

    return pdf_list



# TODO - Modify global variables
base_path = "/content/drive/MyDrive/FUNSD"
OCR_key = "/OCR_credential/key_1.json"
png_folder = "/content/drive/MyDrive/COMP5703_dataset"

offset_min = 0
offset_max = 1500

# Execution
target_folder = "/content/drive/MyDrive/FUNSD/components/origin_data_0.json"
result_storage = "/content/drive/MyDrive/FUNSD/components/OCR_data_0.json"
data = read_json(target_folder)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


def OCR_extraction(png_config):
    # generate configuration 
    png_name = png_config['page_name']
    png_path = png_folder + '/' + png_name
    png_image = cv2.imread(png_path, cv2.IMREAD_UNCHANGED)
    png_config_copy = png_config

    # crop image for OCR test 
    for obj_index in png_config['objects']:
        sub_obj_list = png_config['objects'][obj_index]['sub_obj']
        for sub_obj_item in sub_obj_list:
            coord = sub_obj_list[sub_obj_item]['coord']
            cropped_image = png_image[coord[3]:coord[1], coord[0]:coord[2]]
            cv2.imwrite(base_path + '/' + 'tmp.jpg',cropped_image)
            with io.open(base_path + '/' + 'tmp.jpg', 'rb') as image_file:
                content = image_file.read()

            # OCR 
            client = vision.ImageAnnotatorClient()
            image = vision.Image(content=content)
            response = client.text_detection(image = image)
            texts = response.text_annotations
            total_text = texts[0].description.replace('\n', '')
            png_config_copy['objects'][obj_index]['sub_obj'][sub_obj_item]['text'] = total_text
    
    return png_config_copy

def main(data):
    pdf_list = get_pdf_list(data)
    total = len(pdf_list)
    result_path = base_path + result_storage
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = base_path + OCR_key

    success = 0
    fail = 0
    result = {}

    for pdf_name in pdf_list:
        pdf_config = data[pdf_name]['pages']
        result[pdf_name] = {}
        result[pdf_name] = data[pdf_name]

        try: 
            for png_index in pdf_config:
                png_copy = OCR_extraction(pdf_config[png_index])
                data[pdf_name]['pages'][png_index] = png_copy
            success = success + 1
        except:
            fail = fail + 1

        if ((success + fail) % 100 == 0 and (success + fail) > 100):
            print("Process = {} success in {}/{} ".format(success, success + fail, total))
            save_json(data, result_storage)
    
    return data
    

tmp = main(data)
save_json(tmp, result_storage)


In [None]:
import cv2


# draw all boxes on image 
def draw_box(json_data, off_set = False, show_upper = True):
  for pdf in json_data:
    pages = json_data[pdf]['pages']

    for page in pages:
        objects = pages[page]['objects']
        png_name = pages[page]['page_name']
        png_image = cv2.imread(png_folder + '/' + png_name, cv2.IMREAD_UNCHANGED)

        for upper_index in objects:
            upper_box = objects[upper_index]['lt_obj']
            sub_box_list = objects[upper_index]['sub_obj']

            for sub_index in sub_box_list:
                sub_box = sub_box_list[sub_index]['coord']
                cv2.rectangle(png_image, (sub_box[0], sub_box[1]), (sub_box[2], sub_box[3]), (0, 255, 0, 255), 1)

            if (show_upper == True):
                cv2.rectangle(png_image, (upper_box[0], upper_box[1]), (upper_box[2], upper_box[3]), (0, 0, 255, 255), 1)
        print(png_name)
        cv2_imshow(png_image)

  return False


draw_box(data, off_set = 1, show_upper = False)