In [None]:
%cd /home/jupyter/yogendra/
!ls

In [41]:
from google.cloud import vision
from google.cloud.vision import types
import io
from PIL import Image, ImageDraw, ImageFont
from enum import Enum
import pandas as pd
from tqdm import tqdm
from pdf2image import convert_from_path
import six
from google.cloud import translate_v2 as translate
import time
import os
import shutil
import uuid
import subprocess

In [42]:
def convert2docx(path_to_file, save_path):
    command = ['lowriter', '--convert-to', 'docx', '--outdir', save_path, path_to_file] # for the command to run: sudo apt install libreoffice
    code = subprocess.run(command)

In [43]:
class FeatureType(Enum):
    PAGE = 1
    BLOCK = 2
    PARA = 3
    WORD = 4
    SYMBOL = 5
    
def get_document_bounds(document, feature):
    bounds=[]
    for i,page in enumerate(document.pages):
        for block in page.blocks:
            if feature==FeatureType.BLOCK:
                bounds.append(block.bounding_box)
            for paragraph in block.paragraphs:
                if feature==FeatureType.PARA:
                    bounds.append(paragraph.bounding_box)
                for word in paragraph.words:
                    for symbol in word.symbols:
                        if (feature == FeatureType.SYMBOL):
                            bounds.append(symbol.bounding_box)
                    if (feature == FeatureType.WORD):
                        bounds.append(word.bounding_box)
    return bounds

In [44]:
def assemble_word(word):
    assembled_word=""
    for symbol in word.symbols:
        assembled_word+=symbol.text
    return assembled_word

def get_paragraph_level_text(document):
    bounds=[]
    Text = []  
    for page in document.pages:
        for block in page.blocks:
            B = []
            for paragraph in block.paragraphs:
                P = []
                for word in paragraph.words:
                    assembled_word=assemble_word(word)
                    P.append(assembled_word)
                Text.append(" ".join(P))
    return Text

In [45]:
def chinese_text_wrap(text,font,writing,max_width,max_height):
    lines = [[]]
    words = list(text)
    for word in words:
        # try putting this word in last line then measure
        lines[-1].append(word)
        (w,h) = writing.multiline_textsize('\n'.join([' '.join(line) for line in lines]), font=font)
        if w > max_width: # too wide
            # take it back out, put it on the next line, then measure again
            lines.append([lines[-1].pop()])
            (w,h) = writing.multiline_textsize('\n'.join([' '.join(line) for line in lines]), font=font)
    return '\n'.join([' '.join(line) for line in lines])

In [46]:
def text_wrap(text,font,writing,max_width,max_height):
    lines = [[]]
    words = text.split()
    for word in words:
        # try putting this word in last line then measure
        lines[-1].append(word)
        (w,h) = writing.multiline_textsize('\n'.join([' '.join(line) for line in lines]), font=font)
        if w > max_width: # too wide
            # take it back out, put it on the next line, then measure again
            lines.append([lines[-1].pop()])
            (w,h) = writing.multiline_textsize('\n'.join([' '.join(line) for line in lines]), font=font)
    return '\n'.join([' '.join(line) for line in lines])

def get_bounding_box_info(bound):
    x0, y0 = bound.vertices[0].x, bound.vertices[0].y
    x1, y1 = bound.vertices[1].x, bound.vertices[1].y
    x2, y2 = bound.vertices[2].x, bound.vertices[2].y
    x3, y3 = bound.vertices[3].x, bound.vertices[3].y
    
    # write text from coordinates x3, y3
    # get max width and max height for text_wrap function..
    max_width = (x1-x0)
    max_height = (y3-y0)
    
#     print(max_width, max_height)
    return x0, y0, max_width, max_height

def fill_bounding_box(bound):
    x0, y0 = bound.vertices[0].x, bound.vertices[0].y
    x1, y1 = bound.vertices[1].x, bound.vertices[1].y
    x2, y2 = bound.vertices[2].x, bound.vertices[2].y
    x3, y3 = bound.vertices[3].x, bound.vertices[3].y
    
    return x0, y0, x2, y2

def translate_text(target, text):
    """Translates text into the target language.
    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which cas"e this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)
    
    translated_text = result["translatedText"]
    return translated_text

In [86]:
def translation_pipeline(pdf_file_path, target, output_path):
    Target = {'Hindi': 'hi','Chinese':'zh-CN', 'Arabic':'ar','Polish':'pl', 'Spanish':'es', 'Tagalog':'tl'}
    font = {
        'Hindi':'/home/jupyter/yogendra/Fonts/ARIALUNI.TTF',
        'Chinese':'/home/jupyter/yogendra/Fonts/ARIALUNI.TTF',
        'Arabic':'/home/jupyter/yogendra/Fonts/ARIALUNI.TTF',
        'Polish':'/home/jupyter/yogendra/Fonts/ARIALUNI.TTF',
        'Spanish':'/home/jupyter/yogendra/Fonts/ARIALUNI.TTF',
        'Tagalog':'/home/jupyter/yogendra/Fonts/ARIALUNI.TTF'
    }
    
    all_text = ''
    counter = 0
    pages = convert_from_path(pdf_file_path) # size of image: 1700x2200

    name = pdf_file_path.split('/')[-1]
    true_name = name.split('.')[0]
    txt_name = 'Translated_'+true_name+'_'+target+'.txt'
    output_name ='Translated_'+true_name+'_'+target+'.pdf'
    output_list = []

    folder_name = str(uuid.uuid4().hex)
    os.mkdir(folder_name)

    for i in tqdm(range(len(pages))):
        image_file = 'page_'+str(i)+'.jpg'
        pages[i].save(os.path.join(folder_name, image_file))

        # Calling google cloud vision API (text detection) 
        client = vision.ImageAnnotatorClient()
        image_path = os.path.join(folder_name, image_file)

        with io.open(image_path, 'rb') as image_file1:
            content = image_file1.read()
        content_image = types.Image(content=content)
        response = client.document_text_detection(image=content_image)
        document = response.full_text_annotation

        # Getting the bounds and text
        bounds = get_document_bounds(document, FeatureType.PARA)
        TEXT = get_paragraph_level_text(document)

        bg = Image.open(image_path)
        writing = ImageDraw.Draw(bg)

        for i in range(len(bounds)):
            text_x, text_y, max_width, max_height = get_bounding_box_info(bounds[i])
            x0, y0, x2, y2 = fill_bounding_box(bounds[i])
            
            area = max_width*max_height
            text_ratio = area/ (len(TEXT[i]))
#             print(i, area, len(TEXT[i]), text_ratio)
            font_size = 22
            if(text_ratio > 1500 and text_ratio <= 3000):
                font_size = 30
            elif(text_ratio > 3000 and text_ratio <= 6000):
                font_size = 40
            elif(text_ratio > 6000 and text_ratio <= 10000):
                font_size = 45
            elif(text_ratio > 10000):
                font_size = 55
            
            box_no = '['+ str(counter) + ']'
            writing.rectangle((x0, y0, x2, y2), outline='red', fill='white')
            description = translate_text(Target[target], TEXT[i])
            
            all_text += '\n\n\t---------- ' + box_no +' ----------\n\n'
            all_text += '\tOriginal: \n' + TEXT[i] + '\n\n' + '\tTranslated: \n' + description
            desc_font = ImageFont.truetype(font=font[target], size=font_size)
            box_font = ImageFont.truetype(font=font[target], size=18)
#             print(TEXT[i], description, font_size, '\n')
            
#             description += ' '+box_no
#             description = box_no+' ' + description
            if(target!= 'Chinese'):
                description_wrapped = text_wrap(description,desc_font,writing,max_width,max_height)
            else:
                description_wrapped = chinese_text_wrap(description,desc_font,writing,max_width,max_height)

            # write text in bounding boxes..
            writing.text((text_x,text_y),description_wrapped,font=desc_font, fill='Black')
            writing.text(((text_x-30),(text_y)),box_no,font=box_font, fill='Black')
            
            #Incrementing the counter
            counter = counter + 1
        output_list.append(bg)

    im1 = output_list.pop(0)  
    pdf_path = os.path.join(output_path, output_name)
    im1.save(pdf_path, "PDF" ,resolution=100.0, save_all=True, append_images=output_list)
    shutil.rmtree(folder_name)
    
    true_name = name.split('.')[0]
    txt_name = 'Translated_'+true_name+'_'+target+'.txt'
    text_path = os.path.join(output_path, txt_name)
    f = open(text_path, "w")
    f.write(all_text)
    f.close()
    
    docx_path = os.path.dirname(text_path)
    convert2docx(text_path, docx_path)
    
    #removing the text file
#     os.remove(text_path)

In [88]:
pdf_file_path = "./COVID-19_SocialDistancing.pdf"
target = 'Hindi'
output_path = './temp'
    
start = time.time()    
translation_pipeline(pdf_file_path, target, output_path)
end = time.time()
print(f"Total time: {end - start}")

100%|██████████| 1/1 [00:02<00:00,  2.21s/it]


Total time: 3.280320644378662


In [8]:
# start = time.time()

# pdf_file_path = "./COVID-19_SocialDistancing.pdf"
# target = 'Hindi'
# output_path = './'

# Target = {'Hindi': 'hi','Chinese':'zh', 'German':'de','French':'pt', 'Urdu':'ur'}
# font = {
#     'Hindi':'/home/jupyter/Lohit-Devanagari.ttf',
#     'Chinese':'/home/jupyter/Lohit-Devanagari.ttf',
#     'German':'/home/jupyter/Lohit-Devanagari.ttf',
#     'French':'/home/jupyter/Lohit-Devanagari.ttf',
#     'Urdu':'/home/jupyter/Lohit-Devanagari.ttf',
# }

# pages = convert_from_path(pdf_file_path) # size of image: 1700x2200

# name = pdf_file_path.split('/')[-1]
# output_name = 'Translated_'+name
# output_list = []

# folder_name = str(uuid.uuid4().hex)
# os.mkdir(folder_name)

# for i in range(len(pages)):
#     image_file = 'page_'+str(i)+'.jpg'
#     pages[i].save(os.path.join(folder_name, image_file))

#     # Calling google cloud vision API (text detection) 
#     client = vision.ImageAnnotatorClient()
#     image_path = os.path.join(folder_name, image_file)
    
#     with io.open(image_path, 'rb') as image_file1:
#         content = image_file1.read()
#     content_image = types.Image(content=content)
#     response = client.document_text_detection(image=content_image)
#     document = response.full_text_annotation

#     # Getting the bounds and text
#     bounds = get_document_bounds(FeatureType.PARA)
#     TEXT = get_paragraph_level_text(document)

#     bg = Image.open(image_path)
#     writing = ImageDraw.Draw(bg)

#     for i in tqdm(range(len(bounds))):
#         text_x, text_y, max_width, max_height = get_bounding_box_info(bounds[i])
#         x0, y0, x2, y2 = fill_bounding_box(bounds[i])

#         writing.rectangle((x0, y0, x2, y2), outline=None, fill='white')
#         description = translate_text(Target[target], TEXT[i])
#         desc_font = ImageFont.truetype(font=font[target], size=22)

#         description_wrapped = text_wrap(description,desc_font,writing,max_width,max_height)

#         # write text in bounding boxes..
#         writing.text((text_x,text_y),description_wrapped,font=desc_font, fill='Black')
#     output_list.append(bg)

# im1 = output_list.pop(0)  
# pdf_path = os.path.join(output_path, output_name)
# im1.save(pdf_path, "PDF" ,resolution=100.0, save_all=True, append_images=output_list)
# shutil.rmtree(folder_name)

# end = time.time()
# print(f"Total time: {end - start}")