In [192]:
import fitz  # PyMuPDF for PDF handling
from paddleocr import PaddleOCR  # For OCR extraction of text with bounding boxes
from deep_translator import GoogleTranslator  # For translating text from Chinese to English
from reportlab.pdfgen import canvas  # For generating the translated PDF
from reportlab.lib.pagesizes import letter, A4  # Page size for new PDF
from reportlab.lib import colors  # For coloring text (debug)
from reportlab.pdfbase.ttfonts import TTFont  # For adding custom fonts
from reportlab.pdfbase import pdfmetrics  # For registering custom fonts
import os

In [193]:
# Register a custom font if needed (Optional: Arial Unicode for CJK characters)
pdfmetrics.registerFont(TTFont('ArialUnicode', 'ArialUnicodeMS.ttf'))

In [194]:
# def adjust_cmyk_image_opacity(image_path, opacity=1.0):
#     """
#     Adjusts the opacity of a CMYK image by converting it to RGBA and modifying the alpha channel.
    
#     Args:
#         image_path (str): The path to the input CMYK image.
#         opacity (float): The desired opacity value (0.0 = fully transparent, 1.0 = fully opaque).
    
#     Returns:
#         Image: A PIL Image object with the modified opacity.
#     """
#     # Open the image in CMYK mode
#     image = Image.open(image_path)
    
#     # Convert the CMYK image to RGBA (which supports alpha channel)
#     rgba_image = image.convert("RGBA")
    
#     # Extract the R, G, B, and A channels
#     r, g, b, a = rgba_image.split()
    
#     # Create a new alpha channel with the desired opacity
#     new_alpha = a.point(lambda p: int(p * opacity))
    
#     # Merge back the modified alpha channel with the R, G, and B channels
#     rgba_image_with_opacity = Image.merge("RGBA", (r, g, b, new_alpha))
    
#     return rgba_image_with_opacity

# # Example usage
# adjusted_image = adjust_cmyk_image_opacity("image_33_0.png", opacity=0.5)
# adjusted_image.show() 

In [195]:
# def get_image_opacity(image):
#     """
#     Retrieves the average opacity value from a CMYK image with an alpha channel.
#     Args:
#         image_path (str): Path to the local CMYK image.
#     Returns:
#         float: The average opacity value (0.0 - 1.0) if alpha channel is present, else 1.0.
#     """
#     # image = Image.open(image_path)
    
#     # Convert to RGBA if the image has an alpha channel, otherwise return full opacity (1.0)
#     if image.mode == 'CMYK':
#         rgba_image = image.convert("RGBA")
#     elif image.mode == 'RGBA':
#         rgba_image = image
#     else:
#         # No alpha channel, so return maximum opacity
#         return 1.0

#     # Extract the alpha channel from the RGBA image
#     _, _, _, alpha = rgba_image.split()
    
#     # Calculate the average opacity
#     avg_opacity = sum(alpha.getdata()) / (255 * alpha.size[0] * alpha.size[1])
#     return avg_opacity

In [196]:
# Step 1: Extract text, images, and tables from PDF using OCR
def extract_text_from_pdf(pdf_path, language='ch'):
    # Initialize the OCR model with the Chinese language
    ocr = PaddleOCR(lang=language, use_gpu=False)
    doc = fitz.open(pdf_path)
    extracted_data = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        image_data = []  # Collect image coordinates and sizes

        # Extract text and bounding boxes using OCR
        page_image = page.get_pixmap()  # Get page as an image for OCR
        image_path = f"temp_page_{page_num}.png"
        page_image.save(image_path)

        ocr_result = ocr.ocr(image_path)
        page_text_data = []
        for res in ocr_result[0]:
            # Each 'res' contains [bounding box, text, confidence]
            bounding_box, text = res
            page_text_data.append((bounding_box, text))

        # Collect image data (coordinates, etc.)
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]
            img = doc.extract_image(xref)
            # opacity = get_image_opacity(img)
            image_filename = f"image_{page_num}_{img_index}.png"
            with open(image_filename, "wb") as img_file:
                img_file.write(img["image"])
            # Extract image coordinates, width, and height
            rect = page.get_image_rects(xref)[0]
            image_data.append((image_filename, rect))

        # Append data for this page
        extracted_data.append({'page_num': page_num, 'text': page_text_data, 'images': image_data})
        # Remove temporary page image
        os.remove(image_path)

    # Reverse the ovelapping images per pages
    for page_data in extracted_data:
        # Sort the images for the current page in reverse order by filename
        page_data['images'].sort(key=lambda item: item[0], reverse=True)
    return extracted_data

In [221]:
extracted_data = extract_text_from_pdf("data/pdf2.pdf")

[2024/10/04 12:13:28] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/user/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/user/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length

In [222]:
extracted_data

[{'page_num': 0,
  'text': [([[405.0, 41.0], [508.0, 43.0], [508.0, 57.0], [405.0, 56.0]],
    ('各国化妆品标签管理指南', 0.9451141953468323)),
   ([[92.0, 92.0], [175.0, 92.0], [175.0, 109.0], [92.0, 109.0]],
    ('一、适用范围', 0.9990265369415283)),
   ([[115.0, 141.0], [506.0, 141.0], [506.0, 155.0], [115.0, 155.0]],
    ('本指南旨在提供我国的化妆品主要出口目标国家（地区）在化妆品标签方面', 0.9992008805274963)),
   ([[90.0, 165.0], [505.0, 165.0], [505.0, 178.0], [90.0, 178.0]],
    ('的法规、标准、要求以及与我国的差异，提出达到目标市场要求的建议和指南。', 0.9996585249900818)),
   ([[89.0, 187.0], [313.0, 187.0], [313.0, 201.0], [89.0, 201.0]],
    ('适用于所有出口或准备出口的化妆品企业。', 0.9998819231987)),
   ([[118.0, 549.0], [428.0, 241.0], [490.0, 307.0], [180.0, 615.0]],
    ('MOFCOM', 0.9954840540885925))],
  'images': [('image_0_0.png',
    Rect(123.77999877929688, 242.75997924804688, 543.5399780273438, 671.1599731445312))]},
 {'page_num': 1,
  'text': [([[89.0, 44.0], [189.0, 44.0], [189.0, 57.0], [89.0, 57.0]],
    ('各国化妆品标签管理指南', 0.9979879260063171)),
   ([[90.0, 94.0], [

In [223]:
def translate_text(text, source_lang='zh-CN', target_lang='en'):
    if not isinstance(text, str):
        print(f"Invalid input type for translation: {type(text)}")  # Debugging line
        return ""  # Return empty string for invalid input

    max_chunk_size = 5000
    translated_chunks = []

    # Split the text into smaller chunks
    for i in range(0, len(text), max_chunk_size):
        chunk = text[i:i + max_chunk_size]
        print(f"Translating chunk: {chunk}")  # Debugging line

        try:
            translated_chunk = GoogleTranslator(source=source_lang, target=target_lang).translate(chunk)
            if translated_chunk:  # Ensure we only append valid strings
                translated_chunks.append(translated_chunk)
            else:
                print(f"Translation returned None for chunk: {chunk}")  # Debugging line

        except Exception as e:
            print(f"Error translating chunk '{chunk}': {e}")  # Print the error for debugging

    # Join all the translated chunks into a single string
    translated_text = ''.join(translated_chunks)
    return translated_text


In [226]:
# Step 3: Reconstruct PDF with translated text and preserved layout
def create_translated_pdf(extracted_data, output_path, page_size=A4):
    c = canvas.Canvas(output_path, pagesize=page_size)
    width, height = page_size

    for page in extracted_data:
        page_num = page['page_num']
        text_data = page['text']
        image_data = page['images']

        # Set the font and size for the text
        c.setFont("ArialUnicode", 10)

        # Draw images at original positions
        for img_filename, rect in image_data:
            x1, y1, x2, y2 = rect  # Image coordinates
            img_width = x2 - x1
            img_height = y2 - y1
            # img_opa = adjust_cmyk_image_opacity(img_filename, opacity=0.5)
            # Save the adjusted image temporarily
            # temp_filename = f"temp_{os.path.basename(img_filename)}"
            # img_opa.save(temp_filename)
            c.drawImage(img_filename, x1, height - y2, width=img_width, height=img_height)

        # Draw translated text at original bounding box positions with adjusted font size
        for bounding_box, text_tuple in text_data:
            if not isinstance(text_tuple, tuple):
                print(f"Invalid text input: {text_tuple}")  # Debugging line
                continue  # Skip invalid text

            # Extract the actual text and ignore the confidence score
            text = text_tuple[0]  # Get the text from the tuple
            
            # Ensure the text is a string
            if not isinstance(text, str):
                print(f"Invalid text extracted: {text}")  # Debugging line
                continue  # Skip if the extracted text is not a string
            
            x_min, y_min = bounding_box[0]
            # x_min = x_min-60
            # print("----------------",x_min,"---------------",y_min,"-------------------------------")
            x_max, y_max = bounding_box[2]
            # x_max = x_max+80
            # print("----------------",x_max,"---------------",y_max,"-------------------------------")

            # # Draw a bounding box around the text
            # c.setStrokeColorRGB(0, 0, 1)  # Set color to blue for bounding box
            # c.rect(x_min, height - y_max, x_max - x_min, y_max - y_min, stroke=1, fill=0)

            # Translate the text
            translated_text = translate_text(text)

            # Calculate the appropriate font size to fit text within bounding box
            box_width = x_max - x_min
            c.setFont("ArialUnicode", 10)  # Default font size
            font_size = adjust_font_size(translated_text, box_width)
            c.setFont("ArialUnicode", font_size)

            # Check if the text is within a table or a paragraph
            if is_within_table(bounding_box, text_data):
                # Adjust alignment and placement if it's part of a table
                c.drawRightString(x_max, height - y_max, translated_text)
            else:
                # Place the translated text normally
                c.drawString(x_min, height - y_max, translated_text)  # Adjust Y-coordinate for PDF format

        # Add a new page for each original PDF page
        c.showPage()
    # delete_all_images(extracted_data)
    c.save()


# Helper function: Adjust font size to fit text within a bounding box
def adjust_font_size(text, box_width, initial_font_size=10):
    """Adjust the font size based on the box width and text length."""
    font_size = initial_font_size
    text_width = len(text) * (font_size * 0.5)  # Approximate text width calculation
    if text_width > box_width:
        font_size = (box_width / len(text)) * 2 # Adjust font size proportionally
        # font_size = ((box_width / len(text)) * 2 ) + 1 # Adjust font size proportionally
    return font_size


# Helper function: Check if the bounding box is part of a table
def is_within_table(bounding_box, text_data):
    """Identify if a text bounding box belongs to a table by checking proximity with other boxes."""
    x_min, y_min, x_max, y_max = bounding_box[0][0], bounding_box[0][1], bounding_box[2][0], bounding_box[2][1]
    for other_box, _ in text_data:
        if other_box != bounding_box:
            other_x_min, other_y_min, other_x_max, other_y_max = other_box[0][0], other_box[0][1], other_box[2][0], other_box[2][1]
            # Check if the bounding boxes are aligned horizontally and vertically
            if abs(other_y_min - y_min) < 5 and abs(other_y_max - y_max) < 5:
                return True
    return False

def delete_all_images(extracted_data):
    # Loop through each page's data in extracted_data
    for page in extracted_data:
        image_data = page.get('images', [])
        
        # Loop through each image tuple in image_data
        for img_tuple in image_data:
            img_filename = img_tuple[0]  # Extract the image filename
            # Delete the image file if it exists
            if os.path.exists(img_filename):
                os.remove(img_filename)


In [227]:
create_translated_pdf(extracted_data, "translated_english_document2.pdf")

Translating chunk: 各国化妆品标签管理指南
Translating chunk: 一、适用范围
Translating chunk: 本指南旨在提供我国的化妆品主要出口目标国家（地区）在化妆品标签方面
Translating chunk: 的法规、标准、要求以及与我国的差异，提出达到目标市场要求的建议和指南。
Translating chunk: 适用于所有出口或准备出口的化妆品企业。
Translating chunk: MOFCOM
Translating chunk: 各国化妆品标签管理指南
Translating chunk: 二、进出口化妆品基本情况概述
Translating chunk: 2.1术语和定义
Translating chunk: 化妆品cosmetios：指以涂抹、洒、喷或其它类似方式，施于人体表面任何部位
Translating chunk: （皮肤、毛发、指甲、口唇）、牙齿和口腔粘膜，以达到清洁、芳香、改变外观、
Translating chunk: 修正人体气味、保养、保持良好状态目的的产品。
Translating chunk: 标签labelling：指粘贴或连接或印在化妆品销售包装上的文字、数字、符号、
Translating chunk: 图案和置于销售包装内的说明性材料。
Translating chunk: 销售包装salespaokaging：指以销售为目的，与内装物一起交付给消费者的包
Translating chunk: 装。
Translating chunk: 内装物contents：指包装容器内所装的产品。
Translating chunk: 展示面displaypanels：指化妆品在陈列时，在自然状态下被消费者能看见的
Translating chunk: 面。
Translating chunk: 可视面visiblepanels：指化妆品在不破坏销售包装的情况下，消费者能够看
Translating chunk: 到的任何面。
Translating chunk: 净含量netcontent：指去除包装容器和其它包装材料后，内装物的实际质量或
Translating chunk: 体积或长度。
Translating chunk: 保质期sheIflife：指在化妆品产品标准和标签

In [183]:
# Step 4: Combine and execute the modules
def translate_pdf(input_pdf, output_pdf):
    # Extract the PDF contents (text, images, tables)
    extracted_data = extract_text_from_pdf(input_pdf)

    # Create a translated PDF with the extracted data
    create_translated_pdf(extracted_data, output_pdf)

In [184]:
# Example usage:
input_pdf_path = "data/pdf2.pdf"  # Replace with your Chinese PDF path
output_pdf_path = "translated_english_document.pdf"  # Output path for the translated PDF
translate_pdf(input_pdf_path, output_pdf_path)

[2024/10/03 17:23:26] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/user/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/user/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length

Image - chinese to english

In [241]:
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from paddleocr import PaddleOCR
from deep_translator import GoogleTranslator

# Step 1: Initialize PaddleOCR for Chinese and English text detection
ocr = PaddleOCR(use_angle_cls=True, lang='ch')  # 'ch' supports Chinese and English text detection

# Step 2: Read the image
image_path = 'pdf_page.png'
image = cv2.imread(image_path)

# Step 3: Use PaddleOCR to detect and extract text along with bounding boxes
ocr_results = ocr.ocr(image, cls=True)

# Step 4: Create a PIL image from the OpenCV image for easier text overlay
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
draw = ImageDraw.Draw(pil_image)

# Load a suitable font for overlaying the translated text
font_path = "ArialUnicodeMS.ttf"  # Specify the path to your TTF font file
font_size = 16  # Adjust based on image size and resolution
font = ImageFont.truetype(font_path, font_size)

# Define the translation function as provided by the user
def translate_text(text, source_lang='zh-CN', target_lang='en'):
    if not isinstance(text, str):
        print(f"Invalid input type for translation: {type(text)}")  # Debugging line
        return ""  # Return empty string for invalid input

    max_chunk_size = 5000
    translated_chunks = []

    # Split the text into smaller chunks
    for i in range(0, len(text), max_chunk_size):
        chunk = text[i:i + max_chunk_size]
        print(f"Translating chunk: {chunk}")  # Debugging line

        try:
            translated_chunk = GoogleTranslator(source=source_lang, target=target_lang).translate(chunk)
            if translated_chunk:  # Ensure we only append valid strings
                translated_chunks.append(translated_chunk)
            else:
                print(f"Translation returned None for chunk: {chunk}")  # Debugging line

        except Exception as e:
            print(f"Error translating chunk '{chunk}': {e}")  # Print the error for debugging

    # Join all the translated chunks into a single string
    translated_text = ''.join(translated_chunks)
    return translated_text

# Step 5: Iterate through detected text boxes, translate, and replace text in the image
if ocr_results[0]:
    for line in ocr_results:
        for (bbox, (text, confidence)) in line:  # Correctly unpack the tuple to get text and confidence
            if isinstance(text, str):  # Check if text is a valid string
                print(f"Detected text: {text}")  # Debugging line
                translated_text = translate_text(text)  # Use the translate_text function

                # Bounding box coordinates
                top_left = bbox[0]
                bottom_right = bbox[2]
                x_min, y_min = int(top_left[0]), int(top_left[1])
                x_max, y_max = int(bottom_right[0]), int(bottom_right[1])

                # Step 6: Draw a white rectangle over the original text region
                draw.rectangle([x_min, y_min, x_max, y_max], fill="white")

                # Step 7: Overlay the translated text back on the image
                draw.text((x_min, y_min), translated_text, fill="black", font=font)

# Step 8: Convert PIL image back to OpenCV format and save the image
final_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
output_path = 'translated_image_paddleocr.jpg'  # Path to save the final image
cv2.imwrite(output_path, final_image)

print(f"Translation and text overlay completed successfully! Image saved at {output_path}")


[2024/10/04 13:20:47] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/user/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/user/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length