# Working With PDF

## 1. Module import

In [10]:
from pypdf import PdfReader
import json
import fitz 
from pdf2image import convert_from_path
import os
from pathlib import Path
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import math


## 2. Handling data

#### PATH

In [11]:
pdf_path = '../data_import/pdf_mock_file.pdf'
target_path = Path('../data_export/file_pdf')

target_text_path = target_path / 'extracted_text'
target_image_path = target_path / 'extracted_images'
target_paragraph_details_path = target_path / "extracted_paragraph_details"
target_convert_uppercase_path = target_path / "convert_uppercase"

os.makedirs(target_text_path, exist_ok=True)
os.makedirs(target_image_path, exist_ok=True)
os.makedirs(target_paragraph_details_path, exist_ok=True)
os.makedirs(target_convert_uppercase_path, exist_ok=True)

#### Extract Text

In [12]:
document = fitz.open(pdf_path)

output_text_path = target_text_path / "extracted_text.txt"

with open(output_text_path, 'w', encoding='utf-8') as f:
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]  
        for block in blocks:
            if block['type'] == 0: 
                text = block["lines"]
                for line in text:
                    for span in line["spans"]:
                        f.write(span["text"])
                f.write("\n")
        f.write("\n\n")

print(f"All text extracted and saved to {output_text_path}")

All text extracted and saved to ../data_export/file_pdf/extracted_text/extracted_text.txt


#### Extract Images

In [13]:
def extract_images(pdf_path, target_image_path):
    image_files = []

    for i in range(len(document)):
        page = document.load_page(i)
        image_list = page.get_images(full=True)
        
        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = document.extract_image(xref)
            image_bytes = base_image["image"]
            image_path = target_image_path / f"page_{i+1}_image_{image_index+1}.jpg"
            
            with open(image_path, "wb") as f:
                f.write(image_bytes)
            
            image_files.append(image_path)
    
    return image_files

image_files = extract_images(pdf_path, target_image_path)
print(f"Images have been extracted to {target_image_path}")

Images have been extracted to ../data_export/file_pdf/extracted_images


### Extract paragraph details

In [14]:
def extract_paragraph_details(pdf_path):
    doc = fitz.open(pdf_path)
    paragraph_details = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if block["type"] == 0:  # Text block
                previous_span = None
                paragraph_text = ""
                font_details = []

                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"]
                        font = span["font"]
                        size = span["size"]
                        color = span["color"]
                        flags = span["flags"]

                        # Determine formatting
                        is_bold = "Bold" in font
                        is_italic = "Italic" in font or "Oblique" in font

                        current_span = {
                            "text": text,
                            "font_name": font,
                            "font_size": size,
                            "color": color,
                            "bold": is_bold,
                            "italic": is_italic
                        }

                        # Concatenate spans with the same formatting
                        if previous_span and (previous_span["font_name"] == current_span["font_name"] and
                                              previous_span["font_size"] == current_span["font_size"] and
                                              previous_span["bold"] == current_span["bold"] and
                                              previous_span["italic"] == current_span["italic"] and
                                              previous_span["color"] == current_span["color"]):
                            paragraph_text += text
                            font_details[-1]["text"] += text
                        else:
                            if previous_span:
                                paragraph_text += previous_span["text"]
                            font_details.append(current_span)
                            previous_span = current_span

                paragraph_details.append({
                    "page": page_num + 1,
                    "line_content": paragraph_text,
                    "font_details": font_details
                })

    return paragraph_details

def save_paragraph_details(paragraph_details, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(paragraph_details, f, ensure_ascii=False, indent=4)

paragraph_details_output_path = target_paragraph_details_path / 'paragraph_details.json'
paragraph_details = extract_paragraph_details(pdf_path)
save_paragraph_details(paragraph_details, paragraph_details_output_path)
print(f"Paragraph details have been extracted to {paragraph_details_output_path}")

Paragraph details have been extracted to ../data_export/file_pdf/extracted_paragraph_details/paragraph_details.json


## Convert To Uppercase

In [15]:
def create_pdf_from_details(pdf_details, output_pdf_path):
    c = canvas.Canvas(output_pdf_path.as_posix(), pagesize=letter)
    width, height = letter

    current_page = None
    for detail in pdf_details:
        page_num = detail['page']
        
        if current_page != page_num:
            if current_page is not None:
                c.showPage()
            current_page = page_num
            y = height - 30  
            x = 30  

        for line in detail['line_content'].split('\n'):
            if y < 30:  
                c.showPage()
                y = height - 30
                x = 30

            # Split the line into segments that fit within the page width
            text_width = c.stringWidth(line, 'Helvetica', detail['font_details'][0]['font_size'])
            if text_width > width - 2 * x:
                chars_per_line = int(len(line) * (width - 2 * x) // text_width)
                line_segments = [line[i:i + chars_per_line] for i in range(0, len(line), chars_per_line)]
            else:
                line_segments = [line]

            for segment in line_segments:
                for char_detail in detail['font_details']:
                    color = char_detail['color']
                    if isinstance(color, list) and len(color) == 3:
                        r, g, b = color
                    else:
                        r = (color >> 16) & 0xFF
                        g = (color >> 8) & 0xFF
                        b = color & 0xFF
                    c.setFillColorRGB(r / 255.0, g / 255.0, b / 255.0)

                    if char_detail['bold']:
                        c.setFont('Helvetica-Bold', char_detail['font_size'])
                    else:
                        c.setFont('Helvetica', char_detail['font_size'])

                c.drawString(x, y, segment)
                y -= detail['font_details'][0]['font_size'] * 1.2  # Adjust line spacing

    c.save()

with open(paragraph_details_output_path, 'r', encoding='utf-8') as f:
    pdf_details = json.load(f)

new_pdf_path = target_convert_uppercase_path / 'PDF_uppercase.pdf'

create_pdf_from_details(pdf_details, new_pdf_path)

print(f"New PDF with uppercase text has been saved to {new_pdf_path}")

New PDF with uppercase text has been saved to ../data_export/file_pdf/convert_uppercase/PDF_uppercase.pdf
