# Working With PDF

## 1. Module import

In [35]:
from pypdf import PdfReader
import fitz
from pathlib import Path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import simpleSplit
import json
import json


## 2. Handling data

## Task 1

In [2]:
pdf_path = '../data_import/pdf_mock_file.pdf'
target_path = Path('../data_export/file_pdf')

In [3]:
reader = PdfReader(pdf_path)
for  i in range(len(reader.pages)):
    page = reader.pages[i]

In [4]:
target_text_path = target_path / "text"
target_text_path.mkdir(parents=True, exist_ok=True)

doc = fitz.open(pdf_path)

all_text = ""

for page in doc:
    all_text += page.get_text()

output_text_path = target_text_path / "extracted_text.txt"

with open(output_text_path, 'w', encoding='utf-8') as file:
    file.write(all_text)

doc.close()

print(f"Extracted text has been saved to {output_text_path}")

Extracted text has been saved to ../data_export/file_pdf/text/extracted_text.txt


In [6]:
target_images_path = target_path / "images"
target_images_path.mkdir(parents=True, exist_ok=True)

for i, page in enumerate(reader.pages):
    for image_index, img in enumerate(page.images):
        output_images_path = target_images_path / f"page_{i}_image_{image_index}.jpg"
        with open(output_images_path, "wb") as f:
            f.write(img.data)
print(f"Extracted images has been saved to {output_images_path}")

Extracted images has been saved to ../data_export/file_pdf/images/page_2_image_5.jpg


## Task 2

In [32]:
target_text_details_path = target_path / "text_details"
target_text_details_path.mkdir(parents=True, exist_ok=True)


def extract_pdf_details(pdf_path, output_path):
    pdf_details = []
    for page_num, page_layout in enumerate(extract_pages(pdf_path), start=1):
        previous_y = None
        line_content = ""
        font_details = []

        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line, LTTextLine):
                        current_y = text_line.y0
                        if previous_y is not None and abs(previous_y - current_y) > 10:
                            if line_content:
                                pdf_details.append({
                                    'page': page_num,
                                    'line_content': line_content,
                                    'font_details': font_details
                                })
                            line_content = ""
                            font_details = []

                        line_content += text_line.get_text()
                        for character in text_line:
                            if isinstance(character, LTChar):
                                font_details.append({
                                    'text': character.get_text(),
                                    'font_name': character.fontname,
                                    'font_size': character.size,
                                    'color': character.graphicstate.ncolor,
                                    'bold': 'Bold' in character.fontname,
                                    'italic': 'Italic' in character.fontname
                                })
                        previous_y = current_y

        if line_content:
            pdf_details.append({
                'page': page_num,
                'line_content': line_content,
                'font_details': font_details
            })

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(pdf_details, f, ensure_ascii=False, indent=4)

output_file = target_text_details_path / "pdf_details.json"
extract_pdf_details(pdf_path, output_file)

print(f"Extracted text details has been saved to {output_file}")


Extracted text details has been saved to ../data_export/file_pdf/text_details/pdf_details.json


## Task 3

In [51]:
target_pdf_uppercase_path = target_path / "pdf_uppercase"
target_pdf_uppercase_path.mkdir(parents=True, exist_ok=True)

def create_pdf_from_details(pdf_details, output_pdf_path):
    c = canvas.Canvas(output_pdf_path.as_posix(), pagesize=letter)
    width, height = letter

    current_page = None
    for detail in pdf_details:
        page_num = detail['page']
        
        if current_page != page_num:
            if current_page is not None:
                c.showPage()
            current_page = page_num
            y = height - 30  
            x = 30  

        for line in detail['line_content'].split('\n'):
            if y < 30:  
                c.showPage()
                y = height - 30
                x = 30
            
            for char_detail in detail['font_details']:
                
                color = char_detail['color']
                if isinstance(color, list) and len(color) == 3:
                    r, g, b = color
                else:
                    r = g = b = color[0] if isinstance(color, list) else 0
                c.setFillColorRGB(r, g, b)

                if char_detail['bold']:
                    c.setFont('Helvetica-Bold', char_detail['font_size'])
                else:
                    c.setFont('Helvetica', char_detail['font_size'])

            c.drawString(x, y, line.upper())  
            y -= 15

    c.save()

with open(output_file, 'r', encoding='utf-8') as f:
    pdf_details = json.load(f)

output_pdf_uppercase_file = target_pdf_uppercase_path / "uppercase_text.pdf"
create_pdf_from_details(pdf_details, output_pdf_uppercase_file)

print(f"Uppercase text PDF has been saved to {output_pdf_uppercase_file}")

Uppercase text PDF has been saved to ../data_export/file_pdf/pdf_uppercase/uppercase_text.pdf
