In [19]:
import fitz  # PyMuPDF
from docx import Document
from docx.shared import Inches
from PIL import Image
import io
import cv2
import numpy as np
import os

def extract_text_and_images(pdf_path, page_numbers):
    """
    Extracts text and images from specified pages of a PDF file.
    """
    doc = fitz.open(pdf_path)
    text = ""
    images = []

    for page_num in page_numbers:
        if page_num >= len(doc):
            print(f"Page {page_num} does not exist in the document.")
            continue

        page = doc.load_page(page_num)
        text += page.get_text()

        # Extract images
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            images.append(image)

    return text, images

def apply_hand_drawn_effect(image):
    """
    Applies a "hand-drawn" effect to an image.
    """
    # Resize image for faster processing
    max_size = (800, 800)
    image.thumbnail(max_size, Image.Resampling.LANCZOS)

    # Convert image to numpy array
    image_np = np.array(image)

    # Apply stylization effect using OpenCV
    effect = cv2.stylization(image_np, sigma_s=3, sigma_r=0.5)
    return Image.fromarray(effect)

def create_word_document(text, images, output_docx):
    """
    Creates a Word document with text and stylized images.
    """
    doc = Document()
    doc.add_paragraph(text)

    for idx, image in enumerate(images):
        # Apply "hand-drawn" effect
        hand_drawn_image = apply_hand_drawn_effect(image)

        # Save image to a temporary file
        temp_image_path = f"temp_image_{idx}.png"
        hand_drawn_image.save(temp_image_path)

        # Add image to Word document
        doc.add_picture(temp_image_path, width=Inches(4))  # Image width: 4 inches

        # Remove temporary file
        os.remove(temp_image_path)

    doc.save(output_docx)
    print(f"Word document saved as {output_docx}")

def main():
    pdf_path = "Жирнов_Н_И_Классическая_механика (1).pdf"  # Path to the PDF file
    output_docx = "output.docx"  # Path to the output Word file

    # Specify page numbers to process (starting from 0)
    page_numbers = list(range(164, 180))  # Pages 165 to 180 (inclusive)

    # Extract text and images
    text, images = extract_text_and_images(pdf_path, page_numbers)

    # Create Word document
    create_word_document(text, images, output_docx)

if __name__ == "__main__":
    main()


Word document saved as output.docx


In [None]:
from pdf2docx import Converter

def convert_pdf_to_docx(pdf_file, docx_file, start_page, end_page):
    # Создание объекта Converter
    cv = Converter(pdf_file)
    # Конвертация указанного диапазона страниц PDF в DOCX
    cv.convert(docx_file, start=start_page, end=end_page)
    # Закрытие объекта Converter
    cv.close()

if __name__ == "__main__":
    pdf_path = "Жирнов_Н_И_Классическая_механика (1).pdf"  # Path to the PDF file
    docx_file = 'output.docx'  # Путь к выходному DOCX-файлу
    start_page = 164  # Начальная страница (включительно)
    end_page = 179    # Конечная страница (исключительно)
    convert_pdf_to_docx(pdf_path, docx_file, start_page, end_page)
