In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import zipfile
from pypdf import PdfReader
from PIL import Image
import pytesseract
import io

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

In [3]:
def ocr_image(image):
    return pytesseract.image_to_string(image)

In [4]:
def extract_text_from_image(pdf_path):
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        for page in reader.pages:
            if '/XObject' in page['/Resources']:
                xObject = page['/Resources']['/XObject'].get_object()
                for obj in xObject:
                    if xObject[obj]['/Subtype'] == '/Image':
                        data = xObject[obj]._data
                        image = Image.open(io.BytesIO(data))
                        text += ocr_image(image)
    return text

In [5]:
def save_pdf_and_extract_text(url, base_dir, session):
    response = session.get(url)
    filename = os.path.join(base_dir, os.path.basename(url))
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Сохранён PDF: {filename}")    
    text = extract_text_from_pdf(filename)    
    if not text:
        text = extract_text_from_image(filename)
    text_filename = f"{os.path.splitext(filename)[0]}.txt"
    with open(text_filename, 'w', encoding='utf-8') as f:
        f.write(text)

In [6]:
def crawl_and_download_pdfs(url, base_dir, session, visited=None, count=0):
    if visited is None:
        visited = set()  
    if url in visited or count >= 50:
        return count
    visited.add(url)
    response = session.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)
    for link in links:
        href = link['href']
        full_url = urljoin(url, href)    
        if full_url.endswith('.pdf'):
            save_pdf_and_extract_text(full_url, base_dir, session)
            count += 1
            if count >= 50:
                break
        elif base_url in full_url:
            count = crawl_and_download_pdfs(full_url, base_dir, session, visited, count)
            if count >= 50:
                break
    return count

In [7]:
def create_zip_archive(base_dir, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w') as zip_file:
        for foldername, subfolders, filenames in os.walk(base_dir):
            for filename in filenames:
                file_path = os.path.join(foldername, filename)
                zip_file.write(file_path, os.path.relpath(file_path, base_dir))
    print(f"Создан ZIP-архив: {zip_filename}")

In [8]:
if __name__ == "__main__":
    base_url = "https://pstu.ru/sveden/education/"
    base_dir = "downloaded_pdfs"
    zip_filename = "pdfs.zip"
    with requests.Session() as session:
        crawl_and_download_pdfs(base_url, base_dir, session)    
    create_zip_archive(base_dir, zip_filename)

Сохранён PDF: downloaded_pdfs\opop-soo.pdf
Сохранён PDF: downloaded_pdfs\kalendarnyj-grafik.pdf
Сохранён PDF: downloaded_pdfs\{6ea8912d-e61f-4482-af34-fa4a2444b5d7}.pdf
Сохранён PDF: downloaded_pdfs\{49bad6ff-5e98-4b07-83c3-e90160c13d54}.pdf
Сохранён PDF: downloaded_pdfs\{7967d58c-6a2b-4d1b-9eab-e50b785eff3f}.pdf
Сохранён PDF: downloaded_pdfs\{0963265b-7877-4e70-8232-9e38f68f3c36}.pdf
Сохранён PDF: downloaded_pdfs\{2025042b-7295-42ff-9462-7e5dbf2f7e62}.pdf
Сохранён PDF: downloaded_pdfs\{e035b918-2cca-40be-88e3-ef7b3326daa5}.pdf
Сохранён PDF: downloaded_pdfs\{05c24f0b-0c8a-4c46-97a6-a22429fc8555}.pdf
Сохранён PDF: downloaded_pdfs\{dd1bbf13-4610-452d-a6ad-932a3f8568b9}.pdf
Сохранён PDF: downloaded_pdfs\{92ad979b-1e9a-4296-af3b-9bc96d9c9288}.pdf
Сохранён PDF: downloaded_pdfs\{4ca9c360-2fc1-43e3-9949-bb98d84be786}.pdf
Сохранён PDF: downloaded_pdfs\{192c5a69-8e9e-471e-aabe-44bcd11a797f}.pdf
Сохранён PDF: downloaded_pdfs\{a99858ad-45fa-4766-ad29-f11299a19413}.pdf
Сохранён PDF: downloaded_pdf