In [None]:
!pip install requests beautifulsoup4 fpdf html2text weasyprint reportlab -q

In [None]:
import requests
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.enums import TA_CENTER, TA_LEFT
from reportlab.lib.units import mm
import textwrap
import os

def create_clean_pdf(url):
    # 1️⃣ Fetch webpage
    response = requests.get(url)
    response.raise_for_status()
    html = response.text

    # 2️⃣ Extract main content
    soup = BeautifulSoup(html, "html.parser")

    # Try to get title and main text content
    title = soup.title.string.strip() if soup.title else "Webpage Content"
    main_content = soup.find("main") or soup.find("article") or soup.body
    if not main_content:
        main_content = soup

    # Remove scripts, styles, and nav
    for tag in main_content.find_all(["script", "style", "nav", "footer", "header", "noscript"]):
        tag.decompose()

    text_parts = []
    for elem in main_content.find_all(["h1", "h2", "h3", "p", "li"]):
        txt = elem.get_text(strip=True)
        if txt:
            text_parts.append(txt)
    clean_text = "\n\n".join(text_parts)

    # 3️⃣ Create PDF filename
    os.makedirs("webpage_pdfs", exist_ok=True)
    name = url.split("/")[-1] or "index"
    pdf_path = os.path.join("webpage_pdfs", f"{name}.pdf")

    # 4️⃣ Format & write PDF (same style as your AI Supply Chain PDF)
    doc = SimpleDocTemplate(pdf_path, pagesize=A4,
                            rightMargin=20*mm, leftMargin=20*mm,
                            topMargin=20*mm, bottomMargin=20*mm)

    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(name='CenterTitle', alignment=TA_CENTER,
                              fontSize=16, leading=20, spaceAfter=10))
    styles.add(ParagraphStyle(name='Body', alignment=TA_LEFT,
                              fontSize=10, leading=12))

    flow = []
    flow.append(Paragraph(f"{title} — Hof University", styles['CenterTitle']))
    flow.append(Paragraph(f"Source: {url}", styles['Body']))
    flow.append(Spacer(1, 8))

    for para in clean_text.split("\n\n"):
        wrapped = "\n".join(textwrap.fill(line, 95) for line in para.splitlines())
        flow.append(Paragraph(wrapped.replace("\n", "<br/>"), styles['Body']))
        flow.append(Spacer(1, 4))

    doc.build(flow)
    print(f"✅ PDF created: {pdf_path}")

# ---------- RUN ----------
if __name__ == "__main__":
    link = input("Enter webpage URL: ").strip()
    create_clean_pdf(link)
