In [None]:
!pip install requests beautifulsoup4 fpdf html2text weasyprint reportlab lxml -q

In [None]:
# Webpage Extraction with XML

import os
import re
import textwrap
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import mm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_LEFT

# -------------------------------------
# STEP 1: Extract all links from sitemap or webpage
# -------------------------------------
def extract_links(base_url):
    print(f"üåê Fetching links from: {base_url}")
    response = requests.get(base_url, timeout=15)
    response.raise_for_status()

    content_type = response.headers.get("Content-Type", "")
    base_domain = urlparse(base_url).netloc
    all_links = set()

    # ---- Case 1: XML Sitemap ----
    if "xml" in content_type or base_url.endswith(".xml"):
        soup = BeautifulSoup(response.text, "xml")
        for loc in soup.find_all("loc"):
            url = loc.text.strip()
            if base_domain in urlparse(url).netloc:
                all_links.add(url)
        print(f"‚úÖ Found {len(all_links)} links in XML sitemap.")

    # ---- Case 2: Regular HTML page ----
    else:
        soup = BeautifulSoup(response.text, "html.parser")
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            full_url = urljoin(base_url, href)
            if base_domain in urlparse(full_url).netloc:
                all_links.add(full_url.split("#")[0])
        print(f"‚úÖ Found {len(all_links)} HTML links on page.")

    return sorted(all_links)

# -------------------------------------
# STEP 2: Convert webpage ‚Üí Clean PDF
# -------------------------------------
def create_clean_pdf(url):
    try:
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        html = response.text
    except Exception as e:
        print(f"‚ùå Failed to fetch {url}: {e}")
        return

    soup = BeautifulSoup(html, "html.parser")

    title = soup.title.string.strip() if soup.title else "Webpage Content"
    main_content = soup.find("main") or soup.find("article") or soup.body or soup

    # Remove unwanted tags
    for tag in main_content.find_all(["script", "style", "nav", "footer", "header", "noscript"]):
        tag.decompose()

    text_parts = []
    for elem in main_content.find_all(["h1", "h2", "h3", "p", "li", "span"]):
        txt = elem.get_text(strip=True)
        if txt:
            text_parts.append(txt)

    clean_text = "\n\n".join(text_parts)

    # ---------- Save as PDF ----------
    os.makedirs("webpage_pdfs", exist_ok=True)
    name = re.sub(r"[^A-Za-z0-9_-]", "_", url.split("/")[-1] or "index")
    pdf_path = os.path.join("webpage_pdfs", f"{name}.pdf")

    doc = SimpleDocTemplate(pdf_path, pagesize=A4,
                            rightMargin=20*mm, leftMargin=20*mm,
                            topMargin=20*mm, bottomMargin=20*mm)

    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(name='CenterTitle', alignment=TA_CENTER,
                              fontSize=16, leading=20, spaceAfter=10))
    styles.add(ParagraphStyle(name='Body', alignment=TA_LEFT,
                              fontSize=10, leading=12))

    flow = []
    flow.append(Paragraph(f"{title} ‚Äî Hof University", styles['CenterTitle']))
    flow.append(Paragraph(f"Source: {url}", styles['Body']))
    flow.append(Spacer(1, 8))

    for para in clean_text.split("\n\n"):
        wrapped = "\n".join(textwrap.fill(line, 95) for line in para.splitlines())
        flow.append(Paragraph(wrapped.replace("\n", "<br/>"), styles['Body']))
        flow.append(Spacer(1, 4))

    doc.build(flow)
    print(f"üìÑ PDF created: {pdf_path}")

# -------------------------------------
# STEP 3: Run the whole pipeline
# -------------------------------------
if __name__ == "__main__":
    base_url = " Provide xml file of the website here "

    # 1. Extract all subpage links
    links = extract_links(base_url)

    # 2. Save to file (optional)
    with open("urls.txt", "w") as f:
        for link in links:
            f.write(link + "\n")
    print("üìù Saved all links to urls.txt")

    # 3. Convert all links to PDFs
    for i, link in enumerate(links, 1):
        print(f"\n[{i}/{len(links)}] Processing: {link}")
        create_clean_pdf(link)
