In [1]:
#Required packages
!pip install requests lxml[html_clean] readability-lxml weasyprint

Collecting readability-lxml
  Downloading readability_lxml-0.8.1-py3-none-any.whl.metadata (3.6 kB)
Collecting weasyprint
  Downloading weasyprint-65.0-py3-none-any.whl.metadata (3.7 kB)
Collecting lxml-html-clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Collecting cssselect (from readability-lxml)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pydyf>=0.11.0 (from weasyprint)
  Downloading pydyf-0.11.0-py3-none-any.whl.metadata (2.5 kB)
Collecting tinyhtml5>=2.0.0b1 (from weasyprint)
  Downloading tinyhtml5-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting cssselect2>=0.8.0 (from weasyprint)
  Downloading cssselect2-0.8.0-py3-none-any.whl.metadata (2.9 kB)
Collecting Pyphen>=0.9.1 (from weasyprint)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting brotli>=1.0.1 (from fonttools[woff]>=4.0.0->weasyprint)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64

In [2]:
# Import libraries
import requests
from lxml import html
from readability import Document
from weasyprint import HTML

In [3]:
# Default configuration dictionary (Comment out needs to be implemented)

CONFIG = {
    "font_size": 10,
    # "print_mode": None,
    "heading_emphasizing_factor": 2.4,
    "background_color": "white",
    "text_alignment": "left",
    "orientation": "portrait",
    "font_color": "black",
    "font_family": "Arial, sans-serif",
    "page_size": "A4",
    "margin": "15mm",
    "line_height": 1.5,
    "show_hyperlink_url": False,
    # "remove_images_or_tables": False,
    # "checklist_of_titles": [],
    "column_count": 1,
    # "summarize": False,
    "summarizing_factor": 0.5,
    "show_page_no": False,
    # "space_between_paragraphs": "0px", 
    # "image_or_table_alignment": "center",
    # "image_or_table_size": None 
}


In [25]:
# Custom configuration dictionary
CONFIG = {
    "font_size": 8,
    "heading_emphasizing_factor": 2.4,
    "background_color": "white",
    "text_alignment": "left",
    "orientation": "landscape",
    "font_color": "black",
    "font_family": "Arial, sans-serif",
    "page_size": "A4",
    "margin": "15mm",
    "line_height": 1.5,
    "show_hyperlink_url": False,
    "column_count":3,
    "summarizing_factor": 0.5,
    "show_page_no" : True
}

In [26]:
def css_format(config=CONFIG):
    """
    Generates a customizable CSS string for optimizing web pages for print.
    """
    
    hyperlink_css = ""
    if config.get('show_hyperlink_url'):
        hyperlink_css = """
    /* Ensure links show their URLs */
    a {{
      text-decoration: none;
    }}
    a[href]:after {
        content: " (" attr(href) ")";
    }
    """
        
    page_number_css = ""
    if config['show_page_no']:  # Assuming 'show_page_no' always exists
        page_number_css = f"""
        @bottom-center {{
            content: counter(page) " / " counter(pages);
            font-size: {config['font_size']*0.8}pt;
        }}
        """
        
    css_print_format = f"""
@media print {{
    /* Set background and text properties */
    body {{
        background-color: {config['background_color']} !important;
        color: {config['font_color']} !important;
        font-size: {config['font_size']}pt;
        font-family: {config['font_family']};
        line-height: {config['line_height']};
        margin: 0;
        padding: 0;
        column-count: {config['column_count']};
        column-gap: 20px;
        text-align: {config['text_alignment']};
    }}
    
    /* Set page size and margins */
    @page {{
        size: {config['page_size']} {config['orientation']};
        margin: {config['margin']};

        {page_number_css}
    }}
    
    /* Ensure images fit within the print area */
    img {{
        max-width: 100% !important;
        height: auto !important;
        page-break-inside: avoid;
        display: block;
        margin: auto;
    }}

    {hyperlink_css}
    
    /* Ensure main-heading spans full width */
    .main-heading {{
        font-size: {config['font_size'] * config['heading_emphasizing_factor'] * 1.3}pt !important;
        font-weight: bold !important;
        text-align: left;
        display: block !important;
        margin-top: 20px !important;
        color: {config['font_color']} !important;
        column-span: all; /* Ensures heading spans across all columns */
        width: 100%;
        break-before: always; /* Forces it to start on a new line */
    }}
    
    h1 {{
        font-size: {config['font_size'] * config['heading_emphasizing_factor']}pt !important;
        font-weight: bold !important;
        text-align: left;
        display: block !important;
        margin-top: 20px !important;
        color: {config['font_color']} !important;
    }}
    
    h2 {{
        font-size: {config['font_size'] * config['heading_emphasizing_factor'] * 0.8}pt !important;
        font-weight: bold !important;
        text-align: left;
        margin-top: 15px !important;
        color: {config['font_color']} !important;
    }}
    
    h3 {{
        font-size: {config['font_size'] * config['heading_emphasizing_factor'] * 0.6}pt !important;
        font-weight: bold;
        text-align: left;
        margin-top: 10px;
        color: {config['font_color']};
    }}
}}
"""
    return css_print_format

In [27]:
def fetch_html(url):
    """
    Fetch raw HTML by HTTP request with a user-agent header to bypass bot detection.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.text

def extract_main_content(raw_html, url):
    """
    Use readability-lxml to extract the main article content.
    Returns the short title and a cleaned HTML summary.
    """
    doc = Document(raw_html, url=url)
    title = doc.short_title()
    content_html = doc.summary(html_partial=True)
    return title, content_html

def build_full_html(title, content_html, url):
    """
    Wrap the extracted content in a minimal HTML document and include the title and URL in the body.
    """
    full_html = f"""<!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="utf-8">
        <title>{title}</title>
        <style>
            .url-text {{
                color: blue;
            }}
        </style>
    </head>
    <body>
        <h1 class="main-heading">{title}</h1>
        {content_html}
    </body>
    </html>"""
    return full_html

def inject_print_css(html_content, css_print_format):
    """
    Inject print CSS into the HTML content.
    """
    tree = html.fromstring(html_content)
    head = tree.find("head")
    
    if head is None:
        head = html.Element("head")
        tree.insert(0, head)
    style = html.Element("style")
    style.text = css_print_format
    head.append(style)
    return html.tostring(tree, pretty_print=True, encoding="unicode")

def generate_pdf(html_content, output_pdf="output.pdf"):
    """
    Convert the given HTML content into a PDF using WeasyPrint.
    """
    HTML(string=html_content).write_pdf(output_pdf)

def main(url, output_pdf="output.pdf", config=CONFIG):
    """
    Main function to generate PDF from a webpage.
    """
    # Generate print-specific CSS using the configuration dictionary
    css_print_format = css_format(config)
    # Fetch the page HTML
    raw_html = fetch_html(url)
    # Extract the main content from the HTML
    title, content_html = extract_main_content(raw_html, url)
    # Build a full HTML document with the extracted content
    full_html = build_full_html(title, content_html, url)
    # Inject the print CSS into the HTML
    final_html = inject_print_css(full_html, css_print_format)
    # Convert the final HTML to a PDF file
    generate_pdf(final_html, output_pdf)
    print(f"PDF generated successfully: {output_pdf}")

In [28]:
url1 = "https://www.spiceworks.com/tech/artificial-intelligence/articles/what-is-ml/"
url2 = "https://www.vegrecipesofindia.com/paneer-bhurji-scrambled-cottage-cheese-with-spices/#h-about-paneer-bhurji"
url3 = "https://in.indeed.com/hire/c/info/ai-in-business?gad_source=1&gclid=Cj0KCQjwytS-BhCKARIsAMGJyzpWiFtLtPDyDphEvwR_p0vwIKkIW-Gc1waoYCrLdRyjX2M0KoXfL6UaAnptEALw_wcB&aceid=&gclsrc=aw.ds"
url4 = "https://www.geeksforgeeks.org/web-browser/"
url5 = "https://en.wikipedia.org/wiki/Interstellar_(film)"
url6 = "https://sallysbakingaddiction.com/easy-healthy-dinner-baked-pineapple-teriyaki-chicken/"

if __name__ == "__main__":
    main(url1)

PDF generated successfully: output.pdf


## Other notes 

/* Hide navigation, sidebars, ads, and footers */
    nav, aside, .navigation, .sidebar, .advertisement, .ads, .footer {{
        display: none !important;
    }}
    
- insert in build_full_html's body to show webpage url at the last page
<hr>
        <p class="url-text">Source URL: {url}</p>

- summarizing_factor
  - 1 token ≈ 0.75 words (for English)

summarizing_factor = 0.5

OUTPUT_TOKEN_COUNT = round(len(TEXT.split())*1.33*summarizing_factor)