## Installing packages and importing libraries

In [1]:
!pip install lxml[html_clean] weasyprint PyPDF2
!npm install puppeteer jsdom @mozilla/readability  #JavaScript (Node.js) packages

Collecting weasyprint
  Downloading weasyprint-65.1-py3-none-any.whl.metadata (3.7 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting lxml_html_clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting pydyf>=0.11.0 (from weasyprint)
  Downloading pydyf-0.11.0-py3-none-any.whl.metadata (2.5 kB)
Collecting tinyhtml5>=2.0.0b1 (from weasyprint)
  Downloading tinyhtml5-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting cssselect2>=0.8.0 (from weasyprint)
  Downloading cssselect2-0.8.0-py3-none-any.whl.metadata (2.9 kB)
Collecting Pyphen>=0.9.1 (from weasyprint)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting brotli>=1.0.1 (from fonttools[woff]>=4.0.0->weasyprint)
  Downloading Brotli-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting zopfli>=0.1.4 (from fonttools[woff]>=4.0.0->weasyprint)
  Downloading zopfli-0.2.3.post1-cp3

In [2]:
from lxml import html
import subprocess, json
from bs4 import BeautifulSoup

# not used in main implementation(only used in experimentation)
from PyPDF2 import PdfReader #For counting page in a pdf
import os                    #for renaming file name
import time, copy
import pandas as pd 
from weasyprint import HTML

In [3]:
# Write the Node.js script to extract main article content
node_script = '''
const puppeteer = require('puppeteer');
const { Readability } = require('@mozilla/readability');
const { JSDOM } = require('jsdom');

const url = process.argv[2];

(async () => {
  const browser = await puppeteer.launch({
    headless: 'new',
    args: [
      '--no-sandbox',
      '--disable-setuid-sandbox',
      '--disable-blink-features=AutomationControlled'
    ]
  });

  try {
    const page = await browser.newPage();

    // Set user agent to avoid being blocked
    await page.setUserAgent(
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    );

    // Go to the URL and wait for main content to load
    await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 5000});

    // Scroll to trigger lazy-loaded content
    await page.evaluate(() => {
      window.scrollBy(0, window.innerHeight);
    });

    // Delay for 1.5 seconds using standard JS Promise
    await new Promise(resolve => setTimeout(resolve, 500));

    const html = await page.content();
    const dom = new JSDOM(html, { url });

    const reader = new Readability(dom.window.document);
    const article = reader.parse();

    let title = '';
    let content = '';

    if (article && article.content) {
      title = article.title;
      content = article.content;
    } else {
      // Fallback if Readability fails
      const fallback = dom.window.document.querySelector('main, article, .mainContent');
      title = dom.window.document.title || 'Untitled';
      content = fallback ? fallback.innerHTML : dom.window.document.body.innerHTML;
    }

    console.log(JSON.stringify({ title, content }));

  } catch (err) {
    console.error("Extraction error:", err.message);
    process.exit(1);
  } finally {
    await browser.close();
  }
})();
'''

with open("extract_readability.js", "w") as f:
    f.write(node_script)

## Functions

In [4]:
def css_format():
    """
    Generates a customizable CSS string for optimizing web pages for print.
    """

    hyperlink_css = ""
    if show_hyperlink_url:
        hyperlink_css = """
    /* Ensure links show their URLs */
    a {{
      text-decoration: none;
    }}
    a[href]:after {
        content: " (" attr(href) ")";
    }
    """

    page_number_css = ""
    if show_page_no:
        page_number_css = f"""
        @bottom-center {{
            content: counter(page) " / " counter(pages);
            font-size: {font_size * 0.8}pt;
        }}
        """

    image_positioning_css = """
    img {
        max-width: 100% !important;    /*ensures some images are not too small due to container*/
        height: auto !important;
        page-break-inside: avoid;
        display: block;
        margin: auto;
    }
    """
    if image_size == "small":
        image_positioning_css = """
    img {
        max-width: 50% !important;
        height: auto !important;
        float: right !important;
        page-break-inside: avoid;
        margin-left: 15px !important;
        margin-right: 0px !important;
        margin-bottom: 4px !important;
        display: block;
        clear: both;
    }
    """

    css_print_format = f"""
@media print {{
    body {{
        background-color: {background_color} !important;
        color: {font_color} !important;
        font-size: {font_size}pt;
        font-family: {font_family};
        line-height: {line_height};
        margin: 0;
        padding: 0;
        column-count: {column_count};
        column-gap: 15px;
        text-align: {text_alignment};
    }}
    
    @page {{
        size: {page_size} {orientation};
        margin: {margin}mm;
        {page_number_css}
    }}

    hr {{
        display: none;
    }}

    {image_positioning_css}

    {hyperlink_css}

    .main-heading {{
        font-size: {font_size * heading_emphasizing_factor * 1.25}pt !important;
        font-weight: bold !important;
        text-align: left;
        display: block !important;
        margin-top: 20px !important;
        color: {font_color} !important;
        column-span: all;
        width: 100%;
        break-before: always;
    }}

    h1 {{
        font-size: {font_size * heading_emphasizing_factor}pt !important;
        font-weight: bold !important;
        text-align: left;
        display: block !important;
        margin-top: 20px !important;
        color: {font_color} !important;
    }}

    h2 {{
        font-size: {font_size * heading_emphasizing_factor * 0.8}pt !important;
        font-weight: bold !important;
        text-align: left;
        margin-top: 15px !important;
        color: {font_color} !important;
    }}

    h3 {{
        font-size: {font_size * heading_emphasizing_factor * 0.6}pt !important;
        font-weight: bold;
        text-align: left;
        margin-top: 10px;
        color: {font_color};
    }}
}}
"""
    # print(css_print_format)
    return css_print_format

def extract_main_content(url):
    try:
        result = subprocess.run(
            ['node', 'extract_readability.js', url],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        data = json.loads(result.stdout)
        title = data.get('title', 'No Title')
        content_html = data.get('content', '')

        # Clean and format the content
        soup = BeautifulSoup(content_html, 'html.parser')
        for tag in soup(['script', 'style', 'noscript']):
            tag.decompose()

        if remove_images:
            # Remove entire <figure> blocks (preferred)
            for tag in soup.find_all('figure'):
                tag.decompose()

            # Optionally remove other loose image elements just in case
            for tag in soup.find_all(['img', 'picture', 'figcaption', 'cite']):
                tag.decompose()

            # Remove <div> that only contains images or picture-related content
            for div in soup.find_all('div'):
                # If the div contains only pictures or images
                if all(child.name in ['img', 'picture', 'figcaption', 'cite', 'source'] for child in div.contents if hasattr(child, 'name')):
                    div.decompose()

        # print("TITLE:", title)
        # print("CONTENT:", soup.prettify())

        return title, soup.prettify()

    except subprocess.CalledProcessError as e:
        print("Node.js Error:", e.stderr)
        return None, None

def build_full_html_with_print_css(title, content, url, css_print_format):
    """
    Build full HTML with embedded title, content, and print CSS formatting.
    """
    base_html = f"""<!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="utf-8">
        <title>{title}</title>
        <style>
            .url-text {{
                color: blue;
            }}
        </style>
    </head>
    <body>
        <h1 class="main-heading">{title}</h1>
        {content}
    </body>
    </html>"""

    tree = html.fromstring(base_html)
    head = tree.find("head")

    if head is None:
        head = html.Element("head")
        tree.insert(0, head)

    style = html.Element("style")
    style.text = css_print_format
    head.append(style)
    # print(html.tostring(tree, pretty_print=True, encoding="unicode"))
    return html.tostring(tree, pretty_print=True, encoding="unicode")


def generate_pdf(full_html, output_pdf="output.pdf", generate_html=True, output_html="output.html"):
    """
    Convert the given HTML content into a PDF using WeasyPrint.
    """
    HTML(string=full_html).write_pdf(output_pdf)
    if generate_html:
        with open(output_html, "w", encoding="utf-8") as file:
            file.write(full_html)

def main(url):
    """
    Main function to generate PDF from a webpage.
    """
    css = css_format()
    title, content = extract_main_content(url)

    if title and content:
        full_html = build_full_html_with_print_css(title, content, url, css)
        generate_pdf(full_html, output_pdf="output.pdf", output_html="output.html")

## Setting up Configuration

- Heading size and margin of page will depend on font size.
- [font size in pf]* [margin_ration] = [margin in mm]

In [5]:
# Default configuration dictionary
DEFAULT_CONFIG = {"font_size": 11, "print_mode": "default", "heading_emphasizing_factor": 2.2, "background_color": "white", "text_alignment": "left", "orientation": "portrait", "font_color": "black", "font_family": "Arial, sans-serif", "page_size": "A4", "line_height": 1.5, "show_hyperlink_url": False, "remove_images": False, "remove_tables": False, "checklist_of_titles": "All selected", "column_count": 1, "summarize": False, "summarizing_factor": 0.4, "show_page_no": False, "space_between_paragraphs": 0, "image_or_table_alignment": "Center", "image_or_table_size": "Normal","remove_images": False, "image_size": "large"}

# Default configuration dictionary for different print modes
CONFIG_FOR_MODE = {
    "default":{"font_size": 11, "print_mode": "Default", "heading_emphasizing_factor": 2.2, "background_color": "white", "text_alignment": "left", "orientation": "portrait", "font_color": "black", "font_family": "Arial, sans-serif", "page_size": "A4", "line_height": 1.5, "show_hyperlink_url": False, "remove_images": False, "remove_tables": False, "checklist_of_titles": "All selected", "column_count": 1, "summarize": False, "summarizing_factor": 0.4, "show_page_no": False, "space_between_paragraphs": 0, "image_or_table_alignment": "Center", "image_or_table_size": "Normal","remove_images": False},
    "save_pages": {"font_size": 9, "print_mode": "save_pages", "heading_emphasizing_factor": 1.9, "background_color": "white", "text_alignment": "left", "orientation": "portrait", "font_color": "black", "font_family": "Helvetica Neue, Arial, sans-serif", "page_size": "A4" , "line_height": 1.2, "show_hyperlink_url": False, "remove_images": False, "remove_tables": False, "checklist_of_titles": "All selected", "column_count": 1, "summarize": False, "summarizing_factor": 0.4, "show_page_no": False, "space_between_paragraphs": 0, "image_or_table_alignment": "Side", "image_or_table_size": "Small","remove_images": False},
    "save_ink": {"font_size": 9, "print_mode": "save_ink", "heading_emphasizing_factor": 1.9, "background_color": "white", "text_alignment": "left", "orientation": "portrait", "font_color": "dimgray", "font_family": "Calibri, Arial, sans-serif", "page_size": "A4", "line_height": 1.5, "show_hyperlink_url": False, "remove_images": False, "remove_tables": False, "checklist_of_titles": "All selected", "column_count": 1, "summarize": False, "summarizing_factor": 0.4, "show_page_no": False, "space_between_paragraphs": 0, "image_or_table_alignment": "Side", "image_or_table_size": "Small","remove_images": False},
    "high_quality": {"font_size": 12, "print_mode": "high_quality", "heading_emphasizing_factor": 2.4, "background_color": "white", "text_alignment": "left", "orientation": "portrait", "font_color": "black", "font_family": "Arial, sans-serif", "page_size": "A4", "line_height": 1.7, "show_hyperlink_url": False, "remove_images": False, "remove_tables": False, "checklist_of_titles": "All selected", "column_count": 1, "summarize": False, "summarizing_factor": 0.8, "show_page_no": False, "space_between_paragraphs": "Increase", "image_or_table_alignment": "Center", "image_or_table_size": "Normal"},"remove_images": False}

# Setting DEFAULT_CONFIG as congugration initially
CONFIG = DEFAULT_CONFIG

#Take input of CHANGE_CONFIG from user or chatbot
CONFIG_CHANGE = {"font_size":11, "column_count":1, "image_size": "large", "orientation": "portrait", "remove_images": False}
CONFIG.update(CONFIG_CHANGE)

#To be run only if print_mode is changed
# CONFIG.update(CONFIG_FOR_MODE[CONFIG["print_mode"]])

URLS = ["https://www.spiceworks.com/tech/artificial-intelligence/articles/what-is-ml/",
        "https://sallysbakingaddiction.com/easy-healthy-dinner-baked-pineapple-teriyaki-chicken/",
        "https://sallysbakingaddiction.com/skillet-chicken-with-creamy-cilantro-lime-sauce/",
        "https://en.wikipedia.org/wiki/Interstellar_(film)", 
        "https://www.vegrecipesofindia.com/paneer-bhurji-scrambled-cottage-cheese-with-spices/#h-about-paneer-bhurji",
        "https://www.geeksforgeeks.org/web-browser/"]

URLS2 = ["https://www.forbes.com/sites/anuraghunathan/2025/05/14/indian-fintech-billionaire-harshil-mathur-races-to-stay-ahead-of-payments-giants/",
        "https://www.nationalgeographic.com/history/article/diet-advice-food-health-ancient-greeks-romans",
        "https://www.tripadvisor.com/Articles-lXyum7xL94Ms-Canada_national_parks.html",
        "https://finance.yahoo.com/personal-finance/banking/article/budget-categories-list-161542933.html",
        "https://www.ndtv.com/world-news/explained-how-mexican-navy-ship-collided-with-brooklyn-bridge-in-new-york-nypd-8443689",
        "https://www.usatoday.com/story/news/nation/2025/05/17/tornado-deaths-kentucky-missouri/83691762007/",
        "https://www.vice.com/en/article/this-is-what-working-too-much-does-to-your-brain/"]

#Making every variable a global variable, so they need not to be accessed repetitively
for key, value in CONFIG.items():
    globals()[key] = value

margin_ratio = 1.25
margin = font_size*margin_ratio

## Generating PDF output

In [6]:
url = URLS[0]

#generating single output
if __name__ == "__main__":
    main(url)

In [None]:
url, CONFIG_CHANGE

## Output for multiple urls

In [None]:
URLS3 = {
  "guardian_israel_gaza": "https://www.theguardian.com/world/2025/may/17/israel-gaza-ethnic-cleansing-palestinian-death-toll",
  "wikipedia_printfriendly": "https://en.wikipedia.org/wiki/PrintFriendly?utm_source=chatgpt.com",
  "bbc_news_article": "https://www.bbc.com/news/articles/ceqgjpl84pvo",
  "cnn_trump_putin": "https://edition.cnn.com/2025/05/17/politics/trump-putin-ukraine-zelensky-talks",
  "stackoverflow_chucknorris_color": "https://stackoverflow.com/questions/8318911/why-does-html-think-chucknorris-is-a-color",
  "mozilla_view_transition_api": "https://developer.mozilla.org/en-US/docs/Web/API/View_Transition_API/Using",
  "nature_article": "https://www.nature.com/articles/s41586-025-09000-3",
  "allrecipes_pasta_salad": "https://www.allrecipes.com/recipe/52734/awesome-pasta-salad/",
  "foodnetwork_esquites_pasta_salad": "https://www.foodnetwork.com/recipes/food-network-kitchen/esquites-inspired-summer-pasta-salad-18685921",
  "wikihow_make_money_teenagers": "https://www.wikihow.com/Make-Money-(for-Teenagers)",
  "instructables_looping_animations_python": "https://www.instructables.com/Creating-Perfectly-Looping-Animations-With-Python/",
  "makethings_on_off_tariffs": "https://makethings.make.co/p/on-off-tariffs-are-hurting-makers-and-small-businesses",
  "webmd_breast_cancer_smoking_drinking": "https://www.webmd.com/breast-cancer/smoking-drinking-breast-cancer",
  "healthline_simone_biles_pets": "https://www.healthline.com/health-news/simone-biles-mental-health-benefits-of-pets",
  "investopedia_sports_betting": "https://www.investopedia.com/how-sports-betting-is-taking-over-young-mens-time-and-finances-8788230",
  "forbes_indian_fintech_billionaire": "https://www.forbes.com/sites/anuraghunathan/2025/05/14/indian-fintech-billionaire-harshil-mathur-races-to-stay-ahead-of-payments-giants/",
  "nationalgeographic_ancient_diet": "https://www.nationalgeographic.com/history/article/diet-advice-food-health-ancient-greeks-romans",
  "tripadvisor_canada_national_parks": "https://www.tripadvisor.com/Articles-lXyum7xL94Ms-Canada_national_parks.html",
  "yahoo_finance_budget_categories": "https://finance.yahoo.com/personal-finance/banking/article/budget-categories-list-161542933.html",
  "ndtv_mexican_navy_brooklyn_bridge": "https://www.ndtv.com/world-news/explained-how-mexican-navy-ship-collided-with-brooklyn-bridge-in-new-york-nypd-8443689",
  "usatoday_tornado_deaths": "https://www.usatoday.com/story/news/nation/2025/05/17/tornado-deaths-kentucky-missouri/83691762007/",
  "vice_working_too_much": "https://www.vice.com/en/article/this-is-what-working-too-much-does-to-your-brain/",
  "url_1": "https://sallysbakingaddiction.com/easy-healthy-dinner-baked-pineapple-teriyaki-chicken/",
  "url_2": "https://www.spiceworks.com/tech/artificial-intelligence/articles/what-is-ml/",
  "url_3": "https://sallysbakingaddiction.com/skillet-chicken-with-creamy-cilantro-lime-sauce/",
  "url_4": "https://www.usatoday.com/story/news/nation/2025/05/17/tornado-deaths-kentucky-missouri/83691762007/",
  "url_5": "https://www.tripadvisor.com/Tourism-g186338-London_England-Vacations.html"
}

URLS3 = list(URLS3.values())

In [None]:
# Different configs and urls for experimentation
CONFIGS = [
    dict(CONFIG, orientation="portrait", column_count=1),
    dict(CONFIG, orientation="portrait", column_count=2),
    dict(CONFIG, orientation="landscape", column_count=1),
    dict(CONFIG, orientation="landscape", column_count=2)
]
CONFIGS = [DEFAULT_CONFIG]

In [None]:
#Generating multiple outputs
def generate_history(CONFIGS, URLS):
    output_number = 1
    generated_for= []
    # history = pd.DataFrame(columns=["url", "time_taken","num_pages","config","output_filename"])
    
    for URL in URLS:
        print("Generating for url:",URL)
        for CONFIG in CONFIGS:
            if __name__ == "__main__":
                # start_time = time.time()
                main(URL)
                # end_time = time.time()
                # time_taken = end_time - start_time
            
            # Rename output.pdf to output_<number>.pdf
            output_pdf = f"output_{output_number}.pdf"
            output_html = f"output_{output_number}.html"
            if os.path.exists("output.pdf"):
                os.rename("output.pdf", output_pdf)

            if os.path.exists("output.html"):
                os.rename("output.html", output_html)
        output_number += 1
        generated_for.append(URL)
    return output_number, generated_for
            
            # Get number of pages in output PDF
            # num_pages = len(PdfReader(output_filename).pages)
            
            # Saving values in a dataframe
            # history = pd.concat([history, pd.DataFrame([{"url": URL, "time_taken": time_taken,"num_pages":num_pages,"config":CONFIG,"output_filename": output_filename}])], ignore_index=True)
    # return history

In [None]:
generate_history(CONFIGS, URLS3)

In [None]:
generated_for = ['https://www.theguardian.com/world/2025/may/17/israel-gaza-ethnic-cleansing-palestinian-death-toll',
  'https://en.wikipedia.org/wiki/PrintFriendly?utm_source=chatgpt.com',
  'https://www.bbc.com/news/articles/ceqgjpl84pvo',
  'https://edition.cnn.com/2025/05/17/politics/trump-putin-ukraine-zelensky-talks',
  'https://stackoverflow.com/questions/8318911/why-does-html-think-chucknorris-is-a-color',
  'https://developer.mozilla.org/en-US/docs/Web/API/View_Transition_API/Using',
  'https://www.nature.com/articles/s41586-025-09000-3',
  'https://www.allrecipes.com/recipe/52734/awesome-pasta-salad/',
  'https://www.foodnetwork.com/recipes/food-network-kitchen/esquites-inspired-summer-pasta-salad-18685921',
  'https://www.wikihow.com/Make-Money-(for-Teenagers)',
  'https://www.instructables.com/Creating-Perfectly-Looping-Animations-With-Python/',
  'https://makethings.make.co/p/on-off-tariffs-are-hurting-makers-and-small-businesses',
  'https://www.webmd.com/breast-cancer/smoking-drinking-breast-cancer',
  'https://www.healthline.com/health-news/simone-biles-mental-health-benefits-of-pets',
  'https://www.investopedia.com/how-sports-betting-is-taking-over-young-mens-time-and-finances-8788230',
  'https://www.forbes.com/sites/anuraghunathan/2025/05/14/indian-fintech-billionaire-harshil-mathur-races-to-stay-ahead-of-payments-giants/',
  'https://www.nationalgeographic.com/history/article/diet-advice-food-health-ancient-greeks-romans',
  'https://www.tripadvisor.com/Articles-lXyum7xL94Ms-Canada_national_parks.html',
  'https://finance.yahoo.com/personal-finance/banking/article/budget-categories-list-161542933.html',
  'https://www.ndtv.com/world-news/explained-how-mexican-navy-ship-collided-with-brooklyn-bridge-in-new-york-nypd-8443689',
  'https://www.usatoday.com/story/news/nation/2025/05/17/tornado-deaths-kentucky-missouri/83691762007/',
  'https://www.vice.com/en/article/this-is-what-working-too-much-does-to-your-brain/',
  'https://sallysbakingaddiction.com/easy-healthy-dinner-baked-pineapple-teriyaki-chicken/',
  'https://www.spiceworks.com/tech/artificial-intelligence/articles/what-is-ml/',
  'https://sallysbakingaddiction.com/skillet-chicken-with-creamy-cilantro-lime-sauce/',
  'https://www.usatoday.com/story/news/nation/2025/05/17/tornado-deaths-kentucky-missouri/83691762007/',
  'https://www.tripadvisor.com/Tourism-g186338-London_England-Vacations.html']

In [None]:
len(generated_for)

In [None]:
#Generating multiple outputs
def generate_history(CONFIGS, URLS):
    output_number = 1
    history = pd.DataFrame(columns=["url", "time_taken","num_pages","config","output_filename"])
    
    for URL in URLS:
        print("Generating for url:",URL)
        for CONFIG in CONFIGS:
            if __name__ == "__main__":
                start_time = time.time()
                main(URL, CONFIG)
                end_time = time.time()
                time_taken = end_time - start_time
            
            # Rename output.pdf to output_<number>.pdf
            output_filename = f"output_{output_number}.pdf"
            if os.path.exists("output.pdf"):
                os.rename("output.pdf", output_filename)
                output_number += 1
            
            # Get number of pages in output PDF
            num_pages = len(PdfReader(output_filename).pages)
            
            # Saving values in a dataframe
            history = pd.concat([history, pd.DataFrame([{"url": URL, "time_taken": time_taken,"num_pages":num_pages,"config":CONFIG,"output_filename": output_filename}])], ignore_index=True)
    return history

In [None]:
history = generate_history(CONFIGS, URLS)
history['time_per_page'] = history['time_taken'] / history['num_pages']
history

In [None]:
history.to_excel('history.xlsx', index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create 6 groups of 4 from the time_per_page column
groups = []
for i in range(0, 24, 4):
    group = history['time_per_page'].iloc[i:i+4].reset_index(drop=True)
    groups.append(group)

# X-axis labels
configs = ['config1', 'config2', 'config3', 'config4']

# Plotting
plt.figure(figsize=(10, 6))

for idx, group in enumerate(groups):
    plt.plot(configs, group, label=f'Group {idx+1}')  # Default colors used here

plt.xlabel('Configuration')
plt.ylabel('Time per Page')
plt.title('Time per Page across Configurations (6 Groups)')
plt.legend()
plt.grid(True)
plt.show()

- portrait orientation takes more time per page than landscape
- 2-column format takes more time per page than 1-column format

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create 6 groups of 4 from the time_per_page column
groups = []
for i in range(0, 24, 4):
    group = history['time_taken'].iloc[i:i+4].reset_index(drop=True)
    groups.append(group)

# X-axis labels
configs = ['config1', 'config2', 'config3', 'config4']

# Plotting
plt.figure(figsize=(10, 6))

for idx, group in enumerate(groups):
    plt.plot(configs, group, label=f'Group {idx+1}')  # Default colors used here

plt.xlabel('Configuration')
plt.ylabel('Time taken')
plt.legend()
plt.grid(True)
plt.show()

- [portrait, 1-column format] is taking more time

In [None]:
import streamlit as st
import subprocess
import json
from bs4 import BeautifulSoup
from lxml import html
from weasyprint import HTML
import tempfile
import os

# --- Your functions (css_format, extract_main_content, build_full_html_with_print_css, generate_pdf) ---
# I will adapt css_format to take config as input for easier use

def css_format(config):
    """
    Generates a customizable CSS string for optimizing web pages for print, using config dict.
    """
    show_hyperlink_url = config.get("show_hyperlink_url", False)
    show_page_no = config.get("show_page_no", False)
    font_size = config.get("font_size", 11)
    heading_emphasizing_factor = config.get("heading_emphasizing_factor", 2.2)
    background_color = config.get("background_color", "white")
    font_color = config.get("font_color", "black")
    font_family = config.get("font_family", "Arial, sans-serif")
    page_size = config.get("page_size", "A4")
    orientation = config.get("orientation", "portrait")
    margin = config.get("margin", 15)
    line_height = config.get("line_height", 1.5)
    column_count = config.get("column_count", 1)
    text_alignment = config.get("text_alignment", "left")
    image_size = config.get("image_size", "large")

    hyperlink_css = ""
    if show_hyperlink_url:
        hyperlink_css = """
    /* Ensure links show their URLs */
    a {
      text-decoration: none;
    }
    a[href]:after {
        content: " (" attr(href) ")";
    }
    """

    page_number_css = ""
    if show_page_no:
        page_number_css = f"""
        @bottom-center {{
            content: counter(page) " / " counter(pages);
            font-size: {font_size * 0.8}pt;
        }}
        """

    image_positioning_css = """
    img {
        max-width: 100% !important;
        height: auto !important;
        page-break-inside: avoid;
        display: block;
        margin: auto;
    }
    """
    if image_size == "small":
        image_positioning_css = """
    img {
        max-width: 50% !important;
        height: auto !important;
        float: right !important;
        page-break-inside: avoid;
        margin-left: 15px !important;
        margin-right: 0px !important;
        margin-bottom: 4px !important;
        display: block;
        clear: both;
    }
    """

    css_print_format = f"""
@media print {{
    body {{
        background-color: {background_color} !important;
        color: {font_color} !important;
        font-size: {font_size}pt;
        font-family: {font_family};
        line-height: {line_height};
        margin: 0;
        padding: 0;
        column-count: {column_count};
        column-gap: 15px;
        text-align: {text_alignment};
    }}
    
    @page {{
        size: {page_size} {orientation};
        margin: {margin}mm;
        {page_number_css}
    }}

    hr {{
        display: none;
    }}

    {image_positioning_css}

    {hyperlink_css}

    .main-heading {{
        font-size: {font_size * heading_emphasizing_factor * 1.25}pt !important;
        font-weight: bold !important;
        text-align: left;
        display: block !important;
        margin-top: 20px !important;
        color: {font_color} !important;
        column-span: all;
        width: 100%;
        break-before: always;
    }}

    h1 {{
        font-size: {font_size * heading_emphasizing_factor}pt !important;
        font-weight: bold !important;
        text-align: left;
        display: block !important;
        margin-top: 20px !important;
        color: {font_color} !important;
    }}

    h2 {{
        font-size: {font_size * heading_emphasizing_factor * 0.8}pt !important;
        font-weight: bold !important;
        text-align: left;
        margin-top: 15px !important;
        color: {font_color} !important;
    }}

    h3 {{
        font-size: {font_size * heading_emphasizing_factor * 0.6}pt !important;
        font-weight: bold;
        text-align: left;
        margin-top: 10px;
        color: {font_color};
    }}
}}
"""
    return css_print_format

def extract_main_content(url, remove_images=False):
    """
    Extract main content HTML and title from URL using node script.
    """
    try:
        result = subprocess.run(
            ['node', 'extract_readability.js', url],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        data = json.loads(result.stdout)
        title = data.get('title', 'No Title')
        content_html = data.get('content', '')

        soup = BeautifulSoup(content_html, 'html.parser')
        for tag in soup(['script', 'style', 'noscript']):
            tag.decompose()

        if remove_images:
            for tag in soup.find_all('figure'):
                tag.decompose()
            for tag in soup.find_all(['img', 'picture', 'figcaption', 'cite']):
                tag.decompose()
            for div in soup.find_all('div'):
                if all(child.name in ['img', 'picture', 'figcaption', 'cite', 'source'] for child in div.contents if hasattr(child, 'name')):
                    div.decompose()

        return title, soup.prettify()

    except subprocess.CalledProcessError as e:
        st.error(f"Node.js error when processing URL {url}: {e.stderr}")
        return None, None

def build_full_html_with_print_css(title, content, css_print_format):
    base_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>{title}</title>
    <style>
        .url-text {{
            color: blue;
        }}
    </style>
</head>
<body>
    <h1 class="main-heading">{title}</h1>
    {content}
</body>
</html>"""

    tree = html.fromstring(base_html)
    head = tree.find("head")
    if head is None:
        head = html.Element("head")
        tree.insert(0, head)

    style = html.Element("style")
    style.text = css_print_format
    head.append(style)

    return html.tostring(tree, pretty_print=True, encoding="unicode")

def generate_pdf(full_html, output_pdf_path):
    HTML(string=full_html).write_pdf(output_pdf_path)

# --------- Streamlit app starts here ---------

st.title("Custom PDF Generator from URLs")

st.markdown("""
Enter one or more URLs (one per line). Configure your print settings below and then click **Generate PDF**.
""")

urls_input = st.text_area("Enter URLs (one per line)", height=150)
urls = [url.strip() for url in urls_input.splitlines() if url.strip()]

st.sidebar.header("Print Configuration")

# Basic config inputs in sidebar
font_size = st.sidebar.slider("Font Size (pt)", 6, 20, 11)
heading_emphasizing_factor = st.sidebar.slider("Heading Emphasizing Factor", 1.0, 3.0, 2.2, 0.1)
background_color = st.sidebar.color_picker("Background Color", "#FFFFFF")
font_color = st.sidebar.color_picker("Font Color", "#000000")
font_family = st.sidebar.selectbox("Font Family", ["Arial, sans-serif", "Helvetica Neue, Arial, sans-serif", "Calibri, Arial, sans-serif", "Times New Roman, serif"])
page_size = st.sidebar.selectbox("Page Size", ["A4", "Letter", "Legal"])
orientation = st.sidebar.selectbox("Orientation", ["portrait", "landscape"])
line_height = st.sidebar.slider("Line Height", 1.0, 2.5, 1.5, 0.1)
show_hyperlink_url = st.sidebar.checkbox("Show Hyperlink URLs", value=False)
show_page_no = st.sidebar.checkbox("Show Page Numbers", value=False)
remove_images = st.sidebar.checkbox("Remove Images", value=False)
column_count = st.sidebar.slider("Column Count", 1, 3, 1)
text_alignment = st.sidebar.selectbox("Text Alignment", ["left", "justify", "center", "right"])
image_size = st.sidebar.selectbox("Image Size", ["small", "large"])

# Margin calculated as font_size * ratio (like your original code)
margin_ratio = 1.25
margin = font_size * margin_ratio

config = {
    "font_size": font_size,
    "heading_emphasizing_factor": heading_emphasizing_factor,
    "background_color": background_color,
    "font_color": font_color,
    "font_family": font_family,
    "page_size": page_size,
    "orientation": orientation,
    "line_height": line_height,
    "show_hyperlink_url": show_hyperlink_url,
    "show_page_no": show_page_no,
    "remove_images": remove_images,
    "column_count": column_count,
    "text_alignment": text_alignment,
    "image_size": image_size,
    "margin": margin
}

if st.button("Generate PDF"):
    if not urls:
        st.error("Please enter at least one URL.")
    else:
        all_html = ""
        for url in urls:
            st.write(f"Processing: {url}")
            title, content = extract_main_content(url, remove_images=remove_images)
            if title and content:
                css = css_format(config)
                full_html = build_full_html_with_print_css(title, content, css)
                all_html += full_html + "<p style='page-break-after: always;'></p>"
            else:
                st.warning(f"Could not extract content for URL: {url}")

        if all_html:
            # Generate combined PDF
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
                generate_pdf(all_html, tmp_pdf.name)
                st.success("PDF generated successfully!")
                # Provide download link
                with open(tmp_pdf.name, "rb") as f:
                    btn = st.download_button(
                        label="Download PDF",
                        data=f,
                        file_name="output.pdf",
                        mime="application/pdf"
                    )
                os.unlink(tmp_pdf.name)
        else:
            st.error("No content extracted from provided URLs.")


In [None]:
!pip install streamlit

In [None]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py