In [None]:
!pip install playwright requests
!playwright install

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

In [None]:
import os
import time
from urllib.parse import urljoin
from playwright.sync_api import sync_playwright
import requests
from IPython.display import display, HTML

In [None]:
OUTPUT_DIR = "/content/articles"  # Using /content in Colab
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"
TARGET_URL = "http://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/pcis-mdr-plea-indias-ai-startups-accelerated/articleshow/119449885.cms"


In [None]:
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=10
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                return False

            # Navigation
            try:
                page.goto(url, timeout=60000)
                # Wait for content to load
                page.wait_for_load_state("networkidle")
            except Exception as e:
                print(f"Navigation failed: {str(e)}")
                return False

            # Remove unwanted elements
            try:
                page.evaluate("""() => {
                    const safeRemove = selector => {
                        document.querySelectorAll(selector).forEach(el => el.remove());
                    };
                    [
                        '.recommendedStories', '.socialShares', '.newsletter',
                        '.discoverTheStory', '.et_related', '[class*="ad"]',
                        'header', 'footer', 'iframe', 'script'
                    ].forEach(safeRemove);
                }""")
            except Exception as e:
                print(f"Element removal failed: {str(e)}")

            # Extract content
            try:
                article_json = page.evaluate(f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline
                        }};
                    }} catch (e) {{
                        return null;
                    }}
                }}""")
                if not article_json:
                    raise Exception("Content extraction failed")
            except Exception as e:
                print(f"Content error: {str(e)}")
                return False

            # Generate PDF
            try:
                page.set_content(f"""
                    <html>
                        <head>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial;
                                    line-height: 1.6;
                                }}
                                img {{ max-width: 100%; }}
                                .byline {{ color: #666; margin-bottom: 20px; }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            <div class="byline">{article_json['byline']}</div>
                            {article_json['content']}
                        </body>
                    </html>
                """)

                # Fixed PDF margin configuration
                page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '15mm',
                        'right': '15mm',
                        'bottom': '15mm',
                        'left': '15mm'
                    },
                    print_background=False
                )
                return True

            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                return False

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

In [None]:
def process_single_url():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    url = TARGET_URL
    pdf_name = "article.pdf"
    pdf_path = os.path.join(OUTPUT_DIR, pdf_name)

    print(f"Processing URL: {url}")

    try:
        if not create_clean_article_pdf(url, pdf_path):
            raise Exception("Processing failed")
        print("\nSuccessfully created PDF!")

        # Display download link in Colab
        display(HTML(f'<a href="{pdf_path}" download>Download PDF</a>'))
    except Exception as e:
        print(f"Failed: {str(e)}")
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerow({'url': url, 'error': str(e)})

In [None]:
import csv
process_single_url()


Processing URL: http://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/pcis-mdr-plea-indias-ai-startups-accelerated/articleshow/119449885.cms
Processing failed: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.
Failed: Processing failed


In [None]:
# Step 1: Install required packages
!pip install playwright requests
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"  # Using /content in Colab
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"
TARGET_URL = "http://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/pcis-mdr-plea-indias-ai-startups-accelerated/articleshow/119449885.cms"

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=10
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                return False

            # Navigation
            try:
                await page.goto(url, timeout=60000)
                await page.wait_for_load_state("networkidle")
            except Exception as e:
                print(f"Navigation failed: {str(e)}")
                return False

            # Remove unwanted elements
            try:
                await page.evaluate("""() => {
                    const safeRemove = selector => {
                        document.querySelectorAll(selector).forEach(el => el.remove());
                    };
                    [
                        '.recommendedStories', '.socialShares', '.newsletter',
                        '.discoverTheStory', '.et_related', '[class*="ad"]',
                        'header', 'footer', 'iframe', 'script'
                    ].forEach(safeRemove);
                }""")
            except Exception as e:
                print(f"Element removal failed: {str(e)}")

            # Extract content
            try:
                article_json = await page.evaluate(f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline
                        }};
                    }} catch (e) {{
                        return null;
                    }}
                }}""")
                if not article_json:
                    raise Exception("Content extraction failed")
            except Exception as e:
                print(f"Content error: {str(e)}")
                return False

            # Generate PDF
            try:
                await page.set_content(f"""
                    <html>
                        <head>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial;
                                    line-height: 1.6;
                                }}
                                img {{ max-width: 100%; }}
                                .byline {{ color: #666; margin-bottom: 20px; }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            <div class="byline">{article_json['byline']}</div>
                            {article_json['content']}
                        </body>
                    </html>
                """)

                # Fixed PDF margin configuration
                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '15mm',
                        'right': '15mm',
                        'bottom': '15mm',
                        'left': '15mm'
                    },
                    print_background=False
                )
                return True

            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                return False

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process the single URL
async def process_single_url():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    url = TARGET_URL
    pdf_name = "article.pdf"
    pdf_path = os.path.join(OUTPUT_DIR, pdf_name)

    print(f"Processing URL: {url}")

    try:
        if not await create_clean_article_pdf(url, pdf_path):
            raise Exception("Processing failed")
        print("\nSuccessfully created PDF!")

        # Display download link in Colab
        display(HTML(f'<a href="{pdf_path}" download>Download PDF</a>'))
    except Exception as e:
        print(f"Failed: {str(e)}")
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerow({'url': url, 'error': str(e)})

# Step 6: Run the process
# This is the key change for Colab - we need to run the async function properly
def run_async_code():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        return loop.run_until_complete(process_single_url())
    finally:
        loop.close()

run_async_code()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

RuntimeError: Cannot run the event loop while another loop is running

In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries with nest-asyncio for Colab compatibility
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"  # Using /content in Colab
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"
TARGET_URL = "http://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/pcis-mdr-plea-indias-ai-startups-accelerated/articleshow/119449885.cms"

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=10
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            # Navigation
            try:
                await page.goto(url, timeout=60000)
                await page.wait_for_load_state("networkidle")
            except Exception as e:
                print(f"Navigation failed: {str(e)}")
                await browser.close()
                return False

            # Remove unwanted elements
            try:
                await page.evaluate("""() => {
                    const safeRemove = selector => {
                        document.querySelectorAll(selector).forEach(el => el.remove());
                    };
                    [
                        '.recommendedStories', '.socialShares', '.newsletter',
                        '.discoverTheStory', '.et_related', '[class*="ad"]',
                        'header', 'footer', 'iframe', 'script'
                    ].forEach(safeRemove);
                }""")
            except Exception as e:
                print(f"Element removal failed: {str(e)}")

            # Extract content
            try:
                article_json = await page.evaluate(f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline
                        }};
                    }} catch (e) {{
                        return null;
                    }}
                }}""")
                if not article_json:
                    raise Exception("Content extraction failed")
            except Exception as e:
                print(f"Content error: {str(e)}")
                await browser.close()
                return False

            # Generate PDF
            try:
                await page.set_content(f"""
                    <html>
                        <head>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial;
                                    line-height: 1.6;
                                }}
                                img {{ max-width: 100%; }}
                                .byline {{ color: #666; margin-bottom: 20px; }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            <div class="byline">{article_json['byline']}</div>
                            {article_json['content']}
                        </body>
                    </html>
                """)

                # Fixed PDF margin configuration
                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '15mm',
                        'right': '15mm',
                        'bottom': '15mm',
                        'left': '15mm'
                    },
                    print_background=False
                )
                await browser.close()
                return True

            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process the single URL
async def process_single_url():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    url = TARGET_URL
    pdf_name = "article.pdf"
    pdf_path = os.path.join(OUTPUT_DIR, pdf_name)

    print(f"Processing URL: {url}")

    try:
        if not await create_clean_article_pdf(url, pdf_path):
            raise Exception("Processing failed")
        print("\nSuccessfully created PDF!")

        # Display download link in Colab
        display(HTML(f'<a href="{pdf_path}" download>Download PDF</a>'))
    except Exception as e:
        print(f"Failed: {str(e)}")
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerow({'url': url, 'error': str(e)})

# Step 6: Run the process in Colab's environment
await process_single_url()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"
TARGET_URL = "http://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/pcis-mdr-plea-indias-ai-startups-accelerated/articleshow/119449885.cms"

# Increased timeouts
NAVIGATION_TIMEOUT = 120000  # 120 seconds
REQUEST_TIMEOUT = 30000  # 30 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            # Launch browser with increased timeout
            browser = await p.chromium.launch(timeout=NAVIGATION_TIMEOUT)
            context = await browser.new_context()
            page = await context.new_page()

            # Set default timeout for all actions
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            # Navigation with multiple fallbacks
            try:
                print("Attempting to load page...")
                await page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded")

                # Wait for either networkidle or a specific element
                try:
                    await page.wait_for_load_state("networkidle", timeout=30000)
                except:
                    print("Falling back to element-based waiting")
                    await page.wait_for_selector("body", state="attached", timeout=30000)

                print("Page loaded successfully")
            except Exception as e:
                print(f"Navigation failed: {str(e)}")
                await browser.close()
                return False

            # Remove unwanted elements with retries
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        document.querySelectorAll(selector).forEach(el => el.remove());
                    }}""", selector)
                except Exception as e:
                    print(f"Could not remove {selector}: {str(e)}")

            # Extract content
            try:
                print("Extracting article content...")
                article_json = await page.evaluate(f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""")

                if not article_json:
                    raise Exception("Content extraction failed")

                print("Content extracted successfully")
            except Exception as e:
                print(f"Content error: {str(e)}")
                await browser.close()
                return False

            # Generate PDF
            try:
                print("Generating PDF...")
                await page.set_content(f"""
                    <html>
                        <head>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial;
                                    line-height: 1.6;
                                }}
                                img {{ max-width: 100%; }}
                                .byline {{ color: #666; margin-bottom: 20px; }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            <div class="byline">{article_json['byline']}</div>
                            {article_json['content']}
                        </body>
                    </html>
                """)

                # Generate PDF with retry
                try:
                    await page.pdf(
                        path=pdf_path,
                        format='A4',
                        margin={
                            'top': '15mm',
                            'right': '15mm',
                            'bottom': '15mm',
                            'left': '15mm'
                        },
                        print_background=False
                    )
                except Exception as e:
                    print(f"First PDF attempt failed, retrying: {str(e)}")
                    await page.pdf(
                        path=pdf_path,
                        format='A4',
                        margin={
                            'top': '15mm',
                            'right': '15mm',
                            'bottom': '15mm',
                            'left': '15mm'
                        },
                        print_background=False
                    )

                await browser.close()
                return True

            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process the single URL
async def process_single_url():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    url = TARGET_URL
    pdf_name = "article.pdf"
    pdf_path = os.path.join(OUTPUT_DIR, pdf_name)

    print(f"Processing URL: {url}")

    try:
        if not await create_clean_article_pdf(url, pdf_path):
            raise Exception("Processing failed")
        print("\nSuccessfully created PDF!")

        # Display download link in Colab
        display(HTML(f'<a href="{pdf_path}" download>Download PDF</a>'))
    except Exception as e:
        print(f"Failed: {str(e)}")
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerow({'url': url, 'error': str(e)})

# Step 6: Run the process
await process_single_url()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [
    "https://economictimes.indiatimes.com/tech/newsletters/tech-top-5/payments-body-seeks-mdr-return-turkey-opposition-x-ed-out/articleshow/119428423.cms",
    "https://economictimes.indiatimes.com/tech/technology/payments-body-writes-to-pmo-seeks-return-of-0-3-mdr-on-upi-for-large-merchants-and-rupay-debit-cards/articleshow/119423280.cms",
    "https://economictimes.indiatimes.com/news/new-updates/cheating-but-still-filing-fake-rape-case-rippling-cofounder-prasanna-sankar-makes-sensational-allegation-against-wife/articleshow/119410129.cms",
    "https://economictimes.indiatimes.com/tech/newsletters/ettech-unwrapped/top-tech-and-startup-stories-this-week/articleshow/119324661.cms",
    "https://economictimes.indiatimes.com/markets/stocks/news/sebi-removes-over-70000-misleading-social-media-posts-and-handles-since-october-last-year/articleshow/119325989.cms"
]

# Timeout settings
NAVIGATION_TIMEOUT = 120000  # 120 seconds
REQUEST_TIMEOUT = 30000  # 30 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            # Launch browser with increased timeout and headless mode
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
                viewport={'width': 1920, 'height': 1080}
            )
            page = await context.new_page()

            # Set default timeout for all actions
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            # Navigation with multiple fallbacks
            try:
                print(f"\nLoading: {url}")
                await page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded")

                # Wait for either networkidle or a specific element
                try:
                    await page.wait_for_load_state("networkidle", timeout=30000)
                except:
                    print("Using fallback waiting strategy")
                    await page.wait_for_selector("article, .article, .content, .story, .main-content",
                                              state="attached", timeout=30000)

                print("Page loaded successfully")
            except Exception as e:
                print(f"Navigation failed: {str(e)}")
                await browser.close()
                return False

            # Remove unwanted elements with retries
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except Exception as e:
                    pass  # Silently fail if selector doesn't exist

            # Extract content
            try:
                print("Extracting article content...")
                article_json = await page.evaluate(f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) {{
                            console.error('Readability returned null');
                            return null;
                        }}
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""")

                if not article_json or not article_json.get('content'):
                    raise Exception("Content extraction failed")

                print(f"Extracted: {article_json['title'][:50]}...")
            except Exception as e:
                print(f"Content error: {str(e)}")
                await browser.close()
                return False

            # Generate PDF
            try:
                print("Generating PDF...")
                await page.set_content(f"""
                    <html>
                        <head>
                            <meta charset="UTF-8">
                            <title>{article_json['title']}</title>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial, sans-serif;
                                    line-height: 1.6;
                                    color: #333;
                                }}
                                h1 {{
                                    font-size: 24px;
                                    margin-bottom: 10px;
                                    color: #222;
                                }}
                                .byline {{
                                    color: #666;
                                    margin-bottom: 20px;
                                    font-style: italic;
                                }}
                                .excerpt {{
                                    font-weight: bold;
                                    margin-bottom: 20px;
                                    color: #444;
                                }}
                                img {{
                                    max-width: 100%;
                                    height: auto;
                                    margin: 10px 0;
                                }}
                                a {{
                                    color: #0066cc;
                                    text-decoration: none;
                                }}
                                @media print {{
                                    body {{ padding: 0; }}
                                }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                            {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                            {article_json['content']}
                            <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                Source: <a href="{url}">{url}</a>
                            </div>
                        </body>
                    </html>
                """)

                # Generate PDF with retry
                max_attempts = 2
                for attempt in range(max_attempts):
                    try:
                        await page.pdf(
                            path=pdf_path,
                            format='A4',
                            margin={
                                'top': '20mm',
                                'right': '20mm',
                                'bottom': '20mm',
                                'left': '20mm'
                            },
                            print_background=False,
                            scale=0.9
                        )
                        break
                    except Exception as e:
                        if attempt == max_attempts - 1:
                            raise
                        print(f"PDF attempt {attempt + 1} failed, retrying...")
                        await asyncio.sleep(2)

                await browser.close()
                return True

            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process multiple URLs
async def process_urls():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = os.path.join(OUTPUT_DIR, f"batch_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    failed = []
    success_count = 0

    for index, url in enumerate(URLS, 1):
        print(f"\n{'='*50}")
        print(f"Processing URL {index} of {len(URLS)}")

        try:
            # First get the title to use in filename
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                title = await page.title()
                await browser.close()

            safe_title = sanitize_filename(title[:50])  # Limit title length for filename
            pdf_name = f"{index}_{safe_title}.pdf"
            pdf_path = os.path.join(output_dir, pdf_name)

            if await create_clean_article_pdf(url, pdf_path):
                print(f"Successfully created: {pdf_name}")
                success_count += 1
                # Display download link for the first successful PDF
                if success_count == 1:
                    display(HTML(f'<a href="{pdf_path}" download>Download First PDF: {pdf_name}</a>'))
            else:
                raise Exception("PDF creation failed")

        except Exception as e:
            print(f"Failed to process {url}: {str(e)}")
            failed.append({'url': url, 'error': str(e)})
            continue

        # Small delay between requests
        await asyncio.sleep(2)

    # Save failed URLs to CSV
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{output_dir}" download>Download All PDFs</a>'))

# Step 6: Run the process
await process_urls()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing URL 3 of 5

Loading: https://economictimes.indiatimes.com/news/new-updates/cheating-but-still-filing-fake-rape-case-rippling-cofounder-prasanna-sankar-makes-sensational-allegation-against-wife/articleshow/119410129.cms
Using fallback waiting strategy
Page loaded successfully
Extracting article content...
Extracted: 'Cheating, but still filing fake rape case': Rippl...
Generating PDF...
Successfully created: 3_Cheating but still filing fake rape case Rippl.pdf

Processing URL 4 of 5

Loading: https://economictimes.indiatimes.com/tech/newsletters/ettech-unwrapped/top-tech-and-startup-stories-this-week/articleshow/119324661.cms
Using fallback waiting strategy
Navigation failed: Page.wait_for_selector: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("article, .article, .content, .story, .main-content")

Failed to process https://economictimes.indiatimes.com/tech/newsletters/ettech-unwrapped/top-tech-and-startup-stories-this-week/articleshow/119324661.cms: PDF creati

In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [
    "https://economictimes.indiatimes.com/tech/newsletters/tech-top-5/payments-body-seeks-mdr-return-turkey-opposition-x-ed-out/articleshow/119428423.cms",
    "https://economictimes.indiatimes.com/tech/technology/payments-body-writes-to-pmo-seeks-return-of-0-3-mdr-on-upi-for-large-merchants-and-rupay-debit-cards/articleshow/119423280.cms",
    "https://economictimes.indiatimes.com/news/new-updates/cheating-but-still-filing-fake-rape-case-rippling-cofounder-prasanna-sankar-makes-sensational-allegation-against-wife/articleshow/119410129.cms",
    "https://economictimes.indiatimes.com/tech/newsletters/ettech-unwrapped/top-tech-and-startup-stories-this-week/articleshow/119324661.cms",
    "https://economictimes.indiatimes.com/markets/stocks/news/sebi-removes-over-70000-misleading-social-media-posts-and-handles-since-october-last-year/articleshow/119325989.cms"
]

# Timeout settings (increased from previous version)
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        # Strategy 1: Standard navigation
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        # Strategy 2: Reload if needed
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        # Strategy 3: Bypass potential blockers
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            # Additional waiting strategies
            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            # If we get here, none of the wait strategies worked
            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            # Launch browser with increased timeout and headless mode
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()

            # Set default timeout for all actions
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            # Navigation with multiple fallbacks
            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Remove unwanted elements with retries
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass  # Silently fail if selector doesn't exist

            # Extract content with multiple fallbacks
            article_json = None
            extraction_attempts = [
                # Attempt 1: Standard Readability.js
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",

                # Attempt 2: Fallback to article content extraction
                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;

                    return {
                        title: document.title.replace(' - The Economic Times', ''),
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    print(f"Attempting extraction method {attempt}...")
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        print(f"Extracted: {article_json['title'][:50]}...")
                        break
                except Exception as e:
                    print(f"Extraction attempt {attempt} failed: {str(e)}")
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Generate PDF with multiple attempts
            max_pdf_attempts = 3
            for attempt in range(max_pdf_attempts):
                try:
                    print(f"PDF generation attempt {attempt + 1}...")
                    await page.set_content(f"""
                        <html>
                            <head>
                                <meta charset="UTF-8">
                                <title>{article_json['title']}</title>
                                <style>
                                    body {{
                                        max-width: 800px;
                                        margin: 0 auto;
                                        padding: 20px;
                                        font-family: Arial, sans-serif;
                                        line-height: 1.6;
                                        color: #333;
                                    }}
                                    h1 {{
                                        font-size: 24px;
                                        margin-bottom: 10px;
                                        color: #222;
                                    }}
                                    .byline {{
                                        color: #666;
                                        margin-bottom: 20px;
                                        font-style: italic;
                                    }}
                                    .excerpt {{
                                        font-weight: bold;
                                        margin-bottom: 20px;
                                        color: #444;
                                    }}
                                    img {{
                                        max-width: 100%;
                                        height: auto;
                                        margin: 10px 0;
                                    }}
                                    a {{
                                        color: #0066cc;
                                        text-decoration: none;
                                    }}
                                    @media print {{
                                        body {{ padding: 0; }}
                                    }}
                                </style>
                            </head>
                            <body>
                                <h1>{article_json['title']}</h1>
                                {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                                {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                                {article_json['content']}
                                <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                    Source: <a href="{url}">{url}</a>
                                </div>
                            </body>
                        </html>
                    """)

                    await page.pdf(
                        path=pdf_path,
                        format='A4',
                        margin={
                            'top': '20mm',
                            'right': '20mm',
                            'bottom': '20mm',
                            'left': '20mm'
                        },
                        print_background=False,
                        scale=0.9
                    )
                    break
                except Exception as e:
                    if attempt == max_pdf_attempts - 1:
                        print(f"PDF generation failed after {max_pdf_attempts} attempts: {str(e)}")
                        await browser.close()
                        return False
                    print(f"PDF attempt {attempt + 1} failed, retrying...")
                    await asyncio.sleep(2)  # Small delay before retry

            await browser.close()
            return True

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process multiple URLs with retries
async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = os.path.join(OUTPUT_DIR, f"batch_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    failed = []
    success_count = 0

    for index, url in enumerate(URLS, 1):
        print(f"\n{'='*50}")
        print(f"Processing URL {index} of {len(URLS)}")
        print(url)

        max_retries = 2
        pdf_path = None

        for retry in range(max_retries):
            try:
                # First get the title to use in filename
                async with async_playwright() as p:
                    browser = await p.chromium.launch()
                    page = await browser.new_page()
                    try:
                        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                        title = await page.title()
                    except:
                        title = f"Article_{index}"
                    await browser.close()

                safe_title = sanitize_filename(title[:50])  # Limit title length for filename
                pdf_name = f"{index}_{safe_title}.pdf"
                pdf_path = os.path.join(output_dir, pdf_name)

                if await create_clean_article_pdf(url, pdf_path):
                    print(f"Successfully created: {pdf_name}")
                    success_count += 1
                    # Display download link for the first successful PDF
                    if success_count == 1:
                        display(HTML(f'<a href="{pdf_path}" download>Download First PDF: {pdf_name}</a>'))
                    break  # Success, no need to retry
                else:
                    if retry == max_retries - 1:
                        raise Exception("PDF creation failed after retries")
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(5)  # Wait before retry

            except Exception as e:
                if retry == max_retries - 1:
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                continue

        # Small delay between requests
        await asyncio.sleep(3)

    # Save failed URLs to CSV
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{output_dir}" download>Download All PDFs</a>'))

    # Print failed URLs for reference
    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing URL 2 of 5
https://economictimes.indiatimes.com/tech/technology/payments-body-writes-to-pmo-seeks-return-of-0-3-mdr-on-upi-for-large-merchants-and-rupay-debit-cards/articleshow/119423280.cms
Attempting strategy 1...
Trying wait strategy 1...
Trying wait strategy 2...
Page loaded successfully
Attempting extraction method 1...
Extracted: Payments body writes to PMO, seeks return of MDR o...
PDF generation attempt 1...
Successfully created: 2_Payments body writes to PMO seeks return of MDR o.pdf

Processing URL 3 of 5
https://economictimes.indiatimes.com/news/new-updates/cheating-but-still-filing-fake-rape-case-rippling-cofounder-prasanna-sankar-makes-sensational-allegation-against-wife/articleshow/119410129.cms
Attempting strategy 1...
Trying wait strategy 1...
Trying wait strategy 2...
Page loaded successfully
Attempting extraction method 1...
Extracted: 'Cheating, but still filing fake rape case': Rippl...
PDF generation attempt 1...
Successfully created: 3_Cheating but sti

In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process - only your specified URL
URLS = [
    "https://economictimes.indiatimes.com/tech/technology/payu-acquires-payment-solutions-company-mindgate-for-200-250-million/articleshow/119254923.cms"
]

# Timeout settings (increased from previous version)
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        # Strategy 1: Standard navigation
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        # Strategy 2: Reload if needed
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        # Strategy 3: Bypass potential blockers
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            # Additional waiting strategies
            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            # If we get here, none of the wait strategies worked
            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            # Launch browser with increased timeout and headless mode
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()

            # Set default timeout for all actions
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            # Navigation with multiple fallbacks
            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Remove unwanted elements with retries
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass  # Silently fail if selector doesn't exist

            # Extract content with multiple fallbacks
            article_json = None
            extraction_attempts = [
                # Attempt 1: Standard Readability.js
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",

                # Attempt 2: Fallback to article content extraction
                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;

                    return {
                        title: document.title.replace(' - The Economic Times', ''),
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    print(f"Attempting extraction method {attempt}...")
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        print(f"Extracted: {article_json['title'][:50]}...")
                        break
                except Exception as e:
                    print(f"Extraction attempt {attempt} failed: {str(e)}")
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Generate PDF with multiple attempts
            max_pdf_attempts = 3
            for attempt in range(max_pdf_attempts):
                try:
                    print(f"PDF generation attempt {attempt + 1}...")
                    await page.set_content(f"""
                        <html>
                            <head>
                                <meta charset="UTF-8">
                                <title>{article_json['title']}</title>
                                <style>
                                    body {{
                                        max-width: 800px;
                                        margin: 0 auto;
                                        padding: 20px;
                                        font-family: Arial, sans-serif;
                                        line-height: 1.6;
                                        color: #333;
                                    }}
                                    h1 {{
                                        font-size: 24px;
                                        margin-bottom: 10px;
                                        color: #222;
                                    }}
                                    .byline {{
                                        color: #666;
                                        margin-bottom: 20px;
                                        font-style: italic;
                                    }}
                                    .excerpt {{
                                        font-weight: bold;
                                        margin-bottom: 20px;
                                        color: #444;
                                    }}
                                    img {{
                                        max-width: 100%;
                                        height: auto;
                                        margin: 10px 0;
                                    }}
                                    a {{
                                        color: #0066cc;
                                        text-decoration: none;
                                    }}
                                    @media print {{
                                        body {{ padding: 0; }}
                                    }}
                                </style>
                            </head>
                            <body>
                                <h1>{article_json['title']}</h1>
                                {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                                {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                                {article_json['content']}
                                <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                    Source: <a href="{url}">{url}</a>
                                </div>
                            </body>
                        </html>
                    """)

                    await page.pdf(
                        path=pdf_path,
                        format='A4',
                        margin={
                            'top': '20mm',
                            'right': '20mm',
                            'bottom': '20mm',
                            'left': '20mm'
                        },
                        print_background=False,
                        scale=0.9
                    )
                    break
                except Exception as e:
                    if attempt == max_pdf_attempts - 1:
                        print(f"PDF generation failed after {max_pdf_attempts} attempts: {str(e)}")
                        await browser.close()
                        return False
                    print(f"PDF attempt {attempt + 1} failed, retrying...")
                    await asyncio.sleep(2)  # Small delay before retry

            await browser.close()
            return True

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process multiple URLs with retries
async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = os.path.join(OUTPUT_DIR, f"batch_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    failed = []
    success_count = 0

    for index, url in enumerate(URLS, 1):
        print(f"\n{'='*50}")
        print(f"Processing URL {index} of {len(URLS)}")
        print(url)

        max_retries = 2
        pdf_path = None

        for retry in range(max_retries):
            try:
                # First get the title to use in filename
                async with async_playwright() as p:
                    browser = await p.chromium.launch()
                    page = await browser.new_page()
                    try:
                        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                        title = await page.title()
                    except:
                        title = f"Article_{index}"
                    await browser.close()

                safe_title = sanitize_filename(title[:50])  # Limit title length for filename
                pdf_name = f"{index}_{safe_title}.pdf"
                pdf_path = os.path.join(output_dir, pdf_name)

                if await create_clean_article_pdf(url, pdf_path):
                    print(f"Successfully created: {pdf_name}")
                    success_count += 1
                    # Display download link for the first successful PDF
                    if success_count == 1:
                        display(HTML(f'<a href="{pdf_path}" download>Download First PDF: {pdf_name}</a>'))
                    break  # Success, no need to retry
                else:
                    if retry == max_retries - 1:
                        raise Exception("PDF creation failed after retries")
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(5)  # Wait before retry

            except Exception as e:
                if retry == max_retries - 1:
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                continue

        # Small delay between requests
        await asyncio.sleep(3)

    # Save failed URLs to CSV
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{output_dir}" download>Download All PDFs</a>'))

    # Print failed URLs for reference
    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing complete!
Successfully processed: 1/1
Failed: 0


In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process - only your specified URL
URLS = [
    "https://economictimes.indiatimes.com/news/economy/policy/nclat-reduces-googles-936-crore-penalty-to-217-crore-over-competition-law-violation/articleshow/119671499.cms"
]

# Timeout settings
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Remove unwanted elements with specific focus on "Discover the stories" sections
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section',
                # Specific selectors for "Discover the stories" and similar
                'div[data-ga*="Discover"]', 'div[data-ga*="discover"]',
                'div[class*="discover"]', 'div[class*="Discover"]',
                'div[data-testid*="discover"]', 'div[id*="discover"]',
                'div:has-text("Discover the stories")',
                'div:has-text("discover the stories")',
                'div:has-text("Stay on top")',
                'div:has-text("stay on top")'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass

            # Additional text content removal
            await page.evaluate("""() => {
                // Remove elements containing specific text patterns
                const unwantedTextPatterns = [
                    'Discover the stories of your interest',
                    'discover the stories of your interest',
                    'Stay on top of technology and startup news',
                    'ETPrime stories of the day'
                ];

                function containsUnwantedText(node) {
                    if (node.nodeType === Node.TEXT_NODE) {
                        return unwantedTextPatterns.some(pattern =>
                            node.textContent.includes(pattern));
                    }
                    return false;
                }

                function walkDOM(node) {
                    if (node.nodeType === Node.ELEMENT_NODE) {
                        // Check if element itself contains unwanted text
                        if (unwantedTextPatterns.some(pattern =>
                            node.textContent.includes(pattern))) {
                            // Check if this is a small element we can remove
                            if (node.textContent.length < 500) {
                                node.remove();
                                return;
                            }
                        }

                        // Check child nodes
                        const children = Array.from(node.childNodes);
                        for (const child of children) {
                            if (containsUnwantedText(child)) {
                                // Remove the entire paragraph if it contains the unwanted text
                                if (node.tagName === 'P' || node.classList.contains('Normal')) {
                                    node.remove();
                                    break;
                                } else {
                                    child.remove();
                                }
                            } else {
                                walkDOM(child);
                            }
                        }
                    }
                }

                walkDOM(document.body);
            }""")

            article_json = None
            extraction_attempts = [
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title.replace(' - The Economic Times', ''),
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",

                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;

                    return {
                        title: document.title.replace(' - The Economic Times', ''),
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    print(f"Attempting extraction method {attempt}...")
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        print(f"Extracted: {article_json['title'][:50]}...")
                        break
                except Exception as e:
                    print(f"Extraction attempt {attempt} failed: {str(e)}")
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Clean the content of any remaining promotional text
            clean_content = await page.evaluate("""(content) => {
                const temp = document.createElement('div');
                temp.innerHTML = content;

                // Remove elements with specific text
                const unwantedTexts = [
                    'Discover the stories of your interest',
                    'Stay on top of technology and startup news',
                    'ETPrime stories of the day',
                    'BLACK FRIDAY IS LIVE'
                ];

                unwantedTexts.forEach(text => {
                    const elements = Array.from(temp.querySelectorAll('*'));
                    elements.forEach(el => {
                        if (el.textContent.includes(text)) {
                            el.remove();
                        }
                    });
                });

                return temp.innerHTML;
            }""", article_json['content'])

            article_json['content'] = clean_content

            max_pdf_attempts = 3
            for attempt in range(max_pdf_attempts):
                try:
                    print(f"PDF generation attempt {attempt + 1}...")
                    await page.set_content(f"""
                        <html>
                            <head>
                                <meta charset="UTF-8">
                                <title>{article_json['title']}</title>
                                <style>
                                    body {{
                                        max-width: 800px;
                                        margin: 0 auto;
                                        padding: 20px;
                                        font-family: Arial, sans-serif;
                                        line-height: 1.6;
                                        color: #333;
                                    }}
                                    h1 {{
                                        font-size: 24px;
                                        margin-bottom: 10px;
                                        color: #222;
                                    }}
                                    .byline {{
                                        color: #666;
                                        margin-bottom: 20px;
                                        font-style: italic;
                                    }}
                                    .excerpt {{
                                        font-weight: bold;
                                        margin-bottom: 20px;
                                        color: #444;
                                    }}
                                    img {{
                                        max-width: 100%;
                                        height: auto;
                                        margin: 10px 0;
                                    }}
                                    a {{
                                        color: #0066cc;
                                        text-decoration: none;
                                    }}
                                    @media print {{
                                        body {{ padding: 0; }}
                                    }}
                                </style>
                            </head>
                            <body>
                                <h1>{article_json['title']}</h1>
                                {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                                {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                                {article_json['content']}
                                <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                    Source: <a href="{url}">{url}</a>
                                </div>
                            </body>
                        </html>
                    """)

                    await page.pdf(
                        path=pdf_path,
                        format='A4',
                        margin={
                            'top': '20mm',
                            'right': '20mm',
                            'bottom': '20mm',
                            'left': '20mm'
                        },
                        print_background=False,
                        scale=0.9
                    )
                    break
                except Exception as e:
                    if attempt == max_pdf_attempts - 1:
                        print(f"PDF generation failed after {max_pdf_attempts} attempts: {str(e)}")
                        await browser.close()
                        return False
                    print(f"PDF attempt {attempt + 1} failed, retrying...")
                    await asyncio.sleep(2)

            await browser.close()
            return True

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process multiple URLs with retries
async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = os.path.join(OUTPUT_DIR, f"batch_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    failed = []
    success_count = 0

    for index, url in enumerate(URLS, 1):
        print(f"\n{'='*50}")
        print(f"Processing URL {index} of {len(URLS)}")
        print(url)

        max_retries = 2
        pdf_path = None

        for retry in range(max_retries):
            try:
                async with async_playwright() as p:
                    browser = await p.chromium.launch()
                    page = await browser.new_page()
                    try:
                        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                        title = await page.title()
                    except:
                        title = f"Article_{index}"
                    await browser.close()

                safe_title = sanitize_filename(title[:50])
                pdf_name = f"{index}_{safe_title}.pdf"
                pdf_path = os.path.join(output_dir, pdf_name)

                if await create_clean_article_pdf(url, pdf_path):
                    print(f"Successfully created: {pdf_name}")
                    success_count += 1
                    if success_count == 1:
                        display(HTML(f'<a href="{pdf_path}" download>Download First PDF: {pdf_name}</a>'))
                    break
                else:
                    if retry == max_retries - 1:
                        raise Exception("PDF creation failed after retries")
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(5)

            except Exception as e:
                if retry == max_retries - 1:
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                continue

        await asyncio.sleep(3)

    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{output_dir}" download>Download All PDFs</a>'))

    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing complete!
Successfully processed: 1/1
Failed: 0


with date


In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [
    "https://economictimes.indiatimes.com/news/economy/policy/nclat-reduces-googles-936-crore-penalty-to-217-crore-over-competition-law-violation/articleshow/119671499.cms"
]

# Timeout settings
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleWrap, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Remove unwanted elements
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section',
                'div[data-ga*="Discover"]', 'div[data-ga*="discover"]',
                'div[class*="discover"]', 'div[class*="Discover"]',
                'div[data-testid*="discover"]', 'div[id*="discover"]',
                'div:has-text("Discover the stories")',
                'div:has-text("discover the stories")',
                'div:has-text("Stay on top")',
                'div:has-text("stay on top")'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass

            # Extract article content with multiple fallback methods
            article_data = await page.evaluate("""() => {
                // Method 1: Try to find article content directly
                const articleElement = document.querySelector('article, .article, .articleWrap, .articleContent, .content, .story, .main-content, .Normal');
                if (articleElement) {
                    return {
                        title: document.title.replace(' - The Economic Times', ''),
                        content: articleElement.innerHTML,
                        date: document.querySelector('.publish_on, .publish-date, .date')?.textContent.trim() || 'Date not available',
                        byline: document.querySelector('.byline, .author')?.textContent.trim() || ''
                    };
                }

                // Method 2: Try Readability.js fallback
                try {
                    const article = new Readability(document.cloneNode(true)).parse();
                    if (article) {
                        return {
                            title: article.title.replace(' - The Economic Times', ''),
                            content: article.content,
                            date: article.excerpt || 'Date not available',
                            byline: article.byline || ''
                        };
                    }
                } catch (e) {}

                // Method 3: Fallback to body content
                return {
                    title: document.title.replace(' - The Economic Times', ''),
                    content: document.body.innerHTML,
                    date: 'Date not available',
                    byline: ''
                };
            }""")

            if not article_data or not article_data.get('content'):
                print("Content extraction failed")
                await browser.close()
                return False

            # Generate PDF
            await page.set_content(f"""
                <html>
                    <head>
                        <meta charset="UTF-8">
                        <title>{article_data['title']}</title>
                        <style>
                            body {{
                                max-width: 800px;
                                margin: 0 auto;
                                padding: 20px;
                                font-family: Arial, sans-serif;
                                line-height: 1.6;
                                color: #333;
                            }}
                            h1 {{
                                font-size: 24px;
                                margin-bottom: 5px;
                                color: #222;
                            }}
                            .article-meta {{
                                color: #666;
                                margin-bottom: 20px;
                                font-size: 14px;
                                border-bottom: 1px solid #eee;
                                padding-bottom: 15px;
                            }}
                            .article-meta div {{
                                margin-bottom: 5px;
                            }}
                            .article-content {{
                                margin-top: 20px;
                            }}
                            img {{
                                max-width: 100%;
                                height: auto;
                                margin: 10px 0;
                            }}
                            @media print {{
                                body {{
                                    padding: 0;
                                    font-size: 12pt;
                                }}
                                h1 {{
                                    font-size: 18pt;
                                }}
                            }}
                        </style>
                    </head>
                    <body>
                        <h1>{article_data['title']}</h1>
                        <div class="article-meta">
                            <div><strong>Published:</strong> {article_data['date']}</div>
                            {f'<div><strong>Author:</strong> {article_data["byline"]}</div>' if article_data["byline"] else ''}
                            <div><strong>Source:</strong> <a href="{url}">Economic Times</a></div>
                        </div>
                        <div class="article-content">
                            {article_data['content']}
                        </div>
                    </body>
                </html>
            """)

            await page.pdf(
                path=pdf_path,
                format='A4',
                margin={'top': '20mm', 'right': '20mm', 'bottom': '20mm', 'left': '20mm'},
                print_background=False,
                scale=0.9
            )

            await browser.close()
            return True

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

# Step 5: Process URLs with retries
async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = os.path.join(OUTPUT_DIR, f"batch_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    failed = []
    success_count = 0

    for index, url in enumerate(URLS, 1):
        print(f"\n{'='*50}")
        print(f"Processing URL {index} of {len(URLS)}")
        print(url)

        max_retries = 2
        pdf_path = None

        for retry in range(max_retries):
            try:
                pdf_name = f"article_{index}.pdf"
                pdf_path = os.path.join(output_dir, pdf_name)

                if await create_clean_article_pdf(url, pdf_path):
                    print(f"Successfully created: {pdf_name}")
                    success_count += 1
                    if success_count == 1:
                        display(HTML(f'<a href="{pdf_path}" download>Download First PDF: {pdf_name}</a>'))
                    break
                else:
                    if retry == max_retries - 1:
                        raise Exception("PDF creation failed after retries")
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(5)

            except Exception as e:
                if retry == max_retries - 1:
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                continue

        await asyncio.sleep(3)

    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{output_dir}" download>Download All PDFs</a>'))

    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing complete!
Successfully processed: 1/1
Failed: 0


In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [
    "https://economictimes.indiatimes.com/news/india/next-upi-infosys-co-founder-nandan-nilekani-says-this-new-tech-on-roofs-will-create-millions-of-entrepreneurs/articleshow/119642817.cms"
]

# Timeout settings
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def extract_article_metadata(page):
    """Optimized metadata extraction focusing on time.jsdtTime element"""
    try:
        # Extract title
        title = await page.title()
        title = title.replace(' - The Economic Times', '').strip()

        # Extract CSM number from URL
        url = page.url
        csm_number = url.split('/')[-1].split('.')[0]

        # Fast extraction from time.jsdtTime element
        try:
            time_element = await page.query_selector('time.jsdtTime')
            if time_element:
                # First try to get epoch timestamp from data-dt
                epoch_ms = await time_element.get_attribute('data-dt')
                if epoch_ms:
                    dt = datetime.fromtimestamp(int(epoch_ms)/1000)
                    return {
                        'title': title,
                        'csm_number': csm_number,
                        'published_date': dt.strftime('%Y%m%d')
                    }

                # Fallback to text content parsing
                date_text = await time_element.text_content()
                if date_text and "Last Updated:" in date_text:
                    date_part = date_text.split("Last Updated:")[1].split(",")[0].strip()
                    dt = datetime.strptime(date_part, '%b %d %Y')
                    return {
                        'title': title,
                        'csm_number': csm_number,
                        'published_date': dt.strftime('%Y%m%d')
                    }
        except Exception as e:
            print(f"Time element extraction failed: {str(e)}")

        # Fallback to meta tags
        try:
            date_str = await page.get_attribute('meta[property="article:published_time"]', 'content')
            if date_str:
                dt = datetime.strptime(date_str.split('T')[0], '%Y-%m-%d')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d')
                }
        except:
            pass

        # Final fallback to current date
        return {
            'title': title,
            'csm_number': csm_number,
            'published_date': datetime.now().strftime('%Y%m%d')
        }

    except Exception as e:
        print(f"Metadata extraction error: {str(e)}")
        return None

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Extract metadata first (optimized for time.jsdtTime)
            metadata = await extract_article_metadata(page)
            if not metadata:
                print("Failed to extract article metadata")
                await browser.close()
                return False

            # Remove unwanted elements
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section',
                'div[data-ga*="Discover"]', 'div[data-ga*="discover"]',
                'div[class*="discover"]', 'div[class*="Discover"]',
                'div[data-testid*="discover"]', 'div[id*="discover"]',
                'div:has-text("Discover the stories")',
                'div:has-text("discover the stories")',
                'div:has-text("Stay on top")',
                'div:has-text("stay on top")'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass

            # Additional text content removal
            await page.evaluate("""() => {
                const unwantedTextPatterns = [
                    'Discover the stories of your interest',
                    'discover the stories of your interest',
                    'Stay on top of technology and startup news',
                    'ETPrime stories of the day'
                ];

                function walkDOM(node) {
                    if (node.nodeType === Node.ELEMENT_NODE) {
                        if (unwantedTextPatterns.some(pattern =>
                            node.textContent.includes(pattern))) {
                            if (node.textContent.length < 500) {
                                node.remove();
                                return;
                            }
                        }
                        Array.from(node.childNodes).forEach(walkDOM);
                    }
                }
                walkDOM(document.body);
            }""")

            # Extract article content
            article_json = None
            extraction_attempts = [
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title,
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",
                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;
                    return {
                        title: document.title,
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        break
                except:
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Generate PDF
            try:
                await page.set_content(f"""
                    <html>
                        <head>
                            <meta charset="UTF-8">
                            <title>{article_json['title']}</title>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial, sans-serif;
                                    line-height: 1.6;
                                    color: #333;
                                }}
                                h1 {{
                                    font-size: 24px;
                                    margin-bottom: 10px;
                                    color: #222;
                                }}
                                .byline {{
                                    color: #666;
                                    margin-bottom: 20px;
                                    font-style: italic;
                                }}
                                .excerpt {{
                                    font-weight: bold;
                                    margin-bottom: 20px;
                                    color: #444;
                                }}
                                img {{
                                    max-width: 100%;
                                    height: auto;
                                    margin: 10px 0;
                                }}
                                a {{
                                    color: #0066cc;
                                    text-decoration: none;
                                }}
                                @media print {{
                                    body {{ padding: 0; }}
                                }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                            {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                            {article_json['content']}
                            <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                Source: <a href="{url}">{url}</a>
                            </div>
                        </body>
                    </html>
                """)

                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '20mm',
                        'right': '20mm',
                        'bottom': '20mm',
                        'left': '20mm'
                    },
                    print_background=False,
                    scale=0.9
                )
            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

            await browser.close()
            return metadata

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    batch_dir = os.path.join(OUTPUT_DIR, f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    os.makedirs(batch_dir, exist_ok=True)

    failed = []
    success_count = 0

    for url in URLS:
        print(f"\n{'='*50}")
        print(f"Processing URL: {url}")

        for retry in range(2):  # Max 2 retries
            try:
                # Create temp path first
                temp_pdf = os.path.join(batch_dir, "temp.pdf")

                # Process article and get metadata
                result = await create_clean_article_pdf(url, temp_pdf)
                if not result:
                    raise Exception("PDF creation failed")

                # Generate final filename
                safe_title = sanitize_filename(result['title'][:50])  # Limit title length
                pdf_name = f"{result['published_date']}_{result['csm_number']}_{safe_title}.pdf"
                final_path = os.path.join(batch_dir, pdf_name)

                # Rename temp file to final filename
                os.rename(temp_pdf, final_path)

                print(f"Successfully created: {pdf_name}")
                success_count += 1
                if success_count == 1:
                    display(HTML(f'<a href="{final_path}" download>Download First PDF: {pdf_name}</a>'))
                break
            except Exception as e:
                if retry == 1:  # Last attempt failed
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                else:
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(3)
                continue

    # Save failed URLs if any
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{batch_dir}" download>Download All PDFs</a>'))

    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()



ERROR:asyncio:Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed\nCall log:\n  - waiting for locator("meta[property=\\"article:published_time\\"]")\n')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Call log:
  - waiting for locator("meta[property=\"article:published_time\"]")



╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing complete!
Successfully processed: 1/1
Failed: 0


BEST OF BEST


In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [
    "https://economictimes.indiatimes.com/news/india/next-upi-infosys-co-founder-nandan-nilekani-says-this-new-tech-on-roofs-will-create-millions-of-entrepreneurs/articleshow/119642817.cms"
]

# Timeout settings
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def extract_article_metadata(page):
    """Ultra-fast metadata extraction with direct DOM access"""
    try:
        # Get title and CSM number first (fast operations)
        title = (await page.title()).replace(' - The Economic Times', '').strip()
        csm_number = page.url.split('/')[-1].split('.')[0]

        # 1. FIRST TRY: Directly access the time element's data-dt attribute (fastest)
        try:
            epoch_ms = await page.evaluate('''() => {
                const el = document.querySelector('time.jsdtTime');
                return el ? el.getAttribute('data-dt') : null;
            }''')
            if epoch_ms:
                dt = datetime.fromtimestamp(int(epoch_ms)/1000)
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d')
                }
        except:
            pass

        # 2. SECOND TRY: Direct text extraction from time element (fast)
        try:
            date_text = await page.evaluate('''() => {
                const el = document.querySelector('time.jsdtTime');
                return el ? el.textContent : null;
            }''')

            if date_text and "Last Updated:" in date_text:
                # Extract just "Mar 28, 2025" part
                date_part = date_text.split("Last Updated:")[1].split(",")[0].strip()
                dt = datetime.strptime(date_part, '%b %d %Y')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d')
                }
        except:
            pass

        # 3. THIRD TRY: Check for common meta tags (still relatively fast)
        try:
            meta_date = await page.evaluate('''() => {
                const el = document.querySelector('meta[property="article:published_time"]');
                return el ? el.content : null;
            }''')
            if meta_date:
                dt = datetime.strptime(meta_date.split('T')[0], '%Y-%m-%d')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d')
                }
        except:
            pass

        # Final fallback
        return {
            'title': title,
            'csm_number': csm_number,
            'published_date': datetime.now().strftime('%Y%m%d')
        }

    except Exception as e:
        print(f"Metadata error: {str(e)}")
        return None

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Extract metadata first (optimized for time.jsdtTime)
            metadata = await extract_article_metadata(page)
            if not metadata:
                print("Failed to extract article metadata")
                await browser.close()
                return False

            # Remove unwanted elements
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section',
                'div[data-ga*="Discover"]', 'div[data-ga*="discover"]',
                'div[class*="discover"]', 'div[class*="Discover"]',
                'div[data-testid*="discover"]', 'div[id*="discover"]',
                'div:has-text("Discover the stories")',
                'div:has-text("discover the stories")',
                'div:has-text("Stay on top")',
                'div:has-text("stay on top")'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass

            # Additional text content removal
            await page.evaluate("""() => {
                const unwantedTextPatterns = [
                    'Discover the stories of your interest',
                    'discover the stories of your interest',
                    'Stay on top of technology and startup news',
                    'ETPrime stories of the day'
                ];

                function walkDOM(node) {
                    if (node.nodeType === Node.ELEMENT_NODE) {
                        if (unwantedTextPatterns.some(pattern =>
                            node.textContent.includes(pattern))) {
                            if (node.textContent.length < 500) {
                                node.remove();
                                return;
                            }
                        }
                        Array.from(node.childNodes).forEach(walkDOM);
                    }
                }
                walkDOM(document.body);
            }""")

            # Extract article content
            article_json = None
            extraction_attempts = [
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title,
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",
                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;
                    return {
                        title: document.title,
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        break
                except:
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Generate PDF
            try:
                await page.set_content(f"""
                    <html>
                        <head>
                            <meta charset="UTF-8">
                            <title>{article_json['title']}</title>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial, sans-serif;
                                    line-height: 1.6;
                                    color: #333;
                                }}
                                h1 {{
                                    font-size: 24px;
                                    margin-bottom: 10px;
                                    color: #222;
                                }}
                                .byline {{
                                    color: #666;
                                    margin-bottom: 20px;
                                    font-style: italic;
                                }}
                                .excerpt {{
                                    font-weight: bold;
                                    margin-bottom: 20px;
                                    color: #444;
                                }}
                                img {{
                                    max-width: 100%;
                                    height: auto;
                                    margin: 10px 0;
                                }}
                                a {{
                                    color: #0066cc;
                                    text-decoration: none;
                                }}
                                @media print {{
                                    body {{ padding: 0; }}
                                }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                            {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                            {article_json['content']}
                            <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                Source: <a href="{url}">{url}</a>
                            </div>
                        </body>
                    </html>
                """)

                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '20mm',
                        'right': '20mm',
                        'bottom': '20mm',
                        'left': '20mm'
                    },
                    print_background=False,
                    scale=0.9
                )
            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

            await browser.close()
            return metadata

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    batch_dir = os.path.join(OUTPUT_DIR, f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    os.makedirs(batch_dir, exist_ok=True)

    failed = []
    success_count = 0

    for url in URLS:
        print(f"\n{'='*50}")
        print(f"Processing URL: {url}")

        for retry in range(2):  # Max 2 retries
            try:
                # Create temp path first
                temp_pdf = os.path.join(batch_dir, "temp.pdf")

                # Process article and get metadata
                result = await create_clean_article_pdf(url, temp_pdf)
                if not result:
                    raise Exception("PDF creation failed")

                # Generate final filename
                safe_title = sanitize_filename(result['title'][:50])  # Limit title length
                pdf_name = f"{result['published_date']}_{result['csm_number']}_{safe_title}.pdf"
                final_path = os.path.join(batch_dir, pdf_name)

                # Rename temp file to final filename
                os.rename(temp_pdf, final_path)

                print(f"Successfully created: {pdf_name}")
                success_count += 1
                if success_count == 1:
                    display(HTML(f'<a href="{final_path}" download>Download First PDF: {pdf_name}</a>'))
                break
            except Exception as e:
                if retry == 1:  # Last attempt failed
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                else:
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(3)
                continue

    # Save failed URLs if any
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{batch_dir}" download>Download All PDFs</a>'))

    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing complete!
Successfully processed: 1/1
Failed: 0


In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [

]

# Timeout settings
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def extract_article_metadata(page):
    """Ultra-fast metadata extraction with direct DOM access"""
    try:
        # Get title and CSM number first (fast operations)
        title = (await page.title()).replace(' - The Economic Times', '').strip()
        csm_number = page.url.split('/')[-1].split('.')[0]

        # 1. FIRST TRY: Directly access the time element's data-dt attribute (fastest)
        try:
            epoch_ms = await page.evaluate('''() => {
                const el = document.querySelector('time.jsdtTime');
                return el ? el.getAttribute('data-dt') : null;
            }''')
            if epoch_ms:
                dt = datetime.fromtimestamp(int(epoch_ms)/1000)
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d'),
                    'display_date': dt.strftime('%d %B %Y')  # Added display format
                }
        except:
            pass

        # 2. SECOND TRY: Direct text extraction from time element (fast)
        try:
            date_text = await page.evaluate('''() => {
                const el = document.querySelector('time.jsdtTime');
                return el ? el.textContent : null;
            }''')

            if date_text and "Last Updated:" in date_text:
                # Extract just "Mar 28, 2025" part
                date_part = date_text.split("Last Updated:")[1].split(",")[0].strip()
                dt = datetime.strptime(date_part, '%b %d %Y')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d'),
                    'display_date': dt.strftime('%d %B %Y')  # Added display format
                }
        except:
            pass

        # 3. THIRD TRY: Check for common meta tags (still relatively fast)
        try:
            meta_date = await page.evaluate('''() => {
                const el = document.querySelector('meta[property="article:published_time"]');
                return el ? el.content : null;
            }''')
            if meta_date:
                dt = datetime.strptime(meta_date.split('T')[0], '%Y-%m-%d')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d'),
                    'display_date': dt.strftime('%d %B %Y')  # Added display format
                }
        except:
            pass

        # Final fallback
        current_date = datetime.now()
        return {
            'title': title,
            'csm_number': csm_number,
            'published_date': current_date.strftime('%Y%m%d'),
            'display_date': current_date.strftime('%d %B %Y')  # Added display format
        }

    except Exception as e:
        print(f"Metadata error: {str(e)}")
        return None

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Extract metadata first (optimized for time.jsdtTime)
            metadata = await extract_article_metadata(page)
            if not metadata:
                print("Failed to extract article metadata")
                await browser.close()
                return False

            # Remove unwanted elements
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section',
                'div[data-ga*="Discover"]', 'div[data-ga*="discover"]',
                'div[class*="discover"]', 'div[class*="Discover"]',
                'div[data-testid*="discover"]', 'div[id*="discover"]',
                'div:has-text("Discover the stories")',
                'div:has-text("discover the stories")',
                'div:has-text("Stay on top")',
                'div:has-text("stay on top")'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass

            # Additional text content removal
            await page.evaluate("""() => {
                const unwantedTextPatterns = [
                    'Discover the stories of your interest',
                    'discover the stories of your interest',
                    'Stay on top of technology and startup news',
                    'ETPrime stories of the day'
                ];

                function walkDOM(node) {
                    if (node.nodeType === Node.ELEMENT_NODE) {
                        if (unwantedTextPatterns.some(pattern =>
                            node.textContent.includes(pattern))) {
                            if (node.textContent.length < 500) {
                                node.remove();
                                return;
                            }
                        }
                        Array.from(node.childNodes).forEach(walkDOM);
                    }
                }
                walkDOM(document.body);
            }""")

            # Extract article content
            article_json = None
            extraction_attempts = [
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title,
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",
                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;
                    return {
                        title: document.title,
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        break
                except:
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Generate PDF with date under heading
            try:
                await page.set_content(f"""
                    <html>
                        <head>
                            <meta charset="UTF-8">
                            <title>{article_json['title']}</title>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial, sans-serif;
                                    line-height: 1.6;
                                    color: #333;
                                }}
                                h1 {{
                                    font-size: 24px;
                                    margin-bottom: 5px;
                                    color: #222;
                                }}
                                .article-date {{
                                    color: #666;
                                    margin-bottom: 15px;
                                    font-size: 14px;
                                }}
                                .byline {{
                                    color: #666;
                                    margin-bottom: 20px;
                                    font-style: italic;
                                }}
                                .excerpt {{
                                    font-weight: bold;
                                    margin-bottom: 20px;
                                    color: #444;
                                }}
                                img {{
                                    max-width: 100%;
                                    height: auto;
                                    margin: 10px 0;
                                }}
                                a {{
                                    color: #0066cc;
                                    text-decoration: none;
                                }}
                                @media print {{
                                    body {{ padding: 0; }}
                                }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            <div class="article-date">{metadata['display_date']}</div>
                            {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                            {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                            {article_json['content']}
                            <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                Source: <a href="{url}">{url}</a>
                            </div>
                        </body>
                    </html>
                """)

                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '20mm',
                        'right': '20mm',
                        'bottom': '20mm',
                        'left': '20mm'
                    },
                    print_background=False,
                    scale=0.9
                )
            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

            await browser.close()
            return metadata

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    batch_dir = os.path.join(OUTPUT_DIR, f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    os.makedirs(batch_dir, exist_ok=True)

    failed = []
    success_count = 0

    for url in URLS:
        print(f"\n{'='*50}")
        print(f"Processing URL: {url}")

        for retry in range(2):  # Max 2 retries
            try:
                # Create temp path first
                temp_pdf = os.path.join(batch_dir, "temp.pdf")

                # Process article and get metadata
                result = await create_clean_article_pdf(url, temp_pdf)
                if not result:
                    raise Exception("PDF creation failed")

                # Generate final filename
                safe_title = sanitize_filename(result['title'][:50])
                pdf_name = f"{result['published_date']}_{result['csm_number']}_{safe_title}.pdf"
                final_path = os.path.join(batch_dir, pdf_name)

                # Rename temp file to final filename
                os.rename(temp_pdf, final_path)

                print(f"Successfully created: {pdf_name}")
                success_count += 1
                if success_count == 1:
                    display(HTML(f'<a href="{final_path}" download>Download First PDF: {pdf_name}</a>'))
                break
            except Exception as e:
                if retry == 1:  # Last attempt failed
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                else:
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(3)
                continue

    # Save failed URLs if any
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{batch_dir}" download>Download All PDFs</a>'))

    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._


Processing complete!
Successfully processed: 1/1
Failed: 0
