In [None]:
# Step 1: Install required packages
!pip install playwright requests nest-asyncio
!playwright install

# Step 2: Import libraries
import os
import csv
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import requests
from IPython.display import display, HTML
import asyncio
import nest_asyncio
from datetime import datetime

# Apply nest_asyncio to make async work in Colab
nest_asyncio.apply()

# Step 3: Configuration
OUTPUT_DIR = "/content/articles"
FAILED_CSV = "/content/failed_urls.csv"
BASE_URL = "https://economictimes.indiatimes.com"

# List of URLs to process
URLS = [

]

# Timeout settings
NAVIGATION_TIMEOUT = 180000  # 180 seconds
REQUEST_TIMEOUT = 45000  # 45 seconds
SELECTOR_TIMEOUT = 45000  # 45 seconds

# Step 4: Define functions
def validate_url(url):
    """Convert relative URLs to absolute and validate"""
    if url.startswith('/'):
        return urljoin(BASE_URL, url)
    if not url.startswith(('http://', 'https://')):
        raise ValueError(f"Invalid URL format: {url}")
    return url

def sanitize_filename(title):
    """Create a safe filename from article title"""
    keep_chars = (' ', '.', '_', '-')
    return "".join(c for c in title if c.isalnum() or c in keep_chars).rstrip()

async def try_loading_page(page, url):
    """Multiple strategies to load page content"""
    strategies = [
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.reload(timeout=NAVIGATION_TIMEOUT, wait_until="domcontentloaded"),
        lambda: page.goto(url, timeout=NAVIGATION_TIMEOUT, wait_until="commit")
    ]

    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"Attempting strategy {i}...")
            await strategy()

            wait_strategies = [
                lambda: page.wait_for_load_state("networkidle", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("article, .article, .articleContent, .content, .story, .main-content, .Normal",
                                            state="attached", timeout=SELECTOR_TIMEOUT),
                lambda: page.wait_for_selector("h1, h2, p", state="attached", timeout=SELECTOR_TIMEOUT)
            ]

            for j, wait_strategy in enumerate(wait_strategies, 1):
                try:
                    print(f"Trying wait strategy {j}...")
                    await wait_strategy()
                    print("Page loaded successfully")
                    return True
                except:
                    continue

            return False

        except Exception as e:
            print(f"Strategy {i} failed: {str(e)}")
            continue

    return False

async def extract_article_metadata(page):
    """Ultra-fast metadata extraction with direct DOM access"""
    try:
        # Get title and CSM number first (fast operations)
        title = (await page.title()).replace(' - The Economic Times', '').strip()
        csm_number = page.url.split('/')[-1].split('.')[0]

        # 1. FIRST TRY: Directly access the time element's data-dt attribute (fastest)
        try:
            epoch_ms = await page.evaluate('''() => {
                const el = document.querySelector('time.jsdtTime');
                return el ? el.getAttribute('data-dt') : null;
            }''')
            if epoch_ms:
                dt = datetime.fromtimestamp(int(epoch_ms)/1000)
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d'),
                    'display_date': dt.strftime('%d %B %Y')  # Added display format
                }
        except:
            pass

        # 2. SECOND TRY: Direct text extraction from time element (fast)
        try:
            date_text = await page.evaluate('''() => {
                const el = document.querySelector('time.jsdtTime');
                return el ? el.textContent : null;
            }''')

            if date_text and "Last Updated:" in date_text:
                # Extract just "Mar 28, 2025" part
                date_part = date_text.split("Last Updated:")[1].split(",")[0].strip()
                dt = datetime.strptime(date_part, '%b %d %Y')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d'),
                    'display_date': dt.strftime('%d %B %Y')  # Added display format
                }
        except:
            pass

        # 3. THIRD TRY: Check for common meta tags (still relatively fast)
        try:
            meta_date = await page.evaluate('''() => {
                const el = document.querySelector('meta[property="article:published_time"]');
                return el ? el.content : null;
            }''')
            if meta_date:
                dt = datetime.strptime(meta_date.split('T')[0], '%Y-%m-%d')
                return {
                    'title': title,
                    'csm_number': csm_number,
                    'published_date': dt.strftime('%Y%m%d'),
                    'display_date': dt.strftime('%d %B %Y')  # Added display format
                }
        except:
            pass

        # Final fallback
        current_date = datetime.now()
        return {
            'title': title,
            'csm_number': csm_number,
            'published_date': current_date.strftime('%Y%m%d'),
            'display_date': current_date.strftime('%d %B %Y')  # Added display format
        }

    except Exception as e:
        print(f"Metadata error: {str(e)}")
        return None

async def create_clean_article_pdf(url, pdf_path):
    try:
        url = validate_url(url)
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                timeout=NAVIGATION_TIMEOUT,
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--disable-setuid-sandbox',
                    '--no-sandbox'
                ]
            )
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                java_script_enabled=True
            )
            page = await context.new_page()
            page.set_default_timeout(NAVIGATION_TIMEOUT)

            # Load Readability.js
            try:
                response = requests.get(
                    "https://raw.githubusercontent.com/mozilla/readability/master/Readability.js",
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                readability_script = response.text
            except Exception as e:
                print(f"Readability.js load failed: {str(e)}")
                await browser.close()
                return False

            if not await try_loading_page(page, url):
                print("All loading strategies failed")
                await browser.close()
                return False

            # Extract metadata first (optimized for time.jsdtTime)
            metadata = await extract_article_metadata(page)
            if not metadata:
                print("Failed to extract article metadata")
                await browser.close()
                return False

            # Remove unwanted elements
            selectors_to_remove = [
                '.recommendedStories', '.socialShares', '.newsletter',
                '.discoverTheStory', '.et_related', '[class*="ad"]',
                'header', 'footer', 'iframe', 'script', '.comments',
                '.related-news', '.recommendations', '.signup-promo',
                '.subscribe', '.hidden', '.modal', '.popup', '.leaderboard',
                '.ad-container', '.teaser', '.promo', '.newsletter-signup',
                '.social-media', '.sharing', '.recommended', '.trending',
                '.most-popular', '.also-read', '.more-from-section',
                'div[data-ga*="Discover"]', 'div[data-ga*="discover"]',
                'div[class*="discover"]', 'div[class*="Discover"]',
                'div[data-testid*="discover"]', 'div[id*="discover"]',
                'div:has-text("Discover the stories")',
                'div:has-text("discover the stories")',
                'div:has-text("Stay on top")',
                'div:has-text("stay on top")'
            ]

            for selector in selectors_to_remove:
                try:
                    await page.evaluate(f"""selector => {{
                        const elements = document.querySelectorAll(selector);
                        elements.forEach(el => el.remove());
                    }}""", selector)
                except:
                    pass

            # Additional text content removal
            await page.evaluate("""() => {
                const unwantedTextPatterns = [
                    'Discover the stories of your interest',
                    'discover the stories of your interest',
                    'Stay on top of technology and startup news',
                    'ETPrime stories of the day'
                ];

                function walkDOM(node) {
                    if (node.nodeType === Node.ELEMENT_NODE) {
                        if (unwantedTextPatterns.some(pattern =>
                            node.textContent.includes(pattern))) {
                            if (node.textContent.length < 500) {
                                node.remove();
                                return;
                            }
                        }
                        Array.from(node.childNodes).forEach(walkDOM);
                    }
                }
                walkDOM(document.body);
            }""")

            # Extract article content
            article_json = None
            extraction_attempts = [
                f"""() => {{
                    {readability_script}
                    try {{
                        const reader = new Readability(document.cloneNode(true)).parse();
                        if (!reader) return null;
                        return {{
                            title: reader.title,
                            content: reader.content,
                            byline: reader.byline,
                            excerpt: reader.excerpt
                        }};
                    }} catch (e) {{
                        console.error('Readability error:', e);
                        return null;
                    }}
                }}""",
                """() => {
                    const article = document.querySelector('article, .article, .articleContent') ||
                                  document.querySelector('.content, .story, .main-content, .Normal');
                    if (!article) return null;
                    return {
                        title: document.title,
                        content: article.innerHTML,
                        byline: document.querySelector('.byline, .author, .publish-date')?.textContent || '',
                        excerpt: document.querySelector('.excerpt, .summary, .synopsis')?.textContent || ''
                    };
                }"""
            ]

            for attempt, extraction_script in enumerate(extraction_attempts, 1):
                try:
                    article_json = await page.evaluate(extraction_script)
                    if article_json and article_json.get('content'):
                        break
                except:
                    continue

            if not article_json or not article_json.get('content'):
                print("All extraction methods failed")
                await browser.close()
                return False

            # Generate PDF with date under heading
            try:
                await page.set_content(f"""
                    <html>
                        <head>
                            <meta charset="UTF-8">
                            <title>{article_json['title']}</title>
                            <style>
                                body {{
                                    max-width: 800px;
                                    margin: 0 auto;
                                    padding: 20px;
                                    font-family: Arial, sans-serif;
                                    line-height: 1.6;
                                    color: #333;
                                }}
                                h1 {{
                                    font-size: 24px;
                                    margin-bottom: 5px;
                                    color: #222;
                                }}
                                .article-date {{
                                    color: #666;
                                    margin-bottom: 15px;
                                    font-size: 14px;
                                }}
                                .byline {{
                                    color: #666;
                                    margin-bottom: 20px;
                                    font-style: italic;
                                }}
                                .excerpt {{
                                    font-weight: bold;
                                    margin-bottom: 20px;
                                    color: #444;
                                }}
                                img {{
                                    max-width: 100%;
                                    height: auto;
                                    margin: 10px 0;
                                }}
                                a {{
                                    color: #0066cc;
                                    text-decoration: none;
                                }}
                                @media print {{
                                    body {{ padding: 0; }}
                                }}
                            </style>
                        </head>
                        <body>
                            <h1>{article_json['title']}</h1>
                            <div class="article-date">{metadata['display_date']}</div>
                            {f'<div class="byline">{article_json["byline"]}</div>' if article_json.get("byline") else ''}
                            {f'<div class="excerpt">{article_json["excerpt"]}</div>' if article_json.get("excerpt") else ''}
                            {article_json['content']}
                            <div style="margin-top: 30px; font-size: 12px; color: #999;">
                                Source: <a href="{url}">{url}</a>
                            </div>
                        </body>
                    </html>
                """)

                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    margin={
                        'top': '20mm',
                        'right': '20mm',
                        'bottom': '20mm',
                        'left': '20mm'
                    },
                    print_background=False,
                    scale=0.9
                )
            except Exception as e:
                print(f"PDF generation failed: {str(e)}")
                await browser.close()
                return False

            await browser.close()
            return metadata

    except Exception as e:
        print(f"Processing failed: {str(e)}")
        return False

async def process_urls_with_retry():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    batch_dir = os.path.join(OUTPUT_DIR, f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    os.makedirs(batch_dir, exist_ok=True)

    failed = []
    success_count = 0

    for url in URLS:
        print(f"\n{'='*50}")
        print(f"Processing URL: {url}")

        for retry in range(2):  # Max 2 retries
            try:
                # Create temp path first
                temp_pdf = os.path.join(batch_dir, "temp.pdf")

                # Process article and get metadata
                result = await create_clean_article_pdf(url, temp_pdf)
                if not result:
                    raise Exception("PDF creation failed")

                # Generate final filename
                safe_title = sanitize_filename(result['title'][:50])
                pdf_name = f"{result['published_date']}_{result['csm_number']}_{safe_title}.pdf"
                final_path = os.path.join(batch_dir, pdf_name)

                # Rename temp file to final filename
                os.rename(temp_pdf, final_path)

                print(f"Successfully created: {pdf_name}")
                success_count += 1
                if success_count == 1:
                    display(HTML(f'<a href="{final_path}" download>Download First PDF: {pdf_name}</a>'))
                break
            except Exception as e:
                if retry == 1:  # Last attempt failed
                    print(f"Failed to process {url}: {str(e)}")
                    failed.append({'url': url, 'error': str(e)})
                else:
                    print(f"Attempt {retry + 1} failed, retrying...")
                    await asyncio.sleep(3)
                continue

    # Save failed URLs if any
    if failed:
        with open(FAILED_CSV, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'error'])
            writer.writeheader()
            writer.writerows(failed)

    print("\nProcessing complete!")
    print(f"Successfully processed: {success_count}/{len(URLS)}")
    print(f"Failed: {len(failed)}")

    if success_count > 0:
        display(HTML(f'<a href="{batch_dir}" download>Download All PDFs</a>'))

    if failed:
        print("\nFailed URLs:")
        for item in failed:
            print(f"- {item['url']}")
            print(f"  Reason: {item['error']}")

# Step 6: Run the process
await process_urls_with_retry()