In [10]:
import httpx
from bs4 import BeautifulSoup
import os
import time
import logging
import asyncio
from datetime import datetime
from typing import Optional, Tuple

–ü–æ–ª—É—á–µ–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π

In [17]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NCCN Website Scraper
Scrapes NCCN guideline category pages and generates YAML index documents
Supports intelligent caching mechanism for MCP Server automation scenarios
"""

import asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import logging
import yaml
import os
from datetime import datetime

In [None]:
# Ensure logs directory exists
LOGS_DIR = os.path.join(os.path.dirname(__file__), 'logs')
os.makedirs(LOGS_DIR, exist_ok=True)

# Configure logging for this module specifically
logger = logging.getLogger(__name__)

# Only configure handlers if they haven't been added yet
if not logger.handlers:
    logger.setLevel(logging.INFO)
    
    # Create formatter
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    
    # Add console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    
    # Add file handler
    file_handler = logging.FileHandler(os.path.join(LOGS_DIR, 'nccn_get_index.log'))
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    
    # Prevent propagation to root logger to avoid duplicate logs
    logger.propagate = False

# Constants
DEFAULT_OUTPUT_FILE = 'nccn_guidelines_index.yaml'
CACHE_MAX_AGE_DAYS = 7  # Default maximum cache file validity period (days)


async def fetch_page(client: httpx.AsyncClient, url: str, max_retries: int = 3) -> str:
    """
    Asynchronously fetch single page content with retry mechanism
    
    Args:
        client: httpx async client
        url: URL to scrape
        max_retries: Maximum retry attempts, default 3
    
    Returns:
        Page HTML content
    """
    import asyncio
    
    for attempt in range(max_retries + 1):
        try:
            response = await client.get(url, timeout=30.0)
            response.raise_for_status()
            return response.text
        except Exception as e:
            if attempt < max_retries:
                logger.warning(f"Failed to fetch page {url} (attempt {attempt + 1}): {e}, retrying in 1 second...")
                await asyncio.sleep(1)  # Wait 1 second before retry
            else:
                logger.error(f"Final failure to fetch page {url} (after {max_retries} retries): {e}")
                return ""


async def get_page_title(client: httpx.AsyncClient, url: str) -> str:
    """
    Get page title
    
    Args:
        client: httpx async client
        url: Page URL
    
    Returns:
        Page title
    """
    # Category pages are important
    html = await fetch_page(client, url, max_retries=3)
    if not html:
        return ""
    
    soup = BeautifulSoup(html, 'html.parser')
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.get_text(strip=True)
    return ""


async def extract_item_links(client: httpx.AsyncClient, url: str) -> list:
    """
    Extract links and titles from div elements with class 'item-name' on the page
    
    Args:
        client: httpx async client
        url: Page URL
    
    Returns:
        List of dictionaries containing links and titles
    """
    # Category pages are important, use more retries
    html = await fetch_page(client, url, max_retries=5)
    if not html:
        return []
    
    soup = BeautifulSoup(html, 'html.parser')
    items = []
    
    # Find div elements with class 'item-name'
    item_divs = soup.find_all('div', class_='item-name')
    
    for div in item_divs:
        # Find link elements under the div
        link_elem = div.find('a')
        if link_elem:
            href = link_elem.get('href')
            title = link_elem.get_text(strip=True)
            
            if href and title:
                # Convert to absolute URL
                absolute_url = urljoin(url, href)
                items.append({
                    'title': title,
                    'url': absolute_url
                })
    
    return items


async def find_nccn_guideline_link(client: httpx.AsyncClient, url: str) -> str:
    """
    Find hyperlink of element containing 'NCCN guidelines' text on third-level page
    
    Args:
        client: httpx async client
        url: Third-level page URL
    
    Returns:
        NCCN guidelines link, returns empty string if not found
    """
    # Third-level pages are numerous
    html = await fetch_page(client, url, max_retries=3)
    if not html:
        return ""
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find elements containing 'NCCN guidelines' text
    for elem in soup.find_all(['a', 'span', 'div', 'p']):
        text = elem.get_text(strip=True).lower()
        if text == "nccn guidelines":
            # If it's a link element, return href directly
            if elem.name == 'a' and elem.get('href'):
                return urljoin(url, elem.get('href'))
    return ""


async def process_single_item(client: httpx.AsyncClient, item: dict) -> dict:
    """
    Process single item, find its NCCN guidelines link
    
    Args:
        client: httpx async client
        item: Dictionary containing title and url
    
    Returns:
        Enhanced item dictionary containing guideline_link
    """
    guideline_link = await find_nccn_guideline_link(client, item['url'])
    return {
        'title': item['title'],
        'url': item['url'],
        'guideline_link': guideline_link
    }


async def process_category(client: httpx.AsyncClient, category_num: int) -> dict:
    """
    Process single category page
    
    Args:
        client: httpx async client
        category_num: Category number (1-4)
    
    Returns:
        Dictionary containing category information and sub-items
    """
    category_url = f"https://www.nccn.org/guidelines/category_{category_num}"
    logger.info(f"Processing category page: {category_url}")
    
    # Get page title
    title = await get_page_title(client, category_url)
    
    # Get item links from page
    items = await extract_item_links(client, category_url)
    
    if not items:
        logger.warning(f"Category {category_num} found no items")
        return {
            'category_num': category_num,
            'title': title,
            'url': category_url,
            'items': []
        }
    
    logger.info(f"Category {category_num} found {len(items)} items, starting concurrent processing of third-level pages...")
    
    # Process all third-level pages concurrently
    tasks = [process_single_item(client, item) for item in items]
    enhanced_items = await asyncio.gather(*tasks, return_exceptions=True)
    
    # Filter out exception results, keep valid results
    valid_items = []
    for i, result in enumerate(enhanced_items):
        if isinstance(result, Exception):
            logger.error(f"Failed to process item {items[i]['url']}: {result}")
            # Keep original information even if failed
            valid_items.append({
                'title': items[i]['title'],
                'url': items[i]['url'],
                'guideline_link': ''
            })
        else:
            valid_items.append(result)
    
    logger.info(f"Category {category_num} third-level page processing completed")
    
    return {
        'category_num': category_num,
        'title': title,
        'url': category_url,
        'items': valid_items
    }


async def scrape_all_categories() -> list:
    """
    Scrape all category pages
    
    Returns:
        List of all category data
    """
    async with httpx.AsyncClient(
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        },
        follow_redirects=True,
        timeout=httpx.Timeout(30.0, connect=10.0),  # Set connection and read timeout
        limits=httpx.Limits(max_keepalive_connections=10, max_connections=20)  # Limit connections
    ) as client:
        
        # Process 4 category pages concurrently
        tasks = [process_category(client, i) for i in range(1, 5)]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Filter out exception results
        valid_results = [r for r in results if not isinstance(r, Exception)]
        
        return valid_results


def generate_yaml(categories_data: list) -> str:
    """
    Generate YAML format guideline index
    
    Args:
        categories_data: List of category data
    
    Returns:
        YAML format document string
    """
    # Build hierarchical data structure
    categories = []
    
    for category in categories_data:
        category_title = category.get('title', f'Category {category["category_num"]}')
        
        # Collect all valid guidelines under this category
        guidelines = []
        for item in category.get('items', []):
            # Only keep items with guideline_link
            if item.get('guideline_link'):
                guidelines.append({
                    'title': item['title'],
                    'url': item['guideline_link']
                })
        
        # Only add category if it has valid guidelines
        if guidelines:
            categories.append({
                'category': category_title,
                'guidelines': guidelines
            })
    
    # Convert to YAML format
    yaml_data = {
        'nccn_guidelines': categories
    }
    
    return yaml.dump(yaml_data, default_flow_style=False, allow_unicode=True, sort_keys=False)


def check_cache_file(output_file: str = DEFAULT_OUTPUT_FILE) -> dict:
    """
    Check cache file status
    
    Args:
        output_file: Output file path
    
    Returns:
        Dictionary containing cache file information
    """
    cache_info = {
        'exists': False,
        'file_path': output_file,
        'size': 0,
        'created_time': None,
        'age_days': 0,
        'is_valid': False
    }
    
    if os.path.exists(output_file):
        cache_info['exists'] = True
        stat = os.stat(output_file)
        cache_info['size'] = stat.st_size
        cache_info['created_time'] = datetime.fromtimestamp(stat.st_mtime)
        
        # Calculate file age
        age_delta = datetime.now() - cache_info['created_time']
        cache_info['age_days'] = age_delta.days
        
        # Check if within validity period and file is not empty
        cache_info['is_valid'] = cache_info['age_days'] < CACHE_MAX_AGE_DAYS and cache_info['size'] > 0
    
    return cache_info


def load_cached_data(output_file: str = DEFAULT_OUTPUT_FILE) -> dict:
    """
    Load cached YAML data
    
    Args:
        output_file: Output file path
    
    Returns:
        Parsed YAML data, returns empty dict if failed
    """
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except Exception as e:
        logger.error(f"Failed to read cache file: {e}")
        return {}


async def ensure_nccn_index(output_file: str = DEFAULT_OUTPUT_FILE, max_age_days: int = CACHE_MAX_AGE_DAYS) -> dict:
    """
    Ensure NCCN guideline index exists and is valid
    This is the main interface for MCP Server calls
    
    Args:
        output_file: Output file path
        max_age_days: Maximum cache file validity period (days)
    
    Returns:
        Parsed guideline index data
    """
    import time
    
    # Check cache file
    cache_info = check_cache_file(output_file)
    
    # Determine if re-scraping is needed
    should_scrape = not cache_info['exists'] or not cache_info['is_valid']
    
    if cache_info['exists']:
        if cache_info['is_valid']:
            logger.info(f"Using valid cache file: {output_file} (created at {cache_info['created_time'].strftime('%Y-%m-%d %H:%M:%S')}, {cache_info['age_days']} days ago)")
        else:
            logger.info(f"Cache file expired ({cache_info['age_days']} days > {max_age_days} days) or corrupted, starting re-scraping...")
    else:
        logger.info("Cache file not found, starting NCCN guideline index scraping...")
    
    if should_scrape:
        start_time = time.time()
        
        try:
            # Scrape all category data
            categories_data = await scrape_all_categories()
            
            if not categories_data:
                logger.error("Scraping failed, no data retrieved")
                # If scraping fails but old cache exists, try using old cache
                if cache_info['exists']:
                    logger.info("Scraping failed, attempting to use existing cache file")
                    return load_cached_data(output_file)
                return {}
            
            # Generate YAML document
            yaml_content = generate_yaml(categories_data)
            
            # Save to file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(yaml_content)
            
            # Calculate statistics
            total_guidelines = sum(len(cat.get('items', [])) for cat in categories_data)
            successful_guidelines = sum(
                len([item for item in cat.get('items', []) if item.get('guideline_link')])
                for cat in categories_data
            )
            
            elapsed_time = time.time() - start_time
            
            logger.info(f"Scraping completed! Index saved to {output_file}")
            logger.info(f"Processed {len(categories_data)} categories, found {successful_guidelines}/{total_guidelines} valid guideline links")
            logger.info(f"Scraping time: {elapsed_time:.2f} seconds")
            
        except Exception as e:
            logger.error(f"Error during scraping process: {e}")
            # If scraping fails but cache exists, use cache
            if cache_info['exists']:
                logger.info("Scraping failed, using existing cache file")
                return load_cached_data(output_file)
            return {}
    
    # Load and return data
    cached_data = load_cached_data(output_file)
    if cached_data and 'nccn_guidelines' in cached_data:
        total_categories = len(cached_data['nccn_guidelines'])
        total_guidelines = sum(len(cat.get('guidelines', [])) for cat in cached_data['nccn_guidelines'])
        logger.info(f"NCCN guideline index ready: {total_categories} categories, {total_guidelines} total guidelines")
    else:
        logger.warning("Guideline index file format is abnormal")
    
    return cached_data


async def main():
    """
    Main function - for direct script testing
    """
    result = await ensure_nccn_index()
    if result:
        logger.info("Guideline index retrieved successfully")
    else:
        logger.error("Failed to retrieve guideline index")


if __name__ == "__main__":
    asyncio.run(main()) 

##  —Å–∫–∞—á–∏–≤–∞–Ω–∏–µ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π –≤ –ø–∞–ø–∫—É

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NCCN Automatic Login and PDF Downloader
‚úÖ Fixed for Jupyter/Colab (no __file__ dependency)
"""


# === Universal script directory (works in .py, Jupyter, Colab) ===
SCRIPT_DIR = (
    os.path.dirname(os.path.abspath(__file__))
    if '__file__' in globals()
    else os.getcwd()
)

# Ensure directories exist
LOGS_DIR = os.path.join(SCRIPT_DIR, 'logs')
PDFS_DIR = os.path.join(SCRIPT_DIR, 'pdfs')
os.makedirs(LOGS_DIR, exist_ok=True)
os.makedirs(PDFS_DIR, exist_ok=True)

# === Credentials ===
EMAIL = "#####"
PASSWORD = "#####"

# === Logging configuration ===
logger = logging.getLogger(__name__)
if not logger.handlers:
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    # Console handler
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    console.setFormatter(formatter)
    logger.addHandler(console)

    # File handler
    file_handler = logging.FileHandler(os.path.join(LOGS_DIR, 'nccn_downloader.log'))
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    logger.propagate = False

# Constants
PDF_CACHE_MAX_AGE_DAYS = 7


def check_pdf_cache_age(file_path: str, max_age_days: int = PDF_CACHE_MAX_AGE_DAYS) -> dict:
    """
    Check PDF cache file age and validity.
    """
    cache_info = {
        'exists': False,
        'file_path': file_path,
        'size': 0,
        'modified_time': None,
        'age_days': 0,
        'is_valid': False
    }
    if os.path.exists(file_path):
        cache_info['exists'] = True
        stat = os.stat(file_path)
        cache_info['size'] = stat.st_size
        cache_info['modified_time'] = datetime.fromtimestamp(stat.st_mtime)
        age_delta = datetime.now() - cache_info['modified_time']
        cache_info['age_days'] = age_delta.days
        cache_info['is_valid'] = (cache_info['age_days'] < max_age_days) and (cache_info['size'] > 0)

        logger.info(f"PDF cache check: {file_path}")
        logger.info(f"  - Size: {cache_info['size']} bytes")
        logger.info(f"  - Age: {cache_info['age_days']} days")
        logger.info(f"  - Valid: {cache_info['is_valid']}")
    return cache_info


class NCCNDownloader:
    def __init__(self, username=None, password=None):
        """
        Initializes the NCCN Downloader.
        
        Args:
            username (str, optional): Username (email address).
            password (str, optional): Password.
        """
        self.session = httpx.AsyncClient()
        self.username = username
        self.password = password
        # Set request headers to simulate a browser visit
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
    async def login(self, username, password, target_url="https://www.nccn.org/professionals/physician_gls/pdf/all.pdf"):
        """
        Logs into the NCCN website.
        
        Args:
            username (str): Username (email address).
            password (str): Password.
            target_url (str): The target URL to access after login.
        
        Returns:
            bool: True if login is successful, False otherwise.
        """
        try:
            logger.info("Accessing login page...")
            
            # First, access the target URL, which will redirect to the login page
            login_response = await self.session.get(target_url, follow_redirects=True)
            
            logger.info(f"Login page response status: {login_response.status_code}")
            logger.info(f"Login page final URL: {login_response.url}")
            
            if login_response.status_code != 200:
                logger.error(f"Failed to access login page, status code: {login_response.status_code}")
                return False
            
            # Parse the login page
            soup = BeautifulSoup(login_response.text, 'html.parser')
            
            # Find the login form
            form = soup.find('form', {'action': '/login/Index/'})
            if not form:
                logger.error("Login form not found.")
                logger.debug(f"Page content preview: {login_response.text[:1000]}...")
                return False
            
            # Extract hidden fields
            hidden_inputs = form.find_all('input', {'type': 'hidden'})
            form_data = {}
            
            for input_field in hidden_inputs:
                name = input_field.get('name')
                value = input_field.get('value', '')
                if name:
                    form_data[name] = value
            
            logger.info(f"Found {len(form_data)} hidden form fields")
            
            # Add login credentials
            form_data.update({
                'Username': username,
                'Password': password,
                'RememberMe': 'false',  # Do not remember by default
            })
            
            logger.info("Submitting login information...")
            
            # Submit the login form
            login_url = "https://www.nccn.org/login/"
            
            # Set specific headers for the login request
            login_headers = {
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer': str(login_response.url),
                'Origin': 'https://www.nccn.org',
            }
            
            login_result = await self.session.post(
                login_url,
                data=form_data,
                headers=login_headers,
                follow_redirects=True
            )
            
            logger.info(f"Login result status: {login_result.status_code}")
            logger.info(f"Login result final URL: {login_result.url}")
            
            # Check if login was successful
            if login_result.status_code == 200:
                # Check if still on the login page (indicates login failure)
                if '/login' in str(login_result.url) or 'Log in' in login_result.text:
                    logger.error("Login failed: Incorrect username or password.")
                    return False
                else:
                    logger.info("Login successful!")
                    return True
            else:
                logger.error(f"Login request failed, status code: {login_result.status_code}")
                return False
                
        except Exception as e:
            logger.error(f"An error occurred during login: {str(e)}")
            return False
    
    async def download_pdf(self, pdf_url, download_dir=None, username=None, password=None, skip_if_exists=True, max_cache_age_days=PDF_CACHE_MAX_AGE_DAYS):
        """
        Downloads a PDF file, automatically logging in if required.
        
        Args:
            pdf_url (str): URL of the PDF file.
            download_dir (str, optional): Directory to save the PDF. Defaults to current directory.
            username (str, optional): Username (email address), required if not already logged in.
            password (str, optional): Password, required if not already logged in.
            skip_if_exists (bool): Whether to skip download if the file already exists. Defaults to True.
            max_cache_age_days (int): Maximum cache file validity period (days). Defaults to PDF_CACHE_MAX_AGE_DAYS.
        
        Returns:
            tuple: (success (bool), saved_filename (str))
        """
        try:
            # Automatically extract filename from URL
            filename = os.path.basename(pdf_url)
            if not filename or not filename.endswith('.pdf'):
                filename = 'nccn_guideline.pdf'
            
            if download_dir:
                os.makedirs(download_dir, exist_ok=True)
            else:
                download_dir = os.getcwd() # Use current working directory if not specified
            
            save_path = os.path.join(download_dir, filename)
            
            # Check if file already exists and is still valid (not too old)
            if skip_if_exists:
                cache_info = check_pdf_cache_age(save_path, max_cache_age_days)
                if cache_info['exists']:
                    if cache_info['is_valid']:
                        logger.info(f"Using valid cached PDF: {save_path}")
                        logger.info(f"File size: {cache_info['size']} bytes, age: {cache_info['age_days']} days")
                        return True, filename
                    else:
                        logger.info(f"PDF cache expired ({cache_info['age_days']} days > {max_cache_age_days} days) or corrupted, re-downloading...")
                else:
                    logger.info(f"PDF not found in cache, downloading: {save_path}")
            
            logger.info(f"Downloading PDF: {pdf_url}")
            
            # Set request headers for PDF download
            pdf_headers = {
                'Accept': 'application/pdf,*/*',
                'Referer': 'https://www.nccn.org/',
            }
            
            # First, make a regular GET request to check the response
            response = await self.session.get(pdf_url, headers=pdf_headers, follow_redirects=True)
            
            logger.info(f"Response status: {response.status_code}")
            logger.info(f"Final URL: {response.url}")
            
            # Check if we were redirected to a login page
            if response.status_code == 200:
                content_type = response.headers.get('Content-Type', '')
                logger.info(f"Content-Type: {content_type}")
                
                # Check if this is actually a PDF
                if 'application/pdf' in content_type:
                    # This is a PDF, save it directly
                    with open(save_path, 'wb') as f:
                        f.write(response.content)
                    
                    file_size = os.path.getsize(save_path)
                    logger.info(f"PDF file saved to: {save_path}")
                    logger.info(f"File size: {file_size} bytes")
                    return True, filename
                
                elif 'text/html' in content_type:
                    # This is HTML, likely a login page
                    response_text = response.text
                    if 'login' in response_text.lower() or 'log in' in response_text.lower():
                        logger.info("Login required detected, attempting automatic login...")
                        
                        # If login credentials are provided, attempt to log in
                        login_username = username or self.username
                        login_password = password or self.password
                        
                        if login_username and login_password:
                            if await self.login(login_username, login_password, pdf_url):
                                logger.info("Login successful, re-downloading PDF...")
                                time.sleep(1)  # Wait for login state to stabilize
                                # Recursive call, but do not pass login credentials to avoid infinite loop
                                return await self.download_pdf(pdf_url, download_dir=download_dir, skip_if_exists=skip_if_exists, max_cache_age_days=max_cache_age_days)
                            else:
                                logger.error("Automatic login failed.")
                                return False, filename
                        else:
                            logger.error("Login required but username and password not provided.")
                            return False, filename
                    else:
                        logger.warning("Received HTML response but no login form detected.")
                        logger.debug(f"Response preview: {response_text[:500]}...")
                        return False, filename
                else:
                    logger.warning(f"Unexpected content type: {content_type}")
                    return False, filename
            
            elif response.status_code == 302:
                # Handle redirect manually if needed
                redirect_url = response.headers.get('Location')
                logger.info(f"Received redirect to: {redirect_url}")
                
                # Check if redirect is to login page
                if redirect_url and 'login' in redirect_url.lower():
                    logger.info("Redirected to login page, attempting automatic login...")
                    
                    login_username = username or self.username
                    login_password = password or self.password
                    
                    if login_username and login_password:
                        if await self.login(login_username, login_password, pdf_url):
                            logger.info("Login successful, re-downloading PDF...")
                            time.sleep(1)
                            return await self.download_pdf(pdf_url, download_dir=download_dir, skip_if_exists=skip_if_exists, max_cache_age_days=max_cache_age_days)
                        else:
                            logger.error("Automatic login failed.")
                            return False, filename
                    else:
                        logger.error("Login required but username and password not provided.")
                        return False, filename
                else:
                    logger.error(f"Unexpected redirect to: {redirect_url}")
                    return False, filename
            
            else:
                logger.error(f"Download failed, status code: {response.status_code}")
                return False, filename
                
        except Exception as e:
            logger.error(f"An error occurred during download: {str(e)}")
            return False, filename
    
    async def __aenter__(self):
        """Asynchronous context manager entry point."""
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Asynchronous context manager exit point."""
        await self.session.aclose()



# === Main ===
async def main():
    # PDF URLs (examples)
    pdf_urls = [
    "https://www.nccn.org/professionals/physician_gls/pdf/all.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/aml.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ampullary.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/anal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/appendiceal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/nmsc.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/b-cell.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/btc.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/bladder.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/bone.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/breast.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/castleman.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/cns.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/cervical.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/cll.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/cml.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/colon.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/cutaneous_lymphomas.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/dfsp.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/esophageal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/gastric.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/gist.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/gtn.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/hairy_cell.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/head-and-neck.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/hepatobiliary.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/hcc.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/histiocytic_neoplasms.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/hodgkins.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/kaposi.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/kidney.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/cutaneous_melanoma.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/uveal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/mcc.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/meso_peritoneal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/meso_pleural.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/myeloma.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/mds.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/mlne.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/mpn.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/neuroblastoma.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/neuroendocrine.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/nscl.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/occult.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ovarian.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/pancreatic.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ped_all.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ped_b-cell.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ped_cns.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ped_hodgkin.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/ped_sts.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/penile.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/prostate.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/rectal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/small_bowel.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/sclc.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/sarcoma.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/squamous.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/amyloidosis.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/mastocytosis.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/t-cell.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/testicular.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/thymic.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/thyroid.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/uterine.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/vaginal.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/vulvar.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/waldenstroms.pdf",
    "https://www.nccn.org/professionals/physician_gls/pdf/wilms_tumor.pdf"
]

    async with NCCNDownloader() as downloader:
        # Explicit login (optional ‚Äî will be auto-tried if needed)
        # if not await downloader.login():
        #     logger.error("‚ùå Login failed ‚Äî aborting.")
        #     return

        for url in pdf_urls:
            ok, filename = await downloader.download_pdf(
                url,
                username=EMAIL, password=PASSWORD,
                download_dir=PDFS_DIR,
                skip_if_exists=True
            )
            status = "‚úÖ" if ok else "‚ùå"
            logger.info(f"{status} {filename}")

        logger.info("üéâ All downloads completed.")


# === Run safely for both .py and Jupyter/Colab ===
if __name__ == "__main__":
    try:
        # Regular Python script
        asyncio.run(main())
    except RuntimeError as e:
        # "asyncio.run() cannot be called from a running event loop" ‚Äî Jupyter/Colab
        logger.warning("Running in interactive mode ‚Üí using nest_asyncio fallback")
        import nest_asyncio
        nest_asyncio.apply()
        loop = asyncio.get_event_loop()
        if loop.is_running():
            loop.run_until_complete(main())
        else:
            asyncio.run(main())