In [1]:
!pip install selenium

Defaulting to user installation because normal site-packages is not writeable


In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import json
import time

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--start-maximized')
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def main():
    driver = setup_driver()
    try:
        print("Loading BBC Technology page...")
        driver.get("https://www.bbc.com/news/technology")
        
        # Wait for either cookie consent or main content
        try:
            WebDriverWait(driver, 10).until(
                lambda d: d.find_elements(By.CSS_SELECTOR, 'div[data-entityid="container-top-stories#1"]') or 
                          d.find_elements(By.ID, 'cookiePrompt')
            )
        except Exception as e:
            print(f"Initial wait failed: {e}")
            driver.save_screenshot("initial_load_failed.png")
            return

        # Handle cookie consent if present
        try:
            cookie_btn = driver.find_element(By.XPATH, '//button[contains(., "agree")]')
            cookie_btn.click()
            print("Cookie consent dismissed")
            time.sleep(1)
        except:
            print("No cookie consent found")

        # Get the first article with basic checks
        articles = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-entityid^="container-top-stories"] .gs-c-promo'))
        )
        print(f"Found {len(articles)} potential articles")

        if not articles:
            print("No articles found - trying alternative selector")
            articles = driver.find_elements(By.CSS_SELECTOR, 'div.gs-c-promo')
            print(f"Found {len(articles)} with fallback selector")

        if articles:
            article = articles[0]
            print("\nAttempting to extract article data...")
            
            try:
                headline = article.find_element(By.CSS_SELECTOR, 'h3, h2, span.gs-c-promo-heading__title').text.strip()
                print(f"Headline: {headline}")
                
                url = article.find_element(By.CSS_SELECTOR, 'a[href^="https://www.bbc.com/news/"]').get_attribute('href')
                print(f"URL: {url}")
                
                try:
                    summary = article.find_element(By.CSS_SELECTOR, 'p').text.strip()
                except:
                    summary = "No summary available"
                print(f"Summary: {summary[:100]}...")
                
                result = [{
                    "Headline": headline,
                    "URL": url,
                    "Summary": summary,
                    "Timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                }]
                
                with open("bbc_article.json", "w", encoding="utf-8") as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)
                print("\nSuccessfully saved to bbc_article.json")
                
                return result
            
            except Exception as e:
                print(f"Failed to extract article: {e}")
                driver.save_screenshot("extraction_failed.png")
                return None
        else:
            print("No articles found at all")
            driver.save_screenshot("no_articles.png")
            return None

    except Exception as e:
        print(f"Main error: {e}")
        driver.save_screenshot("main_error.png")
        return None
    finally:
        driver.quit()
        print("Browser closed")

if __name__ == "__main__":
    result = main()
    if result:
        print("\nFinal result:")
        print(json.dumps(result, indent=2, ensure_ascii=False))

Loading BBC Technology page...
Initial wait failed: Message: 

Browser closed
