In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

In [30]:
chrome_options = Options()
# Remove headless option for debugging
# chrome_options.add_argument("--headless")  # Run headless if you don't want a browser window
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36")

service = Service('/usr/local/bin/chromedriver/chromedriver')  # Adjust this path to your chromedriver location
driver = webdriver.Chrome(service=service, options=chrome_options)

# URL of the CBC Toronto news page
url = 'https://www.cbc.ca/news/canada/toronto'

try:
    driver.get(url)
    print(f"Loaded URL: {driver.current_url}")
    print(f"Page title: {driver.title}")

    # Check for specific elements to ensure the page is loaded correctly
    body = driver.find_element(By.TAG_NAME, 'body')
    print("Body element found.")

    # Wait for the main content to load
    main_content = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.card-content'))
    )
    time.sleep(5)  # Additional wait for the content to fully load
    print("Main content loaded.")

except Exception as e:
    print("Error: Main content did not load.")
    # print(e)
    # driver.save_screenshot('error_screenshot.png')  # Take a screenshot for further investigation

finally:
    # Check if the page source is loaded correctly
    page_source = driver.page_source
    print("Page source length:", len(page_source))
    with open('page_source.html', 'w', encoding='utf-8') as f:
        f.write(page_source)

print("Body content:   ")
print(body)


Loaded URL: https://www.cbc.ca/news/canada/toronto
Page title: Toronto - CBC News
Body element found.
Main content loaded.
Page source length: 766760
Body content:   
<selenium.webdriver.remote.webelement.WebElement (session="ab5ed33fdfade012720b865377ec7b94", element="f.9F0ABEC7306875EBC00D581002998BF7.d.59BE45241E2B30F3A8E664663965F66B.e.1348")>


In [24]:

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Example: Extract article titles and links
articles = soup.select('div.card-content h3 a')
for article in articles:
    title = article.get_text()
    link = article['href'] if 'href' in article.attrs else None
    print(f"Title: {title}")
    print(f"Link: {link}")
    print("-" * 20)

In [20]:
# Function to parse and extract articles
def parse_articles(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    articles = soup.find_all('div', class_='card-content')
    for article in articles:
        headline = article.find('h3', class_='headline').get_text(strip=True)
        summary = article.find('div', class_='description').get_text(strip=True) if article.find('div', class_='description') else 'No summary available'
        link = article.find('a', class_='card-link')['href']
        metadata_div = article.find('div', class_='metadata')
        category = metadata_div.find('span', class_='category').get_text(strip=True) if metadata_div and metadata_div.find('span', class_='category') else 'No category available'
        
        print(f'Headline: {headline}')
        print(f'Summary: {summary}')
        print(f'Link: https://www.cbc.ca{link}')
        print(f'Category: {category}')
        print('-' * 80)

# Initial parse
parse_articles(driver.page_source)


MaxRetryError: HTTPConnectionPool(host='localhost', port=52201): Max retries exceeded with url: /session/fa58846ae2ddcaa761a6346e322af5f4/source (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11844b4d0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
# load more articles
while True:
    try:
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-testid="loadMoreButton"]'))  # Adjust the selector based on inspection
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
        load_more_button.click()
        time.sleep(3)  # Wait for more articles to load
        parse_articles(driver.page_source)
    except Exception as e:
        print("No more articles to load or an error occurred.")
        print(e)
        break

driver.quit()

In [33]:

# Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36")

# Chrome driver path (adjust this to your chromedriver location)
service = Service('/usr/local/bin/chromedriver/chromedriver')

# Initialize Chrome webdriver
driver = webdriver.Chrome(service=service, options=chrome_options)

# URL of the CBC Toronto news page
url = 'https://www.cbc.ca/news/canada/toronto'

try:
    # Load the page
    driver.get(url)
    print(f"Loaded URL: {driver.current_url}")
    print(f"Page title: {driver.title}")

    # Wait for the main content to load
    main_content = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.card-content'))
    )
    time.sleep(5)  # Additional wait for the content to fully load
    print("Main content loaded.")

    # Get page source
    page_source = driver.page_source
    print(f"Page source length: {len(page_source)}")

  # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    print(f"Soup: {soup}")

    # Example: Extract article titles and links
    articles = soup.select('div.card-content h3 a')
    print(f"Articles: {articles}")
    for article in articles:
        title = article.get_text()
        link = article['href'] if 'href' in article.attrs else None
        print(f"Title: {title}")
        print(f"Link: {link}")
        print("-" * 20)

except Exception as e:
    print(f"Error: {e}")

finally:
    driver.quit()


Loaded URL: https://www.cbc.ca/news/canada/toronto
Page title: Toronto - CBC News
Main content loaded.
Page source length: 764778
Soup: <html class="hydrated" lang="en"><head><script async="" src="https://script.4dex.io/localstore.js" type="text/javascript"></script><style id="vf-font-size-override">.viafoura { --base-font-size: 0.625rem; }</style><meta content="Az520Inasey3TAyqLyojQa8MnmCALSEU29yQFW8dePZ7xQTvSt73pHazLFTK5f7SyLUJSo2uKLesEtEa9aUYcgMAAACPeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZS5jb206NDQzIiwiZmVhdHVyZSI6IkRpc2FibGVUaGlyZFBhcnR5U3RvcmFnZVBhcnRpdGlvbmluZyIsImV4cGlyeSI6MTcyNTQwNzk5OSwiaXNTdWJkb21haW4iOnRydWUsImlzVGhpcmRQYXJ0eSI6dHJ1ZX0=" http-equiv="origin-trial"/><script async="" crossorigin="anonymous" integrity="sha384-hPyFRPtmIlZ/h96KCtxJTBIBP3zd59DIienuUS73AMSvQ3kZQSEjuK4A+Lw/BPMn" src="https://www.gstatic.com/recaptcha/releases/TqxSU0dsOd2Q9IbI7CpFnJLD/recaptcha__en.js" type="text/javascript"></script><script src="//static.chartbeat.com/js/chartbeat_video.js"></script><scrip

In [5]:
chrome_options = Options()
# Remove headless option for debugging
# chrome_options.add_argument("--headless")  # Run headless if you don't want a browser window
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36")

service = Service('/usr/local/bin/chromedriver/chromedriver')  # Adjust this path to your chromedriver location
driver = webdriver.Chrome(service=service, options=chrome_options)

# URL of the CBC Toronto news page
url = 'https://www.cbc.ca/news/canada/toronto'

try:
    driver.get(url)
    print(f"Loaded URL: {driver.current_url}")
    print(f"Page title: {driver.title}")

    # Check for specific elements to ensure the page is loaded correctly
    body = driver.find_element(By.TAG_NAME, 'body')
    print("Body element found.")
    print(f"Body content: {body.text}...")  # Print first 500 characters of the body content for verification
    print(f"body html: {body.get_dom_attribute('class')}")

    # Wait for the main content to load
    main_content = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.card-content'))
    )
    time.sleep(5)  # Additional wait for the content to fully load
    print("Main content loaded.")

except Exception as e:
    print("Error: Main content did not load.")
    print(e)
    driver.save_screenshot('error_screenshot.png')  # Take a screenshot for further investigation

finally:
    # Check if the page source is loaded correctly
    page_source = driver.page_source
    print("Page source length:", len(page_source))
    with open('page_source.html', 'w', encoding='utf-8') as f:
        f.write(page_source)

    driver.quit()


Loaded URL: https://www.cbc.ca/news/canada/toronto
Page title: Toronto - CBC News
Body element found.
Body content: Content
MENU
Search
Sign In
Toronto
Currently Selected:
Home
Community
Programs
Contact Us
Metro Morning
Weather
Toronto - CBC News Loaded
Toronto - CBC News
Toronto Video
Carousel
Page 1 of 3
Video
LIVE
CBC Toronto News
Video
0:41
Dramatic Toronto house fire captured by doorbell camera
Video
4:28
Toronto needs more money to rename Yonge-Dundas Square
Video
2:49
3 people dead after shooting inside North York business
Video
3:56
Temperatures in the GTA could feel as hot as 45 C this week
Video
2:52
Video
2:32
What is a heat dome? A climatologist breaks it down
Video
4:08
Woman who was sexually assaulted at work waited 6 years for human rights hearing
Video
2:26
'It's hard to breathe': students describe heat inside Toronto classrooms as unbearable
Video
2:26
Early polls open in Toronto-St. Paul's byelection
Video
2:27
St. Anne's Church hosts outdoor service after losing bui