In [1]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_bbc_news():
    # URL of the BBC News website
    url = "https://www.bbc.com/news"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the articles on the page
        articles = soup.find_all('div', class_='gs-c-promo-body')

        # Create a list to store the scraped data
        data = []

        # Scrape information for each article (limit to 10 for the sample)
        for i, article in enumerate(articles[:2000]):
            # Extract link
            link = article.find('a')['href']

            # Extract headline if available
            headline_tag = article.find('h3')
            headline = headline_tag.get_text(strip=True) if headline_tag else "No headline available"

            # Set sarcastic flag to 0 for BBC articles
            sarcastic = 0

            # Create a dictionary to hold the data
            article_data = {
                'link': link,
                'headline': headline,
                'sarcastic': sarcastic
            }

            # Append the dictionary to the data list
            data.append(article_data)

        # Save the data to a JSON file
        with open('bbc_news.json', 'w') as json_file:
            json.dump(data, json_file, indent=2)

        print("Scraping complete. Data saved to bbc_news.json.")

    else:
        print(f"Failed to retrieve the page. Status Code: {response.status_code}")

# Call the function to start scraping
scrape_bbc_news()


Scraping complete. Data saved to bbc_news.json.


In [1]:
import time
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import json

def scrape_bbc_news():
    # URL of the BBC News website
    url = "https://www.bbc.com/news"
    
    # Initialize a headless browser
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)

    # Send a GET request to the URL
    driver.get(url)
    
    # Scroll down the page to load more articles
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for page to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find the articles on the page
    articles = soup.find_all('div', class_='gs-c-promo-body')

    # Create a list to store the scraped data
    data = []

    # Scrape information for each article
    for article in articles:
        # Extract link
        link = article.find('a')['href']

        # Extract headline if available
        headline_tag = article.find('h3')
        headline = headline_tag.get_text(strip=True) if headline_tag else "No headline available"

        # Set sarcastic flag to 0 for BBC articles
        sarcastic = 0

        # Create a dictionary to hold the data
        article_data = {
            'link': link,
            'headline': headline,
            'sarcastic': sarcastic
        }

        # Append the dictionary to the data list
        data.append(article_data)

    # Save the data to a JSON file
    with open('bbc_news1.json', 'w') as json_file:
        json.dump(data, json_file, indent=2)

    print("Scraping complete. Data saved to bbc_news.json.")
    
    # Close the browser
    driver.quit()

# Call the function to start scraping
scrape_bbc_news()


Scraping complete. Data saved to bbc_news.json.
