In [3]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_sky_news():
    # URL of the Sky News website
    url = "https://news.sky.com/"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the articles on the page
        articles = soup.find_all('div', class_='sdc-news-ticker__item') + soup.find_all('div', class_='sdc-site-tile__headline')

        # Create a list to store the scraped data
        data = []

        # Scrape information for each article
        for article in articles:
            # Extract link
            link_tag = article.find('a')
            if link_tag:
                link = link_tag['href']
            else:
                link = "No link available"

            # Extract headline if available
            headline_tag = article.find('span', class_='sdc-site-tile__headline-text')
            headline = headline_tag.get_text(strip=True) if headline_tag else "No headline available"

            # Set sarcastic flag to 0 for Sky News articles
            sarcastic = 0

            # Create a dictionary to hold the data
            article_data = {
                'link': link,
                'headline': headline,
                'sarcastic': sarcastic
            }

            # Append the dictionary to the data list
            data.append(article_data)

        # Save the data to a JSON file
        with open('sky_news.json', 'w') as json_file:
            json.dump(data, json_file, indent=2)

        print("Scraping complete. Data saved to sky_news.json.")

    else:
        print(f"Failed to retrieve the page. Status Code: {response.status_code}")

# Call the function to start scraping
scrape_sky_news()


Scraping complete. Data saved to sky_news.json.


In [4]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_rte_news():
    # URL of the RTE News website
    url = "https://www.rte.ie/news/"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the articles on the page
        articles = soup.find_all('div', class_='listing')

        # Create a list to store the scraped data
        data = []

        # Scrape information for each article
        for article in articles:
            # Extract link
            link_tag = article.find('a')
            if link_tag:
                link = "https://www.rte.ie" + link_tag['href']
            else:
                link = "No link available"

            # Extract headline if available
            headline_tag = article.find('h3', class_='title')
            headline = headline_tag.get_text(strip=True) if headline_tag else "No headline available"

            # Set sarcastic flag to 0 for RTE News articles
            sarcastic = 0

            # Create a dictionary to hold the data
            article_data = {
                'link': link,
                'headline': headline,
                'sarcastic': sarcastic
            }

            # Append the dictionary to the data list
            data.append(article_data)

        # Save the data to a JSON file
        with open('rte_news.json', 'w') as json_file:
            json.dump(data, json_file, indent=2)

        print("Scraping complete. Data saved to rte_news.json.")

    else:
        print(f"Failed to retrieve the page. Status Code: {response.status_code}")

# Call the function to start scraping
scrape_rte_news()


Scraping complete. Data saved to rte_news.json.


In [6]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_bbc_news():
    # URL of the BBC News website
    url = "https://www.rte.ie/"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the articles on the page
        articles = soup.find_all('div', class_='gs-c-promo-body')

        # Create a list to store the scraped data
        data = []

        # Scrape information for each article (limit to 10 for the sample)
        for i, article in enumerate(articles[:2000]):
            # Extract link
            link = article.find('a')['href']

            # Extract headline if available
            headline_tag = article.find('h3')
            headline = headline_tag.get_text(strip=True) if headline_tag else "No headline available"

            # Set sarcastic flag to 0 for BBC articles
            sarcastic = 0

            # Create a dictionary to hold the data
            article_data = {
                'link': link,
                'headline': headline,
                'sarcastic': sarcastic
            }

            # Append the dictionary to the data list
            data.append(article_data)

        # Save the data to a JSON file
        with open('bbc_news.json', 'w') as json_file:
            json.dump(data, json_file, indent=2)

        print("Scraping complete. Data saved to bbc_news.json.")

    else:
        print(f"Failed to retrieve the page. Status Code: {response.status_code}")

# Call the function to start scraping
scrape_bbc_news()


Scraping complete. Data saved to bbc_news.json.
