In [11]:
# Define required functions

import requests
from bs4 import BeautifulSoup
import json
from pathlib import Path


def has_placeholder_twitter(tag):
    """
    Checks if the class attribute contains a SquareSocialConsent
    """
    if tag.name != "div":
        return False

    if not tag.has_attr("class"):
        return False

    return any(
        "SocialConsentPlaceholder-SquareSocialConsentPlaceholder" in c
        for c in tag["class"]
    )


def has_placeholder_related_topics(tag):
    """
    Checks if the class attribute contains a LinksComponentWrappe
    """
    if tag.name != "div":
        return False

    if not tag.has_attr("class"):
        return False

    return any("LinksComponentWrappe" in c for c in tag["class"])

def get_article_urls(
    base_url:str,
    section:str,
):
    response = requests.get(base_url + section)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")

    # Find all links to articles on web page
    return soup.find_all("a", class_="gs-c-promo-heading")

def scrap_title_body(
        base_url:str,
        link, 
        exclude_subsections:list,
    ):
    href = link["href"]
    # Find the parent div of the link
    parent = link.find_parent("div", role="region")
    # Check if parent has an aria-labelledby attribute that matches any of the exclude subsections
    if parent and parent.get("aria-labelledby"):
        label = parent["aria-labelledby"]
        if any(sub in label for sub in exclude_subsections):
            return None, None

    # Construct full URL by adding base URL and href
    full_url = base_url + href

    article_response = requests.get(full_url)
    article_html = article_response.content

    article_soup = BeautifulSoup(article_html, "html.parser")
    
    # Find the title element and get its text
    title = article_soup.find("title", {"data-rh": "true"}).text

    # Find the article element to get article body
    article = article_soup.find("article")

    # Try to remove the figure element
    try:
        article.find("figure").decompose()
    except AttributeError:
        # print("Figure element not found")
        pass

    # Try to remove the aside element
    try:
        article.find("aside").decompose()
    except AttributeError:
        # print("Aside element not found")
        pass

    # Try to remove the Twitter content element
    try:
        article.find(has_placeholder_twitter).decompose()
    except AttributeError:
        # print("Twitter content not found")
        pass

    # Try to remove the related topics element
    try:
        divs = article.find_all_next(has_placeholder_related_topics)
        # Remove each div element from the soup object
        for div in divs:
            div.decompose()
    except AttributeError:
        # print("Related topics not found")
        pass

    # Try to remove the footer element
    try:
        article.find("footer").decompose()
    except AttributeError:
        # print("Footer element not found")
        pass

    # Find all the p elements inside the article
    paragraphs = article.find_all("p")

    # Join their text together with a newline character
    body = "\n".join(p.text for p in paragraphs)
    return title, body


# Case to save into folder
# ARTICLES_FOLDER = "articles"
# ARTICLES_BUSSINESS_FOLDER = "bussiness"
# ARTICLES_TECHNOLOGY_FOLDER = "technology"

# Simulate Folder with dict for test purpouses
articles_business_dict = dict()
articles_technology_dict = dict()
print("OK")


OK


In [12]:
# Scrap business

# Define base URL
base_url = "https://www.bbc.com"
section = "/news/business"

# Define the subsections to exclude
exclude = [
    # "Features & Analysis"
    "nw-c-Features&Analysis__title",
    # Watch/Listen
    "nw-c-Watch/Listen__title",
    # Special Reports
    "nw-c-Specialreports__title",
    # Around The BBC
    "nw-c-around-the-bbc-heading__title",
]

# Get all article links
links = get_article_urls(
        base_url=base_url,
        section=section,
)

# Loop through links
for link in links:
    # Extract info from links
    title, body = scrap_title_body(
        base_url=base_url,
        link=link, 
        exclude_subsections=exclude,
    )
    # If None means content has a exclusion rule
    if (title is None or body is None):
        continue
    
    
    # Save title and body as JSON object in separate file
    # Create JSON object with title and body keys
    json_object = {"title": title, "body": body}
    
    # Use title as file name and add .json extension
    file_name = f"{title}.json"
    
    # Virtual case, example case
    if (file_name in articles_business_dict):
        print(f"article \"{title}\" Already saved")
    else:
        articles_business_dict[file_name]=json_object

    # File case
    # file_path = Path(ARTICLES_FOLDER, ARTICLES_BUSSINESS_FOLDER, file_name)
    # if (file_path.is_file()):
    #    print(f"article \"{title}\" Already saved")
    # else:
    #    # Open file in write mode and dump JSON object into it
    #    with open(file_path, "w") as f:
    #        json.dump(json_object, f)

    
for file_name, file_content in articles_business_dict.items():
    print(f"title: {file_content.get('title')}")
    print(f"content: {file_content.get('body')[:60]}")



article "Facebook owner Meta plans to create Twitter rival - BBC News" Already saved
article "Facebook owner Meta plans to create Twitter rival - BBC News" Already saved
article "Reddit blackout: Subreddits to go private on Monday - BBC News" Already saved
article "Windfall tax to end if energy prices drop - BBC News" Already saved
article "Rishi Sunak and Joe Biden announce green funding agreement - BBC News" Already saved
article "Qantas: Australian airline relaxes gender-based uniform rules - BBC News" Already saved
article "The divisive debate over California's anti-caste bill - BBC News" Already saved
article "Reddit blackout: Subreddits to go private on Monday - BBC News" Already saved
article "Windfall tax to end if energy prices drop - BBC News" Already saved
article "Rishi Sunak and Joe Biden announce green funding agreement - BBC News" Already saved
article "Qantas: Australian airline relaxes gender-based uniform rules - BBC News" Already saved
article "The divisive debate ov

In [14]:
# Scrap technology

# Define base URL
base_url = "https://www.bbc.com"
section = "/news/technology"

# Define the subsections to exclude
exclude = [
    # "Features & Analysis"
    "nw-c-Features&Analysis__title",
    # Watch/Listen
    "nw-c-Watch/Listen__title",
    # Special Reports
    # "nw-c-Specialreports__title",
    # Around The BBC
    "nw-c-around-the-bbc-heading__title",
]

# Get all article links
links = get_article_urls(
        base_url=base_url,
        section=section,
)

# Loop through links
for link in links:
    # Extract info from links
    title, body = scrap_title_body(
        base_url=base_url,
        link=link, 
        exclude_subsections=exclude,
    )
    # If None means content has a exclusion rule
    if (title is None or body is None):
        continue
    
    
    # Save title and body as JSON object in separate file
    # Create JSON object with title and body keys
    json_object = {"title": title, "body": body}
    
    # Use title as file name and add .json extension
    file_name = f"{title}.json"
    
    # Virtual case, example case
    if (file_name in articles_technology_dict):
        print(f"article \"{title}\" Already saved")
    else:
        articles_technology_dict[file_name]=json_object

    # File case
    # file_path = Path(ARTICLES_FOLDER, ARTICLES_TECHNOLOGY_FOLDER, file_name)
    # if (file_path.is_file()):
    #    print(f"article \"{title}\" Already saved")
    # else:
    #    # Open file in write mode and dump JSON object into it
    #    with open(file_path, "w") as f:
    #        json.dump(json_object, f)

    
for file_name, file_content in articles_technology_dict.items():
    print(f"title: {file_content.get('title')}")
    print(f"content: {file_content.get('body')[:60]}")



article "Reddit blackout: Subreddits to go private on Monday - BBC News" Already saved
article "Reddit blackout: Subreddits to go private on Monday - BBC News" Already saved
article "Facebook owner Meta plans to create Twitter rival - BBC News" Already saved
article "Aberdeen AI trial helps doctors spot breast cancers - BBC News" Already saved
article "Twitch scraps ad changes after streamers leave platform - BBC News" Already saved
article "Crypto: 24-hour cooling-off period included in ad overhaul - BBC News" Already saved
article "Stay ahead in AI race, tech boss urges West - BBC News" Already saved
article "Facebook owner Meta plans to create Twitter rival - BBC News" Already saved
article "Aberdeen AI trial helps doctors spot breast cancers - BBC News" Already saved
article "Twitch scraps ad changes after streamers leave platform - BBC News" Already saved
article "Crypto: 24-hour cooling-off period included in ad overhaul - BBC News" Already saved
article "Stay ahead in AI race, t

In [17]:
# Inspect dictionaries

print("Business")
for file_name, file_content in articles_business_dict.items():
    print(f"file name: {file_name}")
    print(f"title: {file_content.get('title')}")
    print(f"content: {file_content.get('body')}")
    break

print("-"*100)

print("Technology")
for file_name, file_content in articles_technology_dict.items():
    print(f"file name: {file_name}")
    print(f"title: {file_content.get('title')}")
    print(f"content: {file_content.get('body')}")
    break

Business
file name: Facebook owner Meta plans to create Twitter rival - BBC News.json
title: Facebook owner Meta plans to create Twitter rival - BBC News
content: Meta has shown staff plans for a text-based social network designed to compete with Twitter, sources have told the BBC.
It could allow users to follow accounts they already follow on Instagram, Meta's image-sharing app.
And it could potentially allow them to bring over followers from decentralised platforms such as Mastodon.
A Meta spokesperson confirmed to the BBC that the platform was in development.
"We're exploring a standalone decentralised social network for sharing text updates," they said.
"We believe there's an opportunity for a separate space where creators and public figures can share timely updates about their interests."
Meta's chief product officer Chris Cox said coding was under way on the platform. The tech giant aims to release it soon, although no date was given. There is some speculation that it could be as