In [62]:
import requests
from bs4 import BeautifulSoup
import json
import csv
import re
from datetime import datetime

## BBC Sport Scraper

In [54]:
def load_sport_categories():
    root_url = 'https://www.bbc.com/sport/all-sports'
    response = requests.get(root_url)
    doc = BeautifulSoup(response.text, 'html.parser')
    liste_cat = []

    links = doc.find_all('a')
    for link in links:
        liste_cat.append(link.get('href'))


    categories_links = [link for link in liste_cat if '/sport' in link]
    categories_links = [link for link in categories_links if 'https://www.bbc.com' not in link]
    categories_links = [link for link in categories_links if 'https://www.bbc.co.uk' not in link]
    categories_links = categories_links[3:-7]
    categories_links = ["/".join(link.split('/')[2:]) for link in categories_links]
    categories_links = list(set(categories_links))

    with open('sport_categories.csv', mode='w', newline='', encoding='utf-8') as datei:
        csv_writer = csv.writer(datei)
        for element in categories_links:
            csv_writer.writerow([element])

# load_sport_categories() # Uncomment to run. Only run if you really want to reload the categories, from bbc directly. This should only be necessary if the categories on the website change.

In [55]:
def get_sport_categories(filename: str):
    categories = []
    with open(filename, mode='r') as file:
        reader = csv.reader(file)
    
        for row in reader:
            categories.append(row[0])

    return set(categories)

In [56]:
categories = get_sport_categories('sport_categories.csv')
# categories = {'football','cycling','formula1','tennis'} #,'golf','athletics','basketball','boxing','darts'
root_url = 'https://www.bbc.com/sport'
modified_url = root_url[:root_url.rfind("/")] # remove last part of url, because articles in sport dont have full url
text_list = []

for category in categories:
    list_url = set()
    response = requests.get(root_url + r"/" + category)
    doc = BeautifulSoup(response.text, 'html.parser')
    newsAll = doc.find_all('div', { 'class':"ssrcss-1f3bvyz-Stack e1y4nx260" }) # find all news articles on the selected sport category page


    for news in newsAll:
        try:
            article_path = news.find('a') # find url of the article in article class
            if article_path['href'].startswith('/sport'): #filter out links that are not articles
                article_url = modified_url + article_path['href']
                list_url.add(article_url)
            else:
                continue

        except TypeError: # if no url is found
            continue


    for article_url in list_url:
        article = {}
        response = requests.get(article_url)
        doc = BeautifulSoup(response.text, 'html.parser')

        article['url'] = article_url
        article['category'] = category

        # get article heading
        # heading = doc.find('h1', {'id':"main-heading"})
        heading = doc.find('h1')
        try:
            article['heading'] = heading.text # type: ignore
        except AttributeError: # if no heading is found
            article['heading'] = '0'

        # get article publication date
        publication_date = doc.find('time')
        try:
            article['publication_date'] = publication_date['datetime'] # type: ignore
        except TypeError: # if no publication date is found
            article['publication_date'] = '0'

        # get article description
        if category == 'football':
            description = doc.find('p', {'role':'introduction'})
        else:
            description = doc.find('b', {'class':"ssrcss-1xjjfut-BoldText e5tfeyi3"})

        try: 
            article['description'] = description.text # type: ignore
        except AttributeError: # if no description is found
            article['description'] = '0'

    # get article text
        article_text = str()
        if category == 'football':
    
            textAll = doc.find_all('article')
            for text in textAll:
                paragraphs = text.find_all('p')
                for paragraph in paragraphs:
                    article_text = article_text + " " + paragraph.text
        else:

            textAll = doc.find_all('div',{'class':"ssrcss-7uxr49-RichTextContainer e5tfeyi1"})
            for text in textAll:
                paragraphs = text.find_all('p', { "class":"ssrcss-1q0x1qg-Paragraph e1jhz7w10" })
                for paragraph in paragraphs:
                    article_text = article_text + " " + paragraph.text

        article['text'] = article_text

        text_list.append(article)
        print('Article added: ', article['heading'])

text_list_clean = [article for article in text_list if article['text'] != ""]

print('No of articles found: ',len(text_list_clean))

Article added:  Crystal Palace 5-2 West Ham United: Michael Olise and Eberechi Eze star in brilliant Palace win
Article added:  Nottingham Forest question VAR official in Everton defeat and 'consider options'
Article added:  Arsenal Women 3-0 Leicester City Women: Gunners secure Champions League place
Article added:  Fulham 1-3 Liverpool: Trent Alexander-Arnold sets Reds on way to reigniting Premier League title bid
Article added:  Women's Football
Article added:  Huddersfield Town 0-4 Swansea City: Lowe, Ronald, Yates and Walsh on target
Article added:  Manchester City 5-0 West Ham: City cruise to victory to go top of Women's Super League
Article added:  Barcelona 0-1 Chelsea: Irene Paredes says Barca can 'turn it around' in Champions League semi-final second leg
Article added:  Premier League quiz: Can you name this current or former player?
Article added:  European Football
Article added:  Everton 2-0 Nottingham Forest: Hosts win Premier League relegation six-pointer
Article added: 

In [80]:
now = datetime.now().date()

with open(str(now) + '_bbc_sport.json', 'w', encoding='utf-8') as json_file:
    json.dump(text_list_clean, json_file, indent=4)

In [None]:
text_list_clean = [article for article in text_list if article['text'] != ""]

In [69]:
print(len(text_list_clean), len(text_list))

text = str()
for article in text_list_clean:
    text = text + ' ' + article['text']


# Word count
word_count = len(text.split())

# Character count
character_count = len(text)

# Sentence count using regular expression to split by sentence-ending punctuation
sentence_count = len(re.split(r'[.!?]+', text)) - 1

print(f"Word Count: {word_count}")
print(f"Character Count: {character_count}")
print(f"Sentence Count: {sentence_count}")


386 15
Word Count: 248076
Character Count: 1426545
Sentence Count: 13176


## Other stuff

In [None]:
# BBC home page 
root_url = 'https://www.bbc.com'
response = requests.get(root_url)
doc = BeautifulSoup(response.text, 'html.parser')

def find_bbc_article(url):
    print(url)
    # BBC article page
    response_article = requests.get(url)
    soup = BeautifulSoup(response_article.text, 'html.parser')
    
    
    # Find article descrition 
    body = soup.find(property="articleBody")
    description = [p.text for p in body.find_all("p")]
    if description:
        description = '\n'.join(description)
    
    # Find articles image
    try : 
        img_url = soup.find('img',{'class':"js-image-replace"}).get('src')
    except:
        print("No Image")
    
    # Find article time
    time = soup.find(class_="date").attrs['data-seconds']
    
    return description,img_url,time

# All articles list
news_list = []

def bbc_scraper(catagory):
    

    # Find all news or articles
    newsAll = doc.find_all('div', { 'data-testid': "edinburgh-article" })
    
    # Traverse all news or articles
    for news in newsAll:
        headline = news.find('h3')
        article_path = news.find('a')

        article_url = root_url + article_path['href']

        try:
            description,img_url,time = find_bbc_article(article_url)
    
            article = {
                "id": article_path['href'],
                "source": "BBC",
                "type": catagory,
                "author": "Null",
                "title": headline.text,
                "description": description,
                "url": article_url,
                "image_url": img_url,
                "published_at": time,
                "updated_at": time
            }

            # Add the article to our list
            news_list.append(article)
        except:
            print("Url error")

catagories = ["sport","business","innovation"]#,"science_and_environment","entertainment_and_arts","health"]
politics_catagories = ["world","asia","uk"]

for catagory in catagories:
    response = requests.get(root_url +r"/"+ catagory)
    doc = BeautifulSoup(response.text, 'html.parser')
    
    # "world","asia","uk" all are politics catagory
    world = 'world'
    asia = 'asia'
    uk = 'uk'
    if catagory in politics_catagories:
        catagory='politics'

    bbc_scraper(catagory)

print("Total news:",len(news_list))

with open('bbcNews.json', 'w', encoding='utf-8') as file:
        json.dump(news_list , file, ensure_ascii=False, indent=4)


