In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

prem_teams = [
    "afc-bournemouth", "arsenal", "aston-villa", "brentford", "brighton-and-hove-albion",
    "burnley", "chelsea", "crystal-palace", "everton", "fulham",
    "leeds-united", "liverpool", "manchester-city", "manchester-united",
    "newcastle-united", "nottingham-forest", "sunderland",
    "tottenham-hotspur", "west-ham-united", "wolverhampton-wanderers"
]
base = "https://www.bbc.com/sport/football/teams/{team_slug}?page={i}"

all_articles_data = []

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

for team_slug in prem_teams:
    print(f"\nStarting scrape for: **{team_slug}**")
    page_number = 1
    
    while True:
        team_url = base.format(team_slug=team_slug, i=page_number)
        
        try:
            response = requests.get(team_url, headers=headers)
            if response.status_code == 404:
                print(f"Page {page_number}: **404 Not Found**. Stopping for this team.")
                break
            response.raise_for_status() 
            html = response.content
        except requests.exceptions.HTTPError as e:
            print(f"Page {page_number}: **HTTP Error {response.status_code}**. Error: {e}")
            break
        except requests.exceptions.RequestException as e:
            print(f"Page {page_number}: **Connection Error**. Error: {e}")
            break

        soup = BeautifulSoup(html, 'html.parser')
        articles = soup.find_all('article')
        if not articles:
            print(f"Page {page_number}: No articles found. Stopping for this team.")
            break 
        
        print(f"Page {page_number}: Found **{len(articles)}** articles.")

        for a in articles:
            header_tag = a.find('header')
            header_text = header_tag.text.strip() if header_tag else 'No Header Found'
            
            paragraphs = a.find_all('p')
            article_text = '\n'.join([p.text.strip() for p in paragraphs])
            
            #Extract date
            date_tag = a.find('span', {'data-testid': 'accessible-timestamp'})
            article_date = date_tag.text.strip() if date_tag else 'No Date Found'
            
            article_data = {
                'Team': team_slug,
                'Page': page_number,
                'Header': header_text,
                'Article_Text': article_text,
                'Date': article_date
            }
            all_articles_data.append(article_data)

        page_number += 1
        time.sleep(2)

print("\nSaving Data")
df = pd.DataFrame(all_articles_data)
output_file = 'bbc_football_articles_full.csv'
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Successfully scraped {len(df)} articles across all teams and saved to **{output_file}**.")


ðŸš€ Starting scrape for: **afc-bournemouth**
   âœ… Page 1: Found **20** articles.
   âœ… Page 2: Found **19** articles.
   âœ… Page 3: Found **18** articles.
   âœ… Page 4: Found **19** articles.
   âœ… Page 5: Found **21** articles.
   âœ… Page 6: Found **21** articles.
   âœ… Page 7: Found **17** articles.
   âœ… Page 8: Found **21** articles.
   âœ… Page 9: Found **19** articles.
   âœ… Page 10: Found **22** articles.
   âœ… Page 11: Found **20** articles.
   âœ… Page 12: Found **17** articles.
   âœ… Page 13: Found **17** articles.
   âœ… Page 14: Found **19** articles.
   âœ… Page 15: Found **23** articles.
   âœ… Page 16: Found **17** articles.
   âœ… Page 17: Found **16** articles.
   âœ… Page 18: Found **22** articles.
   âœ… Page 19: Found **22** articles.
   âœ… Page 20: Found **16** articles.
   âœ… Page 21: Found **15** articles.
   âœ… Page 22: Found **19** articles.
   âœ… Page 23: Found **18** articles.
   âœ… Page 24: Found **18** articles.
   âœ… Page 25: Found **15

In [5]:
from datetime import datetime

def parse_bbc_date(date_str):
    try:
        date_str = date_str.replace('published at ', '')
        current_year = datetime.now().year
        #Remove GMT
        date_str = date_str.replace('GMT', '').strip()
        #Append year if missing
        if str(current_year) not in date_str:
            date_str = f"{date_str} {current_year}"
        
        #Parse date
        dt = datetime.strptime(date_str, "%H:%M %d %B %Y")
        return dt.strftime("%d-%m-%Y %H:%M")
    except Exception as e:
        return date_str

df['Date'] = df['Date'].apply(parse_bbc_date)
df.head(10)

Unnamed: 0,Team,Page,Header,Article_Text,Date
0,afc-bournemouth,1,"Gossip: Liverpool, Man Utd & Spurs all in Seme...",Liverpool are leading the race to sign Bournem...,07:35 2025
1,afc-bournemouth,1,Â£65m 'far too low' - your views on Semenyo's r...,We asked for your views on Bournemouth winger ...,18-11-2025 15:01
2,afc-bournemouth,1,Bournemouth's best Premier League XI?published...,"Over the past week, we have been asking you to...",18-11-2025 12:32
3,afc-bournemouth,1,Bournemouth 'pretty powerless' if Semenyo rele...,BBC Radio Solent's Jordan Clark has said repor...,18-11-2025 10:44
4,afc-bournemouth,1,Gossip: Bournemouth unwilling to lose Semenyo ...,"Liverpool's hopes of signing Antoine Semenyo, ...",18-11-2025 07:59
5,afc-bournemouth,1,Your Bournemouth Premier League XIpublished at...,"We have, roughly, run the numbers and here is ...",17-11-2025 16:40
6,afc-bournemouth,1,'Know how to use the noise and scrutiny'publis...,Nicola PearsonBBC Sport journalist\nThe statis...,17-11-2025 12:56
7,afc-bournemouth,1,Gossip: Liverpool linked to Semenyo in a winte...,With Mohamed Salah due to play for Egypt at th...,17-11-2025 08:20
8,afc-bournemouth,1,Stadium or state of mind? Psychologist on home...,"Nicola PearsonBBC Sport journalist\n""Home adva...",16-11-2025 15:28
9,afc-bournemouth,1,'Defend is his middle name' - your Premier Lea...,We wanted your suggestions for Bournemouth's a...,15-11-2025 09:15


In [7]:
df.to_csv(output_file, index=False, encoding='utf-8')