In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import pandas as pd
import time
from webdriver_manager.chrome import ChromeDriverManager

# Setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.bbc.com/news/topics/cjxv13v27dyt"
driver.get(url)
time.sleep(2)

all_articles = []

def extract_articles(soup):
    articles = soup.find_all("div", {"data-testid": "liverpool-card"})
    results = []
    for article in articles:
        try:
            link_tag = article.find("a", {"data-testid": "internal-link"})
            href = link_tag['href']
            full_link = "https://www.bbc.com" + href

            title = article.find("h2", {"data-testid": "card-headline"}).get_text(strip=True)

            region_tag = article.find("span", {"data-testid": "card-metadata-tag"})
            region = region_tag.get_text(strip=True) if region_tag else None

            date_tag = article.find("span", {"data-testid": "card-metadata-lastupdated"})
            date = date_tag.get_text(strip=True) if date_tag else None

            results.append({
                "title": title,
                "link": full_link,
                "region": region,
                "date": date
            })
        except Exception as e:
            print("Error parsing article:", e)
    return results

In [2]:
# Pagination loop
page = 1
while True:
    print(f"Scraping page {page}...")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    all_articles.extend(extract_articles(soup))

    try:
        next_button = driver.find_element("xpath", "//button[@data-testid='pagination-next-button']")
        # Check if it's disabled
        if next_button.get_attribute("disabled"):
            print("Reached last page.")
            break
        next_button.click()
        time.sleep(2)
        page += 1
    except (NoSuchElementException, ElementClickInterceptedException):
        print("No more pages or click failed.")
        break

driver.quit()

# Create DataFrame
df = pd.DataFrame(all_articles).drop_duplicates(subset="link")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Reached last page.


In [3]:
df.shape

(276, 4)

In [4]:
df

Unnamed: 0,title,link,region,date
0,'Google AI presented my April Fools' story as ...,https://www.bbc.com/news/articles/cly12egqq5ko,Wales,3 Apr 2025
1,Woman sentenced in case that sparked Springfie...,https://www.bbc.com/news/articles/cy890gpqw1po,US & Canada,3 Dec 2024
2,The Onion buys Alex Jones's Infowars at auction,https://www.bbc.com/news/articles/c30p1p0j0ddo,US & Canada,14 Nov 2024
3,How US election fraud claims changed as Trump won,https://www.bbc.com/news/articles/cy9j8r8gg0do,US & Canada,8 Nov 2024
4,Whirlwind of misinformation sows distrust ahea...,https://www.bbc.com/news/articles/czj7eex29r3o,Technology,3 Nov 2024
...,...,...,...,...
271,What claims do you want BBC Reality Check to i...,https://www.bbc.com/news/uk-41928747,UK,17 Jan 2020
272,Russia bans 'disrespect' of government,https://www.bbc.com/news/world-europe-47488267,Europe,7 Mar 2019
273,QAnon: What's the truth behind a pro-Trump con...,https://www.bbc.com/news/blogs-trending-45040614,BBC Trending,2 Aug 2018
274,"BBC game challenges young people to spot ""fake...",https://www.bbc.com/news/school-report-43391188,Family & Education,14 Mar 2018


In [5]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm  # for progress bar

def extract_article_text(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        
        text_blocks = soup.find_all("div", {"data-component": "text-block"})
        paragraphs = []
        for block in text_blocks:
            for p in block.find_all("p"):
                paragraphs.append(p.get_text(strip=True))
        
        return " ".join(paragraphs)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

In [6]:
extract_article_text('https://www.bbc.com/news/articles/cly12egqq5ko')

'Every year, journalist Ben Black publishes a playful fake story on his community news site Cwmbran Life for April Fools\' Day. Since 2018 the 48-year-old has spun yarns including a Hollywood-style sign on a mountain to a nudist cold-water swimming club at a lake. In 2020, Mr Black published afake story claiming Cwmbran had been recognised by Guinness World Records for having the most roundabouts per square kilometre. Despite altering the wording of his article that afternoon, when he searched for it on 1 April he said he was "shocked" and "worried" to find the false information being used by Google\'s AI tool and presented as real information. Google said it was looking into the matter. Mr Black decided to begin writing fake stories for April Fools\' Day for "a bit of fun" and said his wife usually helped him come up with the ideas. The concept for his story in 2020 came from Cwmbran being a new town, where "often linking houses with roundabouts is the easiest way to build". "I made u

In [7]:
extract_article_text('https://www.bbc.com/news/articles/cy9j8r8gg0do')

'In the build-up to Tuesday’s US election, claims of voter fraud flooded social media - but as Donald Trump’s victory crystallised, the chatter largely subsided. The claims didn’t stop entirely, however. A number of right-wing influencers and organisations pushing stories about “cheating” and a “rigged” vote pointed to incomplete vote totals and continued to repeat discredited theories about the 2020 election. And disappointed Democratic Party supporters developed their own unsubstantiated voter fraud theories, some of which went viral on X, formerly Twitter, and other platforms. The reach of the posts is nowhere near the deluge of content that circulated after Trump lost the 2020 election. And with no support from losing candidate Kamala Harris or other Democratic Party officials, the chances seem slim of a large-scale movement developing along the lines of the “Stop the Steal” drive four years ago, which culminated in a riot at the US Capitol. The BBCtracked a huge wave of pre-electi

In [8]:
# Complete the dataframe by adding full article content
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import json

In [9]:
def extract_article_text(url):
    """Extract full article text from BBC article URL"""
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Try multiple selectors for BBC article content
        text_blocks = soup.find_all("div", {"data-component": "text-block"})
        paragraphs = []
        
        if text_blocks:
            for block in text_blocks:
                for p in block.find_all("p"):
                    text = p.get_text(strip=True)
                    if text:
                        paragraphs.append(text)
        else:
            # Fallback to other common BBC selectors
            article_body = soup.find("div", {"class": "story-body"}) or soup.find("main")
            if article_body:
                for p in article_body.find_all("p"):
                    text = p.get_text(strip=True)
                    if text and len(text) > 20:  # Filter out very short paragraphs
                        paragraphs.append(text)
        
        return " ".join(paragraphs) if paragraphs else None
        
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None


In [12]:
def create_complete_dataset(df):
    """Add full article content to the existing dataframe"""
    print(f"Processing {len(df)} articles...")
    
    # Add content column
    df_complete = df.copy()
    df_complete['content'] = None
    df_complete['content_length'] = 0
    df_complete['scraping_status'] = 'pending'
    
    successful_scrapes = 0
    failed_scrapes = 0
    
    for idx, row in tqdm(df_complete.iterrows(), total=len(df_complete), desc="Scraping articles"):
        try:
            content = extract_article_text(row['link'])
            
            if content and len(content.strip()) > 100:  # Ensure meaningful content
                df_complete.at[idx, 'content'] = content
                df_complete.at[idx, 'content_length'] = len(content)
                df_complete.at[idx, 'scraping_status'] = 'success'
                successful_scrapes += 1
            else:
                df_complete.at[idx, 'scraping_status'] = 'failed_no_content'
                failed_scrapes += 1
            
            # Be respectful to the server
            time.sleep(0.5)
            
        except Exception as e:
            df_complete.at[idx, 'scraping_status'] = f'failed_error: {str(e)[:50]}'
            failed_scrapes += 1
            print(f"Error processing {row['link']}: {e}")
    
    print(f"\nScraping completed:")
    print(f"Successful: {successful_scrapes}")
    print(f"Failed: {failed_scrapes}")
    print(f"Success rate: {successful_scrapes/(successful_scrapes+failed_scrapes)*100:.1f}%")
    
    # Filter to only successful scrapes
    df_successful = df_complete[df_complete['scraping_status'] == 'success'].copy()
    print(f"Final dataset: {len(df_successful)} articles with content")
    
    return df_successful

# Load your existing dataframe (assuming it's saved or recreate from your notebook)
# df = pd.read_csv('bbc_articles.csv')  # if you saved it
# OR recreate it from your notebook data

# For now, I'll assume you have your df from the notebook
# Run this after loading your df:
df_with_content = create_complete_dataset(df)

# Save the complete dataset
df_with_content.to_csv('bbc_articles_complete.csv', index=False)
df_with_content.to_json('bbc_articles_complete.json', orient='records', indent=2)

print("Complete dataset saved to 'bbc_articles_complete.csv' and 'bbc_articles_complete.json'")

# Display statistics
print(f"\nDataset Statistics:")
print(f"Total articles: {len(df_with_content)}")
print(f"Average content length: {df_with_content['content_length'].mean():.0f} characters")
print(f"Date range: {df_with_content['date'].min()} to {df_with_content['date'].max()}")
print(f"Regions covered: {df_with_content['region'].nunique()}")

# Show sample
print(f"\nSample article:")
sample = df_with_content.iloc[0]
print(f"Title: {sample['title']}")
print(f"Region: {sample['region']}")
print(f"Date: {sample['date']}")
print(f"Content preview: {sample['content'][:200]}...")

Processing 276 articles...


Scraping articles: 100%|██████████| 276/276 [04:37<00:00,  1.00s/it]


Scraping completed:
Successful: 276
Failed: 0
Success rate: 100.0%
Final dataset: 276 articles with content
Complete dataset saved to 'bbc_articles_complete.csv' and 'bbc_articles_complete.json'

Dataset Statistics:
Total articles: 276
Average content length: 5266 characters
Date range: 1 Feb 2021 to 9 Sep 2024
Regions covered: 35

Sample article:
Title: 'Google AI presented my April Fools' story as real news'
Region: Wales
Date: 3 Apr 2025
Content preview: Every year, journalist Ben Black publishes a playful fake story on his community news site Cwmbran Life for April Fools' Day. Since 2018 the 48-year-old has spun yarns including a Hollywood-style sign...



