In [3]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse
import pandas as pd

In [2]:
base_url = "https://www.bbc.com/"

In [3]:
def get_unique_anchor_hrefs(base_url):
    # Send a request to the base URL
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve content: {response.status_code}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all anchor tags with the specified structure
    anchor_tags = soup.find_all('a', {'data-testid': 'internal-link'})

    unique_hrefs = set()  # Use a set to store unique links

    for anchor in anchor_tags:
        if 'href' in anchor.attrs:
            href = anchor['href']
            
            # Handle full URLs and relative URLs
            full_url = urljoin(base_url, href)  # Join base_url with relative hrefs

            # Parse the full URL and normalize it
            parsed_url = urlparse(full_url)
            normalized_url = parsed_url._replace(path=parsed_url.path.replace('//', '/')).geturl()

            unique_hrefs.add(normalized_url)  # Add to set to avoid duplicates

    return unique_hrefs

In [4]:
def save_urls_to_file(links, filename):
    # Write the unique links to a file
    with open(filename, 'w') as file:
        for link in links:
            file.write(f"{link}\n")
    print(f"Saved {len(links)} unique urls to {filename}")

In [5]:
unique_hrefs = get_unique_anchor_hrefs(base_url)

# Save unique links to a file
save_urls_to_file(unique_hrefs, 'urls.txt')

Saved 82 unique urls to urls.txt


In [1]:
def scrape(unique_links_file):
    # Initialize lists to store the scraped data
    headlines = []
    descriptions = []
    subcategories = []
    categories = []
    publication_dates = []

    # Get current time for reference
    current_time = datetime.now()

    # Read the unique links from the file
    with open(unique_links_file, 'r') as file:
        links = file.readlines()
    
    # Loop through each link to scrape data
    for link in links:
        url = link.strip()  # Clean up the URL
        
        # Send a GET request to the webpage
        try:
            response = requests.get(url)
            
            # Check if the request was successful
            if response.status_code != 200:
                print(f"Failed to retrieve content from {url}: {response.status_code}")
                continue

            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, "html.parser")

            # Find the first <h1> tag as the category for each unique URL page
            category_tag = soup.find('h1', {'data-testid': 'nested-navigation-page-title'})
            category = category_tag.text.strip() if category_tag else 'NA'

            # Find the main container holding all articles
            articles = soup.find_all('div', {'data-testid': 'card-text-wrapper'})

            # Loop through each article to extract the relevant data
            for article in articles:
                # Extract headline
                headline_tag = article.find('h2', {'data-testid': 'card-headline'})
                headline = headline_tag.text.strip() if headline_tag else 'NA'
                headlines.append(headline)

                # Extract description
                description_tag = article.find('p', {'data-testid': 'card-description'})
                description = description_tag.text.strip() if description_tag else 'NA'
                descriptions.append(description)

                # Extract subcategory
                subcategory_tag = article.find('span', {'data-testid': 'card-metadata-tag'})
                subcategory = subcategory_tag.text.strip() if subcategory_tag else 'NA'
                subcategories.append(subcategory)

                # Extract publication date
                date_tag = article.find('span', {'data-testid': 'card-metadata-lastupdated'})
                relative_time = date_tag.text.strip() if date_tag else 'NA'
                
                # Convert relative time to timestamp
                if relative_time != 'NA':
                    try:
                        # Split the relative time string
                        time_parts = relative_time.split()
                        if len(time_parts) == 3:
                            num = int(time_parts[0])  # the number of hours/days
                            unit = time_parts[1]  # the unit (hrs/days)
                            if 'hr' in unit:
                                publication_time = current_time - timedelta(hours=num)
                            elif 'day' in unit:
                                publication_time = current_time - timedelta(days=num)
                            else:
                                publication_time = current_time  # fallback
                            publication_date = publication_time.strftime("%Y-%m-%d %H:%M:%S")
                        else:
                            publication_date = 'NA'  # in case the format is unexpected
                    except ValueError:
                        publication_date = 'NA'  # in case of conversion issues
                else:
                    publication_date = 'NA'
                
                publication_dates.append(publication_date)

                # Append the category extracted from the page to the corresponding article
                categories.append(category)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
            continue

    # Create a DataFrame using the scraped data
    data = {
        "Headline": headlines,
        "Context": descriptions,
        "Category": categories,
        "Subcategory": subcategories,
        "Publication Date": publication_dates
    }

    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv("data.csv", index=False)

    print("Data successfully scraped and saved to data.csv")

In [7]:
unique_links_file = "urls.txt"
scrape(unique_links_file)

Data successfully scraped and saved to ata.csv


### Data Analysis

In [62]:
import matplotlib.pyplot as plt
import seaborn as sns

In [63]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Headline,Context,Category,Subcategory,Publication Date
0,Global trade up despite growing conflicts — 10...,The World Trade Organisation's chief economist...,,Opening Bell,2024-10-10 22:11:09
1,The threat of solar storms explained,The US has issued a solar storm watch. So what...,,Science & Health,2024-10-10 15:11:09
2,How to reset your brain with your breath,With each inhalation and exhalation we have th...,,Health Decoded,2024-10-10 09:11:09
3,Renewable energy is growing due to low prices ...,IEA head says world is adding as much renewabl...,,Opening Bell,2024-10-09 22:11:09
4,Uber boss: Chinese EVs good for environment - ...,Uber CEO Dara Khosrowshahi says electric cars ...,,Opening Bell,2024-10-08 22:11:09


In [64]:
df.describe

<bound method NDFrame.describe of                                                Headline  \
0     Global trade up despite growing conflicts — 10...   
1                  The threat of solar storms explained   
2              How to reset your brain with your breath   
3     Renewable energy is growing due to low prices ...   
4     Uber boss: Chinese EVs good for environment - ...   
...                                                 ...   
1579  Colin Farrell describes how he transformed int...   
1580  Vienna Philharmonic: See the magic of the grea...   
1581          Reviving Madeira's stunning walking paths   
1582  Earth tides: Why our planet's crust has tides too   
1583             Is this the future of electric racing?   

                                                Context Category  \
0     The World Trade Organisation's chief economist...      NaN   
1     The US has issued a solar storm watch. So what...      NaN   
2     With each inhalation and exhalation we have th.

In [65]:
df.shape

(1584, 5)

In [66]:
df.drop_duplicates(inplace=True)

In [67]:
df.shape

(1047, 5)

In [68]:
df.dtypes

Headline            object
Context             object
Category            object
Subcategory         object
Publication Date    object
dtype: object

In [69]:
df['Publication Date'] = pd.to_datetime(df['Publication Date'])

In [70]:
df.dtypes

Headline                    object
Context                     object
Category                    object
Subcategory                 object
Publication Date    datetime64[ns]
dtype: object

In [71]:
df.isna().sum()

Headline              0
Context             185
Category            588
Subcategory          74
Publication Date     35
dtype: int64

In [72]:
df.drop(columns=['Category'], inplace=True)

In [73]:
df['Headline'].fillna('Not Available', inplace=True)
df['Context'].fillna('Not Available', inplace=True)
df['Subcategory'].fillna('Unknown', inplace=True)
df['Publication Date'].fillna(pd.Timestamp('2024-01-01'), inplace=True)

In [74]:
df.isna().sum()

Headline            0
Context             0
Subcategory         0
Publication Date    0
dtype: int64

In [75]:
df.head()

Unnamed: 0,Headline,Context,Subcategory,Publication Date
0,Global trade up despite growing conflicts — 10...,The World Trade Organisation's chief economist...,Opening Bell,2024-10-10 22:11:09
1,The threat of solar storms explained,The US has issued a solar storm watch. So what...,Science & Health,2024-10-10 15:11:09
2,How to reset your brain with your breath,With each inhalation and exhalation we have th...,Health Decoded,2024-10-10 09:11:09
3,Renewable energy is growing due to low prices ...,IEA head says world is adding as much renewabl...,Opening Bell,2024-10-09 22:11:09
4,Uber boss: Chinese EVs good for environment - ...,Uber CEO Dara Khosrowshahi says electric cars ...,Opening Bell,2024-10-08 22:11:09


In [76]:
df.shape

(1047, 4)

In [80]:
df.to_csv("cleaned_data.csv")