In [1]:
import requests
from bs4 import BeautifulSoup
import time

# Define the sitemap index URL
sitemap_index_url = "https://businessday.ng/sitemap_index.xml"

# Use a header to mimic a real browser (helps avoid blocking)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

try:
    response = requests.get(sitemap_index_url, headers=headers)
    response.raise_for_status()  # Raise an error if the request failed
    sitemap_index_xml = response.text
    print("Sitemap index fetched successfully!")
except requests.RequestException as e:
    print(f"Failed to fetch sitemap index: {e}")
    sitemap_index_xml = ""

Sitemap index fetched successfully!


In [2]:
article_sitemaps = []
if sitemap_index_xml:
    soup_index = BeautifulSoup(sitemap_index_xml, features="xml")
    sitemap_locs = [loc.text for loc in soup_index.find_all("loc")]
    for loc_url in sitemap_locs:
        # Exclude sitemaps for categories, tags, authors, etc.
        if "post-sitemap" in loc_url or "posts-sitemap" in loc_url:
            article_sitemaps.append(loc_url)
    print(f"Found {len(article_sitemaps)} article sitemap(s).")
else:
    print("No sitemap index XML to parse.")

Found 250 article sitemap(s).


In [3]:
article_urls = []  # to store (url, lastmod_date) for filtered articles

for sitemap_url in article_sitemaps:
    try:
        resp = requests.get(sitemap_url, headers=headers)
        resp.raise_for_status()
    except requests.RequestException as e:
        print(f"Warning: Could not retrieve {sitemap_url} ({e}) – skipping.")
        time.sleep(1)
        continue
    
    sitemap_xml = resp.text
    soup_sitemap = BeautifulSoup(sitemap_xml, "xml")
    url_entries = soup_sitemap.find_all("url")
    for entry in url_entries:
        loc_tag = entry.find("loc")
        lastmod_tag = entry.find("lastmod")
        if not loc_tag:
            continue
        url = loc_tag.text.strip()
        # Get the year from lastmod (if available), otherwise we'll fetch the page to get the date later
        year = None
        if lastmod_tag:
            # Extract year from date string, e.g., "2022-05-10T14:30:00+00:00"
            date_text = lastmod_tag.text.strip()
            if len(date_text) >= 4:
                year_str = date_text[:4]
                if year_str.isdigit():
                    year = int(year_str)
        # If no lastmod or year not found, we could still include and filter later by page content, 
        # but we'll assume sitemap provides it.
        if year and 2021 <= year <= 2025:
            article_urls.append(url)
    # Pause between sitemap requests
    time.sleep(1)

print(f"Total articles found for 2021–2025: {len(article_urls)}")

Total articles found for 2021–2025: 113225


In [None]:
from datetime import datetime
import csv
import requests
from bs4 import BeautifulSoup
import time

# Make sure to define article_urls and headers earlier.

output_filename = "businessday_articles_2021_2025.csv"
fieldnames = ["Title", "Author", "Date", "Content"]

# Create the CSV file and write the header ONCE before scraping
with open(output_filename, mode="w", encoding="utf-8", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

articles_data = []  # to temporarily store before flushing every 100

for idx, url in enumerate(article_urls):
    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching article {url} ({e}) – skipping.")
        time.sleep(1)
        continue
    
    page_html = res.text
    soup_page = BeautifulSoup(page_html, "html.parser")
    
    # Extract Title
    title_tag = soup_page.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else ""

    # Extract Author
    author = ""
    author_link = soup_page.find('a', href=lambda href: href and "/author/" in href)
    if author_link:
        author = author_link.get_text(strip=True)

    # Extract Date
    date_text = ""
    time_tag = soup_page.find('time')
    if time_tag:
        date_text = time_tag.get_text(strip=True)
    else:
        if author_link:
            parent = author_link.find_parent()
            if parent:
                combined = parent.get_text(" ", strip=True)
                date_text = combined.replace(author, "").strip().strip("-–|byBy ")

    # Extract Content
    content_text = ""
    content_container = soup_page.find('div', class_='entry-content')
    if content_container:
        content_text = content_container.get_text(" ", strip=True)
    else:
        article_tag = soup_page.find('article')
        if article_tag:
            content_text = article_tag.get_text(" ", strip=True)
    content_text = content_text.replace("\n", " ")

    # Save if valid
    if title and content_text:
        articles_data.append({
            "Title": title,
            "Author": author,
            "Date": date_text,
            "Content": content_text
        })

    # Save every 100 articles
    if (idx + 1) % 100 == 0 or (idx + 1) == len(article_urls):
        with open(output_filename, mode="a", encoding="utf-8", newline="") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            for article in articles_data:
                writer.writerow(article)
        print(f"Saved {len(articles_data)} articles at index {idx+1}...")
        articles_data = []  # Clear buffer after saving

    # Sleep to avoid overloading server
    time.sleep(1)

print("Scraping completed.")

Saved 100 articles at index 100...
Saved 100 articles at index 200...
Saved 100 articles at index 300...
Saved 100 articles at index 400...
Error fetching article https://businessday.ng/events/article/ez37-solutions-to-host-a-free-webinar-on-becoming-a-certified-coach/ (404 Client Error: Not Found for url: https://businessday.ng/events/article/ez37-solutions-to-host-a-free-webinar-on-becoming-a-certified-coach/) – skipping.
Saved 99 articles at index 500...
Saved 100 articles at index 600...
Error fetching article https://businessday.ng/events/article/businessday-joins-netplusdotcom-to-offer-free-digital-transformation-webinar-series-for-local-smes-in-2021/ (404 Client Error: Not Found for url: https://businessday.ng/events/article/businessday-joins-netplusdotcom-to-offer-free-digital-transformation-webinar-series-for-local-smes-in-2021/) – skipping.
Saved 99 articles at index 700...
Saved 100 articles at index 800...
Saved 100 articles at index 900...
Saved 100 articles at index 1000.

In [None]:
# import csv

# output_filename = "businessday_articles_2021_2025.csv"
# fieldnames = ["Title", "Author", "Date", "Content"]

# try:
#     with open(output_filename, mode="a", encoding="utf-8", newline="") as csvfile:
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#         writer.writeheader()
#         for article in articles_data:
#             writer.writerow(article)
#     print(f"CSV file '{output_filename}' written with {len(articles_data)} articles.")
# except Exception as e:
#     print(f"Error writing to CSV: {e}")