In [None]:
import requests
import csv
import time
from datetime import datetime
import os


In [None]:
def fetch_news_articles(api_key, start_date, end_date, news_source, language="en", page_size=100, max_pages=10):
    """
    Fetch news articles from WorldNewsApi with pagination.

    Args:
        api_key (str): Your WorldNewsApi API key
        start_date (str): Earliest publish date in format YYYY-MM-DD
        end_date (str): Latest publish date in format YYYY-MM-DD
        news_source (str): URL of the news source
        language (str): Language code (default: 'en')
        page_size (int): Number of results per page (default: 100, max allowed)
        max_pages (int): Maximum number of pages to fetch

    Returns:
        list: List of news article dictionaries
    """
    base_url = "https://api.worldnewsapi.com/search-news"

    all_articles = []
    current_offset = 0

    headers = {
        'x-api-key': api_key
    }

    for page in range(max_pages):
        # Build the query parameters
        params = {
            'language': language,
            'earliest-publish-date': start_date,
            'latest-publish-date': end_date,
            # 'news-sources': news_source,
            'number': page_size,
            'offset': current_offset,
            'sort': 'publish-time',
            'sort-direction': 'DESC'
        }

        print(f"Fetching page {page+1} with offset {current_offset}...")

        try:
            response = requests.get(base_url, headers=headers, params=params)

            # Handle rate limiting
            if response.status_code == 429:
                wait_time = int(response.headers.get('Retry-After', 60))
                print(f"Rate limit reached. Waiting for {wait_time} seconds...")
                time.sleep(wait_time)
                continue

            # Handle other errors
            if response.status_code != 200:
                print(f"Error: {response.status_code} - {response.text}")
                break

            data = response.json()
            articles = data.get('news', [])

            # If no more articles, break the loop
            if not articles:
                print("No more articles found.")
                break

            all_articles.extend(articles)
            print(f"Retrieved {len(articles)} articles. Total so far: {len(all_articles)}")

            # Update offset for the next page
            current_offset += page_size

            # Check if we've reached the total number of available news
            total_news = data.get('available', 0)
            if current_offset >= total_news:
                print(f"Reached the end of available news ({total_news} total).")
                break

            # Short delay to avoid hammering the API
            time.sleep(1)

        except Exception as e:
            print(f"Error fetching data: {str(e)}")
            break

    return all_articles


In [None]:
def save_to_csv(articles, filename):
    """
    Save the articles to a CSV file.

    Args:
        articles (list): List of article dictionaries
        filename (str): Output CSV filename
    """
    if not articles:
        print("No articles to save.")
        return

    # Extract all possible keys from all articles to use as CSV headers
    fieldnames = set()
    for article in articles:
        fieldnames.update(article.keys())

    fieldnames = sorted(list(fieldnames))

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles)

    print(f"Saved {len(articles)} articles to {filename}")


In [None]:
def main():
    # API key (replace with your actual API key)
    api_key = "6f5ecd01552340ebad22daa8344947fc"

    # Date range (March 1, 2025 to April 1, 2025)
    start_date = "2025-03-05"
    end_date = "2025-04-01"

    # News source
    news_source = "https://www.bbc.co.uk"

    # Language
    language = "en"

    # Number of results per page (maximum allowed is 100)
    page_size = 100

    # Maximum number of pages to fetch (adjust based on your API limits)
    # For example, if your plan allows 1000 calls, set this to 10 to get 1000 articles
    max_pages = 100

    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"bbc_news_{start_date}_to_{end_date}_{timestamp}.csv"

    print(f"Fetching BBC news articles from {start_date} to {end_date}...")
    articles = fetch_news_articles(api_key, start_date, end_date, news_source,
                                   language, page_size, max_pages)

    print(f"Total articles retrieved: {len(articles)}")
    save_to_csv(articles, filename)

    print("Done!")

if __name__ == "__main__":
    main()

Fetching BBC news articles from 2025-03-05 to 2025-04-01...
Fetching page 1 with offset 0...
Retrieved 100 articles. Total so far: 100
Fetching page 2 with offset 100...
Retrieved 100 articles. Total so far: 200
Fetching page 3 with offset 200...
Retrieved 100 articles. Total so far: 300
Fetching page 4 with offset 300...
Retrieved 100 articles. Total so far: 400
Fetching page 5 with offset 400...
Retrieved 100 articles. Total so far: 500
Fetching page 6 with offset 500...
Retrieved 100 articles. Total so far: 600
Fetching page 7 with offset 600...
Retrieved 100 articles. Total so far: 700
Fetching page 8 with offset 700...
Retrieved 100 articles. Total so far: 800
Fetching page 9 with offset 800...
Retrieved 100 articles. Total so far: 900
Fetching page 10 with offset 900...
Retrieved 100 articles. Total so far: 1000
Fetching page 11 with offset 1000...
Retrieved 100 articles. Total so far: 1100
Fetching page 12 with offset 1100...
Retrieved 100 articles. Total so far: 1200
Fetching p