In [None]:
import requests
import csv
import time
from datetime import datetime, timedelta
import os


In [None]:
def date_range_chunks(start_date_str, end_date_str, num_chunks=10):
    """
    Split a date range into smaller chunks.

    Args:
        start_date_str (str): Start date in format 'YYYY-MM-DD'
        end_date_str (str): End date in format 'YYYY-MM-DD'
        num_chunks (int): Number of chunks to create

    Returns:
        list: List of date range tuples (start_date, end_date) as strings
    """
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

    # Calculate the total number of days
    total_days = (end_date - start_date).days

    # Adjust num_chunks if it's more than the total days
    num_chunks = min(num_chunks, total_days)

    if num_chunks <= 1:
        return [(start_date_str, end_date_str)]

    # Calculate days per chunk (can be a float for more even distribution)
    days_per_chunk = total_days / num_chunks

    chunks = []
    for i in range(num_chunks):
        chunk_start = start_date + timedelta(days=(i * days_per_chunk))
        # If it's the last chunk, use the exact end date
        if i == num_chunks - 1:
            chunk_end = end_date
        else:
            chunk_end = start_date + timedelta(days=((i + 1) * days_per_chunk)) - timedelta(seconds=1)

        # Format dates as strings
        chunk_start_str = chunk_start.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end.strftime("%Y-%m-%d")

        chunks.append((chunk_start_str, chunk_end_str))

    return chunks

def fetch_news_articles(api_key, start_date, end_date, news_sources, language="en", page_size=100,
                       max_articles_per_source_chunk=25, date_chunks_count=5):
    """
    Fetch news articles from WorldNewsApi with date range chunking and multiple news sources.

    Args:
        api_key (str): Your WorldNewsApi API key
        start_date (str): Earliest publish date in format YYYY-MM-DD
        end_date (str): Latest publish date in format YYYY-MM-DD
        news_sources (list): List of news source URLs
        language (str): Language code (default: 'en')
        page_size (int): Number of results per page (default: 100, max allowed)
        max_articles_per_source_chunk (int): Maximum articles to fetch per source per date chunk
        date_chunks_count (int): Number of date chunks to divide the time period into

    Returns:
        list: List of news article dictionaries
    """
    base_url = "https://api.worldnewsapi.com/search-news"

    all_articles = []

    # Calculate date range chunks
    date_chunks = date_range_chunks(start_date, end_date, num_chunks=date_chunks_count)

    headers = {
        'x-api-key': api_key
    }

    # Keep track of API calls for reporting
    total_api_calls = 0

    # Cycle through each date chunk and each news source
    for idx, (chunk_start, chunk_end) in enumerate(date_chunks):
        print(f"\nProcessing date chunk {idx+1}/{len(date_chunks)}: {chunk_start} to {chunk_end}")

        for source_idx, news_source in enumerate(news_sources):
            print(f"  News source {source_idx+1}/{len(news_sources)}: {news_source}")

            current_offset = 0
            chunk_source_articles = []

            # Fetch up to max_articles_per_source_chunk articles per source per date chunk
            while len(chunk_source_articles) < max_articles_per_source_chunk:
                # Calculate remaining articles to fetch for this source/chunk
                remaining = max_articles_per_source_chunk - len(chunk_source_articles)
                current_page_size = min(page_size, remaining)

                # Build the query parameters
                params = {
                    'language': language,
                    'earliest-publish-date': chunk_start,
                    'latest-publish-date': chunk_end,
                    'news-sources': news_source,
                    'number': current_page_size,
                    'offset': current_offset,
                    'sort': 'publish-time',
                    'sort-direction': 'DESC'
                }

                print(f"    Fetching up to {current_page_size} articles with offset {current_offset}...")

                try:
                    # Make the API call
                    response = requests.get(base_url, headers=headers, params=params)
                    total_api_calls += 1

                    # Handle rate limiting
                    if response.status_code == 429:
                        wait_time = int(response.headers.get('Retry-After', 60))
                        print(f"    Rate limit reached. Waiting for {wait_time} seconds...")
                        time.sleep(wait_time)
                        continue

                    # Handle other errors
                    if response.status_code != 200:
                        print(f"    Error: {response.status_code} - {response.text}")
                        break

                    data = response.json()
                    articles = data.get('news', [])

                    # Add source identifier to help with analysis
                    for article in articles:
                        article['fetched_from'] = news_source

                    # If no more articles, break the loop
                    if not articles:
                        print("    No more articles found for this source in this date range.")
                        break

                    chunk_source_articles.extend(articles)
                    print(f"    Retrieved {len(articles)} articles. Total for this source/chunk: {len(chunk_source_articles)}")

                    # Update offset for the next page
                    current_offset += len(articles)

                    # Check if we've reached the total number of available news
                    total_news = data.get('available', 0)
                    if current_offset >= total_news:
                        print(f"    Reached the end of available news for this source/date range ({total_news} total).")
                        break

                    # Short delay to avoid hammering the API
                    time.sleep(1)

                except Exception as e:
                    print(f"    Error fetching data: {str(e)}")
                    break

            all_articles.extend(chunk_source_articles)
            print(f"  Completed source {source_idx+1}/{len(news_sources)}. Articles from this source: {len(chunk_source_articles)}")

        print(f"Completed date chunk {idx+1}/{len(date_chunks)}. Total articles so far: {len(all_articles)}")

    print(f"\nTotal API calls made: {total_api_calls}")
    return all_articles

def analyze_distribution(articles):
    """
    Analyze and print the distribution of articles by date and source.

    Args:
        articles (list): List of article dictionaries
    """
    date_counts = {}
    source_counts = {}
    source_date_counts = {}

    for article in articles:
        # Extract source
        source = article.get('fetched_from', 'unknown')

        # Extract publish date
        publish_date = article.get('publish_date', '')
        if publish_date:
            # Extract just the date part (YYYY-MM-DD)
            date_only = publish_date.split('T')[0] if 'T' in publish_date else publish_date.split(' ')[0]

            # Update counts
            date_counts[date_only] = date_counts.get(date_only, 0) + 1
            source_counts[source] = source_counts.get(source, 0) + 1

            # Create combined key for source and date
            source_date_key = f"{source}|{date_only}"
            source_date_counts[source_date_key] = source_date_counts.get(source_date_key, 0) + 1

    # Print source distribution
    print("\nArticle Distribution by Source:")
    for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{source}: {count} articles")

    # Print date distribution
    print("\nArticle Distribution by Date:")
    for date, count in sorted(date_counts.items()):
        print(f"{date}: {count} articles")

    # Print source-date distribution (top 10)
    print("\nTop Source-Date Combinations:")
    top_source_dates = sorted(source_date_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    for source_date, count in top_source_dates:
        source, date = source_date.split('|')
        print(f"{date} from {source}: {count} articles")

    # Calculate statistics
    if date_counts:
        total_dates = len(date_counts)
        total_sources = len(source_counts)
        avg_articles_per_date = sum(date_counts.values()) / total_dates
        avg_articles_per_source = sum(source_counts.values()) / total_sources

        print(f"\nDistribution Statistics:")
        print(f"- Total sources covered: {total_sources}")
        print(f"- Total dates covered: {total_dates}")
        print(f"- Avg articles per source: {avg_articles_per_source:.2f}")
        print(f"- Avg articles per date: {avg_articles_per_date:.2f}")

def save_to_csv(articles, filename):
    """
    Save the articles to a CSV file.

    Args:
        articles (list): List of article dictionaries
        filename (str): Output CSV filename
    """
    if not articles:
        print("No articles to save.")
        return

    # Extract all possible keys from all articles to use as CSV headers
    fieldnames = set()
    for article in articles:
        fieldnames.update(article.keys())

    fieldnames = sorted(list(fieldnames))

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles)

    print(f"Saved {len(articles)} articles to {filename}")


In [None]:

def main():
    # API key (replace with your actual API key)
    api_key = "2eefb6ac28254d0faf6698987a6d7e46"

    # Date range (March 1, 2025 to April 1, 2025)
    start_date = "2024-03-20"
    end_date = "2025-01-01"

    # News sources
    news_sources = [
        "https://www.bbc.co.uk",
        "https://politicalwire.com",
        "https://www.nytimes.com"
    ]

    # Language
    language = "en"

    # Number of results per page (maximum allowed is 100)
    page_size = 100

    # Maximum articles to fetch per source per date chunk
    # Lower number means more even distribution across sources and dates
    max_articles_per_source_chunk = 100

    # Number of date chunks to divide the time period into
    date_chunks_count = 8

    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"news_articles_{start_date}_to_{end_date}_{timestamp}.csv"

    print(f"Fetching news articles from {start_date} to {end_date} from {len(news_sources)} sources...")
    articles = fetch_news_articles(api_key, start_date, end_date, news_sources,
                                  language, page_size, max_articles_per_source_chunk, date_chunks_count)

    print(f"\nTotal articles retrieved: {len(articles)}")

    # Analyze distribution
    analyze_distribution(articles)

    # Save to CSV
    save_to_csv(articles, filename)

    print("Done!")

if __name__ == "__main__":
    main()



Fetching news articles from 2024-03-20 to 2025-01-01 from 4 sources...

Processing date chunk 1/8: 2024-03-20 to 2024-04-24
  News source 1/4: https://www.bbc.co.uk
    Fetching up to 100 articles with offset 0...
    Error: 400 - {"status":"failure", "code":400,"message":"On the free and starter plan, you cannot look back further than 1 month, please set 'earliest-publish-date' to a date within the last 30 days."}
  Completed source 1/4. Articles from this source: 0
  News source 2/4: https://politicalwire.com
    Fetching up to 100 articles with offset 0...
    Error: 400 - {"status":"failure", "code":400,"message":"On the free and starter plan, you cannot look back further than 1 month, please set 'earliest-publish-date' to a date within the last 30 days."}
  Completed source 2/4. Articles from this source: 0
  News source 3/4: https://www.foxnews.com
    Fetching up to 100 articles with offset 0...
    Error: 400 - {"status":"failure", "code":400,"message":"On the free and starter 

KeyboardInterrupt: 