In [1]:
import json
import time
import urllib.request
import urllib.error
from datetime import datetime, timedelta
from newspaper import Article

# List of API keys (rotate through these when rate limits are hit)
api_keys = ["f1bf348b43b7bcd01e85bb48b9004e35","0bb2cf2c78a4bcaa1031ce94399e43e0","0bb2cf2c78a4bcaa1031ce94399e43e0","2cf193679204ff0f7ed459f0802065b4","e803fa0f00d10d33aed884f4817c442f","46a6b1e8bd760246885ce254afcc8bc4","8c0eb0635a7ff3a28bcc03a518b84afa","bb2e490326b68d657ee20a5037addbba","cd8ed833857837a5cf16ffda435e21fa","524b14c974275d30ca0568c4ef583cdd","9da39a8e80ed9ff08652d5f8c705ecbc","807680e5ca3a74d0b94fb64fb78a0817","b42b7a967e6bfbd10e1abb294f46a9ea","73fcd25c9bd186b6836cdbd0eddfb0c4","aaff285de31e9ce79aa4e64bba17e24c","7e12b47a497698b7813bd1460760632e","d468c8fab5d1baf744566358025b1d41","7a44e177c34b7ce71aa79105ae2f724e"]  # Add more keys as needed
api_index = 0  # Start with the first API key

BASE_URL = "https://gnews.io/api/v4/search"

# Define the date range for the query
start_date = '2023-04-01'
end_date = '2024-04-01'

# Dictionary to store articles sorted by month
articles_by_month = {}

# Function to fetch articles from GNews API
def fetch_articles(query, from_date, to_date, api_key, page=1):
    from_date_formatted = f"{from_date}T00:00:00Z"
    to_date_formatted = f"{to_date}T23:59:59Z"

    url = f"{BASE_URL}?q={query}&from={from_date_formatted}&to={to_date_formatted}&lang=en&apikey={api_key}&page={page}&max=100"
    
    try:
        with urllib.request.urlopen(url) as response:
            data = json.loads(response.read().decode("utf-8"))
            return data.get("articles", [])
    except urllib.error.HTTPError as e:
        if e.code in [403, 429]:  # Rate limit exceeded or forbidden
            print(f"API Key {api_index + 1} hit the limit (HTTP {e.code}). Switching to next API key...")
            return "RATE_LIMIT"
        print(f"HTTP error occurred: {e}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Function to extract full content using newspaper3k
def extract_full_content(article_url):
    try:
        article = Article(article_url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Error extracting content from {article_url}: {e}")
        return None

# Loop through the date range to collect articles
current_date = datetime.strptime(start_date, '%Y-%m-%d')

while current_date < datetime.strptime(end_date, '%Y-%m-%d'):
    next_date = current_date + timedelta(days=1)
    
    # Get current year and month to store in the correct file
    year_month = current_date.strftime('%Y_%m')
    if year_month not in articles_by_month:
        articles_by_month[year_month] = []

    attempts = 0  # Keep track of API key attempts

    while attempts < len(api_keys):  # Ensure we don't exceed available API keys
        api_key = api_keys[api_index]  # Use the current API key
        print(f"Fetching articles for {current_date.strftime('%Y-%m-%d')} using API Key {api_index + 1}...")

        articles_for_day = fetch_articles('HCL', current_date.strftime('%Y-%m-%d'), next_date.strftime('%Y-%m-%d'), api_key)

        if articles_for_day == "RATE_LIMIT":
            api_index = (api_index + 1) % len(api_keys)  # Switch to the next API key
            attempts += 1  # Count this attempt
            if attempts == len(api_keys):  # If all keys are exhausted, stop execution
                print("All API keys exhausted! Stopping execution.")
                exit()
            continue  # Retry with the next API key
        else:
            break  # Exit loop if request is successful

    # Process and add articles to the monthly storage
    for article in articles_for_day:
        full_content = extract_full_content(article["url"])
        if full_content:
            article["full_content"] = full_content  # Add full content to the article

        articles_by_month[year_month].append(article)

    print(f"Collected {len(articles_for_day)} articles for {current_date.strftime('%Y-%m-%d')}")

    time.sleep(1)  # Delay to avoid hitting rate limits
    current_date = next_date


Fetching articles for 2023-04-01 using API Key 1...
API Key 1 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 2...
API Key 2 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 3...
API Key 3 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 4...
API Key 4 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 5...
API Key 5 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 6...
API Key 6 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 7...
API Key 7 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 8...
API Key 8 hit the limit (HTTP 403). Switching to next API key...
Fetching articles for 2023-04-01 using API Key 9...
API Key 9 hi



Collected 1 articles for 2024-03-27
Fetching articles for 2024-03-28 using API Key 17...
Collected 2 articles for 2024-03-28
Fetching articles for 2024-03-29 using API Key 17...
Collected 1 articles for 2024-03-29
Fetching articles for 2024-03-30 using API Key 17...
Collected 1 articles for 2024-03-30
Fetching articles for 2024-03-31 using API Key 17...
Collected 1 articles for 2024-03-31


In [2]:

# Save sorted articles into a single JSON file per month
for year_month, articles in articles_by_month.items():
    articles.sort(key=lambda x: x['publishedAt'])  # Sort by published date
    filename = f'hcl_articles_{year_month}.json'
    with open(filename, 'w') as f:
        json.dump(articles, f, indent=4)

    print(f"Saved {len(articles)} articles to {filename}")

print("Data collection complete.")


Saved 75 articles to hcl_articles_2023_04.json
Saved 16 articles to hcl_articles_2023_05.json
Saved 16 articles to hcl_articles_2023_06.json
Saved 57 articles to hcl_articles_2023_07.json
Saved 15 articles to hcl_articles_2023_08.json
Saved 22 articles to hcl_articles_2023_09.json
Saved 55 articles to hcl_articles_2023_10.json
Saved 6 articles to hcl_articles_2023_11.json
Saved 20 articles to hcl_articles_2023_12.json
Saved 78 articles to hcl_articles_2024_01.json
Saved 22 articles to hcl_articles_2024_02.json
Saved 22 articles to hcl_articles_2024_03.json
Data collection complete.
