In [1]:
import requests
import json
import pandas as pd
import os
import logging
import random
from datetime import datetime

# Set up logging to track errors
log_directory = "logs"
os.makedirs(log_directory, exist_ok=True)

# Configure logging to log to both file and console
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# File handler for logging to a file
file_handler = logging.FileHandler(f"{log_directory}/debug_log.txt")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))

# Console handler for logging to the terminal
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))

# Add handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# GDELT API endpoint for the Global News API (event data)
url = "https://api.gdeltproject.org/api/v2/doc/doc"

# Topics to search for with lowercase query for GDELT
topics = ["election", "business", "finance", "health", "environment"]

# Directory for saving files
save_directory = "data"
os.makedirs(save_directory, exist_ok=True)

# Prepare the final list of processed articles
all_articles = []

# Fetch articles for each topic
for topic in topics:
    # Parameters for the request
    params = {
        "query": f"{topic} sourcecountry:unitedstates sourcelang:english",  # Topic in lowercase for query
        "startdatetime": "20241006000000",  # Start date (6th Oct, 2024)
        "enddatetime": "20241105235959",    # End date (5th Nov, end of the day)
        "mode": "artlist",                  # Get article list mode
        "format": "json",                   # Get JSON format
        "maxrecords": 250,                  # Maximum number of records to fetch (adjust as needed)
    }

    # Send the request to the GDELT API
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an HTTPError if the response code is not 200
        logging.info(f"Data fetched successfully for {topic} from GDELT API.")
        
        # Parse the JSON response
        data = response.json()

        # Extract articles from the data
        articles = data.get('articles', [])

        # If no articles are found, log and skip
        if not articles:
            logging.warning(f"No articles found for topic: {topic}")
            continue

        # Process each article
        for idx, article in enumerate(articles):
            # Generate a random 9-digit ID for the article
            article_id = random.randint(100000000, 999999999)

            # Get 'fromdate' and 'todate' or skip if not available
            fromdate = article.get('fromdate')
            todate = article.get('todate')
            
            # Format dates if they are available, else default to start and end date in range
            if fromdate:
                fromdate = f"{fromdate[:4]}/{fromdate[4:6]}/{fromdate[6:]}"
            else:
                fromdate = "2024/10/06"
            
            if todate:
                todate = f"{todate[:4]}/{todate[4:6]}/{todate[6:]}"
            else:
                todate = "2024/11/05"
            
            processed_article = {
                'id': article_id,               # Unique random ID
                'category': topic.capitalize(),  # Capitalized category name for output
                'fromdate': fromdate,           # Use actual 'fromdate' or default with updated format
                'todate': todate,               # Use actual 'todate' or default with updated format
                'sourcename': article.get('domain', 'N/A'),
                'title': article.get('title', 'N/A'),
                'url': article.get('url', 'N/A'),
                'seendate': article.get('seendate', 'N/A'),
                'sourcecountry': article.get('sourcecountry', 'N/A'),
                'language': article.get('language', 'N/A'),
            }
            all_articles.append(processed_article)

    except requests.exceptions.RequestException as e:
        logging.error(f"Error while fetching data for {topic}: {e}")
        print(f"Error while fetching data for {topic}: {e}")
    except Exception as e:
        logging.error(f"Unexpected error while fetching data for {topic}: {e}")
        print(f"Unexpected error while fetching data for {topic}: {e}")

# Convert to a pandas DataFrame for easy CSV conversion
df = pd.DataFrame(all_articles)

# Save the DataFrame to a CSV file
df.to_csv(f"{save_directory}/all_articles_2024-11-05.csv", index=False)
logging.info("CSV data saved successfully for all topics.")

2024-12-01 22:47:13,194 - DEBUG - Starting new HTTPS connection (1): api.gdeltproject.org:443
2024-12-01 22:47:14,157 - DEBUG - https://api.gdeltproject.org:443 "GET /api/v2/doc/doc?query=election+sourcecountry%3Aunitedstates+sourcelang%3Aenglish&startdatetime=20241006000000&enddatetime=20241105235959&mode=artlist&format=json&maxrecords=250 HTTP/11" 200 None
2024-12-01 22:47:14,243 - INFO - Data fetched successfully for election from GDELT API.
2024-12-01 22:47:14,273 - DEBUG - Starting new HTTPS connection (1): api.gdeltproject.org:443
2024-12-01 22:47:15,054 - DEBUG - https://api.gdeltproject.org:443 "GET /api/v2/doc/doc?query=business+sourcecountry%3Aunitedstates+sourcelang%3Aenglish&startdatetime=20241006000000&enddatetime=20241105235959&mode=artlist&format=json&maxrecords=250 HTTP/11" 200 None
2024-12-01 22:47:15,164 - INFO - Data fetched successfully for business from GDELT API.
2024-12-01 22:47:15,190 - DEBUG - Starting new HTTPS connection (1): api.gdeltproject.org:443
2024-12-