In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup


# URL parameters
query = 'economy'
country = 'Russia'
format = 'CSV'
maxrecords = 250
sort = 'DateDesc'
start_datetime = '20181017000000'  # October 17, 2014 (10 years ago from today)
end_datetime = '20241017000000'  # October 17, 2024 (current date)
language = 'EN'  # Filter for English-language articles

# Constructing the URL
url = f'https://api.gdeltproject.org/api/v2/doc/doc?query={query}&format={format}&maxrecords={maxrecords}&sort={sort}&STARTDATETIME={start_datetime}&ENDDATETIME={end_datetime}&sourcecountry={country}&lang={language}'


# Example query for economy-related articles
# url = 'https://api.gdeltproject.org/api/v2/doc/doc?query=economy&format=CSV&maxrecords=20&sort=DateDesc'
response = requests.get(url)

if response.status_code == 200:
    # Save the CSV data
    with open('gdelt_economy_articles.csv', 'wb') as file:
        file.write(response.content)
else:
    print(f"Error: {response.status_code}")


# Load data into a DataFrame
df = pd.read_csv('gdelt_economy_articles.csv')

# Counters for scraping attempts and successful scrapes
scrape_attempts = 0
successful_scrapes = 0

# Phrases that commonly indicate a subscription or paywall
paywall_phrases = ['subscription required', 'sign in to read', 'subscribe', 'purchase', "you don't have permission", 'This website is using a security service']

# Function to scrape article content
def scrape_article(url):
    global scrape_attempts, successful_scrapes
    scrape_attempts += 1  # Increment the attempt counter
    print(f"Scraping article {scrape_attempts} from URL: {url}")

    try:
        response = requests.get(url, timeout=10)  # Add a timeout to avoid hanging
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract paragraphs and combine them
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text().strip() for para in paragraphs if para.get_text().strip()])

        # Clean the content: check for paywall, error messages, and ensure it's not empty
        if not content or any(phrase in content.lower() for phrase in paywall_phrases):
            print(f"Skipped article {scrape_attempts}: Paywall or no content")
            return None

        # Increment success counter if the article is valid
        successful_scrapes += 1
        print(f"Successfully scraped article {successful_scrapes}")
        return content
    except Exception as e:
        print(f"Failed to scrape article {scrape_attempts}: {e}")
        return None

# Scrape article content and store it in the 'content' column
df['content'] = df['URL'].apply(scrape_article)

# Filter out rows where 'content' is None or empty
df = df[df['content'].notnull() & df['content'].str.strip().astype(bool)]

df.to_csv('economy_articles_with_content.csv', index=False)
print("Saved economy articles with full content.")



Error: 429


FileNotFoundError: [Errno 2] No such file or directory: 'gdelt_economy_articles.csv'