# DOE Google News Scraper with Accurate Dates
This notebook scrapes Google News for DOE press releases (2020–2025), classifies them by topic, and extracts actual article publication dates.

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import urllib.parse
from dateutil import parser

In [None]:
# Helper: classify category based on title
def classify_category(title):
    title = title.lower()
    if any(k in title for k in ["policy", "rule", "licens", "regulation", "act", "executive", "approval", "roadmap", "nrc"]):
        return "policy"
    elif any(k in title for k in ["reactor", "fusion", "smr", "deployment", "technology", "microreactor", "grant", "terrapower"]):
        return "tech"
    elif any(k in title for k in ["anniversary", "hiroshima", "fukushima"]):
        return "anniversary"
    elif any(k in title for k in ["protest", "lawsuit", "opposition", "activist"]):
        return "protest"
    elif any(k in title for k in ["leak", "shutdown", "disaster", "accident", "alert"]):
        return "disaster"
    elif any(k in title for k in ["france", "russia", "china", "ukraine", "international"]):
        return "international"
    return "other"


In [None]:
# Google News Scraper with accurate date extraction
def google_news_scraper(query, start_year, end_year, pause=2, max_pages=5):
    base_url = "https://www.google.com/search?q={query}&tbm=nws&tbs=cdr:1,cd_min:{start},cd_max:{end}&start={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    query_encoded = urllib.parse.quote_plus(query)
    articles = []

    for year in range(start_year, end_year + 1):
        start_date = f"01/01/{year}"
        end_date = f"12/31/{year}"
        seen_titles = set()

        for page_num in range(max_pages):
            page = page_num * 10
            url = base_url.format(query=query_encoded, start=start_date, end=end_date, page=page)
            print(f"🔍 Scraping: {url}")
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"❌ Failed to fetch page {page_num+1} for {year}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            found = 0

            for a in soup.find_all("a", href=True):
                h3 = a.find("h3")
                if h3:
                    title = h3.get_text(strip=True)
                    if title in seen_titles:
                        continue
                    seen_titles.add(title)
                    link = a['href']
                    if link.startswith("/url?q="):
                        cleaned_link = link.split("/url?q=")[1].split("&")[0]

                        # Try to get the date from a nearby <span>
                        span = a.find_next("span")
                        try:
                            parsed_date = parser.parse(span.get_text(strip=True), fuzzy=True).date().isoformat()
                        except Exception:
                            parsed_date = f"{year}-01-01"  # fallback if parsing fails

                        articles.append({
                            "date": parsed_date,
                            "label": title,
                            "category": classify_category(title),
                            "url": cleaned_link
                        })
                        found += 1

            print(f"✅ Year {year}, Page {page_num+1}: {found} articles found")
            time.sleep(pause)

    return pd.DataFrame(articles)


In [None]:
# Run and save
df = google_news_scraper("site:energy.gov press release", 2020, 2025, pause=2, max_pages=5)
df.to_csv("doe_press_releases_2020_2025_google_news.csv", index=False)
df.head()