# DOE Google News Scraper with Real Article Dates
This notebook scrapes DOE press releases from Google News (2020–2025), classifies them, and extracts actual publication dates from the article pages.

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import urllib.parse
from dateutil import parser

In [2]:
def classify_category(title):
    title = title.lower()
    if any(k in title for k in ["policy", "rule", "licens", "regulation", "act", "executive", "approval", "roadmap", "nrc"]):
        return "policy"
    elif any(k in title for k in ["reactor", "fusion", "smr", "deployment", "technology", "microreactor", "grant", "terrapower"]):
        return "tech"
    elif any(k in title for k in ["anniversary", "hiroshima", "fukushima"]):
        return "anniversary"
    elif any(k in title for k in ["protest", "lawsuit", "opposition", "activist"]):
        return "protest"
    elif any(k in title for k in ["leak", "shutdown", "disaster", "accident", "alert"]):
        return "disaster"
    elif any(k in title for k in ["france", "russia", "china", "ukraine", "international"]):
        return "international"
    return "other"


In [3]:
def extract_article_date(url):
    try:
        res = requests.get(url, timeout=5)
        soup = BeautifulSoup(res.text, "html.parser")

        meta_tags = [
            {"attr": "property", "value": "article:published_time"},
            {"attr": "name", "value": "date"},
            {"attr": "name", "value": "pubdate"},
            {"attr": "itemprop", "value": "datePublished"},
        ]
        for tag in meta_tags:
            meta = soup.find("meta", attrs={tag["attr"]: tag["value"]})
            if meta and meta.get("content"):
                return parser.parse(meta["content"], fuzzy=True).date().isoformat()

        time_tag = soup.find("time")
        if time_tag:
            datetime_val = time_tag.get("datetime") or time_tag.get_text(strip=True)
            return parser.parse(datetime_val, fuzzy=True).date().isoformat()

    except Exception as e:
        print(f"⚠️ Failed to extract date from {url}: {e}")
    return None


In [4]:
def google_news_scraper(query, start_year, end_year, pause=2, max_pages=5):
    base_url = "https://www.google.com/search?q={query}&tbm=nws&tbs=cdr:1,cd_min:{start},cd_max:{end}&start={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    query_encoded = urllib.parse.quote_plus(query)
    articles = []

    for year in range(start_year, end_year + 1):
        start_date = f"01/01/{year}"
        end_date = f"12/31/{year}"
        seen_titles = set()

        for page_num in range(max_pages):
            page = page_num * 10
            url = base_url.format(query=query_encoded, start=start_date, end=end_date, page=page)
            print(f"🔍 Scraping: {url}")
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"❌ Failed to fetch page {page_num+1} for {year}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            found = 0

            for a in soup.find_all("a", href=True):
                h3 = a.find("h3")
                if h3:
                    title = h3.get_text(strip=True)
                    if title in seen_titles:
                        continue
                    seen_titles.add(title)
                    link = a['href']
                    if link.startswith("/url?q="):
                        cleaned_link = link.split("/url?q=")[1].split("&")[0]

                        real_date = extract_article_date(cleaned_link)
                        if not real_date:
                            real_date = f"{year}-01-01"  # fallback

                        articles.append({
                            "date": real_date,
                            "label": title,
                            "category": classify_category(title),
                            "url": cleaned_link
                        })
                        found += 1

            print(f"✅ Year {year}, Page {page_num+1}: {found} articles found")
            time.sleep(pause)

    return pd.DataFrame(articles)


In [5]:
df = google_news_scraper("site:energy.gov press release", 2020, 2025, pause=2, max_pages=5)
df.to_csv("doe_press_releases_with_real_dates.csv", index=False)
df.head()

🔍 Scraping: https://www.google.com/search?q=site%3Aenergy.gov+press+release&tbm=nws&tbs=cdr:1,cd_min:01/01/2020,cd_max:12/31/2020&start=0
✅ Year 2020, Page 1: 10 articles found
🔍 Scraping: https://www.google.com/search?q=site%3Aenergy.gov+press+release&tbm=nws&tbs=cdr:1,cd_min:01/01/2020,cd_max:12/31/2020&start=10
✅ Year 2020, Page 2: 10 articles found
🔍 Scraping: https://www.google.com/search?q=site%3Aenergy.gov+press+release&tbm=nws&tbs=cdr:1,cd_min:01/01/2020,cd_max:12/31/2020&start=20
✅ Year 2020, Page 3: 10 articles found
🔍 Scraping: https://www.google.com/search?q=site%3Aenergy.gov+press+release&tbm=nws&tbs=cdr:1,cd_min:01/01/2020,cd_max:12/31/2020&start=30
✅ Year 2020, Page 4: 10 articles found
🔍 Scraping: https://www.google.com/search?q=site%3Aenergy.gov+press+release&tbm=nws&tbs=cdr:1,cd_min:01/01/2020,cd_max:12/31/2020&start=40
✅ Year 2020, Page 5: 10 articles found
🔍 Scraping: https://www.google.com/search?q=site%3Aenergy.gov+press+release&tbm=nws&tbs=cdr:1,cd_min:01/01/2021

Unnamed: 0,date,label,category,url
0,2025-07-24,DOE Announces Site Selection for AI Data Cente...,other,https://www.energy.gov/articles/doe-announces-...
1,2025-07-28,Energy Department Appoints Inaugural CEO to Le...,other,https://www.energy.gov/articles/energy-departm...
2,2025-07-23,Department of Energy Terminates Taxpayer-Funde...,other,https://www.energy.gov/articles/department-ene...
3,2025-07-29,Secretary Wright Issues Emergency Order to Saf...,other,https://www.energy.gov/articles/secretary-wrig...
4,2025-07-28,Department of Energy Issues Report Evaluating ...,policy,https://www.energy.gov/articles/department-ene...
