In [4]:
!pip install gdeltdoc newspaper3k pandas tqdm



In [13]:
import os
import time
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from tqdm import tqdm
from gdeltdoc import GdeltDoc, Filters 
from newspaper import Article, Config

KEYWORDS_LIST = ["energy transition", "carbon neutrality", "climate policy"]
DOMAINS = [    
    "theguardian.com", 
    "reuters.com", 
]
START_DATE = "2020-01-01"
END_DATE = "2025-12-31" 
WINDOW_DAYS = 14

desktop_path = Path.home() / "Desktop"
target_folder = desktop_path / "SOSC314" / "Data"
target_folder.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = target_folder / "energy_narrative_W.csv"

def get_metadata_batch(kw, domain, start_dt, end_dt):
    gd = GdeltDoc()
    f = Filters(keyword=kw, domain=domain, start_date=start_dt, end_date=end_dt)
    try:
        articles = gd.article_search(f)
        return articles
    except Exception:
        return pd.DataFrame()

def scrape_full_text(url):
    config = Config()
    config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    config.request_timeout = 15 
    try:
        article = Article(url, config=config)
        article.download()
        article.parse()
        if len(article.text) > 150:
            return article.text
        return None
    except Exception:
        return None

all_metadata_list = []
print(f"System Check: Saving to {OUTPUT_FILE}")
print("\n--- Phase 1: Metadata Retrieval ---")

for kw in KEYWORDS_LIST:
    print(f"Keyword Group: [{kw}]")
    curr_start = datetime.strptime(START_DATE, '%Y-%m-%d')
    final_end = datetime.strptime(END_DATE, '%Y-%m-%d')
    
    while curr_start < final_end:
        curr_end = curr_start + timedelta(days=WINDOW_DAYS)
        s_str, e_str = curr_start.strftime('%Y-%m-%d'), curr_end.strftime('%Y-%m-%d')
        print(f"  Fetching: {s_str} to {e_str} ... ", end="", flush=True)
        
        batch_start = time.time()
        for dom in DOMAINS:
            batch = get_metadata_batch(kw, dom, s_str, e_str)
            if not batch.empty:
                batch['source_label'] = dom
                batch['search_keyword'] = kw
                all_metadata_list.append(batch)
        
        print(f"Done! ({time.time() - batch_start:.1f}s)")
        curr_start = curr_end
        time.sleep(1)

if not all_metadata_list:
    print("No metadata found.")
else:
    df_metadata = pd.concat(all_metadata_list).drop_duplicates(subset=['url'])
    print(f"\nPhase 1 Complete. Unique URLs identified: {len(df_metadata)}")

    print("\n--- Phase 2: Full-Text Extraction ---")
    final_results = []
    for index, row in tqdm(df_metadata.iterrows(), total=len(df_metadata)):
        content = scrape_full_text(row['url'])
        if content:
            final_results.append({
                'url': row['url'],
                'title': row.get('title', 'N/A'),
                'publish_date': row.get('seendate', 'N/A'),
                'source': row['source_label'],
                'keyword': row['search_keyword'],
                'full_content': content
            })
        time.sleep(1.2)
        if len(final_results) % 50 == 0:
            pd.DataFrame(final_results).to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')

    if final_results:
        df_final = pd.DataFrame(final_results)
        df_final.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
        print(f"Total Valid Articles: {len(df_final)}")
    else:
        print("Scraping failed.")

System Check: Saving to /Users/oushilin/Desktop/SOSC314/Data/energy_narrative_W.csv

--- Phase 1: Metadata Retrieval ---
Keyword Group: [energy transition]
  Fetching: 2020-01-01 to 2020-01-15 ... Done! (3.0s)
  Fetching: 2020-01-15 to 2020-01-29 ... Done! (3.6s)
  Fetching: 2020-01-29 to 2020-02-12 ... Done! (3.3s)
  Fetching: 2020-02-12 to 2020-02-26 ... Done! (3.6s)
  Fetching: 2020-02-26 to 2020-03-11 ... Done! (3.6s)
  Fetching: 2020-03-11 to 2020-03-25 ... Done! (4.0s)
  Fetching: 2020-03-25 to 2020-04-08 ... Done! (3.3s)
  Fetching: 2020-04-08 to 2020-04-22 ... Done! (4.1s)
  Fetching: 2020-04-22 to 2020-05-06 ... Done! (5.0s)
  Fetching: 2020-05-06 to 2020-05-20 ... Done! (3.9s)
  Fetching: 2020-05-20 to 2020-06-03 ... Done! (3.7s)
  Fetching: 2020-06-03 to 2020-06-17 ... Done! (4.7s)
  Fetching: 2020-06-17 to 2020-07-01 ... Done! (4.1s)
  Fetching: 2020-07-01 to 2020-07-15 ... Done! (3.2s)
  Fetching: 2020-07-15 to 2020-07-29 ... Done! (4.3s)
  Fetching: 2020-07-29 to 2020-08-

100%|██████████| 5683/5683 [4:55:57<00:00,  3.12s/it]    

Total Valid Articles: 2117



