In [None]:
import os
import time
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from tqdm import tqdm
from gdeltdoc import GdeltDoc, Filters 
from newspaper import Article, Config

KEYWORDS_LIST = ["energy transition", "carbon neutrality", "climate policy"]
DOMAINS = ["chinadaily.com.cn", "en.people.cn"]
START_DATE = "2020-01-01"
END_DATE = "2025-12-31" 
WINDOW_DAYS = 14

desktop_path = Path.home() / "Desktop"
target_folder = desktop_path / "SOSC314" / "Data"
target_folder.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = target_folder / "energy_narrative_2020_2025.csv"

print(f"System initialized. Data will be stored at: {OUTPUT_FILE}")

def get_metadata_batch(kw, domain, start_dt, end_dt):
    gd = GdeltDoc()
    f = Filters(keyword=kw, domain=domain, start_date=start_dt, end_date=end_dt)
    try:
        articles = gd.article_search(f)
        return articles
    except Exception:
        return pd.DataFrame()

def scrape_full_text(url):
    config = Config()
    config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    config.request_timeout = 15 
    try:
        article = Article(url, config=config)
        article.download()
        article.parse()
        if len(article.text) > 150:
            return article.text
        return None
    except Exception:
        return None

all_metadata_list = []
print(f"\n--- Phase 1: Metadata Retrieval ({START_DATE} to {END_DATE}) ---")

for kw in KEYWORDS_LIST:
    print(f"\nSearching Keyword: [{kw}]")
    curr_start = datetime.strptime(START_DATE, '%Y-%m-%d')
    final_end = datetime.strptime(END_DATE, '%Y-%m-%d')
    
    while curr_start < final_end:
        curr_end = curr_start + timedelta(days=WINDOW_DAYS)
        s_str = curr_start.strftime('%Y-%m-%d')
        e_str = curr_end.strftime('%Y-%m-%d')
        
        print(f"  Fetching: {s_str} to {e_str} ... ", end="", flush=True)
        
        start_time = time.time()
        for dom in DOMAINS:
            batch = get_metadata_batch(kw, dom, s_str, e_str)
            if not batch.empty:
                batch['source_label'] = dom
                batch['search_keyword'] = kw
                all_metadata_list.append(batch)
        
        end_time = time.time()
        print(f"Done! ({end_time - start_time:.1f}s)")
        
        curr_start = curr_end
        time.sleep(1)

if not all_metadata_list:
    print("\nError: No metadata retrieved. Check connection.")
else:
    df_metadata = pd.concat(all_metadata_list).drop_duplicates(subset=['url'])
    print(f"\nTotal Unique Articles Found: {len(df_metadata)}")

    print("\n--- Phase 2: Content Scraping ---")
    final_results = []

    for index, row in tqdm(df_metadata.iterrows(), total=len(df_metadata)):
        content = scrape_full_text(row['url'])
        if content:
            final_results.append({
                'url': row['url'],
                'title': row.get('title', 'N/A'),
                'publish_date': row.get('seendate', 'N/A'),
                'source': row['source_label'],
                'keyword': row['search_keyword'],
                'full_content': content
            })
        
        time.sleep(1.2)
        
        if len(final_results) > 0 and len(final_results) % 20 == 0:
            pd.DataFrame(final_results).to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')

    if final_results:
        df_final = pd.DataFrame(final_results)
        df_final.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
        print(f"\nSuccess! Total valid records: {len(df_final)}")
        print(f"Path: {OUTPUT_FILE}")
    else:
        print("\nProcess finished with 0 valid records.")