In [1]:
import os
import sys
import time
import requests
import pandas as pd
from datetime import datetime, timedelta

In [7]:


# ── CONFIGURATION ─────────────────────────────────────────────────────────────
API_KEY              = "3170106d1a0e96162d6bd13e1921de31"

OUTPUT_CSV   = 'gnews_articles.csv'
INTERVALS    = 52           # one per week
DESIRED_PER  = 20           # target per interval
PER_PAGE_MAX = 10           # free tier limit
PAGES_PER    = (DESIRED_PER + PER_PAGE_MAX - 1) // PER_PAGE_MAX  # = 2

def save_results(articles, filename=OUTPUT_CSV):
    df = pd.DataFrame(articles)
    df.to_csv(filename, index=False)
    print(f"[{datetime.utcnow():%Y-%m-%d %H:%M:%S}] Saved {len(df)} articles")

def fetch_uniform_articles():
    end_dt   = datetime.utcnow()
    start_dt = end_dt - timedelta(days=365)
    # split into 53 timestamps → 52 weekly spans
    splits = pd.date_range(start=start_dt, end=end_dt, periods=INTERVALS+1).to_pydatetime()
    all_articles = []

    for i in range(INTERVALS):
        frm = splits[i].strftime('%Y-%m-%dT%H:%M:%SZ')
        to  = splits[i+1].strftime('%Y-%m-%dT%H:%M:%SZ')

        fetched_this_interval = 0
        for page in range(1, PAGES_PER+1):
            params = {
                'q':       'news',       # all topics
                'from':    frm,
                'to':      to,
                'lang':    'en',
                'country': 'us',
                'max':     PER_PAGE_MAX,
                'page':    page,
                'token':   API_KEY
            }
            try:
                r = requests.get('https://gnews.io/api/v4/search', params=params, timeout=10)
                r.raise_for_status()
                batch = r.json().get('articles', [])
                for art in batch:
                    all_articles.append({
                        'title':       art.get('title'),
                        'description': art.get('description'),
                        'url':         art.get('url'),
                        'publishedAt': art.get('publishedAt'),
                        'source_name': art.get('source', {}).get('name')
                    })
                fetched_this_interval += len(batch)
                print(f"✅ Week {i+1}/{INTERVALS}, page {page}: {len(batch)} articles")
            except Exception as e:
                print(f"⚠️  Week {i+1}, page {page} failed: {e}", file=sys.stderr)
            time.sleep(1)  # throttle between requests

        print(f"— Week {i+1} total: {fetched_this_interval} articles")
        save_results(all_articles)

    return all_articles

if __name__ == '__main__':
    fetch_uniform_articles()


✅ Week 1/52, page 1: 10 articles
✅ Week 1/52, page 2: 10 articles
— Week 1 total: 20 articles
[2025-04-20 15:34:51] Saved 20 articles
✅ Week 2/52, page 1: 10 articles
✅ Week 2/52, page 2: 10 articles
— Week 2 total: 20 articles
[2025-04-20 15:34:54] Saved 40 articles
✅ Week 3/52, page 1: 10 articles
✅ Week 3/52, page 2: 10 articles
— Week 3 total: 20 articles
[2025-04-20 15:34:57] Saved 60 articles
✅ Week 4/52, page 1: 10 articles
✅ Week 4/52, page 2: 10 articles
— Week 4 total: 20 articles
[2025-04-20 15:35:01] Saved 80 articles
✅ Week 5/52, page 1: 10 articles
✅ Week 5/52, page 2: 10 articles
— Week 5 total: 20 articles
[2025-04-20 15:35:04] Saved 100 articles
✅ Week 6/52, page 1: 10 articles
✅ Week 6/52, page 2: 10 articles
— Week 6 total: 20 articles
[2025-04-20 15:35:07] Saved 120 articles
✅ Week 7/52, page 1: 10 articles
✅ Week 7/52, page 2: 10 articles
— Week 7 total: 20 articles
[2025-04-20 15:35:10] Saved 140 articles
✅ Week 8/52, page 1: 10 articles
✅ Week 8/52, page 2: 10 ar

⚠️  Week 51, page 1 failed: 403 Client Error: Forbidden for url: https://gnews.io/api/v4/search?q=news&from=2025-04-06T14%3A39%3A25Z&to=2025-04-13T15%3A07%3A06Z&lang=en&country=us&max=10&page=1&token=3170106d1a0e96162d6bd13e1921de31
⚠️  Week 51, page 2 failed: 403 Client Error: Forbidden for url: https://gnews.io/api/v4/search?q=news&from=2025-04-06T14%3A39%3A25Z&to=2025-04-13T15%3A07%3A06Z&lang=en&country=us&max=10&page=2&token=3170106d1a0e96162d6bd13e1921de31


— Week 51 total: 0 articles
[2025-04-20 15:37:33] Saved 1000 articles


⚠️  Week 52, page 1 failed: 403 Client Error: Forbidden for url: https://gnews.io/api/v4/search?q=news&from=2025-04-13T15%3A07%3A06Z&to=2025-04-20T15%3A34%3A48Z&lang=en&country=us&max=10&page=1&token=3170106d1a0e96162d6bd13e1921de31
⚠️  Week 52, page 2 failed: 403 Client Error: Forbidden for url: https://gnews.io/api/v4/search?q=news&from=2025-04-13T15%3A07%3A06Z&to=2025-04-20T15%3A34%3A48Z&lang=en&country=us&max=10&page=2&token=3170106d1a0e96162d6bd13e1921de31


— Week 52 total: 0 articles
[2025-04-20 15:37:36] Saved 1000 articles


In [12]:
import pandas as pd

# 1. Load both CSVs, parsing dates
other_df = pd.read_csv(
    'old_data.csv',
    parse_dates=['publish_date'],
    date_parser=lambda col: pd.to_datetime(col, utc=True)
)
gnews_df = pd.read_csv(
    'gnews_articles.csv',
    parse_dates=['publishedAt'],
    date_parser=lambda col: pd.to_datetime(col, utc=True)
)

# 2. Strip timezone info from both (make tz-naive)
other_df['publish_date'] = other_df['publish_date'].dt.tz_localize(None)
gnews_df['publishedAt']  = gnews_df['publishedAt'].dt.tz_localize(None)

# 3. Select & rename
other_trim = other_df[['publish_date', 'summary']].rename(
    columns={'publish_date': 'parsed_date', 'summary': 'Headline'}
)
gnews_trim = gnews_df[['publishedAt', 'title']].rename(
    columns={'publishedAt': 'parsed_date', 'title': 'Headline'}
)

# 4. Combine & sort
combined = pd.concat([other_trim, gnews_trim], ignore_index=True)
combined = combined.sort_values('parsed_date').reset_index(drop=True)

combined.to_csv('all_headlines.csv', index=False)

print(combined.head())


          parsed_date                                           Headline
0 2024-04-27 11:27:01  MMA News Roundup: Joe Rogan Gives His Verdict ...
1 2024-04-27 11:27:01  MMA News Roundup: Joe Rogan Gives His Verdict ...
2 2024-04-27 12:54:46  Exclusive: Apple's Upcoming Beats Solo Buds & ...
3 2024-04-27 12:54:46  Exclusive: Apple's Upcoming Beats Solo Buds & ...
4 2024-04-27 13:00:20  Match Group CEO Bernard Kim on romance scams: ...


  other_df = pd.read_csv(
  gnews_df = pd.read_csv(
