#### Data collection successfully completed on Feb 11, 2026.
####  Raw files were downloaded and preserved.
#### API free-tier limits may prevent immediate re-execution.

In [None]:
!pip -q install requests

import os, json, time, random
import requests

In [None]:
GNEWS_API_KEY = ""

RAW_DIR = "raw_data"
os.makedirs(RAW_DIR, exist_ok=True)


In [None]:
SESSION = requests.Session()

def polite_sleep(min_s=0.7, max_s=1.4):
    time.sleep(min_s + random.random() * (max_s - min_s))

def append_jsonl(path, obj):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

In [None]:
GNEWS_URL = "https://gnews.io/api/v4/search"

params = {
    "q": "flu vaccine",
    "lang": "en",
    "max": 10,
    "page": 1,
    "apikey": GNEWS_API_KEY
}

r = SESSION.get(GNEWS_URL, params=params, timeout=30)
print("Status:", r.status_code)
print("Body preview:", r.text[:200])

In [None]:
# Free plan behavior: max=10 results per request
MAX_PER_REQUEST = 10
LANG = "en"

# Increase query variety to reduce duplicates
QUERIES = [
    "flu vaccine",
    "flu shot",
    "influenza vaccine",
    "influenza shot",
    "flu vaccination",
    "influenza vaccination",
    "flu jab",
    "FluMist",
    "flu shot side effects",
    "flu vaccine side effects",
    "flu shot effectiveness",
    "flu vaccine effectiveness",
    "flu vaccine safety",
    "seasonal flu vaccine",
    "annual flu shot",
    "CDC flu shot",
    "WHO influenza vaccine",
    "should I get a flu shot",
    "flu vaccine pregnancy",
    "flu shot for children",
    "flu shot for elderly",
    "flu vaccine myths"
]

# How deep to paginate per query (raise if you want; may hit daily limit)
MAX_PAGES_PER_QUERY = 10  # 10 pages * 10 results = up to 100 per query

In [None]:
raw_articles_path = os.path.join(RAW_DIR, "gnews_articles_raw.jsonl")
open(raw_articles_path, "w").close()

seen_urls = set()
total = 0

for q in QUERIES:
    for page in range(1, MAX_PAGES_PER_QUERY + 1):
        polite_sleep()

        params = {
            "q": q,
            "lang": LANG,
            "max": MAX_PER_REQUEST,
            "page": page,
            "sortby": "publishedAt",
            "apikey": GNEWS_API_KEY
        }

        resp = SESSION.get(GNEWS_URL, params=params, timeout=30)

        # Handle rate limit / plan limit gracefully
        if resp.status_code == 429:
            print("Hit rate/request limit (429). Stopping collection.")
            break

        if resp.status_code != 200:
            raise RuntimeError(f"GNews error {resp.status_code}: {resp.text[:200]}")

        data = resp.json()
        articles = data.get("articles", [])
        if not articles:
            # No more pages for this query
            break

        added = 0
        for a in articles:
            url = a.get("url")
            if url and url in seen_urls:
                continue
            if url:
                seen_urls.add(url)

            # Save raw article object exactly as returned
            append_jsonl(raw_articles_path, {"kind": "gnews_article", **a})
            total += 1
            added += 1

        print(f"Query='{q}' page={page} added={added} total={total}")

        # If we added nothing new on this page, stop paginating this query
        if added == 0:
            break

    # If we hit 429, stop outer loops too
    if resp.status_code == 429:
        break

print("Total unique articles saved:", total)
print("Raw file:", raw_articles_path)

In [None]:
def count_lines(path):
    with open(path, "r", encoding="utf-8") as f:
        return sum(1 for _ in f)

print("GNews article rows (JSONL lines):", count_lines(raw_articles_path))