In [None]:
!pip -q install requests tqdm

import os, json, time, random
import requests
from tqdm import tqdm

In [None]:
YOUTUBE_API_KEY = ""

RAW_DIR = "raw_data"
os.makedirs(RAW_DIR, exist_ok=True)

TARGET_ROWS = 5000

In [None]:
SESSION = requests.Session()

def polite_sleep(min_s=0.2, max_s=0.8):
    time.sleep(min_s + random.random() * (max_s - min_s))

def yt_get(url, params, max_retries=5):
    last = None
    for attempt in range(max_retries):
        polite_sleep()
        r = SESSION.get(url, params=params, timeout=30)
        last = r
        if r.status_code == 200:
            return r.json()
        time.sleep(2 ** attempt)
    raise RuntimeError(f"Failed: {url} status={last.status_code} body={last.text[:200]}")

def append_jsonl(path, obj):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

In [None]:
QUERIES = [
    "flu vaccine", "flu shot", "influenza vaccine", "influenza shot",
    "flu vaccine side effects", "flu shot side effects",
    "annual flu shot", "flu jab", "flu vaccination"
]

MAX_VIDEOS_PER_QUERY = 30
COMMENTS_PER_VIDEO_CAP = 300

In [None]:
SEARCH_URL = "https://www.googleapis.com/youtube/v3/search"

def search_videos(query, max_videos=30):
    video_ids = []
    page_token = None

    while len(video_ids) < max_videos:
        params = {
            "key": YOUTUBE_API_KEY,
            "part": "snippet",
            "q": query,
            "type": "video",
            "maxResults": 50
        }
        if page_token:
            params["pageToken"] = page_token

        data = yt_get(SEARCH_URL, params)
        items = data.get("items", [])
        for it in items:
            vid = it.get("id", {}).get("videoId")
            if vid:
                video_ids.append(vid)
            if len(video_ids) >= max_videos:
                break

        page_token = data.get("nextPageToken")
        if not page_token:
            break

    return list(dict.fromkeys(video_ids))

# Collect video IDs across queries
all_video_ids = []
for q in QUERIES:
    vids = search_videos(q, MAX_VIDEOS_PER_QUERY)
    all_video_ids.extend(vids)


all_video_ids = list(dict.fromkeys(all_video_ids))
print("Total unique video IDs:", len(all_video_ids))
all_video_ids[:10]

Total unique video IDs: 127


['Z2UqlSo3G-A',
 'rnzuyLQkC6U',
 'HfpGKLsFgjQ',
 'DKByks4MbN4',
 'KiEQTUFmSFQ',
 'YHbvmOByiI0',
 '2h09oj26_H0',
 '4LNbKAatWvM',
 '1OutlE1Y0zg',
 'YZCRkFL6qH0']

In [None]:
COMMENT_THREADS_URL = "https://www.googleapis.com/youtube/v3/commentThreads"

raw_comments_path = os.path.join(RAW_DIR, "youtube_comments_raw.jsonl")
raw_videos_path = os.path.join(RAW_DIR, "youtube_video_ids.jsonl")

# reset outputs
open(raw_comments_path, "w").close()
open(raw_videos_path, "w").close()

# save video IDs as raw too
for vid in all_video_ids:
    append_jsonl(raw_videos_path, {"videoId": vid})

count = 0
seen_comment_ids = set()

def fetch_comments_for_video(video_id, cap=300):
    out = []
    page_token = None

    while len(out) < cap:
        params = {
            "key": YOUTUBE_API_KEY,
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,
            "textFormat": "plainText"
        }
        if page_token:
            params["pageToken"] = page_token

        data = yt_get(COMMENT_THREADS_URL, params)
        items = data.get("items", [])
        if not items:
            break

        for it in items:
            top = it.get("snippet", {}).get("topLevelComment", {})
            cid = top.get("id")
            if not cid:
                continue

            # NO usernames saved
            snippet = top.get("snippet", {})
            record = {
                "kind": "youtube#comment",
                "commentId": cid,
                "videoId": video_id,
                "publishedAt": snippet.get("publishedAt"),
                "updatedAt": snippet.get("updatedAt"),
                "textDisplay": snippet.get("textDisplay"),
                "likeCount": snippet.get("likeCount"),
            }
            out.append(record)

            if len(out) >= cap:
                break

        page_token = data.get("nextPageToken")
        if not page_token:
            break

    return out

for vid in tqdm(all_video_ids):
    if count >= TARGET_ROWS:
        break

    try:
        rows = fetch_comments_for_video(vid, COMMENTS_PER_VIDEO_CAP)
    except Exception as e:
        continue

    for r in rows:
        if count >= TARGET_ROWS:
            break
        cid = r["commentId"]
        if cid in seen_comment_ids:
            continue
        seen_comment_ids.add(cid)

        append_jsonl(raw_comments_path, r)
        count += 1

print("Total comment rows saved:", count)
print("Raw files:", raw_comments_path, raw_videos_path)

100%|██████████| 127/127 [07:39<00:00,  3.62s/it]

Total comment rows saved: 3896
Raw files: raw_data/youtube_comments_raw.jsonl raw_data/youtube_video_ids.jsonl





In [None]:
import os

print("Files in raw_data/:")
for fn in sorted(os.listdir("raw_data")):
    print("-", fn, os.path.getsize(os.path.join("raw_data", fn)), "bytes")

Files in raw_data/:
- youtube_comments_raw.jsonl 1391660 bytes
- youtube_video_ids.jsonl 3429 bytes


In [None]:
def count_lines(path):
    with open(path, "r", encoding="utf-8") as f:
        return sum(1 for _ in f)

print("YouTube comment rows:", count_lines("raw_data/youtube_comments_raw.jsonl"))
print("Video IDs rows:", count_lines("raw_data/youtube_video_ids.jsonl"))

YouTube comment rows: 3896
Video IDs rows: 127


CSV export intentionally deffered to Phase 2 to keep Phase 1 focused on raw data collection.