In [17]:
import os, sys, time, math, itertools, json
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path
import pandas as pd
from googleapiclient.discovery import build
from tqdm import tqdm   # progress bar

from sqlalchemy import create_engine
from urllib.parse import quote_plus  # safely URL-encode the driver name

In [18]:
API_KEY = os.getenv("YOUTUBE_API_KEY")          # export beforehand or load via dotenv
if not API_KEY:
    raise SystemExit("Set YOUTUBE_API_KEY environment variable first!")

In [19]:
def chunks(seq, n):
    """Yield successive n-sized chunks from seq (used for video-id batching)."""
    for i in range(0, len(seq), n):
        yield seq[i:i + n]

In [20]:
def safe_get(item, path, default=None):
    """Safely drill into nested dicts."""
    for key in path:
        item = item.get(key, {})
    return item or default

In [21]:
def build_service():
    # https://googleapis.github.io/google-api-python-client/docs/epy/googleapiclient.discovery-module.html#build
    # build(serviceName, version, developerKey=None, cache_discovery=True)
    #
    # serviceName: string, name of the service.
    # The serviceName and version are the names from the Discovery service.
    #
    # cache_discovery: Boolean, whether or not to cache the discovery doc.
    #

    return build("youtube", "v3", developerKey=API_KEY, cache_discovery=False)

In [22]:
def get_uploads_playlist_id(youtube, channel_id):
    """Step 1: one cheap call → uploads playlistId."""
    resp = youtube.channels().list(
        part="contentDetails",
        id=channel_id,
        maxResults=1
    ).execute()
    try:
        return resp["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    except (KeyError, IndexError):
        raise ValueError("Channel ID not found or no public uploads.")

In [23]:
def get_all_video_ids(youtube, uploads_playlist_id):
    """Step 2: page through playlistItems; collect videoIds."""
    video_ids = []
    next_page = None
    pbar = tqdm(desc="Fetching playlist pages", unit="page")
    while True:
        resp = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=50,      # API max
            pageToken=next_page
        ).execute()
        ids = [item["contentDetails"]["videoId"] for item in resp["items"]]
        video_ids.extend(ids)
        pbar.update(1)
        next_page = resp.get("nextPageToken")
        if not next_page:
            break
    pbar.close()
    return video_ids

In [24]:
def fetch_video_metadata(youtube, video_ids):
    """Step 3: batch-fetch videos.list in groups of ≤50 ids."""
    rows = []
    for batch in tqdm(list(chunks(video_ids, 50)), desc="Downloading metadata", unit="batch"):
        resp = youtube.videos().list(
            part="snippet,statistics,contentDetails",
            id=",".join(batch),
            maxResults=50
        ).execute()
        for v in resp["items"]:
            sni, stats, cd = v["snippet"], v["statistics"], v["contentDetails"]
            rows.append({
                "video_id"      : v["id"],
                "title"         : sni.get("title"),
                "published_at"  : sni.get("publishedAt"),
                "description"   : sni.get("description"),
                "duration_ISO"  : cd.get("duration"),       # e.g. PT13M20S
                "tags"          : "|".join(sni.get("tags", [])),
                "view_count"    : int(stats.get("viewCount", 0)),
                "like_count"    : int(stats.get("likeCount", 0)),
                "comment_count" : int(stats.get("commentCount", 0)),
                "favorite_count": int(stats.get("favoriteCount", 0)),
                "channel_title" : sni.get("channelTitle")
            })
        # polite pause – keeps you well below quota & QPS limits
        time.sleep(0.1)
    return pd.DataFrame(rows)

In [None]:
def df_to_azure_sql(df):
    """
    Write/append the dataframe into dbo.youtube_videos (Azure SQL DB)
    """
    import os
    from dotenv import load_dotenv
    load_dotenv()

    drv = "ODBC Driver 18 for SQL Server"            # keep spaces!
    odbc_str = (
        f"Driver={drv};Server=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"Database={os.getenv('AZSQL_DATABASE')};"
        f"Uid={os.getenv('AZSQL_USERNAME')};"
        f"Pwd={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )

    # SQLAlchemy-style URL.  Space → + ;  parentheses → %28 %29, etc.
    params = quote_plus(odbc_str)
    engine = create_engine(
        f"mssql+pyodbc:///?odbc_connect={params}",
        fast_executemany=True        # batches rows under the hood
    )

    # —— upsert strategy: try append-only, let PK skip duplicates
    with engine.begin() as cn:
        df.to_sql(
            name="youtube_videos",
            con=cn,
            schema="dbo",
            if_exists="append",       # create once, then append
            index=False,
            chunksize=1000,           # good balance of  network / TX
            method="multi"            # uses “INSERT … VALUES … , (…), …”
        )

# ------- replace the old CSV export line with:
df_to_azure_sql(df)
print("Data pushed to Azure SQL 🎉")

In [25]:
def main(channel_id):
    #
    youtube = build_service()
    uploads_id = get_uploads_playlist_id(youtube, channel_id)
    print(f"Uploads playlist ID: {uploads_id}")
    ids = get_all_video_ids(youtube, uploads_id)
    print(f"Total videos: {len(ids):,}")
    df = fetch_video_metadata(youtube, ids)
    outfile = Path(f"{channel_id}_videos.csv")
    df.to_csv(outfile, index=False)
    print(f"Saved → {outfile.resolve()}")

In [26]:

if __name__ == "__main__" and 1==0:
    if len(sys.argv) != 2:
        raise SystemExit(f"Usage: python fetch_channel_videos.py <CHANNEL_ID>")
    main(sys.argv[1])

In [30]:
CHANNEL_ID=os.getenv("VAUSH_CHANNEL_ID")
print(CHANNEL_ID)

UC1E-JS8L0j1Ei70D9VEFrPQ


In [32]:

main(CHANNEL_ID)

Uploads playlist ID: UU1E-JS8L0j1Ei70D9VEFrPQ


Fetching playlist pages: 62page [00:10,  6.17page/s]


Total videos: 3,090


Downloading metadata: 100%|██████████| 62/62 [00:20<00:00,  3.02batch/s]

Saved → C:\Users\vboxuser\PycharmProjects\Youtube_Scraping\apps\UC1E-JS8L0j1Ei70D9VEFrPQ_videos.csv



