In [19]:
import os, sys, time, math, itertools, json
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path
import pandas as pd
from googleapiclient.discovery import build
from tqdm import tqdm   # progress bar

from sqlalchemy import create_engine
from urllib.parse import quote_plus  # safely URL-encode the driver name

In [20]:
API_KEY = os.getenv("YOUTUBE_API_KEY")          # export beforehand or load via dotenv
if not API_KEY:
    raise SystemExit("Set YOUTUBE_API_KEY environment variable first!")

In [21]:
def chunks(seq, n):
    """Yield successive n-sized chunks from seq (used for video-id batching)."""
    for i in range(0, len(seq), n):
        yield seq[i:i + n]

In [22]:
def safe_get(item, path, default=None):
    """Safely drill into nested dicts."""
    for key in path:
        item = item.get(key, {})
    return item or default

In [23]:
def build_service():
    # https://googleapis.github.io/google-api-python-client/docs/epy/googleapiclient.discovery-module.html#build
    # build(serviceName, version, developerKey=None, cache_discovery=True)
    #
    # serviceName: string, name of the service.
    # The serviceName and version are the names from the Discovery service.
    #
    # cache_discovery: Boolean, whether or not to cache the discovery doc.
    #

    return build("youtube", "v3", developerKey=API_KEY, cache_discovery=False)

In [24]:
def get_uploads_playlist_id(youtube, channel_id):
    """Step 1: one cheap call â†’ uploads playlistId."""
    resp = youtube.channels().list(
        part="contentDetails",
        id=channel_id,
        maxResults=1
    ).execute()
    try:
        return resp["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    except (KeyError, IndexError):
        raise ValueError("Channel ID not found or no public uploads.")

In [25]:
def get_all_video_ids(youtube, uploads_playlist_id):
    """Step 2: page through playlistItems; collect videoIds."""
    video_ids = []
    next_page = None
    pbar = tqdm(desc="Fetching playlist pages", unit="page")
    while True:
        resp = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=50,      # API max
            pageToken=next_page
        ).execute()
        ids = [item["contentDetails"]["videoId"] for item in resp["items"]]
        video_ids.extend(ids)
        pbar.update(1)
        next_page = resp.get("nextPageToken")
        if not next_page:
            break
    pbar.close()
    return video_ids

In [26]:
def fetch_video_metadata(youtube, video_ids):
    """Step 3: batch-fetch videos.list in groups of â‰¤50 ids."""
    rows = []
    for batch in tqdm(list(chunks(video_ids, 50)), desc="Downloading metadata", unit="batch"):
        resp = youtube.videos().list(
            part="snippet,statistics,contentDetails",
            id=",".join(batch),
            maxResults=50
        ).execute()
        for v in resp["items"]:
            sni, stats, cd = v["snippet"], v["statistics"], v["contentDetails"]
            rows.append({
                "video_id"      : v["id"],
                "title"         : sni.get("title"),
                "published_at"  : sni.get("publishedAt"),
                "description"   : sni.get("description"),
                "duration_ISO"  : cd.get("duration"),       # e.g. PT13M20S
                "tags"          : "|".join(sni.get("tags", [])),
                "view_count"    : int(stats.get("viewCount", 0)),
                "like_count"    : int(stats.get("likeCount", 0)),
                "comment_count" : int(stats.get("commentCount", 0)),
                "favorite_count": int(stats.get("favoriteCount", 0)),
                "channel_title" : sni.get("channelTitle")
            })
        # polite pause â€“ keeps you well below quota & QPS limits
        time.sleep(0.1)
    return pd.DataFrame(rows)

In [27]:
def normalize_dataframe(df):
    df["published_at"] = (
        pd.to_datetime(df["published_at"], utc=True)  # parse ISO-8601
          .dt.tz_convert(None)                       # drop the UTC tz-info
    )

    # --- make sure numeric cols are true ints, not NaN/float strings ----------
    num_cols = ["view_count", "like_count", "comment_count", "favorite_count"]
    df[num_cols] = (
        df[num_cols]
          .fillna(0)            # YouTube may omit like_count, etc. -> NaN
          .astype("Int64")      # pandas nullable int â†’ SQL BIGINT/INT fine
    )
    return df

In [28]:
# build engine (same style you've been using)
def select_all_azure_sql():
    drv = "ODBC Driver 18 for SQL Server"
    odbc_str = (
        f"DRIVER={{{drv}}};"
        f"SERVER=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"DATABASE={os.getenv('AZSQL_DATABASE')};"
        f"UID={os.getenv('AZSQL_USERNAME')};"
        f"PWD={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;"
        "TrustServerCertificate=no;"
        "Connection Timeout=30;"
    )

    params = quote_plus(odbc_str)
    engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")

    # read table into pandas
    df = pd.read_sql("SELECT * FROM YOUTUBE_API.Vaush_VIDEOS", engine)
    return df

In [None]:
def df_to_azure_sql(df):
    """
    Write/append the dataframe into dbo.youtube_videos (Azure SQL DB)
    """

    drv = "ODBC Driver 18 for SQL Server"            # keep spaces!
    odbc_str = (
        f"Driver={drv};Server=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"Database={os.getenv('AZSQL_DATABASE')};"
        f"Uid={os.getenv('AZSQL_USERNAME')};"
        f"Pwd={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )

    # SQLAlchemy-style URL.  Space â†’ + ;  parentheses â†’ %28 %29, etc.
    params = quote_plus(odbc_str)
    engine = create_engine(
        f"mssql+pyodbc:///?odbc_connect={params}",
        fast_executemany=True        # batches rows under the hood
    )

    # â€”â€” upsert strategy: try append-only, let PK skip duplicates
    with engine.begin() as cn:
        df = pd.read_sql("SELECT * FROM YOUTUBE_API.Vaush_VIDEOS", engine)
    return df

In [29]:
def filter_new_videos(df_in_database, df_from_api):
    keys = set(df_in_database["VIDEO_ID"])
    df_filtered = df_from_api[~df_from_api["VIDEO_ID"].isin(keys)]
    return df_filtered

In [30]:
def df_to_azure_sql(df):
    """
    Write/append the dataframe into dbo.youtube_videos (Azure SQL DB)
    """

    drv = "ODBC Driver 18 for SQL Server"            # keep spaces!
    odbc_str = (
        f"Driver={drv};Server=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"Database={os.getenv('AZSQL_DATABASE')};"
        f"Uid={os.getenv('AZSQL_USERNAME')};"
        f"Pwd={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )

    # SQLAlchemy-style URL.  Space â†’ + ;  parentheses â†’ %28 %29, etc.
    params = quote_plus(odbc_str)
    engine = create_engine(
        f"mssql+pyodbc:///?odbc_connect={params}",
        fast_executemany=True        # batches rows under the hood
    )

    # â€”â€” upsert strategy: try append-only, let PK skip duplicates
    with engine.begin() as cn:
        df.to_sql(
            name="Vaush_VIDEOS",
            con=cn,
            schema="YOUTUBE_API",
            if_exists="append",       # create once, then append
            index=False,
            chunksize=1000,           # good balance of  network / TX
        )

In [31]:
def main(channel_id):
    #
    youtube = build_service()
    uploads_id = get_uploads_playlist_id(youtube, channel_id)
    print(f"Uploads playlist ID: {uploads_id}")
    ids = get_all_video_ids(youtube, uploads_id)
    print(f"Total videos: {len(ids):,}")
    df = fetch_video_metadata(youtube, ids)
    df = normalize_dataframe(df)
    #df_to_azure_sql(df)
    #print("Data pushed to Azure SQL ðŸŽ‰")
    return df

In [32]:
if __name__ == "__main__" and 1==0:
    if len(sys.argv) != 2:
        raise SystemExit(f"Usage: python fetch_channel_videos.py <CHANNEL_ID>")
    main(sys.argv[1])

In [33]:
CHANNEL_ID=os.getenv("VAUSH_CHANNEL_ID")
print(CHANNEL_ID)

UC1E-JS8L0j1Ei70D9VEFrPQ


In [34]:
main(CHANNEL_ID)

Uploads playlist ID: UU1E-JS8L0j1Ei70D9VEFrPQ


Fetching playlist pages: 68page [00:07,  8.80page/s]


Total videos: 3,370


Downloading metadata: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:19<00:00,  3.53batch/s]


Unnamed: 0,video_id,title,published_at,description,duration_ISO,tags,view_count,like_count,comment_count,favorite_count,channel_title
0,0ZVCjgjSecM,Trump Voters Asked For This,2025-12-07 15:21:12,ðŸ”´ Website & Livestream Chat - https://www.vau...,PT15M17S,trump|trump fox|trump news|trump 2025|trump li...,149860,6775,1256,0,Vaush
1,bFTu4t77YP0,Trump's Admin Is An ABSOLUTE MESS,2025-12-06 16:07:27,ðŸ”´ Website & Livestream Chat - https://www.vau...,PT24M16S,trump|trump fox|trump news|trump 2025|trump li...,121640,4648,730,0,Vaush
2,Pde6FQ0btOY,The Hillary Clinton Situation Is Wild,2025-12-05 18:19:26,ðŸ”´ Website & Livestream Chat - https://www.vau...,PT9M46S,israel: hillary clinton visit|israel: hillary ...,81018,3917,801,0,Vaush
3,DOWnEyXHqK4,The World Is About To Get So Much Worse,2025-12-05 00:35:19,ðŸ”´ Website & Livestream Chat - https://www.vau...,PT55M12S,ai gambling|ai gambling tools|gambling with ai...,131963,5693,1243,0,Vaush
4,ll9OWyZ6rRc,"The Most Unhinged, Racist Thing Trump Has Ever...",2025-12-03 18:20:46,ðŸ”´ Website & Livestream Chat - https://www.vau...,PT13M32S,trump|trump fox|trump news|trump 2025|trump li...,146863,6004,1175,0,Vaush
...,...,...,...,...,...,...,...,...,...,...,...
3365,4CZUNd-N1ko,How to TRIGGER Reactionaries with FACTS and LOGIC,2019-02-03 03:39:16,Please be critical of me down in the comments ...,PT14M24S,debate|socialism|reactionary|reactionaries|alt...,192366,10201,1374,0,Vaush
3366,3G9x8rgrGWQ,1/25/19 Stream Debate with the LEGENDARY Paste...,2019-01-27 06:40:21,Hopefully the first of many debates to come.\n...,PT2H2M29S,leftism|socialism|homophobia|lgbt|transphobia|...,18091,466,115,0,Vaush
3367,c8yQDtLeb14,My Longest Yeah Boi Yet + Thoughts on Israel,2019-01-23 23:37:19,This is the one I'll be remembered for. UpGera...,PT1M13S,israel|socialism|yea boi|politics,30695,1446,75,0,Vaush
3368,JlJkVPn2NRM,A Dialogue on Violent Revolution,2019-01-22 23:05:01,Watch me struggle to find a format which works...,PT14M55S,politics|political violence|revolution|leftism...,27643,1380,165,0,Vaush


In [36]:
youtube = build_service()
uploads_id = get_uploads_playlist_id(youtube, CHANNEL_ID)
print(f"Uploads playlist ID: {uploads_id}")
ids = get_all_video_ids(youtube, uploads_id)
print(f"Total videos: {len(ids):,}")
df = fetch_video_metadata(youtube, ids)
df = normalize_dataframe(df)
#df_to_azure_sql(df)
#print("Data pushed to Azure SQL ðŸŽ‰")

Uploads playlist ID: UU1E-JS8L0j1Ei70D9VEFrPQ


Fetching playlist pages: 68page [00:07,  8.67page/s]


Total videos: 3,370


Downloading metadata: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 68/68 [00:19<00:00,  3.49batch/s]


In [37]:
df_in_db = select_all_azure_sql()

InterfaceError: (pyodbc.InterfaceError) ('IM002', '[IM002] [Microsoft][ODBC Driver Manager] Data source name not found and no default driver specified (0) (SQLDriverConnect)')
(Background on this error at: https://sqlalche.me/e/20/rvf5)