In [12]:
import os, sys, time, math, itertools, json
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path
import pandas as pd
from googleapiclient.discovery import build
from tqdm import tqdm   # progress bar

from sqlalchemy import create_engine
from urllib.parse import quote_plus  # safely URL-encode the driver name

from __future__ import annotations

from typing import List, Dict, Optional
from googleapiclient.discovery import build

In [13]:
API_KEY = os.getenv("YOUTUBE_API_KEY")  
if not API_KEY:
    raise SystemExit("No YOUTUBE_API_KEY Found!")

In [14]:
def normalize_dataframe(df):
    df["published_at"] = (
        pd.to_datetime(df["published_at"], utc=True)  # parse ISO-8601
          .dt.tz_convert(None)                       # drop the UTC tz-info
    )

    # --- make sure numeric cols are true ints, not NaN/float strings ----------
    num_cols = ["view_count", "like_count", "comment_count", "favorite_count"]
    df[num_cols] = (
        df[num_cols]
          .fillna(0)            # YouTube may omit like_count, etc. -> NaN
          .astype("Int64")      # pandas nullable int → SQL BIGINT/INT fine
    )
    return df

In [15]:
# build engine (same style you've been using)
def select_all_azure_sql():
    drv = "ODBC Driver 18 for SQL Server"
    odbc_str = (
        f"DRIVER={{{drv}}};"
        f"SERVER=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"DATABASE={os.getenv('AZSQL_DATABASE')};"
        f"UID={os.getenv('AZSQL_USERNAME')};"
        f"PWD={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;"
        "TrustServerCertificate=no;"
        "Connection Timeout=30;"
    )

    params = quote_plus(odbc_str)
    engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")

    # read table into pandas
    df = pd.read_sql("SELECT * FROM YOUTUBE_STREAM_VODS.VAUSH_STREAM_VODS", engine)
    return df

In [16]:
def filter_new_videos(df_in_database, df_from_api):
    keys = set(df_in_database["video_id"])
    df_filtered = df_from_api[~df_from_api["video_id"].isin(keys)]
    return df_filtered

In [17]:
def df_to_azure_sql(df):
    """
    Write/append the dataframe into dbo.youtube_videos (Azure SQL DB)
    """

    drv = "ODBC Driver 18 for SQL Server"            # keep spaces!
    odbc_str = (
        f"Driver={drv};Server=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"Database={os.getenv('AZSQL_DATABASE')};"
        f"Uid={os.getenv('AZSQL_USERNAME')};"
        f"Pwd={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )

    # SQLAlchemy-style URL.  Space → + ;  parentheses → %28 %29, etc.
    params = quote_plus(odbc_str)
    engine = create_engine(
        f"mssql+pyodbc:///?odbc_connect={params}",
        fast_executemany=True        # batches rows under the hood
    )

    # —— upsert strategy: try append-only, let PK skip duplicates
    with engine.begin() as cn:
        df.to_sql(
            name="VAUSH_STREAM_VODS",
            con=cn,
            schema="YOUTUBE_STREAM_VODS",
            if_exists="append",       # create once, then append
            index=False,
            chunksize=1000,           # good balance of  network / TX
        )

In [18]:
# ----------------------------
# 1) Find playlist(s) by expected name
# ----------------------------

def find_playlists_by_name(
    yt,
    channel_id: str,
    expected_names: List[str],
    max_pages: int = 50,
) -> List[Dict]:
    """
    Returns playlists whose title contains any of the expected name phrases (case-insensitive).
    Example expected_names:
      ["stream vod", "stream archive", "past streams", "vods", "live vod"]
    """
    expected = [s.lower() for s in expected_names]
    matches: List[Dict] = []

    page_token: Optional[str] = None
    pages = 0

    while True:
        resp = yt.playlists().list(
            part="snippet,contentDetails",
            channelId=channel_id,
            maxResults=50,
            pageToken=page_token,
        ).execute()

        for item in resp.get("items", []):
            title = (item.get("snippet", {}) or {}).get("title", "") or ""
            title_l = title.lower()

            if any(phrase in title_l for phrase in expected):
                matches.append({
                    "playlist_id": item["id"],
                    "title": title,
                    "item_count": (item.get("contentDetails", {}) or {}).get("itemCount", 0),
                })

        page_token = resp.get("nextPageToken")
        pages += 1
        if not page_token or pages >= max_pages:
            break

    return matches



In [19]:

# ----------------------------
# 2) Iterate playlist videos and pull video metadata
# ----------------------------

def list_video_ids_in_playlist(yt, playlist_id: str) -> List[str]:
    """
    Returns all videoIds in a playlist.
    Note: playlist items can include "Private video"/"Deleted video" placeholders -> those will have no videoId.
    """
    video_ids: List[str] = []
    page_token: Optional[str] = None

    while True:
        resp = yt.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=page_token,
        ).execute()

        for item in resp.get("items", []):
            snip = item.get("snippet", {}) or {}
            resource = snip.get("resourceId", {}) or {}
            vid = resource.get("videoId")
            if vid:
                video_ids.append(vid)

        page_token = resp.get("nextPageToken")
        if not page_token:
            break

    # de-dup but keep stable order
    seen = set()
    out = []
    for v in video_ids:
        if v not in seen:
            seen.add(v)
            out.append(v)
    return out


In [20]:
def fetch_videos(yt, video_ids: List[str]) -> List[Dict]:
    """
    Fetch video metadata in batches of 50.
    Includes liveStreamingDetails so you can detect completed livestreams.
    """
    out: List[Dict] = []

    for i in range(0, len(video_ids), 50):
        batch = video_ids[i:i + 50]
        resp = yt.videos().list(
            part="snippet,contentDetails,statistics,status,liveStreamingDetails",
            id=",".join(batch),
        ).execute()

        for v in resp.get("items", []):
            snip = v.get("snippet", {}) or {}
            cdet = v.get("contentDetails", {}) or {}
            stats = v.get("statistics", {}) or {}
            status = v.get("status", {}) or {}
            live = v.get("liveStreamingDetails") or {}

            out.append({
                "video_id": v.get("id"),
                "title": snip.get("title"),
                "published_at": snip.get("publishedAt"),
                "duration": cdet.get("duration"),
                "view_count": int(stats["viewCount"]) if "viewCount" in stats else None,
                "comment_count": int(stats["commentCount"]) if "commentCount" in stats else None,
                "like_count": int(stats["likeCount"]) if "likeCount" in stats else None,
                "privacy_status": status.get("privacyStatus"),
                # livestream timestamps if it was a live broadcast (completed/upcoming/live)
                "live_actual_start": live.get("actualStartTime"),
                "live_actual_end": live.get("actualEndTime"),
                "live_scheduled_start": live.get("scheduledStartTime"),
            })

    return out



In [21]:

# ----------------------------
# Single entrypoint (minimal)
# ----------------------------

def get_stream_vod_videos_from_channel(
    api_key: str,
    channel_id: str,
    expected_playlist_names: List[str],
) -> Dict:
    """
    1) Find playlist(s) whose name matches your expected patterns
    2) Pull all videos from those playlists and return video metadata
    """
    yt = build("youtube", "v3", developerKey=api_key)

    playlists = find_playlists_by_name(
        yt=yt,
        channel_id=channel_id,
        expected_names=expected_playlist_names,
    )

    all_videos: List[Dict] = []
    for pl in playlists:
        vids = list_video_ids_in_playlist(yt, pl["playlist_id"])
        video_rows = fetch_videos(yt, vids)

        # tag provenance
        for row in video_rows:
            row["discovery_playlist_id"] = pl["playlist_id"]
            row["discovery_playlist_title"] = pl["title"]

        all_videos.extend(video_rows)

    # de-dup by video_id (video can appear in multiple playlists)
    by_id = {}
    for v in all_videos:
        by_id.setdefault(v["video_id"], v)

    return {
        "channel_id": channel_id,
        "matched_playlists": playlists,
        "videos": list(by_id.values()),
        "video_count": len(by_id),
    }



In [22]:

if __name__ == "__main__":
    result = get_stream_vod_videos_from_channel(
        api_key=API_KEY,
        channel_id=os.getenv('VAUSH_CHANNEL_ID'),
        expected_playlist_names=["Stream VODs"]
        )
    print(result["matched_playlists"])
    print(result["video_count"])
    df = pd.DataFrame(result["videos"])
    pass


[{'playlist_id': 'PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw', 'title': 'Stream VODs', 'item_count': 1658}]
1652


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   video_id                  1652 non-null   object
 1   title                     1652 non-null   object
 2   published_at              1652 non-null   object
 3   duration                  1652 non-null   object
 4   view_count                1652 non-null   int64 
 5   comment_count             1652 non-null   int64 
 6   like_count                1652 non-null   int64 
 7   privacy_status            1652 non-null   object
 8   live_actual_start         1636 non-null   object
 9   live_actual_end           1636 non-null   object
 10  live_scheduled_start      1634 non-null   object
 11  discovery_playlist_id     1652 non-null   object
 12  discovery_playlist_title  1652 non-null   object
dtypes: int64(3), object(10)
memory usage: 167.9+ KB


In [24]:
df

Unnamed: 0,video_id,title,published_at,duration,view_count,comment_count,like_count,privacy_status,live_actual_start,live_actual_end,live_scheduled_start,discovery_playlist_id,discovery_playlist_title
0,eJ35lXww21g,"Jan 3, 2026 TRUMP GIVES PRESS CONFERENCE FOLLO...",2026-01-03T15:23:25Z,PT4H43M30S,151613,60,4504,unlisted,2026-01-03T15:52:33Z,2026-01-03T20:36:02Z,2026-01-03T16:15:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
1,Z4e1ta0qHRs,"Jan 3, 2026 BREAKING: AMERICAN INVASION OF VEN...",2026-01-03T06:53:57Z,PT3H31M5S,119879,268,4451,unlisted,2026-01-03T06:54:28Z,2026-01-03T10:25:31Z,2026-01-03T07:00:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
2,5N5XACTXg64,"Jan 2, 2026 MAMDANI SWORN IN, IMMEDIATELY TAKE...",2026-01-02T19:24:54Z,PT5H14M21S,82912,63,3137,unlisted,2026-01-02T20:10:57Z,2026-01-03T01:25:12Z,2026-01-02T20:30:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
3,OgoXU9JeYvM,"Dec 30, 2025 Farewell to the first year of the...",2025-12-30T21:05:00Z,PT3H36M10S,70133,113,2490,unlisted,2025-12-30T21:39:03Z,2025-12-31T01:15:09Z,2025-12-30T22:00:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
4,xwVy4LogYaA,"Dec 27, 2025 WHO ELSE BIG AND ROUND",2025-12-27T18:50:17Z,PT3H25M29S,64748,133,2121,unlisted,2025-12-27T19:36:18Z,2025-12-27T23:01:42Z,2025-12-27T20:00:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,Vn8zk7o9VOw,"Yelling at Reactionary Videos - BraveTheWorld,...",2019-04-23T17:49:35Z,PT1M,3764,30,160,unlisted,2019-04-23T17:46:01Z,2019-04-23T17:46:57Z,2019-04-23T18:00:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
1648,9rukHE7Hzf0,Discussing Racism,2019-04-23T01:41:09Z,PT3H9M57S,7558,30,279,unlisted,2019-04-22T22:00:38Z,2019-04-23T01:10:31Z,2019-04-22T22:00:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
1649,Q7PWX39sCb0,This is a video game stream please do not come...,2019-04-22T06:26:32Z,PT2H13M7S,5163,12,143,unlisted,2019-04-22T04:01:29Z,2019-04-22T06:14:31Z,2019-04-22T04:00:00Z,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs
1650,hQc7QIsP_Yg,I love getting myself banned from streaming pl...,2019-04-18T20:22:47Z,PT51M27S,12153,70,474,unlisted,2019-04-18T18:56:48Z,2019-04-18T19:48:15Z,,PLvVEXejrE-HT5SPUUMaZ1QcTxa2S3PvPw,Stream VODs


In [25]:
def normalize_dataframe(df):
    # --- Parse Youtube datetimes ----------  
    dt_cols = [
        "published_at",
        "live_actual_start",
        "live_actual_end",
        "live_scheduled_start",
    ]

    df[dt_cols] = df[dt_cols].apply(
        pd.to_datetime,
        utc=True,
        errors="coerce",
    )

    # --- make sure numeric cols are true ints, not NaN/float strings ----------
    num_cols = ["view_count", "like_count", "comment_count"]
    df[num_cols] = (
        df[num_cols]
        .fillna(0)            # YouTube may omit like_count, etc. -> NaN
        .astype("Int64")      # pandas nullable int → SQL BIGINT/INT fine
    )
    
    return df

In [26]:
df = normalize_dataframe(df)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   video_id                  1652 non-null   object             
 1   title                     1652 non-null   object             
 2   published_at              1652 non-null   datetime64[ns, UTC]
 3   duration                  1652 non-null   object             
 4   view_count                1652 non-null   Int64              
 5   comment_count             1652 non-null   Int64              
 6   like_count                1652 non-null   Int64              
 7   privacy_status            1652 non-null   object             
 8   live_actual_start         1636 non-null   datetime64[ns, UTC]
 9   live_actual_end           1636 non-null   datetime64[ns, UTC]
 10  live_scheduled_start      1634 non-null   datetime64[ns, UTC]
 11  discovery_playlis

In [28]:
df['video_id'].value_counts()

video_id
eJ35lXww21g    1
Z4e1ta0qHRs    1
5N5XACTXg64    1
OgoXU9JeYvM    1
xwVy4LogYaA    1
              ..
Vn8zk7o9VOw    1
9rukHE7Hzf0    1
Q7PWX39sCb0    1
hQc7QIsP_Yg    1
1TggpUmWqu0    1
Name: count, Length: 1652, dtype: int64

In [29]:
df_in_db = select_all_azure_sql()
df_in_db

OperationalError: (pyodbc.OperationalError) ('08001', '[08001] [Microsoft][ODBC Driver 18 for SQL Server]TCP Provider: Timeout error [258].  (258) (SQLDriverConnect); [08001] [Microsoft][ODBC Driver 18 for SQL Server]Login timeout expired (0); [08001] [Microsoft][ODBC Driver 18 for SQL Server]Invalid connection string attribute (0); [08001] [Microsoft][ODBC Driver 18 for SQL Server]Unable to complete login process due to delay in login response (258)')
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
df_new_rows = filter_new_videos(df_in_db, df)
df_new_rows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1649 entries, 0 to 1648
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   video_id                  1649 non-null   object             
 1   title                     1649 non-null   object             
 2   published_at              1649 non-null   datetime64[ns, UTC]
 3   duration                  1649 non-null   object             
 4   view_count                1649 non-null   Int64              
 5   comment_count             1649 non-null   Int64              
 6   like_count                1649 non-null   Int64              
 7   privacy_status            1649 non-null   object             
 8   live_actual_start         1633 non-null   datetime64[ns, UTC]
 9   live_actual_end           1633 non-null   datetime64[ns, UTC]
 10  live_scheduled_start      1631 non-null   datetime64[ns, UTC]
 11  discovery_playlis

In [None]:
cols_to_insert = ['video_id', 'title', 'published_at', 'duration', 'view_count', 'comment_count', 'like_count', 'privacy_status', 'live_actual_start', 'live_actual_end', 'live_scheduled_start']
len(cols_to_insert)

11

In [None]:
df_to_azure_sql(df_new_rows[cols_to_insert])