In [28]:
import pandas as pd
from pathlib import Path

import os
from dotenv import load_dotenv
load_dotenv()

from googleapiclient.discovery import build
from tqdm import tqdm   # progress bar

from sqlalchemy import create_engine
from urllib.parse import quote_plus  # safely URL-encode the driver name

In [8]:
csv_dir = Path(r'C:\Users\vboxuser\PycharmProjects\Youtube_Scraping\data')

df = pd.concat(
    (pd.read_csv(p) for p in csv_dir.glob("*.csv")),
    ignore_index=True
)

In [17]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   STREAM_START_TIME      255 non-null    object 
 1   STREAM_URL             255 non-null    object 
 2   STREAM_LENGTH_MINUTES  255 non-null    int64  
 3   WATCH_TIME_MINUTES     255 non-null    int64  
 4   AVG_VIEWERS            255 non-null    int64  
 5   PEAK_VIEWERS           255 non-null    int64  
 6   FOLLOWERS_GAINED       255 non-null    int64  
 7   FOLLOWERS_PER_HOUR     255 non-null    int64  
 8   VIEWS                  250 non-null    float64
 9   VIEWS_PER_HOUR         250 non-null    float64
 10  GAMES                  255 non-null    object 
dtypes: float64(2), int64(6), object(3)
memory usage: 22.0+ KB


Unnamed: 0,STREAM_START_TIME,STREAM_URL,STREAM_LENGTH_MINUTES,WATCH_TIME_MINUTES,AVG_VIEWERS,PEAK_VIEWERS,FOLLOWERS_GAINED,FOLLOWERS_PER_HOUR,VIEWS,VIEWS_PER_HOUR,GAMES
0,Wednesday 17th April 2019 19:49,https://sullygnome.com/channel/stream/33737140096,266,106932,402,511,36,8,1150.0,255.0,Just Chatting
1,Wednesday 17th April 2019 03:05,https://sullygnome.com/channel/stream/33727821120,235,19505,83,112,6,1,476.0,112.0,"DOOM,Risk of Rain 2"
2,Tuesday 16th April 2019 20:59,https://sullygnome.com/channel/stream/33723035120,256,35072,137,157,13,2,718.0,159.0,"Risk of Rain 2,Serious Sam 3: BFE,Art"
3,Monday 15th April 2019 18:50,https://sullygnome.com/channel/stream/33706366864,325,129350,398,467,32,5,1342.0,244.0,Just Chatting
4,Saturday 13th April 2019 19:49,https://sullygnome.com/channel/stream/33674464304,266,92302,347,412,19,4,1064.0,236.0,Just Chatting


In [15]:
df.drop(columns=['Unnamed: 0'], inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
df.rename(columns={'Stream start time': 'STREAM_START_TIME',
                   'Stream URL': 'STREAM_URL',
                   'Stream length (mins)': 'STREAM_LENGTH_MINUTES',
                   'Watch time (mins)': 'WATCH_TIME_MINUTES',
                   'Avg viewers': 'AVG_VIEWERS',
                   'Peak viewers': 'PEAK_VIEWERS',
                   'Followers gained': 'FOLLOWERS_GAINED',
                   'Followers per hour': 'FOLLOWERS_PER_HOUR',
                   'Views': 'VIEWS',
                   'Views per hour': 'VIEWS_PER_HOUR',
                   'Games': 'GAMES'
                   }, inplace=True)

In [18]:
df["STREAM_START_TIME"] = (
    df["STREAM_START_TIME"]
        .str.replace(r"(\d+)(st|nd|rd|th)", r"\1", regex=True)
        .pipe(pd.to_datetime, format="%A %d %B %Y %H:%M", errors="coerce", utc=True)
)

In [19]:
df["STREAM_START_TIME"]

0     2019-04-17 19:49:00+00:00
1     2019-04-17 03:05:00+00:00
2     2019-04-16 20:59:00+00:00
3     2019-04-15 18:50:00+00:00
4     2019-04-13 19:49:00+00:00
                 ...           
250   2022-10-25 23:46:00+00:00
251   2022-10-24 07:11:00+00:00
252   2022-10-23 01:10:00+00:00
253   2022-10-19 19:39:00+00:00
254   2022-05-21 19:51:00+00:00
Name: STREAM_START_TIME, Length: 255, dtype: datetime64[ns, UTC]

In [20]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   STREAM_START_TIME      255 non-null    datetime64[ns, UTC]
 1   STREAM_URL             255 non-null    object             
 2   STREAM_LENGTH_MINUTES  255 non-null    int64              
 3   WATCH_TIME_MINUTES     255 non-null    int64              
 4   AVG_VIEWERS            255 non-null    int64              
 5   PEAK_VIEWERS           255 non-null    int64              
 6   FOLLOWERS_GAINED       255 non-null    int64              
 7   FOLLOWERS_PER_HOUR     255 non-null    int64              
 8   VIEWS                  250 non-null    float64            
 9   VIEWS_PER_HOUR         250 non-null    float64            
 10  GAMES                  255 non-null    object             
dtypes: datetime64[ns, UTC](1), float64(2), int64(6), object(2)

Unnamed: 0,STREAM_START_TIME,STREAM_URL,STREAM_LENGTH_MINUTES,WATCH_TIME_MINUTES,AVG_VIEWERS,PEAK_VIEWERS,FOLLOWERS_GAINED,FOLLOWERS_PER_HOUR,VIEWS,VIEWS_PER_HOUR,GAMES
0,2019-04-17 19:49:00+00:00,https://sullygnome.com/channel/stream/33737140096,266,106932,402,511,36,8,1150.0,255.0,Just Chatting
1,2019-04-17 03:05:00+00:00,https://sullygnome.com/channel/stream/33727821120,235,19505,83,112,6,1,476.0,112.0,"DOOM,Risk of Rain 2"
2,2019-04-16 20:59:00+00:00,https://sullygnome.com/channel/stream/33723035120,256,35072,137,157,13,2,718.0,159.0,"Risk of Rain 2,Serious Sam 3: BFE,Art"
3,2019-04-15 18:50:00+00:00,https://sullygnome.com/channel/stream/33706366864,325,129350,398,467,32,5,1342.0,244.0,Just Chatting
4,2019-04-13 19:49:00+00:00,https://sullygnome.com/channel/stream/33674464304,266,92302,347,412,19,4,1064.0,236.0,Just Chatting


In [21]:
# build engine (same style you've been using)
def select_all_azure_sql():
    drv = "ODBC Driver 18 for SQL Server"
    odbc_str = (
        f"DRIVER={{{drv}}};"
        f"SERVER=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"DATABASE={os.getenv('AZSQL_DATABASE')};"
        f"UID={os.getenv('AZSQL_USERNAME')};"
        f"PWD={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;"
        "TrustServerCertificate=no;"
        "Connection Timeout=30;"
    )

    params = quote_plus(odbc_str)
    engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")

    # read table into pandas
    df = pd.read_sql("SELECT * FROM SULLYGNOME_TWITCH_STREAMS.VAUSH_TWITCH_STREAMS", engine)
    return df

In [22]:
def filter_new_videos(df_in_database, df_from_api):
    keys = set(df_in_database["STREAM_START_TIME"])
    df_filtered = df_from_api[~df_from_api["STREAM_START_TIME"].isin(keys)]
    return df_filtered

In [23]:
def df_to_azure_sql(df):
    """
    Write/append the dataframe into dbo.youtube_videos (Azure SQL DB)
    """

    drv = "ODBC Driver 18 for SQL Server"            # keep spaces!
    odbc_str = (
        f"Driver={drv};Server=tcp:{os.getenv('AZSQL_SERVER')},1433;"
        f"Database={os.getenv('AZSQL_DATABASE')};"
        f"Uid={os.getenv('AZSQL_USERNAME')};"
        f"Pwd={os.getenv('AZSQL_PASSWORD')};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )

    # SQLAlchemy-style URL.  Space → + ;  parentheses → %28 %29, etc.
    params = quote_plus(odbc_str)
    engine = create_engine(
        f"mssql+pyodbc:///?odbc_connect={params}",
        fast_executemany=True        # batches rows under the hood
    )

    # —— upsert strategy: try append-only, let PK skip duplicates
    with engine.begin() as cn:
        df.to_sql(
            name="VAUSH_TWITCH_STREAMS",
            con=cn,
            schema="SULLYGNOME_TWITCH_STREAMS",
            if_exists="append",       # create once, then append
            index=False,
            chunksize=1000,           # good balance of  network / TX
        )

In [29]:
df_in_db = select_all_azure_sql()
df_in_db

Unnamed: 0,STREAM_START_TIME,STREAM_URL,STREAM_LENGTH_MINUTES,WATCH_TIME_MINUTES,AVG_VIEWERS,PEAK_VIEWERS,FOLLOWERS_GAINED,FOLLOWERS_PER_HOUR,VIEWS,VIEWS_PER_HOUR,GAMES,INGESTED_AT


In [30]:
df_new_rows = filter_new_videos(df_in_database=df_in_db, df_from_api=df)
df_new_rows

Unnamed: 0,STREAM_START_TIME,STREAM_URL,STREAM_LENGTH_MINUTES,WATCH_TIME_MINUTES,AVG_VIEWERS,PEAK_VIEWERS,FOLLOWERS_GAINED,FOLLOWERS_PER_HOUR,VIEWS,VIEWS_PER_HOUR,GAMES
0,2019-04-17 19:49:00+00:00,https://sullygnome.com/channel/stream/33737140096,266,106932,402,511,36,8,1150.0,255.0,Just Chatting
1,2019-04-17 03:05:00+00:00,https://sullygnome.com/channel/stream/33727821120,235,19505,83,112,6,1,476.0,112.0,"DOOM,Risk of Rain 2"
2,2019-04-16 20:59:00+00:00,https://sullygnome.com/channel/stream/33723035120,256,35072,137,157,13,2,718.0,159.0,"Risk of Rain 2,Serious Sam 3: BFE,Art"
3,2019-04-15 18:50:00+00:00,https://sullygnome.com/channel/stream/33706366864,325,129350,398,467,32,5,1342.0,244.0,Just Chatting
4,2019-04-13 19:49:00+00:00,https://sullygnome.com/channel/stream/33674464304,266,92302,347,412,19,4,1064.0,236.0,Just Chatting
...,...,...,...,...,...,...,...,...,...,...,...
250,2022-10-25 23:46:00+00:00,https://sullygnome.com/channel/stream/41424838763,269,331677,1233,1564,168,35,,,Subnautica: Below Zero
251,2022-10-24 07:11:00+00:00,https://sullygnome.com/channel/stream/41419279819,109,26705,245,324,30,15,,,Northern Journey
252,2022-10-23 01:10:00+00:00,https://sullygnome.com/channel/stream/41414238587,20,15220,761,1058,78,156,,,Subnautica: Below Zero
253,2022-10-19 19:39:00+00:00,https://sullygnome.com/channel/stream/47330535885,231,603603,2613,4113,398,99,,,Just Chatting


In [31]:
df_to_azure_sql(df_new_rows)