# Setup

### Set CWD

In [2]:
cd ..

d:\Documents\A_DIGIPEN\PersonalSVN\Fall22SVN\CSP400\MRB_II


### Imports

In [3]:
from datetime import datetime
from pytz import timezone

import pandas as pd

import sqlalchemy
import sqlalchemy as sqa

import googleapiclient.discovery

---

# Data

In [4]:
def build_client(path="./secret_keys/"):
    """Creates and returns the YouTube Data API client needed for requests."""
    print("building client...")

    api_key_path = path + "api_key.txt"

    # API information
    api_service_name = "youtube"
    api_version = "v3"

    # Read API key
    with open(api_key_path, "r") as api_file:
        api_key = api_file.read()

    # Create the client with api_key
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=api_key
    )
    print("client built")

    return youtube

In [5]:
def _get_all_parts():
    parts = (
        "id",
        "snippet",
        "contentDetails",
        "status",
        "statistics",
        "player",
        "topicDetails",
        "recordingDetails",
        "liveStreamingDetails",
        "localizations",
    )

    return ",".join(parts)

In [6]:
def get_trending(youtube):
    """Gets top trending videos in the US."""
    print("getting trending videos...")

    # Get data
    request = youtube.videos().list(
        part=_get_all_parts(),
        chart="mostPopular",
        maxResults=50,
        regionCode="US"
    )
    response = request.execute()
    print("got trending videos")

    return response

In [7]:
def get_categories(youtube, cat_id):
    print("getting category videos...")
    try:
        # Get data
        request = youtube.videos().list(
            part=_get_all_parts(),
            chart="mostPopular",
            maxResults=50,
            regionCode="US",
            videoCategoryId=cat_id,
        )
        response = request.execute()
        print(f"got cat{cat_id} videos")
    except googleapiclient.errors.HttpError:
        print(f"cat{cat_id} chart not found or failed")

    return response

In [8]:
youtube = build_client()
trending = get_trending(youtube)
cat = get_categories(youtube, 1)

building client...
client built
getting trending videos...
got trending videos
getting category videos...
got cat1 videos


### Prelim. Wrangling

In [9]:
current_time = datetime.now(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ")

In [10]:
# Read into dataframe
df = pd.json_normalize(trending["items"], max_level=1)

# Save timestamp
df["queryTime"] = current_time

# Convert columns to datetime
dt_names = (
    "queryTime",
    "snippet.publishedAt",
)
for dt_feat in dt_names:
    df[dt_feat] = pd.to_datetime(df[dt_feat], utc=True)

In [11]:
# Remove features containing the following strings
drop_substrings = (
    "localizations",
)
small = df.loc[:, [col for col in df.columns if not any(d in col for d in drop_substrings)]]

---

# Database

In [10]:
with open("mysql/mysql_key.txt", "r") as file:
    key = file.read()

# engine = create_engine("mysql://user:pwd@localhost/college",echo = True)
engine = sqlalchemy.create_engine(f"mysql://eric:{key}@localhost/trending", echo=True)

In [11]:
def create_table(name: str, features: dict):
    """Given a name and a dictionary of features, creates and returns a table with SQLAlchemy.

    Params:
        name: Name of resulting table
        features: Dictionary of feature names as keys and SQLAlchemy data types as values
    """
    # Create table
    meta = sqa.MetaData()
    table = sqa.Table(
        name, meta,
        sqa.Column("num", sqa.Integer, primary_key=True),
        sqa.Column("queryNum", sqa.Integer, index=True),
        *[sqa.Column(name, dtype) for name, dtype in features.items()],
    )
    meta.create_all(engine)

### Videos

#### Define Features

In [12]:
# Get dictionary of relavant features w/ dtypes
feats_vid = {
    "id": sqa.Text,
    "queryTime": sqa.DateTime,

    "snippet.publishedAt": sqa.DateTime,
    "snippet.channelId": sqa.Text,
    "snippet.title": sqa.Text,
    "snippet.description": sqa.Text,
    "snippet.channelTitle": sqa.Text,
    "snippet.categoryId": sqa.Integer,

    "statistics.viewCount": sqa.BigInteger,
    "statistics.likeCount": sqa.BigInteger,
    "statistics.commentCount": sqa.BigInteger,

    "contentDetails.duration": sqa.BigInteger, 
}

#### Create Table

In [19]:
# Drop table if exists
engine.execute("DROP TABLE IF EXISTS videos")

videos = create_table("videos", feats_vid)

2022-12-22 15:05:46,955 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS videos
2022-12-22 15:05:46,955 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-22 15:05:46,967 INFO sqlalchemy.engine.Engine COMMIT
2022-12-22 15:05:46,968 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-22 15:05:46,969 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-22 15:05:46,969 INFO sqlalchemy.engine.Engine [cached since 1579s ago] ('trending', 'videos')
2022-12-22 15:05:46,970 INFO sqlalchemy.engine.Engine 
CREATE TABLE videos (
	num INTEGER NOT NULL AUTO_INCREMENT, 
	`queryNum` INTEGER, 
	id TEXT, 
	`queryTime` DATETIME, 
	`snippet.publishedAt` DATETIME, 
	`snippet.channelId` TEXT, 
	`snippet.title` TEXT, 
	`snippet.description` TEXT, 
	`snippet.channelTitle` TEXT, 
	`snippet.categoryId` INTEGER, 
	`statistics.viewCount` BIGINT, 
	`statistics.likeCount` BIGINT, 
	`statistics.commentCount` BIGINT, 
	`contentDetails.d

#### Create DF

In [20]:
# Get only features that matter
df_vid = df.loc[:, list(feats_vid.keys())]

# Convert duration to time delta
df_vid["contentDetails.duration"] = pd.to_timedelta(df_vid["contentDetails.duration"].str.slice(start=2).str.replace("M", "m")).dt.seconds

# Save previous maximum query
query_number = pd.read_sql("SELECT MAX(queryNum) FROM videos;", engine).iloc[0][0]
query_number = query_number + 1 if query_number is not None else 1
df_vid["queryNum"] = query_number

2022-12-22 15:05:48,988 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-22 15:05:48,989 INFO sqlalchemy.engine.Engine [cached since 1581s ago] ('trending', 'SELECT MAX(queryNum) FROM videos;')
2022-12-22 15:05:48,990 INFO sqlalchemy.engine.Engine SELECT MAX(queryNum) FROM videos;
2022-12-22 15:05:48,990 INFO sqlalchemy.engine.Engine [raw sql] ()


  data = objects_to_td64ns(data, unit=unit, errors=errors)


#### Insert Data

In [21]:
df_vid.to_sql("videos", engine, 
              index=False, if_exists="append",
              dtype=feats_vid)

2022-12-22 15:05:49,937 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-22 15:05:49,938 INFO sqlalchemy.engine.Engine [cached since 1582s ago] ('trending', 'videos')
2022-12-22 15:05:49,939 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-22 15:05:49,940 INFO sqlalchemy.engine.Engine INSERT INTO videos (id, `queryTime`, `snippet.publishedAt`, `snippet.channelId`, `snippet.title`, `snippet.description`, `snippet.channelTitle`, `snippet.categoryId`, `statistics.viewCount`, `statistics.likeCount`, `statistics.commentCount`, `contentDetails.duration`, `queryNum`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
2022-12-22 15:05:49,941 INFO sqlalchemy.engine.Engine [generated in 0.00054s] (('437eKqGwBQc', datetime.datetime(2022, 12, 22, 22, 39, 23, tzinfo=<UTC>), datetime.datetime(2022, 12, 21, 19, 11, 3, tzinfo=<UTC>), 'UCg3gzldyhCHJjY7AWWTNPPA', 'My Terrifying Experience ALONE at Haunted Ligh

50

### Tags

In [145]:
# Get dictionary of relavant features w/ dtypes
feats_tags = {
    "queryNum": sqa.Integer,
    "id": sqa.Text,
    
    "snippet.tags": sqa.Text, 
}

In [146]:
# Drop table if exists
engine.execute("DROP TABLE IF EXISTS tags")

tags = create_table("tags", feats_tags)

2022-12-21 13:18:18,219 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS tags
2022-12-21 13:18:18,219 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-21 13:18:18,222 INFO sqlalchemy.engine.Engine COMMIT
2022-12-21 13:18:18,223 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-21 13:18:18,223 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:18:18,224 INFO sqlalchemy.engine.Engine [cached since 8900s ago] ('trending', 'tags')
2022-12-21 13:18:18,225 INFO sqlalchemy.engine.Engine 
CREATE TABLE tags (
	num INTEGER NOT NULL AUTO_INCREMENT, 
	`queryNum` INTEGER, 
	id TEXT, 
	`snippet.tags` TEXT, 
	PRIMARY KEY (num)
)


2022-12-21 13:18:18,225 INFO sqlalchemy.engine.Engine [no key 0.00026s] ()
2022-12-21 13:18:18,229 INFO sqlalchemy.engine.Engine CREATE INDEX `ix_tags_queryNum` ON tags (`queryNum`)
2022-12-21 13:18:18,230 INFO sqlalchemy.engine.Engine [no key 0.00045s] ()
2022-12-21 13:18:18,236 IN

  table = sqa.Table(


In [147]:
df_tags = df.loc[:, ["id", "snippet.tags"]]

df_tags = df_tags.explode("snippet.tags")

df_tags["queryNum"] = query_number

df_tags

Unnamed: 0,id,snippet.tags,queryNum
0,P1Ohc8GDFPI,Club Shay Shay,1
0,P1Ohc8GDFPI,fs1,1
0,P1Ohc8GDFPI,fox sports,1
0,P1Ohc8GDFPI,fs1 Club Shay Shay,1
0,P1Ohc8GDFPI,fox,1
...,...,...,...
49,kaObdAuaGqY,game freak,1
49,kaObdAuaGqY,pokemon scarlet is,1
49,kaObdAuaGqY,pokemon violet is,1
49,kaObdAuaGqY,pokemon scarlet review,1


In [148]:
df_tags.to_sql("tags", engine, 
               index=False, if_exists="append",
               dtype=feats_tags)

2022-12-21 13:18:18,536 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:18:18,536 INFO sqlalchemy.engine.Engine [cached since 8900s ago] ('trending', 'tags')
2022-12-21 13:18:18,538 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-21 13:18:18,539 INFO sqlalchemy.engine.Engine INSERT INTO tags (id, `snippet.tags`, `queryNum`) VALUES (%s, %s, %s)
2022-12-21 13:18:18,540 INFO sqlalchemy.engine.Engine [generated in 0.00109s] (('P1Ohc8GDFPI', 'Club Shay Shay', 1), ('P1Ohc8GDFPI', 'fs1', 1), ('P1Ohc8GDFPI', 'fox sports', 1), ('P1Ohc8GDFPI', 'fs1 Club Shay Shay', 1), ('P1Ohc8GDFPI', 'fox', 1), ('P1Ohc8GDFPI', 'fox youtube', 1), ('P1Ohc8GDFPI', 'foxsports youtube', 1), ('P1Ohc8GDFPI', 'fs1 youtube', 1)  ... displaying 10 of 779 total bound parameter sets ...  ('kaObdAuaGqY', 'pokemon scarlet review', 1), ('kaObdAuaGqY', 'pokemon violet review', 1))
2022-12-21 13:18:18,547 INFO sqlalchemy.engine.Engine 

779

### Thumbnails

In [149]:
# Get dictionary of relavant features w/ dtypes
feats_thumb = {
    "queryNum": sqa.Integer,
    "id": sqa.Text,
    
    "default.url": sqa.Text,
    "default.width": sqa.Integer,
    "default.height": sqa.Integer,

    "medium.url": sqa.Text,
    "medium.width": sqa.Integer,
    "medium.height": sqa.Integer,

    "high.url": sqa.Text,
    "high.width": sqa.Integer,
    "high.height": sqa.Integer,

    "standard.url": sqa.Text,
    "standard.width": sqa.Integer,
    "standard.height": sqa.Integer,

    "maxres.url": sqa.Text,
    "maxres.width": sqa.Integer,
    "maxres.height": sqa.Integer,
}

In [150]:
# Drop table if exists
engine.execute("DROP TABLE IF EXISTS thumbnails")

create_table("thumbnails", feats_thumb)

2022-12-21 13:18:19,014 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS thumbnails
2022-12-21 13:18:19,014 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-21 13:18:19,018 INFO sqlalchemy.engine.Engine COMMIT
2022-12-21 13:18:19,019 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-21 13:18:19,019 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:18:19,019 INFO sqlalchemy.engine.Engine [cached since 8901s ago] ('trending', 'thumbnails')
2022-12-21 13:18:19,021 INFO sqlalchemy.engine.Engine 
CREATE TABLE thumbnails (
	num INTEGER NOT NULL AUTO_INCREMENT, 
	`queryNum` INTEGER, 
	id TEXT, 
	`default.url` TEXT, 
	`default.width` INTEGER, 
	`default.height` INTEGER, 
	`medium.url` TEXT, 
	`medium.width` INTEGER, 
	`medium.height` INTEGER, 
	`high.url` TEXT, 
	`high.width` INTEGER, 
	`high.height` INTEGER, 
	`standard.url` TEXT, 
	`standard.width` INTEGER, 
	`standard.height` INTEGER, 
	`maxres.url

  table = sqa.Table(


In [151]:
# Get thumbnail data
df_thumb = pd.json_normalize(df["snippet.thumbnails"])

# Save timestamp
df_thumb["queryNum"] = query_number

# Get id from thumbnail name
df_thumb["id"] = df_thumb["default.url"].str.split("/").apply(lambda x: x[-2])

In [152]:
df_thumb.to_sql("thumbnails", engine, 
                index=False, if_exists="append",
                dtype=feats_thumb)

2022-12-21 13:18:19,627 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:18:19,627 INFO sqlalchemy.engine.Engine [cached since 8902s ago] ('trending', 'thumbnails')
2022-12-21 13:18:19,628 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-21 13:18:19,629 INFO sqlalchemy.engine.Engine INSERT INTO thumbnails (`default.url`, `default.width`, `default.height`, `medium.url`, `medium.width`, `medium.height`, `high.url`, `high.width`, `high.height`, `standard.url`, `standard.width`, `standard.height`, `maxres.url`, `maxres.width`, `maxres.height`, `queryNum`, id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
2022-12-21 13:18:19,630 INFO sqlalchemy.engine.Engine [generated in 0.00059s] (('https://i.ytimg.com/vi/P1Ohc8GDFPI/default.jpg', 120, 90, 'https://i.ytimg.com/vi/P1Ohc8GDFPI/mqdefault.jpg', 320, 180, 'https://i.ytimg.com/vi/P1Ohc8GDFPI/hqdefault.jpg', 480, 360, 'https:

50

# Test Queries

###### Titles

In [164]:
pd.read_sql("SHOW TABLES", engine)

2022-12-21 13:38:04,100 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:38:04,101 INFO sqlalchemy.engine.Engine [cached since 1.009e+04s ago] ('trending', 'SHOW TABLES')
2022-12-21 13:38:04,102 INFO sqlalchemy.engine.Engine SHOW TABLES
2022-12-21 13:38:04,102 INFO sqlalchemy.engine.Engine [raw sql] ()


Unnamed: 0,Tables_in_trending
0,tags
1,thumbnails
2,videos


In [163]:
pd.read_sql("SELECT * FROM videos;", engine)

2022-12-21 13:37:39,025 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:37:39,026 INFO sqlalchemy.engine.Engine [cached since 1.006e+04s ago] ('trending', 'SELECT * FROM videos;')
2022-12-21 13:37:39,027 INFO sqlalchemy.engine.Engine SELECT * FROM videos;
2022-12-21 13:37:39,027 INFO sqlalchemy.engine.Engine [raw sql] ()


Unnamed: 0,num,queryNum,id,queryTime,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.channelTitle,snippet.categoryId,statistics.viewCount,statistics.likeCount,statistics.commentCount,contentDetails.duration
0,1,1,P1Ohc8GDFPI,2022-12-21 18:49:46,2022-12-20 17:30:08,UCQoxJOkwaCgyzQtiuAIDcuw,Deion Sanders receives his flowers from Shanno...,"Club Shay Shay heads to Boulder, Colorado this...",Club Shay Shay,22,791339,39492,4089,5656
1,2,1,IltuQkeU7zU,2022-12-21 18:49:46,2022-12-20 20:37:43,UCWzLmNWhgeh3h1j-M-Isy0g,BADGER'S BEST OF 2022,BUY MY SHAKER CUP (use code BADGER for 10% off...,TheRussianBadger,20,1089263,110274,8858,2485
2,3,1,r5T4TMIXDqM,2022-12-21 18:49:46,2022-12-20 19:39:13,UC5p_l5ZeB_wGjO_yDXwiqvw,Minecraft's Funniest Cooking Show...,"Today TommyInnit, Schlatt, Slimecicle & start ...",TommyInnit,20,847612,98149,4633,1673
3,4,1,dJfE5U6gn9g,2022-12-21 18:49:46,2022-12-20 21:00:30,UCo_IB5145EVNcf8hw1Kku7w,Game Theory: This is Not My Channel,"Friends, today is a BIG day! I am so excited t...",The Game Theorists,20,1609453,145018,11572,1034
4,5,1,qMgb_YhfkUg,2022-12-21 18:49:46,2022-12-20 13:02:10,UC_F8DoJf9MZogEOU51TpTbQ,Santa Hog Rider Saves Clashmas!,Santa Hog Rider has arrived to the Village and...,Clash Royale,20,2564477,112302,2340,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,296,6,WttKbuWfFIQ,2022-12-21 18:49:46,2022-12-19 18:54:08,UCBBsPuUY-8UwkSim4zLXp1w,2 Feet Of Snow Is Coming Before Christmas,"Get your official Ryan Hall, Y'all merch here!...","Ryan Hall, Y'all XTRA",25,1062979,38152,2497,778
296,297,6,YepLtpG-q7A,2022-12-21 18:49:46,2022-12-20 04:17:14,UCSy3TvvwAV12MD0rLsHyClQ,KO and Rollins Take Down The Bloodline; Scare ...,Owens and Rollins get the win as Sami shows up...,WWE on USA,17,335761,6237,508,635
297,298,6,aDwQmnltxNA,2022-12-21 18:49:46,2022-12-20 00:40:14,UC5LGPvoUOfwcLi4Ck8LiR4A,Surprising My Wife With Rod Wave Concert Tickets,I took my wife Biannca to our first ever Rod W...,The Prince Family,22,196680,10306,1140,1312
298,299,6,VM7-ETfhWrg,2022-12-21 18:49:46,2022-12-19 23:01:28,UC3tZRm0-gebQjRqkTxcrkkg,Frieza Saga in a Nutshell,Patreon - https://www.patreon.com/Kyskke\nTwit...,Kyskke,24,564804,50214,4829,983


In [24]:
pd.read_sql("SELECT `snippet.title` FROM videos;", engine)

2022-12-21 10:50:11,394 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 10:50:11,394 INFO sqlalchemy.engine.Engine [cached since 13.3s ago] ('trending', 'SELECT `snippet.title` FROM videos;')
2022-12-21 10:50:11,395 INFO sqlalchemy.engine.Engine SELECT `snippet.title` FROM videos;
2022-12-21 10:50:11,395 INFO sqlalchemy.engine.Engine [raw sql] ()


Unnamed: 0,snippet.title
0,Deion Sanders receives his flowers from Shanno...
1,BADGER'S BEST OF 2022
2,Minecraft's Funniest Cooking Show...
3,Game Theory: This is Not My Channel
4,Santa Hog Rider Saves Clashmas!
5,The Truth About This Upcoming Massive Storm…
6,Argentina vs. France Highlights | 2022 FIFA Wo...
7,Huge DRAMA? Forcing YouTube Animators to becom...
8,Every Mario Kart Wii Course Ranked
9,Mario but I’m Blindfolded


In [167]:
query = "SELECT queryNum, id, group_concat(`snippet.tags`) as tagList \
        FROM tags \
        GROUP BY id, queryNum;"

pd.read_sql(query, engine)

2022-12-21 13:40:26,410 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-21 13:40:26,411 INFO sqlalchemy.engine.Engine [cached since 1.023e+04s ago] ('trending', 'SELECT queryNum, id, group_concat(`snippet.tags`) as tagList         FROM tags         GROUP BY id, queryNum;')
2022-12-21 13:40:26,412 INFO sqlalchemy.engine.Engine SELECT queryNum, id, group_concat(`snippet.tags`) as tagList         FROM tags         GROUP BY id, queryNum;
2022-12-21 13:40:26,412 INFO sqlalchemy.engine.Engine [raw sql] ()


Unnamed: 0,queryNum,id,tagList
0,1,-lsFs2615gw,
1,1,0cfBDICQJM4,"girls,brooklynandbailey,beauty,fashion,school,..."
2,1,34VZzBWBDN0,
3,1,3hJC4dqM6xc,"Rich Eisen Show,sports,NFL,football,college fo..."
4,1,3V5P8EkAa68,"epic,identical twins,ronron,24 hour sleepover,..."
5,1,5ztN90jb9wM,"streaming,Disney,Disney Plus,Disney+"
6,1,aDwQmnltxNA,"rod wave concert,kevin gates,lil baby,nba youn..."
7,1,bK6ldnjE3Y0,
8,1,BtQm1SvhYdk,"Messi,Argentina Celebration Buenos Aires,Messi..."
9,1,d-3XkTz_MBI,"Bowser's Castle,Mario,Luigi,Peach,Yoshi,Donkey..."
