# Setup

##### Set CWD

In [9]:
cd ..

d:\Documents\A_DIGIPEN\PersonalSVN\Fall22SVN\CSP400\MRB_II


##### Imports

In [46]:
from datetime import datetime
from pytz import timezone

import pandas as pd

import sqlalchemy
import sqlalchemy as sqa

import googleapiclient.discovery

# Data

In [16]:
def build_client(path="./secret_keys/"):
    """Creates and returns the YouTube Data API client needed for requests."""
    print("building client...")

    api_key_path = path + "api_key.txt"

    # API information
    api_service_name = "youtube"
    api_version = "v3"

    # Read API key
    with open(api_key_path, "r") as api_file:
        api_key = api_file.read()

    # Create the client with api_key
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=api_key
    )
    print("client built")

    return youtube

In [17]:
def _get_all_parts():
    parts = (
        "id",
        "snippet",
        "contentDetails",
        "status",
        "statistics",
        "player",
        "topicDetails",
        "recordingDetails",
        "liveStreamingDetails",
        "localizations",
    )

    return ",".join(parts)

In [18]:
def get_trending(youtube):
    """Gets top trending videos in the US."""
    print("getting trending videos...")

    # Get data
    request = youtube.videos().list(
        part=_get_all_parts(),
        chart="mostPopular",
        maxResults=50,
        regionCode="US"
    )
    response = request.execute()
    print("got trending videos")

    return response

In [19]:
def get_categories(youtube, cat_id):
    print("getting category videos...")
    try:
        # Get data
        request = youtube.videos().list(
            part=_get_all_parts(),
            chart="mostPopular",
            maxResults=50,
            regionCode="US",
            videoCategoryId=cat_id,
        )
        response = request.execute()
        print(f"got cat{cat_id} videos")
    except googleapiclient.errors.HttpError:
        print(f"cat{cat_id} chart not found or failed")

    return response

In [20]:
youtube = build_client()
trending = get_trending(youtube)
cat = get_categories(youtube, 1)

building client...
client built
getting trending videos...
got trending videos
getting category videos...
got cat1 videos


In [9]:
trending

{'kind': 'youtube#videoListResponse',
 'etag': 'W5RPgIEJ79sKbQsmjuMVuMHNxjY',
 'items': [{'kind': 'youtube#video',
   'etag': 'eREu-TmQNLCfdVuiOIP-yJ9MjlM',
   'id': '4CRt4Eewe0U',
   'snippet': {'publishedAt': '2022-12-13T17:01:06Z',
    'channelId': 'UCFqyJFbsV-uEcosvNhg0PaQ',
    'title': 'SPIDER-MAN: ACROSS THE SPIDER-VERSE - Official Trailer (HD)',
    'description': 'The next Spider-Man movie is coming to cinemas on June 2, 2023 in English, Hindi, Tamil & Telugu. \nWatch the new trailer for Spider-Man: Across the #SpiderVerse now!\n\nMiles Morales returns for the next chapter of the Oscar¬Æ-winning Spider-Verse saga, Spider-Man: Across the Spider-Verse. After reuniting with Gwen Stacy, Brooklyn‚Äôs full-time, friendly neighborhood Spider-Man is catapulted across the Multiverse, where he encounters a team of Spider-People charged with protecting its very existence. But when the heroes clash on how to handle a new threat, Miles finds himself pitted against the other Spiders and mus

###### Prelim. Wrangling

In [163]:
current_time = datetime.now(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ")

In [164]:
# Read into dataframe
df = pd.json_normalize(trending["items"], max_level=1)

# Save timestamp
df["queryTime"] = current_time

# Convert columns to datetime
dt_names = (
    "queryTime",
    "snippet.publishedAt",
)
for dt_feat in dt_names:
    df[dt_feat] = pd.to_datetime(df[dt_feat], utc=True)

# Database

In [21]:
with open("mysql/mysql_key.txt", "r") as file:
    key = file.read()

# engine = create_engine("mysql://user:pwd@localhost/college",echo = True)
engine = sqlalchemy.create_engine(f"mysql://eric:{key}@localhost/trending", echo=True)

#### Videos

###### Define Features

In [133]:
# Get dictionary of relavant features w/ dtypes
feats_vid = {
    "id": sqa.Text,
    "queryTime": sqa.DateTime,

    "snippet.publishedAt": sqa.DateTime,
    "snippet.channelId": sqa.Text,
    "snippet.title": sqa.Text,
    "snippet.description": sqa.Text,
    "snippet.channelTitle": sqa.Text,
    "snippet.categoryId": sqa.Integer,

    "statistics.viewCount": sqa.BigInteger,
    "statistics.likeCount": sqa.BigInteger,
    "statistics.favoriteCount": sqa.BigInteger,
    "statistics.commentCount": sqa.BigInteger,

    "contentDetails.duration": sqa.BigInteger, 
}

###### Create Table

In [134]:
# Drop table if exists
engine.execute("DROP TABLE IF EXISTS videos")

# Create table
meta = sqa.MetaData()
videos = sqa.Table(
    "videos", meta,
    sqa.Column("index", sqa.Integer, primary_key=True),
    *[sqa.Column(name, dtype) for name, dtype in feats_vid.items()],
)
meta.create_all(engine)

2022-12-13 18:31:14,986 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS videos
2022-12-13 18:31:14,987 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-13 18:31:14,999 INFO sqlalchemy.engine.Engine COMMIT
2022-12-13 18:31:15,000 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-13 18:31:15,000 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-13 18:31:15,001 INFO sqlalchemy.engine.Engine [cached since 3919s ago] ('trending', 'videos')
2022-12-13 18:31:15,002 INFO sqlalchemy.engine.Engine 
CREATE TABLE videos (
	`index` INTEGER NOT NULL AUTO_INCREMENT, 
	id TEXT, 
	`queryTime` DATETIME, 
	`snippet.publishedAt` DATETIME, 
	`snippet.channelId` TEXT, 
	`snippet.title` TEXT, 
	`snippet.description` TEXT, 
	`snippet.channelTitle` TEXT, 
	`snippet.categoryId` INTEGER, 
	`statistics.viewCount` BIGINT, 
	`statistics.likeCount` BIGINT, 
	`statistics.favoriteCount` BIGINT, 
	`statistics.commentCount` BIGINT, 

###### Create DF

In [135]:
# Remove features containing the following strings
drop_substrings = (
    "localizations",
)
df_vid = df.loc[:, [col for col in df.columns if not any(d in col for d in drop_substrings)]]

# Convert duration to time delta
df_vid["contentDetails.duration"] = pd.to_timedelta(df_vid["contentDetails.duration"].str.slice(start=2)).dt.seconds

# Get only features that matter
df_vid = df_vid.loc[:, list(feats_vid.keys())]

  data = objects_to_td64ns(data, unit=unit, errors=errors)


###### Create Table

In [136]:
df_vid.to_sql("videos", engine, 
              index=False, if_exists="append",
              dtype=feats_vid)

2022-12-13 18:31:19,124 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-13 18:31:19,124 INFO sqlalchemy.engine.Engine [cached since 3923s ago] ('trending', 'videos')
2022-12-13 18:31:19,125 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-13 18:31:19,126 INFO sqlalchemy.engine.Engine INSERT INTO videos (id, `queryTime`, `snippet.publishedAt`, `snippet.channelId`, `snippet.title`, `snippet.description`, `snippet.channelTitle`, `snippet.categoryId`, `statistics.viewCount`, `statistics.likeCount`, `statistics.favoriteCount`, `statistics.commentCount`, `contentDetails.duration`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
2022-12-13 18:31:19,127 INFO sqlalchemy.engine.Engine [generated in 0.00051s] (('cqGjhVJWtEg', datetime.datetime(2022, 12, 14, 2, 31, 10, tzinfo=<UTC>), datetime.datetime(2022, 12, 13, 17, 0, 7, tzinfo=<UTC>), 'UCz97F7dMxBNOfGYu3rx8aCw', 'SPIDER-MAN: ACROSS THE SPIDER-VE

50

#### Tags

In [144]:
# Get dictionary of relavant features w/ dtypes
feats_tags = {
    "id": sqa.Text,
    "queryTime": sqa.DateTime,
    
    "snippet.tags": sqa.Text, 
}

In [145]:
# Drop table if exists
engine.execute("DROP TABLE IF EXISTS tags")

# Create table
meta = sqa.MetaData()
tags = sqa.Table(
    "tags", meta,
    sqa.Column("index", sqa.Integer, primary_key=True),
    *[sqa.Column(name, dtype) for name, dtype in feats_tags.items()],
)
meta.create_all(engine)

2022-12-13 18:33:00,502 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS tags
2022-12-13 18:33:00,502 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-13 18:33:00,506 INFO sqlalchemy.engine.Engine COMMIT
2022-12-13 18:33:00,508 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-13 18:33:00,508 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-13 18:33:00,508 INFO sqlalchemy.engine.Engine [cached since 4024s ago] ('trending', 'tags')
2022-12-13 18:33:00,510 INFO sqlalchemy.engine.Engine 
CREATE TABLE tags (
	`index` INTEGER NOT NULL AUTO_INCREMENT, 
	id TEXT, 
	`queryTime` DATETIME, 
	`snippet.tags` TEXT, 
	PRIMARY KEY (`index`)
)


2022-12-13 18:33:00,510 INFO sqlalchemy.engine.Engine [no key 0.00026s] ()
2022-12-13 18:33:00,516 INFO sqlalchemy.engine.Engine COMMIT


In [151]:
df_tags = df.loc[:, ["id", "queryTime", "snippet.tags"]]

df_tags = df_tags.explode("snippet.tags")

#df_tags = df_tags.dropna(subset="snippet.tags")

df_tags

Unnamed: 0,id,queryTime,snippet.tags
0,cqGjhVJWtEg,2022-12-14 02:31:10+00:00,
1,ppwvzOH1sJE,2022-12-14 02:31:10+00:00,vh1
1,ppwvzOH1sJE,2022-12-14 02:31:10+00:00,rupaul
1,ppwvzOH1sJE,2022-12-14 02:31:10+00:00,rupaul's drag race
1,ppwvzOH1sJE,2022-12-14 02:31:10+00:00,drag race
...,...,...,...
49,vHtqsuA8WJ4,2022-12-14 02:31:10+00:00,mrbeast pewdiepie
49,vHtqsuA8WJ4,2022-12-14 02:31:10+00:00,mr beast
49,vHtqsuA8WJ4,2022-12-14 02:31:10+00:00,mrbeast reaction
49,vHtqsuA8WJ4,2022-12-14 02:31:10+00:00,mrbeast reacts


In [147]:
df_tags.to_sql("tags", engine, 
               index=False, if_exists="append",
               dtype=feats_tags)

2022-12-13 18:33:02,117 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-13 18:33:02,117 INFO sqlalchemy.engine.Engine [cached since 4026s ago] ('trending', 'tags')
2022-12-13 18:33:02,119 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-13 18:33:02,120 INFO sqlalchemy.engine.Engine INSERT INTO tags (id, `queryTime`, `snippet.tags`) VALUES (%s, %s, %s)
2022-12-13 18:33:02,121 INFO sqlalchemy.engine.Engine [generated in 0.00097s] (('cqGjhVJWtEg', datetime.datetime(2022, 12, 14, 2, 31, 10, tzinfo=<UTC>), None), ('ppwvzOH1sJE', datetime.datetime(2022, 12, 14, 2, 31, 10, tzinfo=<UTC>), 'vh1'), ('ppwvzOH1sJE', datetime.datetime(2022, 12, 14, 2, 31, 10, tzinfo=<UTC>), 'rupaul'), ('ppwvzOH1sJE', datetime.datetime(2022, 12, 14, 2, 31, 10, tzinfo=<UTC>), "rupaul's drag race"), ('ppwvzOH1sJE', datetime.datetime(2022, 12, 14, 2, 31, 10, tzinfo=<UTC>), 'drag race'), ('T6Yn4cfXt9o', datetime.datetime(2022, 12, 14,

716

#### Thumbnails

In [182]:
# Get dictionary of relavant features w/ dtypes
feats_thumb = {
    "id": sqa.Text,
    "queryTime": sqa.DateTime,
    
    "default.url": sqa.Text,
    "default.width": sqa.Integer,
    "default.height": sqa.Integer,

    "medium.url": sqa.Text,
    "medium.width": sqa.Integer,
    "medium.height": sqa.Integer,

    "high.url": sqa.Text,
    "high.width": sqa.Integer,
    "high.height": sqa.Integer,

    "standard.url": sqa.Text,
    "standard.width": sqa.Integer,
    "standard.height": sqa.Integer,

    "maxres.url": sqa.Text,
    "maxres.width": sqa.Integer,
    "maxres.height": sqa.Integer,
}

In [183]:
# Drop table if exists
engine.execute("DROP TABLE IF EXISTS thumbnails")

# Create table
meta = sqa.MetaData()
thumb = sqa.Table(
    "thumbnails", meta,
    sqa.Column("index", sqa.Integer, primary_key=True),
    *[sqa.Column(name, dtype) for name, dtype in feats_thumb.items()],
)
meta.create_all(engine)

2022-12-13 18:55:00,678 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS thumbnails
2022-12-13 18:55:00,679 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-13 18:55:00,681 INFO sqlalchemy.engine.Engine COMMIT
2022-12-13 18:55:00,683 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-13 18:55:00,684 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-13 18:55:00,684 INFO sqlalchemy.engine.Engine [cached since 5345s ago] ('trending', 'thumbnails')
2022-12-13 18:55:00,686 INFO sqlalchemy.engine.Engine 
CREATE TABLE thumbnails (
	`index` INTEGER NOT NULL AUTO_INCREMENT, 
	id TEXT, 
	`queryTime` DATETIME, 
	`default.url` TEXT, 
	`default.width` INTEGER, 
	`default.height` INTEGER, 
	`medium.url` TEXT, 
	`medium.width` INTEGER, 
	`medium.height` INTEGER, 
	`high.url` TEXT, 
	`high.width` INTEGER, 
	`high.height` INTEGER, 
	`standard.url` TEXT, 
	`standard.width` INTEGER, 
	`standard.height` INTEGER, 
	`maxr

In [189]:
# Get thumbnail data
df_thumb = pd.json_normalize(df["snippet.thumbnails"])

# Save timestamp
df_thumb["queryTime"] = current_time
df_thumb["queryTime"] = pd.to_datetime(df_thumb["queryTime"], utc=True)

# Get id from thumbnail name
df_thumb["id"] = df_thumb["default.url"].str.split("/").apply(lambda x: x[-2])



In [190]:
df_thumb.to_sql("thumbnails", engine, 
                index=False, if_exists="append",
                dtype=feats_thumb)

2022-12-13 18:56:54,893 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-12-13 18:56:54,894 INFO sqlalchemy.engine.Engine [cached since 5459s ago] ('trending', 'thumbnails')
2022-12-13 18:56:54,895 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-13 18:56:54,896 INFO sqlalchemy.engine.Engine INSERT INTO thumbnails (`default.url`, `default.width`, `default.height`, `medium.url`, `medium.width`, `medium.height`, `high.url`, `high.width`, `high.height`, `standard.url`, `standard.width`, `standard.height`, `maxres.url`, `maxres.width`, `maxres.height`, `queryTime`, id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
2022-12-13 18:56:54,896 INFO sqlalchemy.engine.Engine [generated in 0.00049s] (('https://i.ytimg.com/vi/cqGjhVJWtEg/default.jpg', 120, 90, 'https://i.ytimg.com/vi/cqGjhVJWtEg/mqdefault.jpg', 320, 180, 'https://i.ytimg.com/vi/cqGjhVJWtEg/hqdefault.jpg', 480, 360, 'https

50