In [1]:
import os
if os.path.split(os.getcwd())[1] != "MRB_II":
    %cd ../../

C:\Users\eric\Documents\DIGIPEN\PersonalSVN\Fall22SVN\CSP400\MRB_II


In [27]:
# Global
import math
from datetime import datetime as dt

import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import nltk

from collections import Counter

# Local
from yt_utils.yt_categories import YouTubeCategories

In [3]:
df = pd.read_feather("https://squeemos.pythonanywhere.com/static/yt_categories.feather")

In [4]:
categories = YouTubeCategories("https://squeemos.pythonanywhere.com/static/video_categories.json")
categories

<yt_utils.yt_categories.YouTubeCategories at 0x136f5647760>

In [5]:
nltk.download('stopwords')
STOPWORDS = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Base

In [6]:
def __tokenize_tags(tags):
    out = " ".join(tags).lower().split()
    out = [w for w in out if w not in STOPWORDS]
    return out

In [7]:
# Remove features containing the following strings
drops = (
    "localizations", "liveStreamingDetails", "recordingDetails",
    "regionRestriction", "ytRating", "thumbnails", "defaultLanguage",
)
tag_df = df.loc[:, [col for col in df.columns if not any(d in col for d in drops)]]

# Get only last month of data with tokenized/lowered/stopword-free tags
tag_df = tag_df.set_index("queryTime").last("30D").reset_index()
tag_df = tag_df.dropna(subset="snippet.tags")
tag_df["tags"] = tag_df["snippet.tags"].apply(__tokenize_tags)

In [8]:
tag_df

Unnamed: 0,queryTime,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.channelTitle,snippet.tags,...,status.embeddable,status.publicStatsViewable,status.madeForKids,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,player.embedHtml,topicDetails.topicCategories,tags
0,2022-11-17 21:01:06+00:00,youtube#video,LzoGT4ptNNXrLwXcL_aLQw32C0E,_m-gO0HSCYk,2022-11-13 05:51:19+00:00,UCqFzWxSCi39LnW1JKFR3efg,Dave Chappelle Stand-Up Monologue - SNL,"Dave Chappelle talks about Kanye West, the 202...",Saturday Night Live,"[SNL11122022, snl, saturday night live, snl 48...",...,True,True,False,9746220,273759.0,0,33872.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[snl11122022, snl, saturday, night, live, snl,..."
1,2022-11-17 21:01:06+00:00,youtube#video,qHPlSgGcBIzbYwH6PPEgYH75r_w,6V_sEqfIL9Q,2022-11-16 09:00:14+00:00,UCMtFAi84ehTSYSE9XoHefig,"Jon Stewart On Dave Chappelle, Kyrie Irving, A...",Jon Stewart gives his thoughtful take on the c...,The Late Show with Stephen Colbert,"[The Late Show, Late Show, Stephen Colbert, St...",...,True,True,False,2674894,53422.0,0,7796.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[late, show, late, show, stephen, colbert, ste..."
2,2022-11-17 21:01:06+00:00,youtube#video,1ye0-8lobh-MKAHG8H7vsMB_fdE,m1y6aXeybhI,2022-11-16 02:29:00+00:00,UCwWhs_6x42TyRM4Wstoq8HA,Kari Lake Questions Election Results & Taylor ...,"Kari Lake denies the Arizona election results,...",The Daily Show with Trevor Noah,"[the daily show, trevor noah, daily show with ...",...,True,False,False,2042119,45640.0,0,3081.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[daily, show, trevor, noah, daily, show, trevo..."
4,2022-11-17 21:01:06+00:00,youtube#video,1XUn1ZBUbIP4Rs8k1mRPM3RqPJk,ByF3xP5kNlo,2022-11-14 13:49:13+00:00,UCt7H1EuNhttU64JGivX_9uw,"Chappelle on why Trump is ""so loved"" in Ohio!!",,The Trumpest,"[usa, news, foxnews, America, trump, trump2024...",...,True,True,False,1567811,153317.0,0,3991.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[usa, news, foxnews, america, trump, trump2024..."
5,2022-11-17 21:01:06+00:00,youtube#video,KNji9dFzWP1WsEPGh3Lk1bn6-gg,KIReDVQMtBA,2022-11-15 01:00:09+00:00,UC_hK9fOxyy_TM8FJGXIyG8Q,"EVIL FOSTER MOM Won't Feed Child, She Lives To...","💥NEW Pop Art Merch Drop OUT NOW, shop here: ht...",Dhar Mann,"[Dhar Mann, motivation, motivational video, in...",...,True,True,False,4526869,92352.0,0,6226.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[dhar, mann, motivation, motivational, video, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156005,2022-11-29 00:02:29+00:00,youtube#video,Uzppeh6TKar1_uQaq8xVvnCTYsU,j-FOnsbjHvs,2022-11-25 21:00:22+00:00,UCdC0An4ZPNr_YiFiYoVbwaw,The Pilot Turned Off Gravity,"Hello everyone, this is YOUR Daily Dose of Int...",Daily Dose Of Internet,"[Internet, Internet videos, Daily Dose, Animal...",...,True,True,False,4336650,205512.0,0,6195.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[internet, internet, videos, daily, dose, anim..."
156006,2022-11-29 00:02:29+00:00,youtube#video,9gCi_0x-kHSQUeoQ6aHwWAQif5E,82gYn_QdhpU,2022-11-18 19:00:04+00:00,UC9VX0KXNH20x9MCH3xGjisg,Tanners Last Video 😢 | Buying Whatever The Mag...,"In the episode of Dope or Nope, we say our las...",DOPE or NOPE,"[dope or nope, dope, matthias, hi5 studios, fu...",...,True,False,False,639746,33037.0,0,2119.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[dope, nope, dope, matthias, hi5, studios, fun..."
156007,2022-11-29 00:02:29+00:00,youtube#video,gEH2KI4N2a1KQu7PumA09DKYwYo,KDMw9pQ5RAk,2022-11-13 13:00:18+00:00,UCe8D5lo_zrqS8xtsNspeCgg,Gaming kid couldn’t charge his #iphone 🔋Batter...,@MONEY TALKS WIRELESS \n\nMerch : www.mtwshop....,MONEY TALKS WIRELESS,"[Apple, iPhone, iPad, Android, Samsung, MacBoo...",...,True,True,False,8111906,536376.0,0,6442.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[apple, iphone, ipad, android, samsung, macboo..."
156009,2022-11-29 00:02:29+00:00,youtube#video,NZWWYs4Ny90OgkxQkfVlZ0jfprc,QiYvXKQksgI,2022-11-23 11:55:29+00:00,UC6n8I1UDTKP1IWjQMg6_TwA,The Insane Scale of Europe’s New Mega-Tunnel,Denmark is building a record-breaking tunnel t...,The B1M,"[B1M, TheB1M, Construction, architecture, engi...",...,True,False,False,2618099,62146.0,0,4453.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[b1m, theb1m, construction, architecture, engi..."


### Hist

In [9]:
# Get only latest videos
cat_tags = tag_df.drop_duplicates(subset="id", keep="last", ignore_index=True)

# Get tags for each category
cat_tags = cat_tags.groupby("snippet.categoryId")["tags"].sum()

cat_tags

snippet.categoryId
1     [packing, traveling, vacation, sister, forever...
2     [thestradman, stradman, supercars, super, cars...
10    [sabrina, carpenter, nonsense, island, records...
15    [pet, collective, pet, collective, animals, fu...
17    [eagles, vs, commanders, philadelphia, eagles,...
19    [vanlife, solofemalevanlife, van, life, living...
20    [funny, moments, montage, video, games, gaming...
22    [cboystv, funny, friends, fast, facebook, mark...
23    [largecat, meme, compilation, funny, memes, tr...
24    [#blackgirlnews, #leahgordone, #lostbrowngirls...
25    [lawrence, o'donnell, health, international, n...
26    [spy, ninjas, spy, ninja, chad, wild, clay, cw...
27    [veritasium, science, physics, smart, nonsense...
28    [adobe, pantone, creative, cloud, creative, su...
Name: tags, dtype: object

In [10]:
# Create column for category id and category name
cat_tags = pd.DataFrame(cat_tags).reset_index()
cat_tags["category"] = cat_tags["snippet.categoryId"].apply(lambda x: categories.id_to_title[x])
cat_tags

Unnamed: 0,snippet.categoryId,tags,category
0,1,"[packing, traveling, vacation, sister, forever...",Film & Animation
1,2,"[thestradman, stradman, supercars, super, cars...",Autos & Vehicles
2,10,"[sabrina, carpenter, nonsense, island, records...",Music
3,15,"[pet, collective, pet, collective, animals, fu...",Pets & Animals
4,17,"[eagles, vs, commanders, philadelphia, eagles,...",Sports
5,19,"[vanlife, solofemalevanlife, van, life, living...",Travel & Events
6,20,"[funny, moments, montage, video, games, gaming...",Gaming
7,22,"[cboystv, funny, friends, fast, facebook, mark...",People & Blogs
8,23,"[largecat, meme, compilation, funny, memes, tr...",Comedy
9,24,"[#blackgirlnews, #leahgordone, #lostbrowngirls...",Entertainment


In [11]:
cat_tags["tags"] = cat_tags["tags"].apply(Counter)
cat_tags["tags"] = cat_tags["tags"].apply(lambda x: dict(x.most_common(10)))
cat_tags

Unnamed: 0,snippet.categoryId,tags,category
0,1,"{'vlogs': 16, 'leroy': 15, 'movie': 13, 'forev...",Film & Animation
1,2,"{'road': 32, 'recovery': 30, 'car': 19, 'lambo...",Autos & Vehicles
2,10,"{'eladio': 41, 'sauce': 39, 'video': 30, 'boyz...",Music
3,15,"{'animals': 28, 'dog': 22, 'fish': 15, 'funny'...",Pets & Animals
4,17,"{'football': 88, 'first': 87, 'nba': 84, 'take...",Sports
5,19,"{'van': 10, 'life': 7, 'bec': 4, 'vanlife': 3,...",Travel & Events
6,20,"{'minecraft': 592, 'roblox': 484, '2': 390, 'f...",Gaming
7,22,"{'vlog': 56, 'family': 45, 'life': 39, 'funny'...",People & Blogs
8,23,"{'funny': 89, 'comedy': 88, 'show': 45, 'memes...",Comedy
9,24,"{'minecraft': 236, 'funny': 161, 'laugh': 108,...",Entertainment


# Time Series

In [28]:
# Get data and set index to date
month_df = tag_df.set_index("queryTime")

# Get log views
month_df["logViews"] = np.log(month_df["statistics.viewCount"])

# Get number of days since beginning of df
month_df["days"] = (month_df.index - month_df.index[0]).days

In [29]:
%%time
# Change to dicts with view count for values
month_df["tagViews"] = month_df.apply(lambda row: Counter({tag: row["logViews"] for tag in row["snippet.tags"]}), axis=1)

# Consolidate tag views
day_views = pd.DataFrame(month_df.groupby(["days", "snippet.categoryId"])["tagViews"].sum())
day_views = day_views.reset_index()

# Get top tags per category for entire month
cat_views = pd.DataFrame(day_views.groupby("snippet.categoryId")["tagViews"].sum())
cat_views.columns = ["catViews"]
cat_views["topTags"] = cat_views["catViews"].apply(lambda x: list(dict(x.most_common(10)).keys()))

# Get viewcounts for top tags across all days
view_df = pd.merge(day_views, cat_views["topTags"], how="left", on="snippet.categoryId")
view_df["topViews"] = view_df.apply(lambda row: {k: v for k, v in row["tagViews"].items() if k in row["topTags"]}, axis=1)
view_df

KeyError: 'category'

In [26]:
# Select category
category_name = "Gaming"

# Get data for category
cat_day = view_df[view_df["category"] == category_name]
day_data = dict()
for tag in cat_day["topTags"].iloc[0]:
    day_data[tag] = cat_day["topViews"].apply(lambda x: x[tag]).values
    
# Create df for viss
day_data = pd.DataFrame(day_data).reset_index().rename({"index": "day"}, axis=1)
day_data = pd.melt(day_data, id_vars="day").rename({"variable": "tag"}, axis=1)

# Plot
fig = px.line(day_data, x="day", y="value", color="tag")
fig.update_layout(
    title_text=f"Monthly Popularity of '{category_name}' Tags", title_x=0.5,
    yaxis_title="Cumulative log(Views)"
)
fig.show()

NameError: name 'px' is not defined