# Questions

* Feature exploration
* Classifier
    * Idea
        * Create explainable classifer that predicts video category and explains the discriminating features by which the decision is made.
    * Applications
        * Submit video link/id to show category information.
        * Fill in title, description, thumbnail, tags, etc. to show category probability distribution.
* Thumbnail analysis
* NLP
    * Title analysis
    * Tag compositions
    * Description

---

# Setup

In [1]:
import os
if os.path.split(os.getcwd())[1] != "MRB_II":
    %cd ../../

D:\Documents\A_DIGIPEN\PersonalSVN\Fall22SVN\CSP400\MRB_II


In [2]:
# Global
import math
from datetime import datetime as dt

import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import nltk

from collections import Counter

# Local
from yt_utils.yt_accessor import YouTubeAccessor
from yt_utils.yt_categories import YouTubeCategories

In [3]:
pd.set_option('display.max_columns', 100)

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### Load Data

In [4]:
raw = pd.read_feather("https://squeemos.pythonanywhere.com/static/yt_categories.feather")

In [5]:
raw

Unnamed: 0,queryTime,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,snippet.thumbnails.medium.url,snippet.thumbnails.medium.width,snippet.thumbnails.medium.height,snippet.thumbnails.high.url,snippet.thumbnails.high.width,snippet.thumbnails.high.height,snippet.thumbnails.standard.url,snippet.thumbnails.standard.width,snippet.thumbnails.standard.height,snippet.thumbnails.maxres.url,snippet.thumbnails.maxres.width,snippet.thumbnails.maxres.height,snippet.channelTitle,snippet.categoryId,snippet.liveBroadcastContent,snippet.localized.title,snippet.localized.description,contentDetails.duration,contentDetails.dimension,contentDetails.definition,contentDetails.caption,contentDetails.licensedContent,contentDetails.projection,status.uploadStatus,status.privacyStatus,status.license,status.embeddable,status.publicStatsViewable,status.madeForKids,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,player.embedHtml,topicDetails.topicCategories,snippet.tags,snippet.defaultAudioLanguage,contentDetails.regionRestriction.blocked,snippet.defaultLanguage,...,localizations.sv.title,localizations.sv.description,localizations.el.title,localizations.el.description,localizations.sk.title,localizations.sk.description,localizations.lt.title,localizations.lt.description,localizations.cs.title,localizations.cs.description,localizations.lv.title,localizations.lv.description,localizations.ro.title,localizations.ro.description,localizations.hr.title,localizations.hr.description,localizations.fil.title,localizations.fil.description,localizations.mn.title,localizations.mn.description,localizations.fi.title,localizations.fi.description,localizations.et.title,localizations.et.description,localizations.it.title,localizations.it.description,localizations.da.title,localizations.da.description,localizations.bg.title,localizations.bg.description,localizations.sr.title,localizations.sr.description,localizations.kk.title,localizations.kk.description,localizations.es-ES.title,localizations.es-ES.description,localizations.es-MX.title,localizations.es-MX.description,localizations.de-DE.title,localizations.de-DE.description,localizations.fr-FR.title,localizations.fr-FR.description,localizations.pt-BR.title,localizations.pt-BR.description,localizations.zh-TW.title,localizations.zh-TW.description,localizations.zh-CN.title,localizations.zh-CN.description,localizations.en-CA.title,localizations.en-CA.description
0,2022-10-13 03:00:47+00:00,youtube#video,Zg6LxNWRUK0wxJdkyCxkzZZrrJM,pFI3hVpV9S8,2022-10-07 01:47:40+00:00,UCYIEv9W7RmdpvFkHX7IEmyg,THE HAVE IT ALL TOUR STARTS TOMORROW! 🤩 #short...,,https://i.ytimg.com/vi/pFI3hVpV9S8/default.jpg,120,90,https://i.ytimg.com/vi/pFI3hVpV9S8/mqdefault.jpg,320,180,https://i.ytimg.com/vi/pFI3hVpV9S8/hqdefault.jpg,480,360,https://i.ytimg.com/vi/pFI3hVpV9S8/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/pFI3hVpV9S8/maxresdefau...,1280.0,720.0,Taylor Tomlinson,23,none,THE HAVE IT ALL TOUR STARTS TOMORROW! 🤩 #short...,,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,9162782,787058.0,0,4504.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...",,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2022-10-13 03:00:47+00:00,youtube#video,y9OsgzQdx0AFBXSyUZZU6ECoRW4,UIGNMccGh8w,2022-10-02 18:20:31+00:00,UCQz2x0BlBM9MVaUW_kYPosg,مشهد سينمائي من فيلم | roman j israel esq,#سينمائيونHD #أقتباسات_افلام #أقتباسات_سينمائي...,https://i.ytimg.com/vi/UIGNMccGh8w/default.jpg,120,90,https://i.ytimg.com/vi/UIGNMccGh8w/mqdefault.jpg,320,180,https://i.ytimg.com/vi/UIGNMccGh8w/hqdefault.jpg,480,360,https://i.ytimg.com/vi/UIGNMccGh8w/sddefault.jpg,640.0,480.0,,,,𝐑𝐢𝐜𝐤🎭,24,none,مشهد سينمائي من فيلم | roman j israel esq,#سينمائيونHD #أقتباسات_افلام #أقتباسات_سينمائي...,PT52S,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,15589366,1036675.0,0,23024.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[اقتباسات افلام, اقتباسات واقوال, اقتباسات أفل...",ar,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2022-10-13 03:00:47+00:00,youtube#video,bSQUt0FboQ9S9qdcHGViJzEovgA,7-U1EUqTGFA,2022-10-09 14:36:05+00:00,UCGpC5tALOoJ8M7axkm9n9vQ,Bart's a nerd! | The Simpsons #shorts,"Nice shoes, uh... two feet",https://i.ytimg.com/vi/7-U1EUqTGFA/default.jpg,120,90,https://i.ytimg.com/vi/7-U1EUqTGFA/mqdefault.jpg,320,180,https://i.ytimg.com/vi/7-U1EUqTGFA/hqdefault.jpg,480,360,https://i.ytimg.com/vi/7-U1EUqTGFA/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/7-U1EUqTGFA/maxresdefau...,1280.0,720.0,Perfectly Cromulent,1,none,Bart's a nerd! | The Simpsons #shorts,"Nice shoes, uh... two feet",PT1M,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,5548482,694632.0,0,1013.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...",,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2022-10-13 03:00:47+00:00,youtube#video,MHIIeS45a6qMKEDn2-pFz5nTLkc,SS7HXxy3_2c,2022-10-09 05:38:34+00:00,UCqFzWxSCi39LnW1JKFR3efg,Try Guys - SNL,A CNN broadcast is interrupted by breaking new...,https://i.ytimg.com/vi/SS7HXxy3_2c/default.jpg,120,90,https://i.ytimg.com/vi/SS7HXxy3_2c/mqdefault.jpg,320,180,https://i.ytimg.com/vi/SS7HXxy3_2c/hqdefault.jpg,480,360,,,,https://i.ytimg.com/vi/SS7HXxy3_2c/maxresdefau...,1280.0,720.0,Saturday Night Live,23,none,Try Guys - SNL,A CNN broadcast is interrupted by breaking new...,PT5M52S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,2069780,31249.0,0,18810.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[SNL10082022, snl, saturday night live, snl 48...",en,"[AD, AE, AF, AG, AI, AO, AR, AS, AU, AW, BB, B...",,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2022-10-13 03:00:47+00:00,youtube#video,pvxaPRxR_IfrhyuT32U7XIBBjBs,oibJn6Ct1tg,2022-10-03 17:21:17+00:00,UC7bouvhSTd2RQwYOi7zq0hQ,Interrupted in San Diego #shorts #comedy #funny,i appreciate the gift but not the poor crowd e...,https://i.ytimg.com/vi/oibJn6Ct1tg/default.jpg,120,90,https://i.ytimg.com/vi/oibJn6Ct1tg/mqdefault.jpg,320,180,https://i.ytimg.com/vi/oibJn6Ct1tg/hqdefault.jpg,480,360,https://i.ytimg.com/vi/oibJn6Ct1tg/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/oibJn6Ct1tg/maxresdefau...,1280.0,720.0,Stavros Halkias,23,none,Interrupted in San Diego #shorts #comedy #funny,i appreciate the gift but not the poor crowd e...,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,8233551,659746.0,0,2802.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[stand up comedy, crowd work comedy, crowdwork...",en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238708,2022-10-30 05:00:55+00:00,youtube#video,nQl9esYAJFcCtTKT8NYs25_hEQU,Ix28jjDCEh4,2022-10-26 00:39:24+00:00,UC_Vl1oLTGjWYJLmbTpaqorQ,"How Ned Fulmer Ruined the Try Guys Reputation,...",Thanks to Established Titles for sponsoring th...,https://i.ytimg.com/vi/Ix28jjDCEh4/default.jpg,120,90,https://i.ytimg.com/vi/Ix28jjDCEh4/mqdefault.jpg,320,180,https://i.ytimg.com/vi/Ix28jjDCEh4/hqdefault.jpg,480,360,https://i.ytimg.com/vi/Ix28jjDCEh4/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/Ix28jjDCEh4/maxresdefau...,1280.0,720.0,Spill,24,none,"How Ned Fulmer Ruined the Try Guys Reputation,...",Thanks to Established Titles for sponsoring th...,PT51M15S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,375729,14411.0,0,813.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[ned & ariel, try guys, keith, ned fulmer, ned...",en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
238709,2022-10-30 05:00:55+00:00,youtube#video,4cvwwRc3rfMUQiK0A8RkEDDWarg,3tRsaJ2AuU0,2022-10-23 17:00:29+00:00,UC5Qbo0AR3CwpmEq751BIy0g,10 NEW Costco Deals You NEED To Buy in October...,So many people have shopped at Costco but not ...,https://i.ytimg.com/vi/3tRsaJ2AuU0/default.jpg,120,90,https://i.ytimg.com/vi/3tRsaJ2AuU0/mqdefault.jpg,320,180,https://i.ytimg.com/vi/3tRsaJ2AuU0/hqdefault.jpg,480,360,https://i.ytimg.com/vi/3tRsaJ2AuU0/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/3tRsaJ2AuU0/maxresdefau...,1280.0,720.0,The Deal Guy,28,none,10 NEW Costco Deals You NEED To Buy in October...,So many people have shopped at Costco but not ...,PT15M24S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,897609,32929.0,0,978.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[the deal guy, costco, costco store, costco se...",en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
238710,2022-10-30 05:00:55+00:00,youtube#video,lWOayebu2zA79Nvx6uSWME6erjw,VyoPSKJcBII,2022-10-25 15:00:33+00:00,UCvRHSRvLxt9JwRaAeVI1sew,Water test,"If you looking for me on other social media, h...",https://i.ytimg.com/vi/VyoPSKJcBII/default.jpg,120,90,https://i.ytimg.com/vi/VyoPSKJcBII/mqdefault.jpg,320,180,https://i.ytimg.com/vi/VyoPSKJcBII/hqdefault.jpg,480,360,https://i.ytimg.com/vi/VyoPSKJcBII/sddefault.jpg,640.0,480.0,,,,James Butler,24,none,Water test,"If you looking for me on other social media, h...",PT54S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,155413,17345.0,0,301.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,,en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
238711,2022-10-30 05:00:55+00:00,youtube#video,K9wfRdeYdpcdWRtyWeRJ8bQ16sk,XSajdGCFJ_s,2022-10-25 00:30:07+00:00,UCzcuUus-ef8RIHx63obZvbQ,The Salesman That Won't Let You Close The Door...,SUB TO MY CHANNEL → https://bit.ly/2XBJhd3​\n\...,https://i.ytimg.com/vi/XSajdGCFJ_s/default.jpg,120,90,https://i.ytimg.com/vi/XSajdGCFJ_s/mqdefault.jpg,320,180,https://i.ytimg.com/vi/XSajdGCFJ_s/hqdefault.jpg,480,360,https://i.ytimg.com/vi/XSajdGCFJ_s/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/XSajdGCFJ_s/maxresdefau...,1280.0,720.0,More of Dtay Known,24,none,The Salesman That Won't Let You Close The Door...,SUB TO MY CHANNEL → https://bit.ly/2XBJhd3​\n\...,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,762948,51170.0,0,108.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[Ransom, Parody, comedy, Kyle Exum, Dtay Known...",en-US,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
df = raw.set_index("queryTime")
df = df.last("30D").reset_index()
df

Unnamed: 0,queryTime,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,snippet.thumbnails.medium.url,snippet.thumbnails.medium.width,snippet.thumbnails.medium.height,snippet.thumbnails.high.url,snippet.thumbnails.high.width,snippet.thumbnails.high.height,snippet.thumbnails.standard.url,snippet.thumbnails.standard.width,snippet.thumbnails.standard.height,snippet.thumbnails.maxres.url,snippet.thumbnails.maxres.width,snippet.thumbnails.maxres.height,snippet.channelTitle,snippet.categoryId,snippet.liveBroadcastContent,snippet.localized.title,snippet.localized.description,contentDetails.duration,contentDetails.dimension,contentDetails.definition,contentDetails.caption,contentDetails.licensedContent,contentDetails.projection,status.uploadStatus,status.privacyStatus,status.license,status.embeddable,status.publicStatsViewable,status.madeForKids,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,player.embedHtml,topicDetails.topicCategories,snippet.tags,snippet.defaultAudioLanguage,contentDetails.regionRestriction.blocked,snippet.defaultLanguage,...,localizations.sv.title,localizations.sv.description,localizations.el.title,localizations.el.description,localizations.sk.title,localizations.sk.description,localizations.lt.title,localizations.lt.description,localizations.cs.title,localizations.cs.description,localizations.lv.title,localizations.lv.description,localizations.ro.title,localizations.ro.description,localizations.hr.title,localizations.hr.description,localizations.fil.title,localizations.fil.description,localizations.mn.title,localizations.mn.description,localizations.fi.title,localizations.fi.description,localizations.et.title,localizations.et.description,localizations.it.title,localizations.it.description,localizations.da.title,localizations.da.description,localizations.bg.title,localizations.bg.description,localizations.sr.title,localizations.sr.description,localizations.kk.title,localizations.kk.description,localizations.es-ES.title,localizations.es-ES.description,localizations.es-MX.title,localizations.es-MX.description,localizations.de-DE.title,localizations.de-DE.description,localizations.fr-FR.title,localizations.fr-FR.description,localizations.pt-BR.title,localizations.pt-BR.description,localizations.zh-TW.title,localizations.zh-TW.description,localizations.zh-CN.title,localizations.zh-CN.description,localizations.en-CA.title,localizations.en-CA.description
0,2022-10-13 03:00:47+00:00,youtube#video,Zg6LxNWRUK0wxJdkyCxkzZZrrJM,pFI3hVpV9S8,2022-10-07 01:47:40+00:00,UCYIEv9W7RmdpvFkHX7IEmyg,THE HAVE IT ALL TOUR STARTS TOMORROW! 🤩 #short...,,https://i.ytimg.com/vi/pFI3hVpV9S8/default.jpg,120,90,https://i.ytimg.com/vi/pFI3hVpV9S8/mqdefault.jpg,320,180,https://i.ytimg.com/vi/pFI3hVpV9S8/hqdefault.jpg,480,360,https://i.ytimg.com/vi/pFI3hVpV9S8/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/pFI3hVpV9S8/maxresdefau...,1280.0,720.0,Taylor Tomlinson,23,none,THE HAVE IT ALL TOUR STARTS TOMORROW! 🤩 #short...,,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,9162782,787058.0,0,4504.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...",,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2022-10-13 03:00:47+00:00,youtube#video,y9OsgzQdx0AFBXSyUZZU6ECoRW4,UIGNMccGh8w,2022-10-02 18:20:31+00:00,UCQz2x0BlBM9MVaUW_kYPosg,مشهد سينمائي من فيلم | roman j israel esq,#سينمائيونHD #أقتباسات_افلام #أقتباسات_سينمائي...,https://i.ytimg.com/vi/UIGNMccGh8w/default.jpg,120,90,https://i.ytimg.com/vi/UIGNMccGh8w/mqdefault.jpg,320,180,https://i.ytimg.com/vi/UIGNMccGh8w/hqdefault.jpg,480,360,https://i.ytimg.com/vi/UIGNMccGh8w/sddefault.jpg,640.0,480.0,,,,𝐑𝐢𝐜𝐤🎭,24,none,مشهد سينمائي من فيلم | roman j israel esq,#سينمائيونHD #أقتباسات_افلام #أقتباسات_سينمائي...,PT52S,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,15589366,1036675.0,0,23024.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[اقتباسات افلام, اقتباسات واقوال, اقتباسات أفل...",ar,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2022-10-13 03:00:47+00:00,youtube#video,bSQUt0FboQ9S9qdcHGViJzEovgA,7-U1EUqTGFA,2022-10-09 14:36:05+00:00,UCGpC5tALOoJ8M7axkm9n9vQ,Bart's a nerd! | The Simpsons #shorts,"Nice shoes, uh... two feet",https://i.ytimg.com/vi/7-U1EUqTGFA/default.jpg,120,90,https://i.ytimg.com/vi/7-U1EUqTGFA/mqdefault.jpg,320,180,https://i.ytimg.com/vi/7-U1EUqTGFA/hqdefault.jpg,480,360,https://i.ytimg.com/vi/7-U1EUqTGFA/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/7-U1EUqTGFA/maxresdefau...,1280.0,720.0,Perfectly Cromulent,1,none,Bart's a nerd! | The Simpsons #shorts,"Nice shoes, uh... two feet",PT1M,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,5548482,694632.0,0,1013.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...",,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2022-10-13 03:00:47+00:00,youtube#video,MHIIeS45a6qMKEDn2-pFz5nTLkc,SS7HXxy3_2c,2022-10-09 05:38:34+00:00,UCqFzWxSCi39LnW1JKFR3efg,Try Guys - SNL,A CNN broadcast is interrupted by breaking new...,https://i.ytimg.com/vi/SS7HXxy3_2c/default.jpg,120,90,https://i.ytimg.com/vi/SS7HXxy3_2c/mqdefault.jpg,320,180,https://i.ytimg.com/vi/SS7HXxy3_2c/hqdefault.jpg,480,360,,,,https://i.ytimg.com/vi/SS7HXxy3_2c/maxresdefau...,1280.0,720.0,Saturday Night Live,23,none,Try Guys - SNL,A CNN broadcast is interrupted by breaking new...,PT5M52S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,2069780,31249.0,0,18810.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[SNL10082022, snl, saturday night live, snl 48...",en,"[AD, AE, AF, AG, AI, AO, AR, AS, AU, AW, BB, B...",,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2022-10-13 03:00:47+00:00,youtube#video,pvxaPRxR_IfrhyuT32U7XIBBjBs,oibJn6Ct1tg,2022-10-03 17:21:17+00:00,UC7bouvhSTd2RQwYOi7zq0hQ,Interrupted in San Diego #shorts #comedy #funny,i appreciate the gift but not the poor crowd e...,https://i.ytimg.com/vi/oibJn6Ct1tg/default.jpg,120,90,https://i.ytimg.com/vi/oibJn6Ct1tg/mqdefault.jpg,320,180,https://i.ytimg.com/vi/oibJn6Ct1tg/hqdefault.jpg,480,360,https://i.ytimg.com/vi/oibJn6Ct1tg/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/oibJn6Ct1tg/maxresdefau...,1280.0,720.0,Stavros Halkias,23,none,Interrupted in San Diego #shorts #comedy #funny,i appreciate the gift but not the poor crowd e...,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,8233551,659746.0,0,2802.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[stand up comedy, crowd work comedy, crowdwork...",en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238708,2022-10-30 05:00:55+00:00,youtube#video,nQl9esYAJFcCtTKT8NYs25_hEQU,Ix28jjDCEh4,2022-10-26 00:39:24+00:00,UC_Vl1oLTGjWYJLmbTpaqorQ,"How Ned Fulmer Ruined the Try Guys Reputation,...",Thanks to Established Titles for sponsoring th...,https://i.ytimg.com/vi/Ix28jjDCEh4/default.jpg,120,90,https://i.ytimg.com/vi/Ix28jjDCEh4/mqdefault.jpg,320,180,https://i.ytimg.com/vi/Ix28jjDCEh4/hqdefault.jpg,480,360,https://i.ytimg.com/vi/Ix28jjDCEh4/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/Ix28jjDCEh4/maxresdefau...,1280.0,720.0,Spill,24,none,"How Ned Fulmer Ruined the Try Guys Reputation,...",Thanks to Established Titles for sponsoring th...,PT51M15S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,375729,14411.0,0,813.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[ned & ariel, try guys, keith, ned fulmer, ned...",en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
238709,2022-10-30 05:00:55+00:00,youtube#video,4cvwwRc3rfMUQiK0A8RkEDDWarg,3tRsaJ2AuU0,2022-10-23 17:00:29+00:00,UC5Qbo0AR3CwpmEq751BIy0g,10 NEW Costco Deals You NEED To Buy in October...,So many people have shopped at Costco but not ...,https://i.ytimg.com/vi/3tRsaJ2AuU0/default.jpg,120,90,https://i.ytimg.com/vi/3tRsaJ2AuU0/mqdefault.jpg,320,180,https://i.ytimg.com/vi/3tRsaJ2AuU0/hqdefault.jpg,480,360,https://i.ytimg.com/vi/3tRsaJ2AuU0/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/3tRsaJ2AuU0/maxresdefau...,1280.0,720.0,The Deal Guy,28,none,10 NEW Costco Deals You NEED To Buy in October...,So many people have shopped at Costco but not ...,PT15M24S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,897609,32929.0,0,978.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[the deal guy, costco, costco store, costco se...",en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
238710,2022-10-30 05:00:55+00:00,youtube#video,lWOayebu2zA79Nvx6uSWME6erjw,VyoPSKJcBII,2022-10-25 15:00:33+00:00,UCvRHSRvLxt9JwRaAeVI1sew,Water test,"If you looking for me on other social media, h...",https://i.ytimg.com/vi/VyoPSKJcBII/default.jpg,120,90,https://i.ytimg.com/vi/VyoPSKJcBII/mqdefault.jpg,320,180,https://i.ytimg.com/vi/VyoPSKJcBII/hqdefault.jpg,480,360,https://i.ytimg.com/vi/VyoPSKJcBII/sddefault.jpg,640.0,480.0,,,,James Butler,24,none,Water test,"If you looking for me on other social media, h...",PT54S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,155413,17345.0,0,301.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,,en,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
238711,2022-10-30 05:00:55+00:00,youtube#video,K9wfRdeYdpcdWRtyWeRJ8bQ16sk,XSajdGCFJ_s,2022-10-25 00:30:07+00:00,UCzcuUus-ef8RIHx63obZvbQ,The Salesman That Won't Let You Close The Door...,SUB TO MY CHANNEL → https://bit.ly/2XBJhd3​\n\...,https://i.ytimg.com/vi/XSajdGCFJ_s/default.jpg,120,90,https://i.ytimg.com/vi/XSajdGCFJ_s/mqdefault.jpg,320,180,https://i.ytimg.com/vi/XSajdGCFJ_s/hqdefault.jpg,480,360,https://i.ytimg.com/vi/XSajdGCFJ_s/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/XSajdGCFJ_s/maxresdefau...,1280.0,720.0,More of Dtay Known,24,none,The Salesman That Won't Let You Close The Door...,SUB TO MY CHANNEL → https://bit.ly/2XBJhd3​\n\...,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,762948,51170.0,0,108.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[Ransom, Parody, comedy, Kyle Exum, Dtay Known...",en-US,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238713 entries, 0 to 238712
Columns: 159 entries, queryTime to localizations.en-CA.description
dtypes: bool(5), datetime64[ns, UTC](2), float64(9), int64(9), object(134)
memory usage: 281.6+ MB


##### Load Categories

In [95]:
categories = YouTubeCategories("https://squeemos.pythonanywhere.com/static/video_categories.json")
categories

<yt_utils.yt_categories.YouTubeCategories at 0x28a2096c5e0>

In [96]:
categories.id_to_title[23]

'Comedy'

---

# Preprocess

In [137]:
df = raw.copy()

##### Drop Features

In [138]:
# For checking features that contain string for removal
fname = "contentDetails"
[col for col in df.columns if fname in col]

['contentDetails.duration',
 'contentDetails.dimension',
 'contentDetails.definition',
 'contentDetails.caption',
 'contentDetails.licensedContent',
 'contentDetails.projection',
 'contentDetails.regionRestriction.blocked',
 'contentDetails.regionRestriction.allowed',
 'contentDetails.contentRating.ytRating']

In [139]:
# Feature substrings to drop
drops = (
    "localizations", "liveStreamingDetails", "recordingDetails", 
    "regionRestriction", "ytRating", "thumbnails", "defaultLanguage",
)

# Remove all features with names
df = df.loc[:, [col for col in df.columns if not any(d in col for d in drops)]]

df

Unnamed: 0,queryTime,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.channelTitle,snippet.categoryId,snippet.liveBroadcastContent,snippet.localized.title,snippet.localized.description,contentDetails.duration,contentDetails.dimension,contentDetails.definition,contentDetails.caption,contentDetails.licensedContent,contentDetails.projection,status.uploadStatus,status.privacyStatus,status.license,status.embeddable,status.publicStatsViewable,status.madeForKids,statistics.viewCount,statistics.likeCount,statistics.favoriteCount,statistics.commentCount,player.embedHtml,topicDetails.topicCategories,snippet.tags,snippet.defaultAudioLanguage
0,2022-10-13 03:00:47+00:00,youtube#video,Zg6LxNWRUK0wxJdkyCxkzZZrrJM,pFI3hVpV9S8,2022-10-07 01:47:40+00:00,UCYIEv9W7RmdpvFkHX7IEmyg,THE HAVE IT ALL TOUR STARTS TOMORROW! 🤩 #short...,,Taylor Tomlinson,23,none,THE HAVE IT ALL TOUR STARTS TOMORROW! 🤩 #short...,,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,9162782,787058.0,0,4504.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...",,
1,2022-10-13 03:00:47+00:00,youtube#video,y9OsgzQdx0AFBXSyUZZU6ECoRW4,UIGNMccGh8w,2022-10-02 18:20:31+00:00,UCQz2x0BlBM9MVaUW_kYPosg,مشهد سينمائي من فيلم | roman j israel esq,#سينمائيونHD #أقتباسات_افلام #أقتباسات_سينمائي...,𝐑𝐢𝐜𝐤🎭,24,none,مشهد سينمائي من فيلم | roman j israel esq,#سينمائيونHD #أقتباسات_افلام #أقتباسات_سينمائي...,PT52S,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,15589366,1036675.0,0,23024.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[اقتباسات افلام, اقتباسات واقوال, اقتباسات أفل...",ar
2,2022-10-13 03:00:47+00:00,youtube#video,bSQUt0FboQ9S9qdcHGViJzEovgA,7-U1EUqTGFA,2022-10-09 14:36:05+00:00,UCGpC5tALOoJ8M7axkm9n9vQ,Bart's a nerd! | The Simpsons #shorts,"Nice shoes, uh... two feet",Perfectly Cromulent,1,none,Bart's a nerd! | The Simpsons #shorts,"Nice shoes, uh... two feet",PT1M,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,5548482,694632.0,0,1013.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...",,
3,2022-10-13 03:00:47+00:00,youtube#video,MHIIeS45a6qMKEDn2-pFz5nTLkc,SS7HXxy3_2c,2022-10-09 05:38:34+00:00,UCqFzWxSCi39LnW1JKFR3efg,Try Guys - SNL,A CNN broadcast is interrupted by breaking new...,Saturday Night Live,23,none,Try Guys - SNL,A CNN broadcast is interrupted by breaking new...,PT5M52S,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,2069780,31249.0,0,18810.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[SNL10082022, snl, saturday night live, snl 48...",en
4,2022-10-13 03:00:47+00:00,youtube#video,pvxaPRxR_IfrhyuT32U7XIBBjBs,oibJn6Ct1tg,2022-10-03 17:21:17+00:00,UC7bouvhSTd2RQwYOi7zq0hQ,Interrupted in San Diego #shorts #comedy #funny,i appreciate the gift but not the poor crowd e...,Stavros Halkias,23,none,Interrupted in San Diego #shorts #comedy #funny,i appreciate the gift but not the poor crowd e...,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,8233551,659746.0,0,2802.0,"<iframe width=""480"" height=""270"" src=""//www.yo...","[https://en.wikipedia.org/wiki/Entertainment, ...","[stand up comedy, crowd work comedy, crowdwork...",en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151587,2022-10-23 23:00:51+00:00,youtube#video,bwiePeToSA5F_KVlt0OXCFdMMzg,_mE-XoBGP3k,2022-10-07 00:28:11+00:00,UCtDQmItYdtZmyBf1poBfsTw,Robot 🤖 cleaning 🧹,Welcome To #Bunnal𝚃𝚎𝚌𝚑 this channel we created...,Bunnal 𝚃𝚎𝚌𝚑,28,none,Robot 🤖 cleaning 🧹,Welcome To #Bunnal𝚃𝚎𝚌𝚑 this channel we created...,PT57S,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,34922012,834009.0,0,6317.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,"[Amazing, Concrete, Foundations, Most Solid Co...",en-US
151588,2022-10-23 23:00:51+00:00,youtube#video,5xia_Vk9cMauGxAk8tDV9VYf1Uo,yA4YEo3vkxM,2022-10-20 01:12:09+00:00,UC4EmjdHH2GTLzy9jx14svEQ,Dating in 2022…wow. 😂 #shorts,,Sarah Dawn Moore,22,none,Dating in 2022…wow. 😂 #shorts,,PT1M,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,244320,12047.0,0,5249.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,,
151589,2022-10-23 23:00:51+00:00,youtube#video,dN0uWWeyLBesbQksJLojzVUj7nw,2_xiBNTBL2s,2022-10-11 01:03:33+00:00,UC3OQ4umnnq-a4pHE5IzICmQ,INCREÍBLE INVENTÓ PARA PEGAR CERÁMICA,,PoliSonyer,24,none,INCREÍBLE INVENTÓ PARA PEGAR CERÁMICA,,PT23S,2d,hd,True,False,rectangular,processed,public,youtube,True,True,False,5986059,88810.0,0,2952.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,,es-419
151590,2022-10-23 23:00:51+00:00,youtube#video,nAa3-MLfjEwXLB6u1vJQx3f8AnY,cYyRZ3uzjyM,2022-10-08 18:45:02+00:00,UCDCjRyou_S-8JBKny7egDMg,I'm Your UBER @TopNotch Idiots 👈,,TopNotch Shorts,22,none,I'm Your UBER @TopNotch Idiots 👈,,PT1M,2d,hd,True,True,rectangular,processed,public,youtube,True,True,False,5777709,470185.0,0,2377.0,"<iframe width=""480"" height=""270"" src=""//www.yo...",[https://en.wikipedia.org/wiki/Lifestyle_(soci...,,


---

# Tags

* Questions
    * What tags indicate category?
    * What tags correspond with more views?
    * What tags correspond with more views per category?

### Setup

In [140]:
stopwords = nltk.corpus.stopwords.words("english")

In [141]:
def tokenize_tags(tag_list):
    out = " ".join(tag_list).lower().split()
    out = [w for w in out if w not in stopwords]
    return out

In [145]:
# Get only latest videos with tags
tag_df = df.dropna(subset="snippet.tags")
tag_latest = df.drop_duplicates(subset="id", keep="last", ignore_index=True).dropna(subset="snippet.tags")

In [150]:
tag_df["snippet.tags"] = tag_df["snippet.tags"].apply(list)
tag_latest["snippet.tags"] = tag_latest["snippet.tags"].apply(list)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Tag Counts

##### Create new df with categories and tags

In [153]:
# Get tags for each category
cat_tags = tag_latest.groupby("snippet.categoryId")["snippet.tags"].sum()

# Create column for category id
cat_tags = pd.DataFrame(cat_tags).reset_index()

# Get category names
cat_tags["category"] = cat_tags["snippet.categoryId"].apply(lambda x: categories.id_to_title[x])

# Rearrange columns
cols = cat_tags.columns.tolist()
cols = cols[-1:] + cols[:-1]
cat_tags = cat_tags[cols]

cat_tags

Unnamed: 0,category,snippet.categoryId,snippet.tags
0,Film & Animation,1,"[hbo, hbo max, hbo max movie, hbo max series, ..."
1,Autos & Vehicles,2,"[Off road recovery, Rescue, Matts off road rec..."
2,Music,10,"[BoyWithUke, Sick, Mercury, Records/Republic, ..."
3,Pets & Animals,15,"[animal video, animals, the dodo, Animal Rescu..."
4,Sports,17,"[Chelsea, Milan, Milan vs. Chelsea, Televisa, ..."
5,Travel & Events,19,"[Toya Johnson, Weight No More, TI & Tiny Frien..."
6,Gaming,20,"[simulation games, graystillplays, simulator, ..."
7,People & Blogs,22,"[daily bumps, daily bumps baby, welcome briell..."
8,Comedy,23,"[jimmy, jimmy kimmel, jimmy kimmel live, late ..."
9,Entertainment,24,"[piers morgan, piers morgan andrew tate, andre..."


##### Get tag counts (includes lowering and stopword removal)

In [154]:
# Tokenize tags (includes lowering and stopword removal)
cat_tags["counts"] = cat_tags["snippet.tags"].apply(tokenize_tags).apply(Counter)

# Get 10 ten tags per category
cat_tags["top_counts"] = cat_tags["counts"].apply(lambda x: dict(x.most_common(10)))

cat_tags

Unnamed: 0,category,snippet.categoryId,snippet.tags,counts,top_counts
0,Film & Animation,1,"[hbo, hbo max, hbo max movie, hbo max series, ...","{'hbo': 23, 'max': 21, 'movie': 30, 'series': ...","{'movie': 30, 'leroy': 25, 'hbo': 23, 'max': 2..."
1,Autos & Vehicles,2,"[Off road recovery, Rescue, Matts off road rec...","{'road': 75, 'recovery': 57, 'rescue': 14, 'ma...","{'road': 75, 'recovery': 57, 'truck': 25, 'san..."
2,Music,10,"[BoyWithUke, Sick, Mercury, Records/Republic, ...","{'boywithuke': 1, 'sick': 1, 'mercury': 1, 're...","{'grizzley': 59, 'tee': 54, 'video': 45, 'lil'..."
3,Pets & Animals,15,"[animal video, animals, the dodo, Animal Rescu...","{'animal': 15, 'video': 10, 'animals': 37, 'do...","{'animals': 37, 'funny': 34, 'dodo': 25, 'vide..."
4,Sports,17,"[Chelsea, Milan, Milan vs. Chelsea, Televisa, ...","{'chelsea': 9, 'milan': 3, 'vs.': 10, 'televis...","{'highlights': 124, 'vs': 116, 'sports': 102, ..."
5,Travel & Events,19,"[Toya Johnson, Weight No More, TI & Tiny Frien...","{'toya': 3, 'johnson': 1, 'weight': 1, 'ti': 1...","{'yacht': 8, 'travel': 7, 'boat': 7, 'kara': 5..."
6,Gaming,20,"[simulation games, graystillplays, simulator, ...","{'simulation': 24, 'games': 144, 'graystillpla...","{'minecraft': 492, 'roblox': 452, 'fortnite': ..."
7,People & Blogs,22,"[daily bumps, daily bumps baby, welcome briell...","{'daily': 18, 'bumps': 3, 'baby': 13, 'welcome...","{'vlog': 103, 'nichlmao': 70, 'life': 59, 'hou..."
8,Comedy,23,"[jimmy, jimmy kimmel, jimmy kimmel live, late ...","{'jimmy': 13, 'kimmel': 9, 'live': 28, 'late':...","{'comedy': 134, 'funny': 109, 'stand': 58, 'be..."
9,Entertainment,24,"[piers morgan, piers morgan andrew tate, andre...","{'piers': 12, 'morgan': 14, 'andrew': 23, 'tat...","{'minecraft': 272, 'funny': 227, 'ozark': 154,..."


##### Visualize top counts

In [156]:
cat_count = cat_tags.shape[0]
rows, cols = math.ceil(cat_count / 4), 4

# Create subplots
fig = make_subplots(
    rows=rows, cols=cols,
    vertical_spacing=0.06, horizontal_spacing=0.1,
    subplot_titles=cat_tags.category
)

# Plot
cat_idx = 0
for i in range(rows):
    for j in range(cols):
        data = cat_tags[cat_tags.category == cat_tags.category[cat_idx]].top_counts.iloc[0]

        fig.append_trace(
            go.Bar(
                x=list(data.values()), y=list(data.keys()), orientation="h",
            ),
            row=i + 1, col=j + 1,
        )
        
        # Quit if all categories done
        cat_idx = cat_idx + 1
        if cat_idx >= cat_count:
            break
    
# Layout
fig.update_layout(
    title_text="YouTube Tag Frequency by Category",
    title_x=0.5,
    showlegend=False,
    autosize=False,
    width=1000,
    height=930,
)

fig.show()

### Monthly Time Series

* Track tag popularity per category for last month
    * Popularity ~= sum_per_day(log(views_per_tag))
        * Log to see general usage rather than just most popular

##### Wranglin'

In [308]:
# Get last month's data
month_df = tag_df.set_index("queryTime")
month_df = month_df.last("30D")
month_df = month_df.loc[:, ["snippet.categoryId", "id", "snippet.tags", "statistics.viewCount"]]

# Get log(views)
month_df["logViews"] = np.log(month_df["statistics.viewCount"])

# Get number of days since beginning of df
month_df["days"] = (month_df.index - month_df.index[0]).days

# Get category names
month_df["category"] = month_df["snippet.categoryId"].apply(lambda x: categories.id_to_title[x])

# Rearrange columns
cols = month_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
month_df = month_df[cols]

month_df

Unnamed: 0_level_0,days,category,snippet.categoryId,id,snippet.tags,statistics.viewCount,logViews
queryTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-10-13 03:00:47+00:00,0,Entertainment,24,UIGNMccGh8w,"[اقتباسات افلام, اقتباسات واقوال, اقتباسات أفل...",15589366,16.562100
2022-10-13 03:00:47+00:00,0,Comedy,23,SS7HXxy3_2c,"[SNL10082022, snl, saturday night live, snl 48...",2069780,14.542953
2022-10-13 03:00:47+00:00,0,Comedy,23,oibJn6Ct1tg,"[stand up comedy, crowd work comedy, crowdwork...",8233551,15.923728
2022-10-13 03:00:47+00:00,0,Entertainment,24,c2s1ASeKDhc,"[Universal, Illumination Entertainment, Ninten...",3511043,15.071424
2022-10-13 03:00:47+00:00,0,Comedy,23,_xQ27AsF4_k,"[the daily show, trevor noah, daily show with ...",1549624,14.253523
...,...,...,...,...,...,...,...
2022-10-23 23:00:51+00:00,10,Comedy,23,dbsoMbZlqi0,"[reddit, r/, subreddit, best of reddit, r/ top...",232328,12.355905
2022-10-23 23:00:51+00:00,10,Entertainment,24,p4AE9nENXTw,"[amazon, unboxing, lucas, marcus, lucas and ma...",488715,13.099535
2022-10-23 23:00:51+00:00,10,Entertainment,24,LT9QbOp8LjA,"[broma, dos chicas, bromas, ENTREVISTA EN MADR...",3060282,14.934018
2022-10-23 23:00:51+00:00,10,Entertainment,24,2M2dF_21ANc,"[dan and phil, dan phil, dan howell, phil lest...",601648,13.307428


In [309]:
# Tokenize tags
month_df["snippet.tags"] = month_df["snippet.tags"].apply(tokenize_tags)

In [310]:
# Change to dicts with view count for values
month_df["tagViews"] = month_df.apply(lambda row: Counter({tag: row["logViews"] for tag in row["snippet.tags"]}), axis=1)

In [311]:
# Consolidate tag views
day_views = pd.DataFrame(month_df.groupby(["days", "category"])["tagViews"].sum())
day_views = day_views.reset_index()

In [312]:
# Get top tags per category for entire month
cat_views = pd.DataFrame(day_views.groupby("category")["tagViews"].sum())
cat_views.columns = ["catViews"]
cat_views["topTags"] = cat_views["catViews"].apply(lambda x: list(dict(x.most_common(10)).keys()))
cat_views

Unnamed: 0_level_0,catViews,topTags
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Autos & Vehicles,"{'motortrend': 3506.734698683277, 'roadkill': ...","[truck, cars, road, run, ford, car, audi, buil..."
Comedy,"{'snl10082022': 5237.3301586849075, 'snl': 186...","[comedy, funny, show, shorts, stand, videos, n..."
Education,"{'albert': 2008.7303946246057, 'einstein': 200...","[shorts, video, military, funny, defense, dr, ..."
Entertainment,"{'اقتباسات': 8257.952144469215, 'افلام': 8257....","[funny, tiktok, shorts, family, comedy, videos..."
Film & Animation,"{'adventure': 4785.741954013368, 'fun': 4598.8...","[movie, film, 2, youtube, movies, horror, trai..."
Gaming,"{'#cliptriệuview': 4444.293252166985, '#toptop...","[funny, gameplay, game, gaming, minecraft, mom..."
Howto & Style,"{'rebuilding': 3408.5361013352176, 'wrecked': ...","[moving, alisha, marie, cjrelectrical, rebuild..."
Music,"{'dolly': 760.03181043736, 'parton': 760.03181...","[hip, hop, music, video, records, lil, rap, ba..."
News & Politics,"{'fnc': 6142.408210421934, 'fox': 14064.838438...","[news, breaking, trump, biden, politics, joe, ..."
Nonprofits & Activism,"{'behold': 465.5452863866138, 'israel': 465.54...","[behold, israel, amir, tsarfati, bible, prophe..."


In [313]:
# Get viewcounts for top tags across all days
view_df = pd.merge(day_views, cat_views["topTags"], how="left", on="category")
view_df["topViews"] = view_df.apply(lambda row: {k: v for k, v in row["tagViews"].items() if k in row["topTags"]}, axis=1)
view_df

Unnamed: 0,days,category,tagViews,topTags,topViews
0,0,Autos & Vehicles,"{'motortrend': 346.443437146495, 'roadkill': 3...","[truck, cars, road, run, ford, car, audi, buil...","{'ford': 695.7402230229945, 'run': 661.0255482..."
1,0,Comedy,"{'snl10082022': 1452.010643818261, 'snl': 2552...","[comedy, funny, show, shorts, stand, videos, n...","{'new': 4670.287999505596, 'comedy': 12984.347..."
2,0,Education,"{'albert': 416.50417757513577, 'einstein': 416...","[shorts, video, military, funny, defense, dr, ...","{'funny': 416.50417757513577, 'shorts': 416.50..."
3,0,Entertainment,"{'اقتباسات': 829.503392379331, 'افلام': 829.50...","[funny, tiktok, shorts, family, comedy, videos...","{'family': 2886.4704706830507, 'tik': 2777.123..."
4,0,Film & Animation,"{'adventure': 1355.7097141919855, 'fun': 1168....","[movie, film, 2, youtube, movies, horror, trai...","{'movie': 978.4689867468991, 'horror': 734.347..."
...,...,...,...,...,...
150,10,People & Blogs,"{'rainbow': 913.5046983383546, 'friends': 913....","[funny, life, video, vlogs, vlog, short, comed...","{'life': 4112.610968199318, 'funny': 3906.8126..."
151,10,Pets & Animals,"{'xl': 375.86670607195254, 'bully': 375.866706...","[videos, cute, dog, dogs, video, animal, anima...","{'dog': 655.6665917816081, 'dogs': 375.8667060..."
152,10,Science & Technology,"{'pest': 253.8068414325177, 'control': 253.806...","[shorts, tech, iphone, building, apple, amazin...","{'tech': 371.9389979008261, 'shorts': 777.4600..."
153,10,Sports,"{'sportsnation': 292.67259001267985, '#shorts'...","[highlights, sports, vs, football, nfl, espn, ...","{'espn': 2466.559842268247, 'nba': 1768.463623..."


##### Visualization

In [314]:
# Select category
category_name = "Gaming"

# Get data for category
cat_day = view_df[view_df["category"] == category_name]
day_data = dict()
for tag in cat_day["topTags"].iloc[0]:
    day_data[tag] = cat_day["topViews"].apply(lambda x: x[tag]).values
    
# Create df for viss
day_data = pd.DataFrame(day_data).reset_index().rename({"index": "day"}, axis=1)
day_data = pd.melt(day_data, id_vars="day").rename({"variable": "tag"}, axis=1)

# Plot
fig = px.line(day_data, x="day", y="value", color="tag")
fig.update_layout(
    title_text=f"Monthly Popularity of '{category_name}' Tags", title_x=0.5,
    yaxis_title="Cumulative log(Views)"
)
fig.show()

> Note: currently tracks **EVERY VIDEO for EVERY HOUR**, could instead only use each video as of latest point of each day it is end trending. This could further limit the impact of extremely popular videos alongside the use of log transform.