In [51]:
import json
import pandas as pd

# Explore type of videos

In [52]:
with open("../data/US_category_id.json") as json_input:
    categories = json.load(json_input)

print("Categories:")
for c in categories['items']:
    print(f"{c['snippet']['title']}, id: {c['id']}")
    categories_mapping[int(c['id'])] = c['snippet']['title']
    
print(categories_mapping)
#Music, id: 10

Categories:
Film & Animation, id: 1
Autos & Vehicles, id: 2
Music, id: 10
Pets & Animals, id: 15
Sports, id: 17
Short Movies, id: 18
Travel & Events, id: 19
Gaming, id: 20
Videoblogging, id: 21
People & Blogs, id: 22
Comedy, id: 23
Entertainment, id: 24
News & Politics, id: 25
Howto & Style, id: 26
Education, id: 27
Science & Technology, id: 28
Nonprofits & Activism, id: 29
Movies, id: 30
Anime/Animation, id: 31
Action/Adventure, id: 32
Classics, id: 33
Comedy, id: 34
Documentary, id: 35
Drama, id: 36
Family, id: 37
Foreign, id: 38
Horror, id: 39
Sci-Fi/Fantasy, id: 40
Thriller, id: 41
Shorts, id: 42
Shows, id: 43
Trailers, id: 44
{29: 'Nonprofits & Activism', 1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 30: 'Movies'

In [53]:
column_list = ['video_id', 'title', 'channel_title', 'category_id']
videos = pd.read_csv("../data/USvideos.csv", usecols=column_list, error_bad_lines=False)
print(videos.columns)

videos.describe()

Index(['video_id', 'title', 'channel_title', 'category_id'], dtype='object')


Unnamed: 0,category_id
count,7998.0
mean,20.217679
std,7.415364
min,1.0
25%,17.0
50%,23.0
75%,25.0
max,43.0


Many entries are repeated for the same videos but taken in different dates. 
We will keep only one entry for each videoID

In [54]:
videos = videos.drop_duplicates()

In [55]:
#Add category column as a name
category_name = []
for index, row in videos.iterrows():
    category_name.append(categories_mapping[int(row['category_id'])])

videos['category_name'] = category_name

videos.describe()

Unnamed: 0,category_id
count,2408.0
mean,20.436047
std,7.226186
min,1.0
25%,17.0
50%,24.0
75%,25.0
max,43.0


In [56]:
videos.head()

Unnamed: 0,video_id,title,channel_title,category_id,category_name
0,XpVt6Z1Gjjo,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,Entertainment
1,K4wEI5zhHB0,iPhone X — Introducing iPhone X — Apple,Apple,28,Science & Technology
2,cLdxuaxaQwc,My Response,PewDiePie,22,People & Blogs
3,WYYvHb03Eog,Apple iPhone X first look,The Verge,28,Science & Technology
4,sjlHnJvXdQs,iPhone X (parody),jacksfilms,23,Comedy


In [57]:
#Count videos per category
videos.groupby(['category_id', 'category_name']).size()

category_id  category_name        
1            Film & Animation         101
2            Autos & Vehicles          38
10           Music                    339
15           Pets & Animals            40
17           Sports                   153
19           Travel & Events           18
20           Gaming                    29
22           People & Blogs           259
23           Comedy                   215
24           Entertainment            488
25           News & Politics          199
26           Howto & Style            274
27           Education                 94
28           Science & Technology     155
29           Nonprofits & Activism      5
43           Shows                      1
dtype: int64

339 music videos

# Explore the number of comments for a given category

In [90]:
category = 'Music'
inv_mapping = {v: k for k, v in categories_mapping.items()}

category_id = inv_mapping[category]

category_videos = videos[videos['category_id']== category_id] 
category_videos.head()

Unnamed: 0,video_id,title,channel_title,category_id,category_name
20,-Ifnaxi2LQg,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
24,JhA1Wi9mrns,Kid Rock - Tennessee Mountain Top [Lyrics],Kid Rock,10,Music
30,QBGaO89cBMI,Radiohead - Lift,Radiohead,10,Music
41,O78Lpo4ctSE,LANY - Super Far (Official Video),LANYVEVO,10,Music
53,eM_FR7I2Ttw,Harry Styles - The Chain (Fleetwood Mac cover)...,BBCRadio1VEVO,10,Music


In [73]:
comments = pd.read_csv("../data/UScomments.csv", error_bad_lines=False)

comments.describe()

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'


Unnamed: 0,video_id,comment_text,likes,replies
count,691400,691375,691400,691400
unique,2266,434076,1284,479
top,sjlHnJvXdQs,Lol,0,0
freq,800,310,456894,525730


In [92]:
comments.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [95]:
#Join the two on video_id
comments_category = comments.merge(category_videos, how='right')

In [96]:
comments_category.describe()

Unnamed: 0,category_id
count,118606.0
mean,10.0
std,0.0
min,10.0
25%,10.0
50%,10.0
75%,10.0
max,10.0


In [97]:
comments_category.head()

Unnamed: 0,video_id,comment_text,likes,replies,title,channel_title,category_id,category_name
0,-Ifnaxi2LQg,Trash and I like fergie but this shit is not g...,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
1,-Ifnaxi2LQg,So glad she's back,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
2,-Ifnaxi2LQg,Nicki Minaj looks like a melted Galaxy bar cov...,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
3,-Ifnaxi2LQg,"Sorry fergie for my words. but, you are succes...",0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
4,-Ifnaxi2LQg,Sounds like the song You know you like it by ...,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
