In [25]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from langdetect import detect, LangDetectException

data_path = '../../data/'

# Explore type of videos

In [2]:
with open(data_path+"GB_category_id.json") as json_input:
    categories = json.load(json_input)

print("Categories:")
categories_mapping = {29: 'Others'}
for c in categories['items']:
    print(f"{c['snippet']['title']}, id: {c['id']}")
    categories_mapping[int(c['id'])] = c['snippet']['title']
    
print(categories_mapping)
#Music, id: 10

Categories:
Film & Animation, id: 1
Autos & Vehicles, id: 2
Music, id: 10
Pets & Animals, id: 15
Sports, id: 17
Short Movies, id: 18
Travel & Events, id: 19
Gaming, id: 20
Videoblogging, id: 21
People & Blogs, id: 22
Comedy, id: 23
Entertainment, id: 24
News & Politics, id: 25
Howto & Style, id: 26
Education, id: 27
Science & Technology, id: 28
Movies, id: 30
Anime/Animation, id: 31
Action/Adventure, id: 32
Classics, id: 33
Comedy, id: 34
Documentary, id: 35
Drama, id: 36
Family, id: 37
Foreign, id: 38
Horror, id: 39
Sci-Fi/Fantasy, id: 40
Thriller, id: 41
Shorts, id: 42
Shows, id: 43
Trailers, id: 44
{29: 'Others', 1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventur

In [3]:
column_list = ['video_id', 'title', 'channel_title', 'category_id']
videos = pd.read_csv(data_path+"GBvideos.csv", usecols=column_list, error_bad_lines=False)
print(videos.columns)

videos.describe()

Index(['video_id', 'title', 'channel_title', 'category_id'], dtype='object')


Unnamed: 0,category_id
count,7995.0
mean,19.738462
std,7.177458
min,1.0
25%,17.0
50%,23.0
75%,24.0
max,29.0


Many entries are repeated for the same videos but taken in different dates. 
We will keep only one entry for each videoID

In [4]:
videos = videos.drop_duplicates()

In [5]:
#Add category column as a name
category_name = []
for index, row in videos.iterrows():
    category_name.append(categories_mapping[int(row['category_id'])])

videos['category_name'] = category_name

videos.describe()

Unnamed: 0,category_id
count,1776.0
mean,19.766329
std,7.025187
min,1.0
25%,17.0
50%,22.0
75%,24.0
max,29.0


In [6]:
videos.head()

Unnamed: 0,video_id,title,channel_title,category_id,category_name
0,jt2OHQh0HoQ,Live Apple Event - Apple September Event 2017 ...,Apple Event,28,Science & Technology
1,AqokkXoa7uE,Holly and Phillip Meet Samantha the Sex Robot ...,This Morning,24,Entertainment
2,YPVcg45W0z4,My DNA Test Results! I'm WHAT?!,emmablackery,24,Entertainment
3,T_PuZBdT2iM,getting into a conversation in a language you ...,ProZD,1,Film & Animation
4,NsjsmgmbCfc,Baby Name Challenge!,Sprinkleofglitter,26,Howto & Style


In [7]:
#Count videos per category
videos.groupby(['category_id', 'category_name']).size()

category_id  category_name       
1            Film & Animation         78
2            Autos & Vehicles         14
10           Music                   288
15           Pets & Animals           21
17           Sports                  171
19           Travel & Events          10
20           Gaming                   67
22           People & Blogs          250
23           Comedy                  124
24           Entertainment           327
25           News & Politics          69
26           Howto & Style           256
27           Education                46
28           Science & Technology     51
29           Others                    4
dtype: int64

339 music videos

# Explore the number of comments for a given category

In [8]:
category = 'Music'
inv_mapping = {v: k for k, v in categories_mapping.items()}

category_id = inv_mapping[category]

category_videos = videos[videos['category_id']== category_id] 
category_videos.head()

Unnamed: 0,video_id,title,channel_title,category_id,category_name
9,QBGaO89cBMI,Radiohead - Lift,Radiohead,10,Music
12,eM_FR7I2Ttw,Harry Styles - The Chain (Fleetwood Mac cover)...,BBCRadio1VEVO,10,Music
16,-Ifnaxi2LQg,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
42,qZVm-2nM0sA,Rudimental - Sun Comes Up feat. James Arthur [...,Rudimental,10,Music
47,3-yamPXZQtU,Bryson Tiller - Run Me Dry (Official Video),BrysonTillerVEVO,10,Music


In [9]:
comments = pd.read_csv(data_path+"GBcomments.csv", error_bad_lines=False)

comments.describe()

b'Skipping line 113225: expected 4 fields, saw 5\n'
b'Skipping line 158379: expected 4 fields, saw 7\nSkipping line 241590: expected 4 fields, saw 5\nSkipping line 245637: expected 4 fields, saw 7\n'
b'Skipping line 521402: expected 4 fields, saw 5\n'


Unnamed: 0,likes,replies
count,718452.0,718452.0
mean,5.237454,0.38125
std,203.883893,8.967958
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,60630.0,521.0


In [10]:
comments.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [11]:
#Join the two on video_id
comments_category = comments.merge(category_videos, how='right')

In [12]:
comments_category.shape

(121156, 8)

In [13]:
comments_category.head()

Unnamed: 0,video_id,comment_text,likes,replies,title,channel_title,category_id,category_name
0,QBGaO89cBMI,I didn't really like the song,0.0,0.0,Radiohead - Lift,Radiohead,10,Music
1,QBGaO89cBMI,alright... I'll take the stairs,0.0,0.0,Radiohead - Lift,Radiohead,10,Music
2,QBGaO89cBMI,a fable on a realistic ground is a cry of pro...,0.0,0.0,Radiohead - Lift,Radiohead,10,Music
3,QBGaO89cBMI,hope you will exist for many years..,0.0,0.0,Radiohead - Lift,Radiohead,10,Music
4,QBGaO89cBMI,Would this song exist if he lived on the first...,0.0,0.0,Radiohead - Lift,Radiohead,10,Music


Check for missing data in comment_text and remove them

In [14]:
print(comments_category['comment_text'].isna().sum())
video_nan_comment = comments_category[comments_category['comment_text'].isna()]['video_id'].unique()
print(video_nan_comment)
comments[comments['video_id'].isin(video_nan_comment)]

11
['ZXvbz8cqafM' 'NPovS8i2WDo' '7YAAyUFL1GQ' 'V_dsWL6VcjU' 'I6rh8WdBSfQ'
 '_209r9TMB4M' 's7eVr-OUYkQ' '2spUxnYdQXE' '8I1B4n_8Cto' '5gH8iXNW8wE']


Unnamed: 0,video_id,comment_text,likes,replies
16686,NPovS8i2WDo,Every time I get more shooketh,6,0
16687,NPovS8i2WDo,When she mentioned her grandma i cried,4,0
16688,NPovS8i2WDo,NO DON'T EAT KIT KATS LIKE THAT,0,0
16689,NPovS8i2WDo,I was sad watching this as I didn't think I'd ...,1,0
16690,NPovS8i2WDo,This was so Amazing what a great idea!,0,0
...,...,...,...,...
716697,8I1B4n_8Cto,Burn this video,0,0
716698,8I1B4n_8Cto,4:03 yellow slime looks too much like human fa...,0,0
716699,8I1B4n_8Cto,what is a beck,0,0
716700,8I1B4n_8Cto,BECK IS SO COOL his vids keep getting better a...,1,0


13 comments have no text. They regard 11 different videos. These videos have other comments with a regular text section.
We just remove the 13 comments without text and keep the others

In [15]:
comments_category = comments_category.dropna(subset=['comment_text'])
print(comments_category['comment_text'].isna().sum())

0


Check for duplicates text

In [16]:
comments_category.pivot_table(columns=['comment_text'], aggfunc='size')

comment_text
 'Literally' 'like' my bestfriend\nCamila wut?🤔                                                                                                                          6
 How To Make An Amazing and Iconic Theme Song Even Better by Trent and Atticus.                                                                                          1
 I be like send me the addy, hop out the Porsche wit a bhaddie, got a nice  face and a fattie,, do what I say like I'm daddy I loveeeee this verse😍😍😍 you bad asffff😍    1
 I dont mind if i sell my soul to get a song on the radio boy whatt?                                                                                                     2
 I gess i don'f fit the mold of rap \nCuz i'm respectin' women \nThat's all i wanna hear for the rest of my life                                                         1
                                                                                                                                    

In [17]:
comments_category[comments_category['comment_text'].isna()]['video_id']

Series([], Name: video_id, dtype: object)

# Explore some stats about the comments text

In [18]:
# Average lenght in number of characters
comments_len = comments_category['comment_text'].apply(lambda x: len(str(x)))
comments_len.describe()

count    121145.000000
mean         67.242882
std         169.378058
min           1.000000
25%          19.000000
50%          36.000000
75%          67.000000
max        6921.000000
Name: comment_text, dtype: float64

In [19]:
print(comments_category.loc[115887]['comment_text'])

ONCE👑👑


In [20]:
print(f"Unique comments: {comments_category['comment_text'].nunique()}\nTotal comments: {comments_category.shape[0]}")

Unique comments: 75048
Total comments: 121145


In [21]:
comments_category[comments_category['comment_text'].apply(lambda x: len(str(x))) >6000]['comment_text']

76914    Featuring Sandra Prescott , Kimberly O'Brien ,...
84821    (The Great and terrible Day of the Lord. When ...
84905    (The Great and terrible Day of the Lord. When ...
Name: comment_text, dtype: object

In [22]:
#Keep only uniques
comments_category = comments_category.drop_duplicates(subset='comment_text')
print(f"Unique comments: {comments_category['comment_text'].nunique()}\nTotal comments: {comments_category.shape[0]}")
comments_category.describe()

Unique comments: 75048
Total comments: 75048


Unnamed: 0,likes,replies,category_id
count,75048.0,75048.0,75048.0
mean,0.876546,0.097338,10.0
std,42.459609,2.72491,0.0
min,0.0,0.0,10.0
25%,0.0,0.0,10.0
50%,0.0,0.0,10.0
75%,0.0,0.0,10.0
max,10244.0,504.0,10.0


In [26]:
# Only keep english comments
'''
filt = [detect(text) == 'en' for text in comments_category['comment_text']]
'''

filt = []
for text in comments_category['comment_text']:
    try:
        en = detect(text)
    except LangDetectException:
        filt.append(False)
    else:
        if en == 'en':
            filt.append(True)
        else:
            filt.append(False)

In [27]:
comments_category_filtered = comments_category[filt]
comments_category_filtered['comment_text'].describe()

count                         51790
unique                        51790
top       omg i was in it from 0:01
freq                              1
Name: comment_text, dtype: object

In [29]:
comments_category_filtered.to_csv(data_path+"GB_comments_filtered.csv")