In [65]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

  and should_run_async(code)


# Explore type of videos

In [34]:
with open("../data/US_category_id.json") as json_input:
    categories = json.load(json_input)

print("Categories:")
categories_mapping = {}
for c in categories['items']:
    print(f"{c['snippet']['title']}, id: {c['id']}")
    categories_mapping[int(c['id'])] = c['snippet']['title']
    
print(categories_mapping)
#Music, id: 10

Categories:
Film & Animation, id: 1
Autos & Vehicles, id: 2
Music, id: 10
Pets & Animals, id: 15
Sports, id: 17
Short Movies, id: 18
Travel & Events, id: 19
Gaming, id: 20
Videoblogging, id: 21
People & Blogs, id: 22
Comedy, id: 23
Entertainment, id: 24
News & Politics, id: 25
Howto & Style, id: 26
Education, id: 27
Science & Technology, id: 28
Nonprofits & Activism, id: 29
Movies, id: 30
Anime/Animation, id: 31
Action/Adventure, id: 32
Classics, id: 33
Comedy, id: 34
Documentary, id: 35
Drama, id: 36
Family, id: 37
Foreign, id: 38
Horror, id: 39
Sci-Fi/Fantasy, id: 40
Thriller, id: 41
Shorts, id: 42
Shows, id: 43
Trailers, id: 44
{1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 29: 'Nonprofits & Activism', 30: 'Movies'

In [35]:
column_list = ['video_id', 'title', 'channel_title', 'category_id']
videos = pd.read_csv("../data/USvideos.csv", usecols=column_list, error_bad_lines=False)
print(videos.columns)

videos.describe()

Index(['video_id', 'title', 'channel_title', 'category_id'], dtype='object')


Unnamed: 0,category_id
count,7998.0
mean,20.217679
std,7.415364
min,1.0
25%,17.0
50%,23.0
75%,25.0
max,43.0


Many entries are repeated for the same videos but taken in different dates. 
We will keep only one entry for each videoID

In [36]:
videos = videos.drop_duplicates()

In [37]:
#Add category column as a name
category_name = []
for index, row in videos.iterrows():
    category_name.append(categories_mapping[int(row['category_id'])])

videos['category_name'] = category_name

videos.describe()

Unnamed: 0,category_id
count,2408.0
mean,20.436047
std,7.226186
min,1.0
25%,17.0
50%,24.0
75%,25.0
max,43.0


In [38]:
videos.head()

Unnamed: 0,video_id,title,channel_title,category_id,category_name
0,XpVt6Z1Gjjo,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,Entertainment
1,K4wEI5zhHB0,iPhone X — Introducing iPhone X — Apple,Apple,28,Science & Technology
2,cLdxuaxaQwc,My Response,PewDiePie,22,People & Blogs
3,WYYvHb03Eog,Apple iPhone X first look,The Verge,28,Science & Technology
4,sjlHnJvXdQs,iPhone X (parody),jacksfilms,23,Comedy


In [39]:
#Count videos per category
videos.groupby(['category_id', 'category_name']).size()

category_id  category_name        
1            Film & Animation         101
2            Autos & Vehicles          38
10           Music                    339
15           Pets & Animals            40
17           Sports                   153
19           Travel & Events           18
20           Gaming                    29
22           People & Blogs           259
23           Comedy                   215
24           Entertainment            488
25           News & Politics          199
26           Howto & Style            274
27           Education                 94
28           Science & Technology     155
29           Nonprofits & Activism      5
43           Shows                      1
dtype: int64

339 music videos

# Explore the number of comments for a given category

In [40]:
category = 'Music'
inv_mapping = {v: k for k, v in categories_mapping.items()}

category_id = inv_mapping[category]

category_videos = videos[videos['category_id']== category_id] 
category_videos.head()

Unnamed: 0,video_id,title,channel_title,category_id,category_name
20,-Ifnaxi2LQg,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
24,JhA1Wi9mrns,Kid Rock - Tennessee Mountain Top [Lyrics],Kid Rock,10,Music
30,QBGaO89cBMI,Radiohead - Lift,Radiohead,10,Music
41,O78Lpo4ctSE,LANY - Super Far (Official Video),LANYVEVO,10,Music
53,eM_FR7I2Ttw,Harry Styles - The Chain (Fleetwood Mac cover)...,BBCRadio1VEVO,10,Music


In [41]:
comments = pd.read_csv("../data/UScomments.csv", error_bad_lines=False)

comments.describe()

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'


Unnamed: 0,video_id,comment_text,likes,replies
count,691400,691375,691400,691400
unique,2266,434076,1284,479
top,sjlHnJvXdQs,Lol,0,0
freq,800,310,456894,525730


In [42]:
comments.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [43]:
#Join the two on video_id
comments_category = comments.merge(category_videos, how='right')

In [44]:
comments_category.shape

(118606, 8)

In [45]:
comments_category.head()

Unnamed: 0,video_id,comment_text,likes,replies,title,channel_title,category_id,category_name
0,-Ifnaxi2LQg,Trash and I like fergie but this shit is not g...,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
1,-Ifnaxi2LQg,So glad she's back,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
2,-Ifnaxi2LQg,Nicki Minaj looks like a melted Galaxy bar cov...,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
3,-Ifnaxi2LQg,"Sorry fergie for my words. but, you are succes...",0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music
4,-Ifnaxi2LQg,Sounds like the song You know you like it by ...,0,0,Fergie - You Already Know ft. Nicki Minaj,FergieVEVO,10,Music


Check for missing data in comment_text and remove them

In [46]:
print(comments_category['comment_text'].isna().sum())
video_nan_comment = comments_category[comments_category['comment_text'].isna()]['video_id'].unique()
print(video_nan_comment)
comments[comments['video_id'].isin(video_nan_comment)]

13
['ZXvbz8cqafM' '7YAAyUFL1GQ' '-3lMEZ6k5NA' 'AYyXQcuo8wA' 'zmg9tVaMVd4'
 's3Hk_lDw5yo' 'mP_fKeq_o8A' '941Bl3oxk8Y' 'Kn4KUvDzQFg' '8I1B4n_8Cto'
 '5gH8iXNW8wE']


Unnamed: 0,video_id,comment_text,likes,replies
34780,7YAAyUFL1GQ,Who's here before a million views ?,98,12
34781,7YAAyUFL1GQ,'IM PRETTY SURE THUS ISNT HOW THE STORY ENDS'\...,8,0
34782,7YAAyUFL1GQ,This song is keeping me going honestly,0,0
34783,7YAAyUFL1GQ,this band should fall out of a boat,0,0
34784,7YAAyUFL1GQ,"Okay pretty good, the video was far out the mu...",0,0
...,...,...,...,...
639152,8I1B4n_8Cto,With this and his other new stuff Beck is push...,0,0
639153,8I1B4n_8Cto,"Song’s addicting, won’t come out of my head",1,0
639154,8I1B4n_8Cto,i wanna eat some of dat yummy stuff and die,0,0
639155,8I1B4n_8Cto,All of these hands are white... smh,0,0


13 comments have no text. They regard 11 different videos. These videos have other comments with a regular text section.
We just remove the 13 comments without text and keep the others

In [47]:
comments_category = comments_category.dropna(subset=['comment_text'])
print(comments_category['comment_text'].isna().sum())

0


Check for duplicates text

In [48]:
comments_category.pivot_table(columns=['comment_text'], aggfunc='size')

comment_text
 'Literally' 'like' my bestfriend\nCamila wut?🤔                                                                                                                          3
 *But I Will Tell You If I Don't Like You* 🔥                                                                                                                             1
 EVE - Love Is Blind  is better                                                                                                                                          1
 I LIKE IT MORE THAN THE ORIGINAL CLUB HERE :))))                                                                                                                        4
 I be like send me the addy, hop out the Porsche wit a bhaddie, got a nice  face and a fattie,, do what I say like I'm daddy I loveeeee this verse😍😍😍 you bad asffff😍    1
                                                                                                                                    

In [49]:
comments_category[comments_category['comment_text'].isna()]['video_id']

Series([], Name: video_id, dtype: object)

# Explore some stats about the comments text

In [50]:
# Average lenght in number of characters
comments_len = comments_category['comment_text'].apply(lambda x: len(str(x)))
comments_len.describe()

count    118593.000000
mean         65.042701
std         162.447460
min           1.000000
25%          19.000000
50%          36.000000
75%          67.000000
max        8163.000000
Name: comment_text, dtype: float64

In [51]:
print(comments_category.loc[115887]['comment_text'])

AWSOME!


In [52]:
print(f"Unique comments: {comments_category['comment_text'].nunique()}\nTotal comments: {comments_category.shape[0]}")

Unique comments: 72130
Total comments: 118593


In [53]:
comments_category[comments_category['comment_text'].apply(lambda x: len(str(x))) >6000]['comment_text']

3881      Like if you're watching in 2016, 2017, 2018, 2...
76713     Featuring Sandra Prescott , Kimberly O'Brien ,...
78762     (The Great and terrible Day of the Lord. When ...
78883     (The Great and terrible Day of the Lord. When ...
78967     (The Great and terrible Day of the Lord. When ...
115893    (JESUS; THE ALMIGHTY!!!):\n\nAnd I saw heaven ...
115970    (JESUS; THE ALMIGHTY!!!):\n\nAnd I saw heaven ...
Name: comment_text, dtype: object

In [74]:
#Keep only uniques
comments_category = comments_category.drop_duplicates(subset='comment_text')
print(f"Unique comments: {comments_category['comment_text'].nunique()}\nTotal comments: {comments_category.shape[0]}")
comments_category.describe()

Unique comments: 72130
Total comments: 72130
  and should_run_async(code)


Unnamed: 0,category_id
count,72130.0
mean,10.0
std,0.0
min,10.0
25%,10.0
50%,10.0
75%,10.0
max,10.0


In [75]:
# Only keep english comments
from langdetect import detect, LangDetectException
'''
filt = [detect(text) == 'en' for text in comments_category['comment_text']]
'''

filt = []
for text in comments_category['comment_text']:
    try:
        en = detect(text)
    except LangDetectException:
        filt.append(False)
        
    else:
        if en == 'en':
            filt.append(True)
        else:
            filt.append(False)

  and should_run_async(code)


In [76]:
comments_category_filtered = comments_category[filt]
comments_category_filtered['comment_text'].describe()

  and should_run_async(code)


count                 50304
unique                50304
top       I LOVEEEE it 💜💜💜💜
freq                      1
Name: comment_text, dtype: object

# Create document term matrix

In [81]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(comments_category_filtered['comment_text'])
id2word = cv.get_feature_names()
dtm

  and should_run_async(code)


<50304x12701 sparse matrix of type '<class 'numpy.int64'>'
	with 311894 stored elements in Compressed Sparse Row format>

In [82]:
# LDA for Topic Modelling
LDA = LatentDirichletAllocation(n_components=5,random_state=1)
LDA.fit(dtm)

'''
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=dtm, id2word=id2word, num_topics=5, random_state=42, 
   update_every=1, chunksize=5, passes=10, alpha='auto', per_word_topics=True)
   '''

  and should_run_async(code)


"\nlda_model = gensim.models.ldamodel.LdaModel(\n   corpus=dtm, id2word=id2word, num_topics=5, random_state=42, \n   update_every=1, chunksize=5, passes=10, alpha='auto', per_word_topics=True)\n   "

In [83]:
for index,topic in enumerate(LDA.components_):
    print(f'topic #{index} : ')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])

  and should_run_async(code)
topic #0 : 
['yeah', 'time', 'nbut', 'right', 'oh', 'baby', 'way', 'want', 'let', 'like', 'need', 'love', 'nand', 'll', 've', 'just', 'nyou', 'know', 'don', 'ni']
topic #1 : 
['im', 'smith', 'forever', 'queen', 'going', 'happy', 'heart', 'perfect', 'waiting', 'awesome', 'life', 'like', 'jesus', 'lol', 'got', 'sam', 'chester', 'god', 'shit', 'video']
topic #2 : 
['channel', 'voice', 'https', 'taylor', 'really', 'com', 'watch', 'youtube', 'sounds', 'songs', 'video', 'wait', 'just', 'good', 'new', 'music', 'song', 'album', 'like', 'love']
topic #3 : 
['lol', 'pretty', 'really', 'shit', 'video', 'don', 'girl', 'look', 'didn', 'cool', 'trending', 'thought', 'better', 'make', 'know', 'did', 'looks', 'people', 'just', 'like']
topic #4 : 
['god', 'awesome', 'fuck', 'really', 'songs', 'like', 'wow', 'thank', 'omg', 'best', 'just', 'good', 'voice', 'video', 'great', 'beautiful', 'music', 'amazing', 'love', 'song']


In [84]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, cv)
pyLDAvis.display(vis)

  and should_run_async(code)
