## We take YouTube comment data for the top 200 videos from each day. Obtained from https://www.kaggle.com/datasnaek/youtube.

In [207]:
%matplotlib inline
import matplotlib
import pandas as pd

In [208]:
comments = pd.read_csv("youtube-data/GBcomments.csv", error_bad_lines=False)

Skipping line 113225: expected 4 fields, saw 5

Skipping line 158379: expected 4 fields, saw 7



In [209]:
comments.keys()

Index([u'video_id', u'comment_text', u'likes', u'replies'], dtype='object')

In [210]:
comments['comment_text'] = comments['comment_text'].astype(str)

In [211]:
comments

Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0
5,jt2OHQh0HoQ,Keep calm and buy iphone 8 Keep calm and buy i...,0,0
6,jt2OHQh0HoQ,i am a big fan of youtube and u !!!!!!!!!!!!!,0,0
7,jt2OHQh0HoQ,You will never find Losers who line up and pay...,0,0
8,jt2OHQh0HoQ,*APPLE JUST COMMENTED ON MY LAST VIDEO* I'm cr...,0,0
9,jt2OHQh0HoQ,"I'm only here to see Emma, I love her so much!...",0,0


In [212]:
import re
# convert all comments to lowercase and use unicode encoding
comments['comment_text'] = comments['comment_text'].apply(lambda x: unicode(x.lower(), 'utf-8').encode('unicode_escape'))
# make each emoji a separate character
comments['comment_text'] = comments['comment_text'].apply(lambda x: re.sub(r'(\\[uU][0-9a-z]{4,8})', r" \1 ", x))
# remove whitespace and punctuation marks
comments['comment_text'] = comments['comment_text'].apply(lambda x: re.sub(r'\\\\n', r" ", x))
comments['comment_text'] = comments['comment_text'].apply(lambda x: re.sub(r'\\xa0', r" ", x))
comments['comment_text'] = comments['comment_text'].apply(lambda x: x.translate(None, string.punctuation))

In [213]:
comments

Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,its more accurate to call it the m 1000 becaus...,0,0
1,jt2OHQh0HoQ,to be there with a samsung phone U0001f602 U...,1,0
2,jt2OHQh0HoQ,thank gosh a place i can watch it without havi...,0,0
3,jt2OHQh0HoQ,what happened to the home button on the iphone...,0,0
4,jt2OHQh0HoQ,power is the disease care is the cure keep c...,0,0
5,jt2OHQh0HoQ,keep calm and buy iphone 8 keep calm and buy i...,0,0
6,jt2OHQh0HoQ,i am a big fan of youtube and u,0,0
7,jt2OHQh0HoQ,you will never find losers who line up and pay...,0,0
8,jt2OHQh0HoQ,apple just commented on my last video im cryin...,0,0
9,jt2OHQh0HoQ,im only here to see emma i love her so much im...,0,0


## First thing we want to do is to get some overall stats on words used in all comments.

In [214]:
# join all comments in a single string
all_comments = ' '.join(comments['comment_text']).split()
print all_comments[:50]

['its', 'more', 'accurate', 'to', 'call', 'it', 'the', 'm', '1000', 'because', 'the', 'price', 'is', 'closer', 'than', 'calling', 'it', 'the', 'x', '10', 'to', 'be', 'there', 'with', 'a', 'samsung', 'phone', 'U0001f602', 'U0001f602', 'U0001f602', 'thank', 'gosh', 'a', 'place', 'i', 'can', 'watch', 'it', 'without', 'having', 'to', 'be', 'at', 'hd', 'my', 'speed', 'doesn', 'u2019t', 'support', 'hd']


In [215]:
word_frequency = pd.Series(all_comments).value_counts()
word_frequency

the                                 95078
i                                   69865
a                                   54965
to                                  54634
and                                 54023
you                                 49966
is                                  41696
of                                  35016
this                                34441
it                                  33496
in                                  29790
that                                27122
so                                  23438
for                                 23303
your                                19051
on                                  19044
my                                  18550
love                                17927
like                                17665
was                                 16109
are                                 15578
but                                 15439
with                                14785
have                              

## Using this data, we can normalize the word vector for the comment pool for each video.

In [278]:
grouped_comments = pd.DataFrame(comments.groupby('video_id')['comment_text'].aggregate(lambda x: ' '.join(x)))

In [283]:
word_count = [pd.Series(x.split()).value_counts() for x in grouped_comments['comment_text']]
word_count[0]

and                                186
the                                173
a                                  166
i                                  128
to                                 127
is                                 102
my                                 101
cat                                 88
you                                 79
that                                74
of                                  72
it                                  70
they                                66
cats                                61
in                                  58
he                                  52
not                                 50
milk                                50
U0001f60d                           48
with                                48
was                                 48
have                                44
for                                 43
are                                 43
kittens                             41
or                       

In [300]:
word_vectors_unnormed = [pd.Series(x/word_frequency).fillna(0) for x in word_count]

In [313]:
from sklearn import preprocessing
word_vectors_normed = preprocessing.normalize(word_vectors_unnormed)
grouped_comments['word_vector'] = word_vectors_normed.tolist()

In [321]:
grouped_comments['word_vector']

video_id
-1fzGnFwz9M    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
-2hRYEFiPSA    [0.00235298548795, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
-Ifnaxi2LQg    [0.0, 0.00104160295654, 0.0017301201651, 0.0, ...
-JmNKGfFj7w    [0.000765150192485, 0.0, 0.0, 0.0, 0.0, 0.0, 0...
-yKW9NG2yNc    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
08RHSZvixec    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
0Ac0Q7iL_T4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
0H3MirHyX2w    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
0Jjm3d_7N6A    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
0KLFeWlJtvY    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
0QgTwwloZL8    [0.00420840101425, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
0eKdNcP4xrM    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
0ptNpGfMRCg    [0.0, 0.0, 0.00235832768983, 0.0, 0.0, 0.0, 0....
0qfgZJNCCJQ    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
12fphklmFzk    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1MA8LIQ5Tec    [

## Now we add in the title for each video.

In [245]:
video_url = ["https://www.youtube.com/watch?v="+x for x in grouped_comments.keys()]

In [247]:
import urllib2
from BeautifulSoup import BeautifulSoup
video_title = [BeautifulSoup(urllib2.urlopen(x)).title.string for x in video_url]

In [330]:
grouped_comments['title'] = video_title

In [331]:
grouped_comments

Unnamed: 0_level_0,comment_text,word_vector,title
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1fzGnFwz9M,i make interesting cartoons and i need your he...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9 Things You Need To Know About Kittens - Simo...
-2hRYEFiPSA,one thing thats upset me the most about cheste...,"[0.00235298548795, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",Mike Shinoda of Linkin Park - KROQ Interview: ...
-Ifnaxi2LQg,film tube new channel frequency U0001f48b U0...,"[0.0, 0.00104160295654, 0.0017301201651, 0.0, ...",Fergie - You Already Know ft. Nicki Minaj - Yo...
-JmNKGfFj7w,her reaction annoyed me so much U0001f621 in...,"[0.000765150192485, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",Jim Carrey Sounds Off on Icons and More at NYF...
-yKW9NG2yNc,i subscribed immediately lol i look like one o...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",FENTY BEAUTY.. IS THIS EVEN REAL?? | FULL REVI...
08RHSZvixec,what the f is going on in the mirror behind yo...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",BLIND TASTE TEST // Grace Helbig - YouTube
0Ac0Q7iL_T4,louise you are beautiful oooooh loving the hai...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Homeware &amp; Disney Primark Haul! - YouTube
0H3MirHyX2w,so whacking off into a cup and having random w...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Sperm Donor Meets His 19 Children All Together...
0Jjm3d_7N6A,loved the footage of duke at the beginning im...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",My UNEDITED Morning Routine! | Amelia Liana - ...
0KLFeWlJtvY,super proud as always xxx this is insanely goo...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Tom Fletcher Book Club at WHSmith - YouTube


## First let's look at video similarities.

In [453]:
import warnings
warnings.filterwarnings('ignore')
import scipy.spatial.distance
def calculateDistances(a):
    distances = pd.DataFrame(index=grouped_comments.index)
    distances['distance'] = grouped_comments['word_vector'].apply(lambda b: scipy.spatial.distance.euclidean(a, b))
    distances['title'] = grouped_comments['title'].apply(lambda x: x.replace(" - YouTube", ""))
    return distances
def closestVideos(vecA):
    a = grouped_comments.loc[vecA, "word_vector"]
    distances = calculateDistances(a)
    print "Video being compared:"
    print distances.sort('distance')['title'][:1].values
    print "-------------------"
    print "Nearest videos:"
    print distances.sort('distance')['title'][1:6].values

In [454]:
closestVideos("zgLtEob6X-Q")

Video being compared:
[u'Honest Trailers - The Mummy (2017)']
-------------------
Nearest videos:
[u'Everything Wrong With Alien: Covenant In 16 Minutes Or Less'
 u'100% PURE UNCUT HOLO *not drugz* *for nails*'
 u'Things you need to know about Pennywise'
 u'The Square - Official Trailer'
 u'Heung-Min Son has different celebrations with everyone! | Eric Dier Teammates 2.0']


In [455]:
closestVideos("0ptNpGfMRCg")

Video being compared:
[u'Gold iPhone 8 Plus Unboxing &amp; First Impressions!']
-------------------
Nearest videos:
[u'10 Reasons NOT To Buy iPhone X!' u'YouTube'
 u'Pink Hair, Don&#39;t Care! | Weekly 35' u'Apple iPhone X first look'
 u'The Road to 5,000,000! [LIVE]']


In [456]:
closestVideos("0H3MirHyX2w")

Video being compared:
[u'Sperm Donor Meets His 19 Children All Together']
-------------------
Nearest videos:
[u'IT (2017), The Fan REACTION at Cinema!'
 u'\u041d\u0430 \u0443\u0447\u0435\u043d\u0438\u044f\u0445 \xab\u0417\u0430\u043f\u0430\u0434-2017\xbb \u043d\u0430 \u043f\u043e\u043b\u0438\u0433\u043e\u043d\u0435 \u0432 \u041b\u0435\u043d\u043e\u0431\u043b\u0430\u0441\u0442\u0438 \u0431\u043e\u0435\u0432\u043e\u0439 \u0432\u0435\u0440\u0442\u043e\u043b\u0435\u0442 \u043f\u043e\u043f\u0430\u043b \u0440\u0430\u043a\u0435\u0442\u0430\u043c\u0438 \u043f\u043e \u0437\u0440\u0438\u0442\u0435\u043b\u044f\u043c'
 u'Everything I Ate In PARIS | Food Diary Friday Vlog | Melanie Murphy'
 u'Red Sparrow | Official Trailer [HD] | 20th Century FOX'
 u'Highlights | Millwall 1-0 Leeds']


## Let's do some vector addition!

In [474]:
def addVectors(vecA, vecB):
    a = grouped_comments.loc[vecA, "word_vector"]
    b = grouped_comments.loc[vecB, "word_vector"]
    c = [i+j for i,j in zip(a, b)]
    distances = calculateDistances(preprocessing.normalize(c)[0])
    print "Videos being added:"
    print grouped_comments.loc[vecA, 'title']
    print grouped_comments.loc[vecB, 'title']
    print "-------------------"
    print "Nearest videos:"
    print distances.sort('distance')['title'][:7].values
def subtractVectors(vecA, vecB):
    a = grouped_comments.loc[vecA, "word_vector"]
    b = grouped_comments.loc[vecB, "word_vector"]
    c = [i-j for i,j in zip(a, b)]
    distances = calculateDistances(preprocessing.normalize(c)[0])
    print "Videos being subtracted:"
    print grouped_comments.loc[vecA, 'title']
    print grouped_comments.loc[vecB, 'title']
    print "-------------------"
    print "Nearest videos:"
    print distances.sort('distance')['title'][:7].values
import random
def randomVec():
    return random.choice(grouped_comments.index)

In [463]:
addVectors('yuwp7x_IiZ4', 'zcqZHYo7ONs')

Videos being added:
Playing with old fridges  Kills -  public information film 1970s UK advert - YouTube
Bell&#39;s Theorem: The Quantum Venn Diagram Paradox - YouTube
-------------------
Nearest videos:
[u'Bell&#39;s Theorem: The Quantum Venn Diagram Paradox'
 u'Playing with old fridges  Kills -  public information film 1970s UK advert'
 u'The Problem With Our Phones'
 u'Stevie Wonder Mocks Climate Change Deniers During &#39;Hand In Hand&#39; Telethon'
 u'My DNA Test Results! I&#39;m WHAT?!'
 u'The real reason To Kill A Mockingbird became so famous'
 u'President Trump Delivers a Statement on Hurricane Irma 9/10/17']


In [471]:
addVectors('VJ4yhjTkuNE', '0qfgZJNCCJQ')

Videos being added:
Cardi B. On Her BET Nominations, Nicki Minaj, Dating Offset &amp; Keeping It Hood - YouTube
Everytime The Beat Drops With The Chainsmokers | Ranz and Niana - YouTube
-------------------
Nearest videos:
[u'Everytime The Beat Drops With The Chainsmokers | Ranz and Niana'
 u'Cardi B. On Her BET Nominations, Nicki Minaj, Dating Offset &amp; Keeping It Hood'
 u'6X EXTREME SPICY NOODLE CAR RIDE (I CRIED) | Ranz and Niana'
 u'Cardi B - Bodak Yellow (feat. Kodak Black) [Remix]'
 u'Trap Kitchen: How Two Opposing Gang Members Formed A Bond Over Food'
 u'TESTING WEIRD PUPPY TOYS WITH PUPPY!!! *CUTE*'
 u'Killing spawn peekers can be hard sometimes']


In [476]:
addVectors(randomVec(), randomVec())

Videos being added:
How Fast Do Tails’ Tails Gotta Go To Fly? (Because Science w/ Kyle Hill) - YouTube
getting into a conversation in a language you don&#39;t actually speak that well - YouTube
-------------------
Nearest videos:
[ u'getting into a conversation in a language you don&#39;t actually speak that well'
 u'How Fast Do Tails\u2019 Tails Gotta Go To Fly? (Because Science w/ Kyle Hill)'
 u'Drowning for Power - Rooster Teeth Animated Adventures'
 u'The Honda Civic has a very small feature, that I never noticed. Until now.'
 u'Bell&#39;s Theorem: The Quantum Venn Diagram Paradox'
 u'What&#39;s Actually the Plane of the Future'
 u'Bolton 0-1 Blades - match action']


In [470]:
something = "\u041d\u0430 \u0443\u0447\u0435\u043d\u0438\u044f\u0445 \xab\u0417\u0430\u043f\u0430\u0434-2017\xbb \u043d\u0430 \u043f\u043e\u043b\u0438\u0433\u043e\u043d\u0435 \u0432 \u041b\u0435\u043d\u043e\u0431\u043b\u0430\u0441\u0442\u0438 \u0431\u043e\u0435\u0432\u043e\u0439 \u0432\u0435\u0440\u0442\u043e\u043b\u0435\u0442 \u043f\u043e\u043f\u0430\u043b \u0440\u0430\u043a\u0435\u0442\u0430\u043c\u0438 \u043f\u043e \u0437\u0440\u0438\u0442\u0435\u043b\u044f\u043c"
print something.decode('unicode-escape')
# translates to "At the exercises "West-2017" at the training ground in the Leningrad region, a combat helicopter hit rockets by spectators"

На учениях «Запад-2017» на полигоне в Ленобласти боевой вертолет попал ракетами по зрителям


In [465]:
subtractVectors('59X_zkBglEY', '1MA8LIQ5Tec')

Videos being subtracted:
Trap Kitchen: How Two Opposing Gang Members Formed A Bond Over Food - YouTube
Everything I Ate In PARIS | Food Diary Friday Vlog | Melanie Murphy - YouTube
-------------------
Nearest videos:
[u'Trap Kitchen: How Two Opposing Gang Members Formed A Bond Over Food'
 u'Cardi B. On Her BET Nominations, Nicki Minaj, Dating Offset &amp; Keeping It Hood'
 u'Juicy J - Intro (Prod by $uicideBoy$) [Highly Intoxicated]' u'YouTube'
 u'Stephen A. Smith, Snoop Dogg and Magic Johnson discuss Colin Kaepernick | First Take | ESPN'
 u'AN EMOTIONAL DAY :(' u'Pregnant Day Out! #Ad']


In [479]:
subtractVectors(randomVec(), randomVec())

Videos being subtracted:
The Cast of “It” Rates Everyday Scary Things | Teen Vogue - YouTube
I Ordered The First 5 &quot;Free&quot; Things From Wish - YouTube
-------------------
Nearest videos:
[u'The Cast of \u201cIt\u201d Rates Everyday Scary Things | Teen Vogue'
 u'How The Cast Of It Should Really Look'
 u'CNCO, Little Mix - Reggaet\xf3n Lento (Remix) [Official Video]'
 u'The Honda Civic has a very small feature, that I never noticed. Until now.'
 u'Marilyn Manson - WE KNOW WHERE YOU FUCKING LIVE (official audio)'
 u'Mexico 7.1 Earthquake: &#39;Absolutely Horrific Images&#39; | MSNBC'
 u'SPIKED by a Sea Urchin?']
