## Sentiment Analysis of YouTube Comments

In [18]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [7]:
df = pd.read_csv('../data/video_comment.csv', lineterminator='\n', index_col=[0])
df.head()

Unnamed: 0,videoid,commentid,authorchannelid,authorchannelurl,authordisplayname,authorprofileimageurl,likecount,textdisplay
0,koMbIaJ8Tmo,Ugwap8R6iQe_QRizueB4AaABAg,UC7LrscDTqi-4bOKztaGX8Rg,http://www.youtube.com/channel/UC7LrscDTqi-4bO...,Alexzandria Hensley,https://yt3.ggpht.com/a/AATXAJwC9j2RbiyLEVwWNt...,1,Watching this is 2020 reminiscing on old YouTu...
1,koMbIaJ8Tmo,UgyjDeNnLrkrYs_YnhJ4AaABAg,UCs_pJrtGpyb8gbwBefFy6Ow,http://www.youtube.com/channel/UCs_pJrtGpyb8gb...,Mavi Cole,https://yt3.ggpht.com/a/AATXAJykjvSxTLOa9OIDVL...,3,It’s that time of year ladies
2,koMbIaJ8Tmo,Ugx0gSeHpCbYDzbCcyB4AaABAg,UCL9WyuzDeeHWJN6vl4yOMUw,http://www.youtube.com/channel/UCL9WyuzDeeHWJN...,Insiyah Naqvi,https://yt3.ggpht.com/a/AATXAJwX_oqLooHlvVci8m...,5,who else is here to feel the fall vibes becaus...
3,koMbIaJ8Tmo,Ugz-gHk8r-HXOeRQntN4AaABAg,UCYPSVAaTYU6_r76XGj-OE9g,http://www.youtube.com/channel/UCYPSVAaTYU6_r7...,Isa,https://yt3.ggpht.com/a/AATXAJx3rFIpU8WIw8Jx3I...,2,sometimes i come back and watch this to feel l...
4,koMbIaJ8Tmo,UgxVAWmD8FkDFKA55Dl4AaABAg,UC8S42p9_xrinNkOJ4qvKOTA,http://www.youtube.com/channel/UC8S42p9_xrinNk...,mendlering,https://yt3.ggpht.com/a/AATXAJxbNmNOl5z9gNM1A5...,1,"I always love Beth&#39;s makeup sess, her eyes..."


In [3]:
df_comments = df[['videoid','commentid','textdisplay']]

In [12]:
df = pd.DataFrame(np.array(df.textdisplay), columns=['textdisplay'])
df

Unnamed: 0,textdisplay
0,Watching this is 2020 reminiscing on old YouTu...
1,It’s that time of year ladies
2,who else is here to feel the fall vibes becaus...
3,sometimes i come back and watch this to feel l...
4,"I always love Beth&#39;s makeup sess, her eyes..."
...,...
30982,Yall crazy they is all the BESTTTTT
30983,The last candy they ate is my favourite candy ...
30984,We are just hating on Mexican and they are not...
30985,"Ro just went for it <a href=""https://www.youtu..."


In [13]:
# Calculate word count
df['word_count'] = df['textdisplay'].apply(lambda x: len(str(x).split(" ")))
df

Unnamed: 0,textdisplay,word_count
0,Watching this is 2020 reminiscing on old YouTu...,15
1,It’s that time of year ladies,6
2,who else is here to feel the fall vibes becaus...,21
3,sometimes i come back and watch this to feel l...,31
4,"I always love Beth&#39;s makeup sess, her eyes...",13
...,...,...
30982,Yall crazy they is all the BESTTTTT,7
30983,The last candy they ate is my favourite candy ...,13
30984,We are just hating on Mexican and they are not...,55
30985,"Ro just went for it <a href=""https://www.youtu...",7


In [15]:
# Calculate character count
df['char_count'] = df['textdisplay'].str.len()
df

Unnamed: 0,textdisplay,word_count,char_count
0,Watching this is 2020 reminiscing on old YouTu...,15,87
1,It’s that time of year ladies,6,29
2,who else is here to feel the fall vibes becaus...,21,104
3,sometimes i come back and watch this to feel l...,31,146
4,"I always love Beth&#39;s makeup sess, her eyes...",13,68
...,...,...,...
30982,Yall crazy they is all the BESTTTTT,7,35
30983,The last candy they ate is my favourite candy ...,13,64
30984,We are just hating on Mexican and they are not...,55,237
30985,"Ro just went for it <a href=""https://www.youtu...",7,94


In [17]:
def avg_word(review):
    words = review.split()
    return (sum(len(word) for word in words) / len(words))

# Calculate average words
df['avg_word'] = df['textdisplay'].apply(lambda x: avg_word(x))
df

Unnamed: 0,textdisplay,word_count,char_count,avg_word
0,Watching this is 2020 reminiscing on old YouTu...,15,87,4.866667
1,It’s that time of year ladies,6,29,4.000000
2,who else is here to feel the fall vibes becaus...,21,104,4.000000
3,sometimes i come back and watch this to feel l...,31,146,3.741935
4,"I always love Beth&#39;s makeup sess, her eyes...",13,68,4.307692
...,...,...,...,...
30982,Yall crazy they is all the BESTTTTT,7,35,4.142857
30983,The last candy they ate is my favourite candy ...,13,64,4.000000
30984,We are just hating on Mexican and they are not...,55,237,3.588235
30985,"Ro just went for it <a href=""https://www.youtu...",7,94,12.571429


In [21]:
stop_words = stopwords.words('english')
df['stopword_coun'] = df['textdisplay'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
df.head()

Unnamed: 0,textdisplay,word_count,char_count,avg_word,stopword_coun
0,Watching this is 2020 reminiscing on old YouTu...,15,87,4.866667,7
1,It’s that time of year ladies,6,29,4.0,2
2,who else is here to feel the fall vibes becaus...,21,104,4.0,13
3,sometimes i come back and watch this to feel l...,31,146,3.741935,13
4,"I always love Beth&#39;s makeup sess, her eyes...",13,68,4.307692,4


In [22]:
# Lower case all words
df['review_lower'] = df['textdisplay'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df

Unnamed: 0,textdisplay,word_count,char_count,avg_word,stopword_coun,review_lower
0,Watching this is 2020 reminiscing on old YouTu...,15,87,4.866667,7,watching this is 2020 reminiscing on old youtu...
1,It’s that time of year ladies,6,29,4.000000,2,it’s that time of year ladies
2,who else is here to feel the fall vibes becaus...,21,104,4.000000,13,who else is here to feel the fall vibes becaus...
3,sometimes i come back and watch this to feel l...,31,146,3.741935,13,sometimes i come back and watch this to feel l...
4,"I always love Beth&#39;s makeup sess, her eyes...",13,68,4.307692,4,"i always love beth&#39;s makeup sess, her eyes..."
...,...,...,...,...,...,...
30982,Yall crazy they is all the BESTTTTT,7,35,4.142857,4,yall crazy they is all the besttttt
30983,The last candy they ate is my favourite candy ...,13,64,4.000000,5,the last candy they ate is my favourite candy ...
30984,We are just hating on Mexican and they are not...,55,237,3.588235,24,we are just hating on mexican and they are not...
30985,"Ro just went for it <a href=""https://www.youtu...",7,94,12.571429,3,"ro just went for it <a href=""https://www.youtu..."


In [23]:
# Remove Punctuation
df['review_nopunc'] = df['review_lower'].str.replace('[^\w\s]', '')
df

Unnamed: 0,textdisplay,word_count,char_count,avg_word,stopword_coun,review_lower,review_nopunc
0,Watching this is 2020 reminiscing on old YouTu...,15,87,4.866667,7,watching this is 2020 reminiscing on old youtu...,watching this is 2020 reminiscing on old youtu...
1,It’s that time of year ladies,6,29,4.000000,2,it’s that time of year ladies,its that time of year ladies
2,who else is here to feel the fall vibes becaus...,21,104,4.000000,13,who else is here to feel the fall vibes becaus...,who else is here to feel the fall vibes becaus...
3,sometimes i come back and watch this to feel l...,31,146,3.741935,13,sometimes i come back and watch this to feel l...,sometimes i come back and watch this to feel l...
4,"I always love Beth&#39;s makeup sess, her eyes...",13,68,4.307692,4,"i always love beth&#39;s makeup sess, her eyes...",i always love beth39s makeup sess her eyes are...
...,...,...,...,...,...,...,...
30982,Yall crazy they is all the BESTTTTT,7,35,4.142857,4,yall crazy they is all the besttttt,yall crazy they is all the besttttt
30983,The last candy they ate is my favourite candy ...,13,64,4.000000,5,the last candy they ate is my favourite candy ...,the last candy they ate is my favourite candy ...
30984,We are just hating on Mexican and they are not...,55,237,3.588235,24,we are just hating on mexican and they are not...,we are just hating on mexican and they are not...
30985,"Ro just went for it <a href=""https://www.youtu...",7,94,12.571429,3,"ro just went for it <a href=""https://www.youtu...",ro just went for it a hrefhttpswwwyoutubecomwa...


In [25]:
# Import stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Remove Stopwords
df['review_nopunc_nostop'] = df['review_nopunc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
df

Unnamed: 0,textdisplay,word_count,char_count,avg_word,stopword_coun,review_lower,review_nopunc,review_nopunc_nostop
0,Watching this is 2020 reminiscing on old YouTu...,15,87,4.866667,7,watching this is 2020 reminiscing on old youtu...,watching this is 2020 reminiscing on old youtu...,watching 2020 reminiscing old youtube videos p...
1,It’s that time of year ladies,6,29,4.000000,2,it’s that time of year ladies,its that time of year ladies,time year ladies
2,who else is here to feel the fall vibes becaus...,21,104,4.000000,13,who else is here to feel the fall vibes becaus...,who else is here to feel the fall vibes becaus...,else feel fall vibes youtubers videos match back
3,sometimes i come back and watch this to feel l...,31,146,3.741935,13,sometimes i come back and watch this to feel l...,sometimes i come back and watch this to feel l...,sometimes come back watch feel like 15 year ol...
4,"I always love Beth&#39;s makeup sess, her eyes...",13,68,4.307692,4,"i always love beth&#39;s makeup sess, her eyes...",i always love beth39s makeup sess her eyes are...,always love beth39s makeup sess eyes big pretty
...,...,...,...,...,...,...,...,...
30982,Yall crazy they is all the BESTTTTT,7,35,4.142857,4,yall crazy they is all the besttttt,yall crazy they is all the besttttt,yall crazy besttttt
30983,The last candy they ate is my favourite candy ...,13,64,4.000000,5,the last candy they ate is my favourite candy ...,the last candy they ate is my favourite candy ...,last candy ate favourite candy whole world
30984,We are just hating on Mexican and they are not...,55,237,3.588235,24,we are just hating on mexican and they are not...,we are just hating on mexican and they are not...,hating mexican mess br br people handle spicy ...
30985,"Ro just went for it <a href=""https://www.youtu...",7,94,12.571429,3,"ro just went for it <a href=""https://www.youtu...",ro just went for it a hrefhttpswwwyoutubecomwa...,ro went hrefhttpswwwyoutubecomwatchvqbif88usxd...


In [26]:
# Return frequency of values
freq= pd.Series(" ".join(df['review_nopunc_nostop']).split()).value_counts()[:30]
freq

ew          7086
br          5413
like        2386
love        1908
de          1221
video       1137
one         1053
que         1049
im           811
good         794
2020         750
get          744
know         716
watching     693
la           689
people       684
really       669
i39m         644
much         631
time         629
would        594
u            584
lol          576
make         542
it39s        538
en           516
best         512
song         510
still        509
see          496
dtype: int64

In [None]:
"""
other_stopwords = ['get', 'us', 'see', 'use', 'said', 'asked', 'day', 'go' \
  'even', 'ive', 'right', 'left', 'always', 'would', 'told', \
  'get', 'us', 'would', 'get', 'one', 'ive', 'go', 'even', \
  'also', 'ever', 'x', 'take', 'let' ]
df['review_nopunc_nostop_nocommon'] = df['review_nopunc_nostop'].apply(lambda x: "".join(" ".join(x for x in x.split() if x not in other_stopwords)))
"""

In [None]:
# Import textblob
from textblob import Word

# Lemmatize final review format
df['cleaned_review'] = df['review_nopunc_nostop_nocommon']\
.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))