In [2]:
import translators as ts
from langdetect import detect

In [3]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm

In [4]:
df = pd.read_csv("../datasets/covid_philippines/covid_philippines_comments.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same


In [5]:
translated_comments = {}
translated_comments["video_id"] = {}
translated_comments["comment"] = {}
video_id_list = df["video_id"].to_list()
comments_list = df["comment"].to_list()

In [6]:
pbar = tqdm(total=len(video_id_list))
pbar.set_description("Translating...")

for i in range(df.shape[0]):
    if comments_list[i] != None:
        new_comment = comments_list[i]
        try:
            lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
            if lang != 'en':
                new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')
                
        except:
            # No change; get same comment from list
            pass
            
        finally:
            translated_comments["video_id"][i] = video_id_list[i]
            translated_comments["comment"][i] = new_comment
            pbar.update(1)

translated_df = pd.DataFrame.from_dict(translated_comments)
pbar.close()

Translating...: 100%|██████████| 1216/1216 [01:50<00:00, 11.04it/s]


In [7]:
translated_df

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same
...,...,...
1211,5DvMPgoKZmM,This covid will be a never-ending fuckery as l...
1212,5DvMPgoKZmM,A new variant is inevitable.
1213,5DvMPgoKZmM,The man that should resign from his office is ...
1214,5DvMPgoKZmM,"Is that the people who got been vaccinated.,"


---

In [8]:
def translate(video_id_list, comments_list):
    translated_comments = {}
    translated_comments["video_id"] = {}
    translated_comments["comment"] = {}
    
    pbar = tqdm(total=len(video_id_list))
    pbar.set_description("Translating...")

    for i in range(df.shape[0]):
        if comments_list[i] != None:
            new_comment = comments_list[i]
            try:
                lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
                if lang != 'en':
                    new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')

            except:
                # No change; get same comment from list
                pass

            finally:
                translated_comments["video_id"][i] = video_id_list[i]
                translated_comments["comment"][i] = new_comment
                pbar.update(1)

    translated_df = pd.DataFrame.from_dict(translated_comments)
    pbar.close()

## SENTIMENT ANALYSIS

TEST RoBERTa

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from scipy.special import softmax

def roberta_sentiment_scores(list_comments):
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)

    for comments in list_comments:
        comment_words = []
        comments = comments.replace("\n", " ")
        comments = comments.replace("\xa0", " ")
        comments = comments.replace("?", " ")
        comments = comments.replace(":", " ")
        comments = comments.replace(";", " ")
        comments = comments.replace(";", " ")
        comments = re.sub(r"\s+", ' ', comments) 
        print(comments)
        for word in comments.split(' '):
            if word.startswith('@') and len(word) > 1:
               word = '@user'
        
            elif word.startswith('http'):
                word = "http"
            comment_words.append(word)

        comment_procs = " ".join(comment_words)

        encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
        print(encoded)
        output = model(**encoded)

        scores = output[0][0].detach().numpy()

        scores = softmax(scores)

        for i in range(len(scores)):

            l = labels[i]
            s = scores[i]
            print(l, s)

In [10]:
roberta_sentiment_scores(translated_df["comment"][:50])

  return self.fget.__get__(instance, owner)()


Loved the calmness of Manilla.
{'input_ids': tensor([[    0,   574, 12677,     5,  6327,  1825,     9,  1554,  4699,     4,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Negative 0.0026419058
Neutral 0.031785045
Positive 0.9655731
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
{'input_ids': tensor([[    0,  4993,    42, 23387, 14414,    38,   206,   358,   247,   197,
            33,    10, 23076,   358,   353,     7,   492,   985,  2574,    10,
            86,     7, 14384,     6,    19,    70,     5,  6631,    14,     5,
          6872,    16,  7242,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Negative 0.72357625
Neutral 0.25514364
Positive 0.021280115
manila looks beautifull with less people
{'input_ids': tensor

TEST TEXTBLOB

In [11]:
from textblob import TextBlob

def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [12]:
textblob_sentiment_scores(translated_df["comment"][:50])

positive 0.8
neutral 0.0
negative 0.06666666666666667
positive 0.5375
neutral 0.125
positive 0.75
positive 0.9
negative 0.95
positive 0.7
neutral 0.05
negative 1.0
neutral 0.0
negative 0.8
negative 0.4083333333333334
negative 0.9414285714285715
negative 1.0
neutral 0.0
negative 0.6666666666666666
neutral 0.0
positive 0.45
positive 0.75
neutral 0.0
positive 0.7
positive 0.2
positive 0.56875
neutral 0.0
neutral 0.0
neutral 0.0
negative 0.375
positive 0.5166666666666667
neutral 0.0
neutral 0.0
neutral 0.0
neutral 0.0
negative 0.25
neutral 0.0
negative 0.9
negative 0.6466666666666666
positive 0.44722222222222224
neutral 0.0
neutral 0.0
negative 0.18666666666666665
negative 0.6
positive 0.5
positive 0.35
positive 0.5583333333333333
positive 1.0
negative 0.6839285714285714
positive 0.5576388888888888
positive 0.275


TEST STANZA

In [13]:
import stanza
def stanza_sentiment_scores(list_comments):
    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in list_comments:
        doc = nlp(comment.replace("\n", " "))
        print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            print("%d -> %d" % (i, sentence.sentiment))

In [14]:
stanza_sentiment_scores(translated_df["comment"][:50])

2024-02-24 14:09:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-24 14:09:12 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

2024-02-24 14:09:12 INFO: Using device: cpu
2024-02-24 14:09:12 INFO: Loading: tokenize
2024-02-24 14:09:13 INFO: Loading: mwt
2024-02-24 14:09:13 INFO: Loading: sentiment
2024-02-24 14:09:13 INFO: Done loading processors!


Loved the calmness of Manilla.
0 -> 2
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
0 -> 0
manila looks beautifull with less people
0 -> 2
The lockdown makes the city look like a place I want to explore.

Before the lockdown, it looked like a crowded mess filled with traffic and pollution.
0 -> 0
India also same
0 -> 1
The president promised that he'll do his best to ease the traffic in the metro like 5mins of travel along EDSA. Fortunately it happened but unfortunately it wasn't in the ideal way. Nature's sense of humor tho.
0 -> 0
I went up the viewing spot at antipolo, its a very beautiful, almost smog free view of the metro skyline
0 -> 2
The sad part is people around the world did not intend to heal earth but afraid of death.
0 -> 1
i hope it stays like this forever its beautiful with no messy people
0 -> 2
2020 is the year when nature fight back and reduce h

TEST VADER

In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def vader_sentiment_scores (list_comments):
    for sentence in list_comments:
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict = sid_obj.polarity_scores(sentence)
        print(sentence)
        print("Overall sentiment dictionary is : ", sentiment_dict)
        print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
        print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
        print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
        print("Sentence Overall Rated As", end = " ")
 
        # decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")

    

In [16]:
vader_sentiment_scores (translated_df["comment"][:50])

Loved the calmness of Manilla.
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'compound': 0.765}
sentence was rated as  0.0 % Negative
sentence was rated as  31.2 % Neutral
sentence was rated as  68.8 % Positive
Sentence Overall Rated As Positive
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as  0.0 % Negative
sentence was rated as  100.0 % Neutral
sentence was rated as  0.0 % Positive
Sentence Overall Rated As Neutral
manila looks beautifull with less people
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as  0.0 % Negative
sentence was rated as  100.0 % Neutral
sentence was rated as  0.0 % Positive
Sentence Overall Rated As Neutral
The lockdown makes the city look like

## Get Sentiment sentiment for each video

#### TEXTBLOB APPROACH

In [17]:
def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [18]:
translated_df["video_id"].unique()

array(['aLZ85hb4wjE', 'sYI97jv-pZg', '3YFpjgIQqEo', 'dIsaz_XlmTw',
       'DWxIvQlpJK8', 'pMUumjHY3tw', 'nTUWK8vufOk', 'cPVE7QGS7As',
       'CEIrzjA8euQ', 'lw16DeB6zns', '8dTelszbObM', 'Mxf8uGFcqSE',
       'MQ5aYS4YFlQ', 'iOE6rAY8l-k', '_a-rQYfsCck', 'jEINZXA_ujo',
       '587N9bJ5J5k', 'E5F3xA_zkFc', '05JLyd58R-w', '2fRQ8OsqOLs',
       'E56W-5xVOss', 'c1oU8U05puY', 'SEHcakm-fAc', '1psSvU1km0I',
       'PP3Yu-ro1tA', 'NQeI1CRCqeo', 'RhDHGgo4yZg', 'sdsz-t540WI',
       '7SKGXkZKjV8', 'oy0wHScCPds', 'o1kPskxFkQ8', 'aoNQUUQno00',
       'liEUC1_l8e8', '-b0EuuMvvy8', 'MmyIvf7bEGc', 'Fd3HcncV6Zk',
       '-n9Ks3VTub4', 'LzymJ2xZhho', '-qFcO_onBdA', 'fp9uRsmTWqg',
       '3ZXR2eARmuQ', '57wz3HuIVLA', 'lbK7UjoLr8o', 'xV1oR-RbOGU',
       'kn6DKdInYXk', '2T3yZ6lNRDg', 'HLYSlKY6Ww4', 'sH_YoV-NA6s',
       'ZLT6L3PHz78', 'lczwrm68u6I', 'zhBdbLj5y6A', 'ibWFsmcnefk',
       'e0fN7HxkIUc', 'Ji47WRv2tQE', 'dj5ov38ihZI', 'j3E3NF9nk44',
       'BomdsEJjb0E', 'T7c6GvrF82k', '6DBFwIlT4fg', '2FgFNBIJT

In [19]:
def per_vid_textblob_polarity (df):
    video_polarity={}
    for video in df["video_id"].unique():
        sum_polarity=0
        for comment in df.loc[df["video_id"]==video]["comment"]:
            sum_polarity+=(TextBlob(comment)).sentiment.polarity
        video_polarity[video] = sum_polarity/df.loc[df["video_id"]==video]["comment"].count()
    ''' if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.subjectivity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.subjectivity)
        else:
            print("neutral", testimonial.sentiment.subjectivity)'''
    return video_polarity


In [20]:
per_vid_textblob_polarity(translated_df)

{'aLZ85hb4wjE': 0.48791666666666667,
 'sYI97jv-pZg': 0.5,
 '3YFpjgIQqEo': 0.5016428571428572,
 'dIsaz_XlmTw': 0.23604166666666665,
 'DWxIvQlpJK8': 0.2804861111111111,
 'pMUumjHY3tw': 0.09333333333333332,
 'nTUWK8vufOk': 0.48999007936507943,
 'cPVE7QGS7As': 0.5091309523809524,
 'CEIrzjA8euQ': 0.5,
 'lw16DeB6zns': 0.529623015873016,
 '8dTelszbObM': 0.5007678534202924,
 'Mxf8uGFcqSE': 0.16666666666666666,
 'MQ5aYS4YFlQ': 0.5803968013468015,
 'iOE6rAY8l-k': 0.3170726495726496,
 '_a-rQYfsCck': 0.15295454545454545,
 'jEINZXA_ujo': 0.4664790764790764,
 '587N9bJ5J5k': 0.20842063492063495,
 'E5F3xA_zkFc': 0.5186439153439153,
 '05JLyd58R-w': 0.0,
 '2fRQ8OsqOLs': 0.2924603174603175,
 'E56W-5xVOss': 0.400765873015873,
 'c1oU8U05puY': 0.32416666666666666,
 'SEHcakm-fAc': 0.25199044011544014,
 '1psSvU1km0I': 0.6003121693121694,
 'PP3Yu-ro1tA': 0.5254621212121211,
 'NQeI1CRCqeo': 0.1293939393939394,
 'RhDHGgo4yZg': 0.34,
 'sdsz-t540WI': 0.4252083333333333,
 '7SKGXkZKjV8': 0.4162460317460318,
 'oy0wHS

#### VADER APPROACH

In [23]:
def per_vid_vader_polarity (df):
    polarity_scores={}
    sid_obj = SentimentIntensityAnalyzer()

    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0
        sum_compound= 0
        scores={}
        for sentence in df.loc[df["video_id"]==video]["comment"]:

            sentiment_dict = sid_obj.polarity_scores(sentence)
            sum_neg += sentiment_dict['neg']
            sum_pos += sentiment_dict['pos']
            sum_neu += sentiment_dict['neu']
            sum_compound += sentiment_dict['compound']
            
        scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()
        scores["Overall"] = sum_compound/df.loc[df["video_id"]==video]["comment"].count()
        
        polarity_scores[video]=scores
        '''# decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")'''
    return polarity_scores
        

In [24]:
per_vid_vader_polarity(translated_df)

{'aLZ85hb4wjE': {'Neg': 0.07980000000000001,
  'Pos': 0.194,
  'Neu': 0.7261,
  'Overall': 0.17872000000000002},
 'sYI97jv-pZg': {'Neg': 0.0, 'Pos': 0.056, 'Neu': 0.944, 'Overall': 0.11315},
 '3YFpjgIQqEo': {'Neg': 0.0635,
  'Pos': 0.11320000000000001,
  'Neu': 0.8231999999999999,
  'Overall': 0.029220000000000003},
 'dIsaz_XlmTw': {'Neg': 0.0671,
  'Pos': 0.14550000000000002,
  'Neu': 0.7875,
  'Overall': 0.16365},
 'DWxIvQlpJK8': {'Neg': 0.139625,
  'Pos': 0.09925,
  'Neu': 0.761125,
  'Overall': -0.00888750000000002},
 'pMUumjHY3tw': {'Neg': 0.0475,
  'Pos': 0.034,
  'Neu': 0.9185,
  'Overall': -0.15775},
 'nTUWK8vufOk': {'Neg': 0.09920000000000001,
  'Pos': 0.1785,
  'Neu': 0.7224,
  'Overall': 0.2861},
 'cPVE7QGS7As': {'Neg': 0.07909999999999999,
  'Pos': 0.2439,
  'Neu': 0.6769999999999999,
  'Overall': 0.28220999999999996},
 'CEIrzjA8euQ': {'Neg': 0.196, 'Pos': 0.0, 'Neu': 0.804, 'Overall': -0.296},
 'lw16DeB6zns': {'Neg': 0.0499,
  'Pos': 0.16350000000000003,
  'Neu': 0.7866,
 

In [43]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import re
#from scipy.special import softmax

def roberta_per_vid_scores(df):
    polarity_scores={}

    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0 
        scores={}
        for comments in df.loc[df["video_id"]==video]["comment"]:
            comment_words = []
            comments = comments.replace("\n", " ")
            comments = comments.replace("\xa0", " ")
            comments = comments.replace("?", " ")
            comments = comments.replace(":", " ")
            comments = comments.replace(";", " ")
            comments = comments.replace(";", " ")
            comments = re.sub(r"\s+", ' ', comments) 
     #   print(comments)
            for word in comments.split(' '):
                if word.startswith('@') and len(word) > 1:
                    word = '@user'
        
                elif word.startswith('http'):
                    word = "http"
                comment_words.append(word)

            comment_procs = " ".join(comment_words)

            encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
           # print(encoded)
            output = model(**encoded)

            scores = output[0][0].detach().numpy()

            scores = softmax(scores)

            for i in range(len(scores)):

                l = labels[i]
                s = scores[i]
                #    labels = ['Negative', 'Neutral', 'Positive']

                if l=="Negative":
                    sum_neg+= s
                elif l == "Neutral":
                    sum_neu+=s
                else : 
                    sum_pos+=s
       # scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores =dict([("Neg", sum_neg/df.loc[df["video_id"]==video]["comment"].count()),("Pos",sum_pos/df.loc[df["video_id"]==video]["comment"].count()),("Neu",sum_neu/df.loc[df["video_id"]==video]["comment"].count()
)])
        #scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        #scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()

        polarity_scores[video] = scores
    return polarity_scores

In [44]:
roberta_per_vid_scores(translated_df)

{'aLZ85hb4wjE': {'Neg': 0.24424575177254154,
  'Pos': 0.4741820393828675,
  'Neu': 0.28157221488654616},
 'sYI97jv-pZg': {'Neg': 0.3894941806793213,
  'Pos': 0.0859330091625452,
  'Neu': 0.5245728343725204},
 '3YFpjgIQqEo': {'Neg': 0.4642800234258175,
  'Pos': 0.23067305744625627,
  'Neu': 0.30504688881337644},
 'dIsaz_XlmTw': {'Neg': 0.33587032787036153,
  'Pos': 0.2490432311780751,
  'Neu': 0.4150864414870739},
 'DWxIvQlpJK8': {'Neg': 0.289873747547972,
  'Pos': 0.29180154629284516,
  'Neu': 0.41832468984648585},
 'pMUumjHY3tw': {'Neg': 0.28391369991004467,
  'Pos': 0.05910225957632065,
  'Neu': 0.6569840013980865},
 'nTUWK8vufOk': {'Neg': 0.4252079260069877,
  'Pos': 0.3226866300450638,
  'Neu': 0.25210543572902677},
 'cPVE7QGS7As': {'Neg': 0.15659660201054065,
  'Pos': 0.5135279775597155,
  'Neu': 0.3298754207789898},
 'CEIrzjA8euQ': {'Neg': 0.15780450403690338,
  'Pos': 0.1664552390575409,
  'Neu': 0.6757402420043945},
 'lw16DeB6zns': {'Neg': 0.2425454837968573,
  'Pos': 0.2886490