## SENTIMENT ANALYSIS

In [89]:
import pandas as pd
import numpy as np
import csv

In [96]:
covid_ph_df = pd.read_csv("../datasets/covid_philippines/filtered_comments.csv").drop("Unnamed: 0", axis=1)
covid_ph_df.head()

Unnamed: 0,video_id,comment,comment_cleaned
0,aLZ85hb4wjE,Loved the calmness of Manilla.,loved the calmness of manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...,after this pandemic i think every country shou...
2,aLZ85hb4wjE,manila looks beautiful with less people,manila looks beautiful with less peopl
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,the lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same,india also sam


In [95]:
covid_vax_df = pd.read_csv("../datasets/covid_vaccine/filtered_comments.csv").drop("Unnamed: 0", axis=1)
covid_vax_df.head()

Unnamed: 0,video_id,comment,comment_cleaned
0,2IXl4qJGrRk,That is one Guinness Book of records that will...,that is one guinness book of records that will...
1,2IXl4qJGrRk,He's now the strongest man in the world.,he's now the strongest man in the world.
2,2IXl4qJGrRk,He got more vaccines than all Texans together!,he got more vaccines than all texans together!
3,2IXl4qJGrRk,this man is more vaccine than he is human,this man is more vaccine than he is human
4,2IXl4qJGrRk,That’s the difference between Germany and Amer...,that’s the difference between germany and amer...


In [94]:
is_pal_df = pd.read_csv("../datasets/israel-palestine_conflict_history/filtered_comments.csv").drop("Unnamed: 0", axis=1)
is_pal_df.head()

Unnamed: 0,video_id,comment,comment_cleaned
0,R0ftmf_Uv9A,No matter how many times these information get...,no matter how many times these information get...
1,R0ftmf_Uv9A,Let peace prevail.,let peace prevail.
2,R0ftmf_Uv9A,Why start at 1946?,why start at 1946?
3,R0ftmf_Uv9A,What happened the video was removed and put ag...,what happened the video was removed and put ag...
4,R0ftmf_Uv9A,"This is not a conflict, this is occupation.","this is not a conflict, this is occupation."


TEST RoBERTa

In [102]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from scipy.special import softmax

In [103]:

def roberta_sentiment_scores(df):
    df = df[['video_id','comment']]
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)

    for comments in df['comment']:
        comment_words = []
        comments = comments.replace("\n", " ")
        comments = comments.replace("\xa0", " ")
        comments = comments.replace("?", " ")
        comments = comments.replace(":", " ")
        comments = comments.replace(";", " ")
        comments = comments.replace(";", " ")
        comments = re.sub(r"\s+", ' ', comments) 
        #print(comments)
        for word in comments.split(' '):
            if word.startswith('@') and len(word) > 1:
               word = '@user'
        
            elif word.startswith('http'):
                word = "http"
            comment_words.append(word)

        comment_procs = " ".join(comment_words)

        encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
        #print(encoded)
        output = model(**encoded)

        scores = output[0][0].detach().numpy()

        scores = softmax(scores)

        for i in range(len(scores)):

            l = labels[i]
            s = scores[i]
            df.loc[df['comment']==comments,[labels[i]]] =s
            #print(l, s)
    return df

In [104]:
roberta_sentiment_scores(covid_vax_df)

Unnamed: 0,video_id,comment,Negative,Neutral,Positive
0,2IXl4qJGrRk,That is one Guinness Book of records that will...,0.008738,0.220693,0.770569
1,2IXl4qJGrRk,He's now the strongest man in the world.,0.002007,0.030271,0.967721
2,2IXl4qJGrRk,He got more vaccines than all Texans together!,0.429495,0.470088,0.100417
3,2IXl4qJGrRk,this man is more vaccine than he is human,0.941847,0.054342,0.003811
4,2IXl4qJGrRk,That’s the difference between Germany and Amer...,,,
...,...,...,...,...,...
1480,Ea4cUAlZhY8,"india has the best vaccines in the world, all ...",0.004628,0.029972,0.965400
1481,Ea4cUAlZhY8,Om Sarve Bhavantu Sukhinah \r\n May all be hea...,,,
1482,Ea4cUAlZhY8,"BBC, CNN and Al Jazeera now let's shoot our op...",0.033171,0.649342,0.317487
1483,Ea4cUAlZhY8,Hopefully world will be a better place!! Love ...,0.002632,0.025578,0.971790


TEST TEXTBLOB

In [100]:
from textblob import TextBlob


In [143]:

def textblob_sentiment_scores(input_df):
    df = input_df[['video_id','comment']].copy()
    for comment in df['comment']:
        testimonial = TextBlob(comment)
        df.loc[df['comment']==comment, 'polarity'] =testimonial.sentiment.polarity
        df.loc[df['comment']==comment, 'subjectivity'] =testimonial.sentiment.subjectivity
        if (testimonial.sentiment.polarity > 0):
            #print("positive", testimonial.sentiment.polarity)
            df.loc[df['comment']==comment, 'sentiment'] ='positive'

        elif (testimonial.sentiment.polarity < 0):
            #print("negative", testimonial.sentiment.polarity)
            df.loc[df['comment']==comment, 'sentiment'] ='negative'
            
        else:
            #print("neutral", testimonial.sentiment.polarity)
            df.loc[df['comment']==comment, 'sentiment'] ='neutral'
    return df

In [144]:
textblob_sentiment_scores(covid_vax_df)

Unnamed: 0,video_id,comment,polarity,subjectivity,sentiment
0,2IXl4qJGrRk,That is one Guinness Book of records that will...,0.000000,0.000000,neutral
1,2IXl4qJGrRk,He's now the strongest man in the world.,0.000000,0.000000,neutral
2,2IXl4qJGrRk,He got more vaccines than all Texans together!,0.625000,0.500000,positive
3,2IXl4qJGrRk,this man is more vaccine than he is human,0.250000,0.300000,positive
4,2IXl4qJGrRk,That’s the difference between Germany and Amer...,0.000000,0.000000,neutral
...,...,...,...,...,...
1480,Ea4cUAlZhY8,"india has the best vaccines in the world, all ...",1.000000,0.300000,positive
1481,Ea4cUAlZhY8,Om Sarve Bhavantu Sukhinah \r\n May all be hea...,0.500000,0.500000,positive
1482,Ea4cUAlZhY8,"BBC, CNN and Al Jazeera now let's shoot our op...",0.000000,0.000000,neutral
1483,Ea4cUAlZhY8,Hopefully world will be a better place!! Love ...,0.640625,0.550000,positive


TEST STANZA

In [34]:
import stanza

In [107]:
def stanza_sentiment_scores(input_df):
    df= input_df[['video_id','comment']].copy()
    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in df['comment']:
        doc = nlp(comment.replace("\n", " "))
       # print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            #The existing models each support negative, neutral, and positive, represented by 0, 1, 2 respectively
            if (sentence.sentiment ==0 ):
                df.loc[df['comment']==comment,'sentiment'] = 'negative'
            elif (sentence.sentiment ==1):
                df.loc[df['comment']==comment,'sentiment'] = 'neutral'
            else:
                df.loc[df['comment']==comment,'sentiment'] = 'positive'

                
    return df
                
            #print("%d -> %d" % (i, sentence.sentiment))

In [109]:
stanza_sentiment_scores(covid_vax_df)

2024-03-16 13:48:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-16 13:48:46 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

2024-03-16 13:48:46 INFO: Using device: cpu
2024-03-16 13:48:46 INFO: Loading: tokenize
2024-03-16 13:48:47 INFO: Loading: mwt
2024-03-16 13:48:47 INFO: Loading: sentiment
2024-03-16 13:48:47 INFO: Done loading processors!


That is one Guinness Book of records that will never fall.
He's now the strongest man in the world.
He got more vaccines than all Texans together!
this man is more vaccine than he is human
That’s the difference between Germany and America: man receives 217 Covid shots vs man gives 217 gunshots.
I would love to hear the response from vaccine sceptics who predicted the end of time for people who accepted one vaccine!
Doesn't he know you need 218 vaccinations to be really protected. 😂 🐑
There are two factors helping your immunity with a vaccine. Temporary heightened immune response to all infections and later an immune response to the specific infectious agent.
Wow, this dude’s 5g signal must be off the charts!
It's good to know the vaccine is safe to use.
Strange kink. Though not gonna judge him for it.
no proof, no identity, just words.
In todays late news..He exploded 3 minutes after this footage was doctored...err, filmed.
Wasn't this video supposed to be posted on April Fools Day?
😂😂

Unnamed: 0,video_id,comment,sentiment
0,2IXl4qJGrRk,That is one Guinness Book of records that will...,positive
1,2IXl4qJGrRk,He's now the strongest man in the world.,positive
2,2IXl4qJGrRk,He got more vaccines than all Texans together!,negative
3,2IXl4qJGrRk,this man is more vaccine than he is human,neutral
4,2IXl4qJGrRk,That’s the difference between Germany and Amer...,neutral
...,...,...,...
1480,Ea4cUAlZhY8,"india has the best vaccines in the world, all ...",positive
1481,Ea4cUAlZhY8,Om Sarve Bhavantu Sukhinah \r\n May all be hea...,neutral
1482,Ea4cUAlZhY8,"BBC, CNN and Al Jazeera now let's shoot our op...",neutral
1483,Ea4cUAlZhY8,Hopefully world will be a better place!! Love ...,neutral


TEST VADER

In [37]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [112]:

def vader_sentiment_scores (input_df):
    df= input_df[['video_id','comment']].copy()

    for sentence in df['comment']:
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict = sid_obj.polarity_scores(sentence)
        #print(sentence)
        #print("Overall sentiment dictionary is : ", sentiment_dict)
        #print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
        #print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
        #print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")

        df.loc[df['comment']==sentence,'positive'] = sentiment_dict['pos']
        df.loc[df['comment']==sentence,'neutral'] = sentiment_dict['neu']
        df.loc[df['comment']==sentence,'negative'] = sentiment_dict['neg']

        df.loc[df['comment']==sentence,'compound'] = sentiment_dict['compound']



        #print("Sentence Overall Rated As", end = " ")
 
        # decide sentiment as positive, negative and neutral
        ### compound -> (computed by normalizing the scores above
        if sentiment_dict['compound'] >= 0.05 :
            #print("Positive")
            df.loc[df['comment']==sentence,'Overall sentiment'] = 'positive'

 
        elif sentiment_dict['compound'] <= - 0.05 :
            #print("Negative")
            df.loc[df['comment']==sentence,'Overall sentiment'] = 'negative'
 
        else :
            df.loc[df['comment']==sentence,'Overall sentiment'] = 'neutral'


    return df

In [113]:
vader_sentiment_scores (covid_vax_df)

Unnamed: 0,video_id,comment,positive,neutral,negative,compound,Overall sentiment
0,2IXl4qJGrRk,That is one Guinness Book of records that will...,0.000,1.000,0.000,0.0000,neutral
1,2IXl4qJGrRk,He's now the strongest man in the world.,0.293,0.707,0.000,0.4404,positive
2,2IXl4qJGrRk,He got more vaccines than all Texans together!,0.000,1.000,0.000,0.0000,neutral
3,2IXl4qJGrRk,this man is more vaccine than he is human,0.000,1.000,0.000,0.0000,neutral
4,2IXl4qJGrRk,That’s the difference between Germany and Amer...,0.000,1.000,0.000,0.0000,neutral
...,...,...,...,...,...,...,...
1480,Ea4cUAlZhY8,"india has the best vaccines in the world, all ...",0.418,0.582,0.000,0.8360,positive
1481,Ea4cUAlZhY8,Om Sarve Bhavantu Sukhinah \r\n May all be hea...,0.302,0.534,0.164,0.6310,positive
1482,Ea4cUAlZhY8,"BBC, CNN and Al Jazeera now let's shoot our op...",0.000,0.912,0.088,-0.3400,negative
1483,Ea4cUAlZhY8,Hopefully world will be a better place!! Love ...,0.597,0.403,0.000,0.8856,positive


## Get Sentiment sentiment for each video

### TEXTBLOB APPROACH

In [129]:
def per_vid_textblob_polarity (df_input):
    df = pd.DataFrame(df_input["video_id"].copy())
    df =df.drop_duplicates('video_id')
    df= df.reset_index(drop=True)
    for video in df_input["video_id"].unique():
        sum_polarity=0
        for comment in df_input.loc[df_input["video_id"]==video]["comment"]:
            sum_polarity+=(TextBlob(comment)).sentiment.polarity
        average=sum_polarity/df_input.loc[df_input["video_id"]==video]["comment"].count()
        df.loc[df["video_id"]==video,'polarity'] =average
       # video_polarity[video] = [sum_polarity/df.loc[df["video_id"]==video]["comment"].count()]
    ''' if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.subjectivity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.subjectivity)
        else:
            print("neutral", testimonial.sentiment.subjectivity)'''
    
    return df

In [130]:
per_vid_textblob_polarity (covid_vax_df)

Unnamed: 0,video_id,polarity
0,2IXl4qJGrRk,0.202778
1,HtTalpY-J-M,0.050000
2,jPs4_MeuX7U,0.095748
3,WhiBpmH1mE4,-0.004616
4,LfmhYVCCGhc,0.167498
...,...,...
163,l6mTAJ08aUI,-0.016204
164,2XVCPdPAUbA,0.030952
165,XHfRpNJI0c8,0.088384
166,wuqyRBmashA,-0.015464


### VADER APPROACH

In [138]:
def per_vid_vader_polarity (df_input):
    df = pd.DataFrame(df_input["video_id"].copy())
    df =df.drop_duplicates('video_id')
    df= df.reset_index(drop=True)
    polarity_scores={}
    sid_obj = SentimentIntensityAnalyzer()

    for video in df_input["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0
        sum_compound= 0
        scores={}
        for sentence in df_input.loc[df_input["video_id"]==video]["comment"]:

            sentiment_dict = sid_obj.polarity_scores(sentence)
            sum_neg += sentiment_dict['neg']
            sum_pos += sentiment_dict['pos']
            sum_neu += sentiment_dict['neu']
            sum_compound += sentiment_dict['compound']
            
        df.loc[df['video_id']==video,'negative'] = sum_neg/df_input.loc[df_input["video_id"]==video]["comment"].count()
        df.loc[df['video_id']==video,'positive'] = sum_pos/df_input.loc[df_input["video_id"]==video]["comment"].count()
        df.loc[df['video_id']==video,'neutral'] = sum_neu/df_input.loc[df_input["video_id"]==video]["comment"].count()
        df.loc[df['video_id']==video,'compound'] = sum_compound/df_input.loc[df_input["video_id"]==video]["comment"].count()
        
        #polarity_scores[video]=scores
        '''# decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")'''
    return df
        

In [139]:
per_vid_vader_polarity(covid_vax_df)

Unnamed: 0,video_id,negative,positive,neutral,compound
0,2IXl4qJGrRk,0.016111,0.152111,0.831778,0.353367
1,HtTalpY-J-M,0.105667,0.165556,0.728667,0.167778
2,jPs4_MeuX7U,0.220143,0.128429,0.651286,-0.189100
3,WhiBpmH1mE4,0.033250,0.162375,0.804500,0.334662
4,LfmhYVCCGhc,0.108000,0.044857,0.847286,-0.122543
...,...,...,...,...,...
163,l6mTAJ08aUI,0.081222,0.081444,0.837444,0.106000
164,2XVCPdPAUbA,0.104571,0.045714,0.849857,-0.170800
165,XHfRpNJI0c8,0.057889,0.136889,0.805111,0.150944
166,wuqyRBmashA,0.110300,0.067700,0.822000,-0.195360


### RoBERTa APPROACH

In [184]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import re
#from scipy.special import softmax

def roberta_per_vid_scores(df_input):
    df = pd.DataFrame(df_input["video_id"].copy())
    df =df.drop_duplicates('video_id')
    df= df.reset_index(drop=True)
    polarity_scores={}

    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    for video in df_input["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0 
        scores={}
        for comments in df_input.loc[df_input["video_id"]==video]["comment"]:
            comment_words = []
            comments = comments.replace("\n", " ")
            comments = comments.replace("\xa0", " ")
            comments = comments.replace("?", " ")
            comments = comments.replace(":", " ")
            comments = comments.replace(";", " ")
            comments = comments.replace(";", " ")
            comments = re.sub(r"\s+", ' ', comments) 
     #   print(comments)
            for word in comments.split(' '):
                if word.startswith('@') and len(word) > 1:
                    word = '@user'
        
                elif word.startswith('http'):
                    word = "http"
                comment_words.append(word)

            comment_procs = " ".join(comment_words)

            encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
           # print(encoded)
            output = model(**encoded)

            scores = output[0][0].detach().numpy()

            scores = softmax(scores)

            for i in range(len(scores)):

                l = labels[i]
                s = scores[i]
                #    labels = ['Negative', 'Neutral', 'Positive']

                if l=="Negative":
                    sum_neg+= s
                elif l == "Neutral":
                    sum_neu+=s
                else : 
                    sum_pos+=s
        
        df.loc[df['video_id']==video,'negative'] = sum_neg/df_input.loc[df_input["video_id"]==video]["comment"].count()
        df.loc[df['video_id']==video,'positive'] = sum_pos/df_input.loc[df_input["video_id"]==video]["comment"].count()
        df.loc[df['video_id']==video,'neutral'] = sum_neu/df_input.loc[df_input["video_id"]==video]["comment"].count()


       # scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        #scores =dict([("Neg", sum_neg/df.loc[df["video_id"]==video]["comment"].count()),("Pos",sum_pos/df.loc[df["video_id"]==video]["comment"].count()),("Neu",sum_neu/df.loc[df["video_id"]==video]["comment"].count()
#)])
        
        #scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        #scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()

        #polarity_scores[video] = scores
    return df

In [185]:
roberta_per_vid_scores(covid_vax_df)

Unnamed: 0,video_id,negative,positive,neutral
0,2IXl4qJGrRk,0.263802,0.377510,0.358689
1,HtTalpY-J-M,0.341281,0.200210,0.458508
2,jPs4_MeuX7U,0.557907,0.156772,0.285321
3,WhiBpmH1mE4,0.179869,0.505061,0.315071
4,LfmhYVCCGhc,0.488205,0.171317,0.340478
...,...,...,...,...
163,l6mTAJ08aUI,0.503442,0.054259,0.442300
164,2XVCPdPAUbA,0.518415,0.058549,0.423037
165,XHfRpNJI0c8,0.606950,0.056705,0.336344
166,wuqyRBmashA,0.709377,0.042647,0.247976


### STANZA

In [218]:
def stanza_pervid_sentiment(input_df):
    df= input_df[['video_id','comment']].copy()
    video_df = pd.DataFrame(input_df["video_id"].copy())
    video_df =video_df.drop_duplicates('video_id')
    video_df= video_df.reset_index(drop=True)

    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in df['comment']:
        doc = nlp(comment.replace("\n", " "))
       # print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            #The existing models each support negative, neutral, and positive, represented by 0, 1, 2 respectively
            df.loc[df['comment']==comment,'sentiment'] = sentence.sentiment
            
    for video in video_df['video_id']:            
        video_df.loc[video_df['video_id']==video,'mode_sentiment'] = df.mode()['sentiment'][0]
                
    return video_df
                
            #print("%d -> %d" % (i, sentence.sentiment))

In [219]:
stanza_pervid_sentiment(covid_vax_df)


2024-03-16 16:20:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-16 16:20:03 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

2024-03-16 16:20:03 INFO: Using device: cpu
2024-03-16 16:20:03 INFO: Loading: tokenize
2024-03-16 16:20:03 INFO: Loading: mwt
2024-03-16 16:20:03 INFO: Loading: sentiment
2024-03-16 16:20:03 INFO: Done loading processors!


Unnamed: 0,video_id,mode_sentiment
0,2IXl4qJGrRk,0.0
1,HtTalpY-J-M,0.0
2,jPs4_MeuX7U,0.0
3,WhiBpmH1mE4,0.0
4,LfmhYVCCGhc,0.0
...,...,...
163,l6mTAJ08aUI,0.0
164,2XVCPdPAUbA,0.0
165,XHfRpNJI0c8,0.0
166,wuqyRBmashA,0.0
