In [2]:
import translators as ts
from langdetect import detect

In [3]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm

In [4]:
df = pd.read_csv("../datasets/covid_philippines/covid_philippines_comments.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same


In [5]:
translated_comments = {}
translated_comments["video_id"] = {}
translated_comments["comment"] = {}
video_id_list = df["video_id"].to_list()
comments_list = df["comment"].to_list()

In [6]:
pbar = tqdm(total=len(video_id_list))
pbar.set_description("Translating...")

for i in range(df.shape[0]):
    if comments_list[i] != None:
        new_comment = comments_list[i]
        try:
            lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
            if lang != 'en':
                new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')
                
        except:
            # No change; get same comment from list
            pass
            
        finally:
            translated_comments["video_id"][i] = video_id_list[i]
            translated_comments["comment"][i] = new_comment
            pbar.update(1)

translated_df = pd.DataFrame.from_dict(translated_comments)
pbar.close()

Translating...: 100%|██████████| 1216/1216 [06:11<00:00,  3.28it/s]


In [7]:
translated_df

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same
...,...,...
1211,5DvMPgoKZmM,This covid will be a never-ending fuckery as l...
1212,5DvMPgoKZmM,A new variant is inevitable.
1213,5DvMPgoKZmM,The man that should resign from his office is ...
1214,5DvMPgoKZmM,"Is that the people who got been vaccinated.,"


---

In [8]:
def translate(video_id_list, comments_list):
    translated_comments = {}
    translated_comments["video_id"] = {}
    translated_comments["comment"] = {}
    
    pbar = tqdm(total=len(video_id_list))
    pbar.set_description("Translating...")

    for i in range(df.shape[0]):
        if comments_list[i] != None:
            new_comment = comments_list[i]
            try:
                lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
                if lang != 'en':
                    new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')

            except:
                # No change; get same comment from list
                pass

            finally:
                translated_comments["video_id"][i] = video_id_list[i]
                translated_comments["comment"][i] = new_comment
                pbar.update(1)

    translated_df = pd.DataFrame.from_dict(translated_comments)
    pbar.close()

## DETECT SPAM

- 0:Not Spam
- 1:Spam

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [10]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection
psy_model_df = pd.read_csv("../datasets/model_train/Youtube01-Psy.csv")
psy_model_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [11]:
x=psy_model_df["CONTENT"]
y=psy_model_df["CLASS"]

cv= CountVectorizer()
x=cv.fit_transform(x)

x_train, x_test, y_train, y_test  = train_test_split(x,y)

In [12]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9923664122137404
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       127
           1       0.99      1.00      0.99       135

    accuracy                           0.99       262
   macro avg       0.99      0.99      0.99       262
weighted avg       0.99      0.99      0.99       262



In [13]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9431818181818182
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        48
           1       0.93      0.95      0.94        40

    accuracy                           0.94        88
   macro avg       0.94      0.94      0.94        88
weighted avg       0.94      0.94      0.94        88



dataset from YouTube01-Psy.csv performed better, lets test with another dataset

In [14]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

lmfao_model_df = pd.read_csv("../datasets/model_train/Youtube03-LMFAO.csv")
lmfao_model_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k,Corey Wilson,2015-05-28T21:39:52.376000,"<a href=""http://www.youtube.com/watch?v=KQ6zr6...",0
1,z124jvczaz3dxhnbc04cffk43oiugj25yzo0k,Epic Gaming,2015-05-28T20:07:20.610000,wierd but funny﻿,0
2,z13tczjy5xj0vjmu5231unho1ofey5zdk,LaS Music,2015-05-28T19:23:35.355000,"Hey guys, I&#39;m a human.<br /><br /><br />Bu...",1
3,z13tzr0hdpnayhqqc04cd3zqqqjkf3ngckk0k,Cheryl Fox,2015-05-28T17:49:35.294000,Party Rock....lol...who wants to shuffle!!!﻿,0
4,z12pcvix4zedcjvyb04ccr1r0mr2g5xwyng0k,PATRICK_TW,2015-05-28T16:28:26.818000,Party rock﻿,0


In [15]:
x_lmfao=lmfao_model_df["CONTENT"]
y_lmfao= lmfao_model_df["CLASS"]

x_lmfao=cv.transform(x_lmfao)

In [16]:
predictions=spam_nb.predict(x_lmfao)
accuracy = accuracy_score(y_lmfao, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_lmfao, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.6027397260273972
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.82      0.66       202
           1       0.73      0.42      0.53       236

    accuracy                           0.60       438
   macro avg       0.64      0.62      0.59       438
weighted avg       0.65      0.60      0.59       438



Test accuracy not good, merge dataframes?

In [17]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

kp_model_df = pd.read_csv("../datasets/model_train/Youtube02-KatyPerry.csv")
shakira_df = pd.read_csv("../datasets/model_train/Youtube05-Shakira.csv")
model_df = pd.concat([psy_model_df,lmfao_model_df,kp_model_df,shakira_df])
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [18]:
x=model_df["CONTENT"]
y=model_df["CLASS"]

cv= CountVectorizer()

x=cv.fit_transform(x)

x_train, x_test, y_train, y_test  = train_test_split(x,y)


In [19]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9734748010610079
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       562
           1       0.97      0.98      0.97       569

    accuracy                           0.97      1131
   macro avg       0.97      0.97      0.97      1131
weighted avg       0.97      0.97      0.97      1131



In [20]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9230769230769231
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       186
           1       0.94      0.91      0.92       191

    accuracy                           0.92       377
   macro avg       0.92      0.92      0.92       377
weighted avg       0.92      0.92      0.92       377



test with another dataset again

In [21]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

em_model_df = pd.read_csv("../datasets/model_train/Youtube04-Eminem.csv")
em_model_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z12rwfnyyrbsefonb232i5ehdxzkjzjs2,Lisa Wellas,,+447935454150 lovely girl talk to me xxx﻿,1
1,z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04,jason graham,2015-05-29T02:26:10.652000,I always end up coming back to this song<br />﻿,0
2,z13vsfqirtavjvu0t22ezrgzyorwxhpf3,Ajkal Khan,,"my sister just received over 6,500 new <a rel=...",1
3,z12wjzc4eprnvja4304cgbbizuved35wxcs,Dakota Taylor,2015-05-29T02:13:07.810000,Cool﻿,0
4,z13xjfr42z3uxdz2223gx5rrzs3dt5hna,Jihad Naser,,Hello I&#39;am from Palastine﻿,1


In [22]:
x=em_model_df["CONTENT"]
y=em_model_df["CLASS"]

x=cv.transform(x)



In [23]:
predictions=spam_nb.predict(x)
accuracy = accuracy_score(y, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.875
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.77      0.85       203
           1       0.83      0.96      0.89       245

    accuracy                           0.88       448
   macro avg       0.89      0.87      0.87       448
weighted avg       0.88      0.88      0.87       448



above results showed better test  from a different dataset

try model on translated_df

In [24]:
transformed = cv.transform(translated_df["comment"])

translated_df["spam"]=spam_nb.predict(transformed)
translated_df.head()

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,Loved the calmness of Manilla.,0
1,aLZ85hb4wjE,After this pandemic I think every country shou...,0
2,aLZ85hb4wjE,manila looks beautifull with less people,0
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,0
4,aLZ85hb4wjE,India also same,0


In [25]:
og_translated_df = translated_df.copy()#in case we want to preserve the original dataframe

In [26]:
translated_df.drop(translated_df[translated_df["spam"]==1].index,inplace = True)


In [28]:
translated_df[translated_df["spam"]==1].count()

video_id    0
comment     0
spam        0
dtype: int64

In [29]:
translated_df[translated_df["spam"]==0].count()

video_id    665
comment     665
spam        665
dtype: int64

In [30]:
og_translated_df[og_translated_df["spam"]==1].count()

video_id    551
comment     551
spam        551
dtype: int64

## SENTIMENT ANALYSIS

TEST RoBERTa

In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from scipy.special import softmax

def roberta_sentiment_scores(list_comments):
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)

    for comments in list_comments:
        comment_words = []
        comments = comments.replace("\n", " ")
        comments = comments.replace("\xa0", " ")
        comments = comments.replace("?", " ")
        comments = comments.replace(":", " ")
        comments = comments.replace(";", " ")
        comments = comments.replace(";", " ")
        comments = re.sub(r"\s+", ' ', comments) 
        print(comments)
        for word in comments.split(' '):
            if word.startswith('@') and len(word) > 1:
               word = '@user'
        
            elif word.startswith('http'):
                word = "http"
            comment_words.append(word)

        comment_procs = " ".join(comment_words)

        encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
        print(encoded)
        output = model(**encoded)

        scores = output[0][0].detach().numpy()

        scores = softmax(scores)

        for i in range(len(scores)):

            l = labels[i]
            s = scores[i]
            print(l, s)

In [32]:
roberta_sentiment_scores(translated_df["comment"][:50])

  return self.fget.__get__(instance, owner)()


Loved the calmness of Manilla.
{'input_ids': tensor([[    0,   574, 12677,     5,  6327,  1825,     9,  1554,  4699,     4,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Negative 0.0026419058
Neutral 0.031785045
Positive 0.9655731
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
{'input_ids': tensor([[    0,  4993,    42, 23387, 14414,    38,   206,   358,   247,   197,
            33,    10, 23076,   358,   353,     7,   492,   985,  2574,    10,
            86,     7, 14384,     6,    19,    70,     5,  6631,    14,     5,
          6872,    16,  7242,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Negative 0.72357625
Neutral 0.25514364
Positive 0.021280115
manila looks beautifull with less people
{'input_ids': tensor

TEST TEXTBLOB

In [33]:
from textblob import TextBlob

def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [34]:
textblob_sentiment_scores(translated_df["comment"][:50])

positive 0.7
neutral 0.0
negative -0.16666666666666666
positive 0.11250000000000002
neutral 0.0
positive 0.44999999999999996
positive 0.7
negative -0.55
positive 0.475
neutral 0.0
negative -0.6
negative -0.19999999999999998
negative -0.5
negative -0.3729166666666667
positive 0.2
positive 0.48828125
positive 0.425
positive 0.14375
neutral 0.0
negative -0.125
neutral 0.0
neutral 0.0
negative -0.4
negative -0.14666666666666664
positive 0.18611111111111112
neutral 0.0
negative -0.10166666666666668
positive 0.09999999999999998
positive 0.5
negative -0.07857142857142857
positive 0.18194444444444446
positive 0.3
neutral 0.0
positive 0.2508928571428572
positive 0.8463541666666665
positive 0.5
positive 0.48750000000000004
neutral 0.0
positive 0.18000000000000002
negative -0.25
positive 0.02361111111111111
negative -0.15
positive 0.10476190476190476
neutral 0.0
positive 0.5
neutral 0.0
negative -0.5
neutral 0.0
positive 0.2
negative -0.3333333333333333


TEST STANZA

In [35]:
import stanza
def stanza_sentiment_scores(list_comments):
    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in list_comments:
        doc = nlp(comment.replace("\n", " "))
        print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            print("%d -> %d" % (i, sentence.sentiment))

In [36]:
stanza_sentiment_scores(translated_df["comment"][:50])

2024-02-26 14:10:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-26 14:10:13 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

2024-02-26 14:10:13 INFO: Using device: cpu
2024-02-26 14:10:13 INFO: Loading: tokenize
2024-02-26 14:10:13 INFO: Loading: mwt
2024-02-26 14:10:13 INFO: Loading: sentiment
2024-02-26 14:10:14 INFO: Done loading processors!


Loved the calmness of Manilla.
0 -> 2
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
0 -> 0
manila looks beautifull with less people
0 -> 2
The lockdown makes the city look like a place I want to explore.

Before the lockdown, it looked like a crowded mess filled with traffic and pollution.
0 -> 0
India also same
0 -> 1
The president promised that he'll do his best to ease the traffic in the metro like 5mins of travel along EDSA. Fortunately it happened but unfortunately it wasn't in the ideal way. Nature's sense of humor tho.
0 -> 0
I went up the viewing spot at antipolo, its a very beautiful, almost smog free view of the metro skyline
0 -> 2
The sad part is people around the world did not intend to heal earth but afraid of death.
0 -> 1
i hope it stays like this forever its beautiful with no messy people
0 -> 2
2020 is the year when nature fight back and reduce h

TEST VADER

In [37]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def vader_sentiment_scores (list_comments):
    for sentence in list_comments:
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict = sid_obj.polarity_scores(sentence)
        print(sentence)
        print("Overall sentiment dictionary is : ", sentiment_dict)
        print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
        print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
        print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
        print("Sentence Overall Rated As", end = " ")
 
        # decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")

    

In [38]:
vader_sentiment_scores (translated_df["comment"][:50])

Loved the calmness of Manilla.
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'compound': 0.765}
sentence was rated as  0.0 % Negative
sentence was rated as  31.2 % Neutral
sentence was rated as  68.8 % Positive
Sentence Overall Rated As Positive
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as  0.0 % Negative
sentence was rated as  100.0 % Neutral
sentence was rated as  0.0 % Positive
Sentence Overall Rated As Neutral
manila looks beautifull with less people
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as  0.0 % Negative
sentence was rated as  100.0 % Neutral
sentence was rated as  0.0 % Positive
Sentence Overall Rated As Neutral
The lockdown makes the city look like

## Get Sentiment sentiment for each video

#### TEXTBLOB APPROACH

In [39]:
def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [40]:
translated_df["video_id"].unique()

array(['aLZ85hb4wjE', 'sYI97jv-pZg', '3YFpjgIQqEo', 'dIsaz_XlmTw',
       'DWxIvQlpJK8', 'pMUumjHY3tw', 'nTUWK8vufOk', 'cPVE7QGS7As',
       'CEIrzjA8euQ', 'lw16DeB6zns', '8dTelszbObM', 'Mxf8uGFcqSE',
       'MQ5aYS4YFlQ', 'iOE6rAY8l-k', '_a-rQYfsCck', 'jEINZXA_ujo',
       '587N9bJ5J5k', 'E5F3xA_zkFc', '05JLyd58R-w', '2fRQ8OsqOLs',
       'E56W-5xVOss', 'c1oU8U05puY', 'SEHcakm-fAc', '1psSvU1km0I',
       'PP3Yu-ro1tA', 'NQeI1CRCqeo', 'RhDHGgo4yZg', 'sdsz-t540WI',
       '7SKGXkZKjV8', 'oy0wHScCPds', 'o1kPskxFkQ8', 'aoNQUUQno00',
       'liEUC1_l8e8', '-b0EuuMvvy8', 'MmyIvf7bEGc', 'Fd3HcncV6Zk',
       '-n9Ks3VTub4', 'LzymJ2xZhho', '-qFcO_onBdA', 'fp9uRsmTWqg',
       '3ZXR2eARmuQ', '57wz3HuIVLA', 'lbK7UjoLr8o', 'xV1oR-RbOGU',
       'kn6DKdInYXk', '2T3yZ6lNRDg', 'sH_YoV-NA6s', 'ZLT6L3PHz78',
       'lczwrm68u6I', 'zhBdbLj5y6A', 'ibWFsmcnefk', 'e0fN7HxkIUc',
       'Ji47WRv2tQE', 'dj5ov38ihZI', 'j3E3NF9nk44', 'BomdsEJjb0E',
       'T7c6GvrF82k', '6DBFwIlT4fg', '2FgFNBIJTIA', '254GdlKEz

In [41]:
def per_vid_textblob_polarity (df):
    video_polarity={}
    for video in df["video_id"].unique():
        sum_polarity=0
        for comment in df.loc[df["video_id"]==video]["comment"]:
            sum_polarity+=(TextBlob(comment)).sentiment.polarity
        video_polarity[video] = sum_polarity/df.loc[df["video_id"]==video]["comment"].count()
    ''' if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.subjectivity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.subjectivity)
        else:
            print("neutral", testimonial.sentiment.subjectivity)'''
    return video_polarity


In [42]:
per_vid_textblob_polarity(translated_df)

{'aLZ85hb4wjE': 0.1720833333333333,
 'sYI97jv-pZg': -0.6,
 '3YFpjgIQqEo': -0.07692708333333331,
 'dIsaz_XlmTw': 0.08875,
 'DWxIvQlpJK8': -0.09013888888888888,
 'pMUumjHY3tw': -0.05083333333333334,
 'nTUWK8vufOk': 0.16722883597883598,
 'cPVE7QGS7As': 0.37745783730158733,
 'CEIrzjA8euQ': -0.25,
 'lw16DeB6zns': -0.002703373015873013,
 '8dTelszbObM': 0.14872621027346639,
 'Mxf8uGFcqSE': 0.0,
 'MQ5aYS4YFlQ': 0.24710370370370374,
 'iOE6rAY8l-k': 0.06736111111111112,
 '_a-rQYfsCck': 0.10000000000000002,
 'jEINZXA_ujo': 0.33730158730158727,
 '587N9bJ5J5k': 0.15,
 'E5F3xA_zkFc': 0.096230987119876,
 '05JLyd58R-w': 0.0,
 '2fRQ8OsqOLs': -0.08333333333333333,
 'E56W-5xVOss': 0.11555555555555556,
 'c1oU8U05puY': 0.23500000000000001,
 'SEHcakm-fAc': 0.08875000000000001,
 '1psSvU1km0I': 0.1614844209288654,
 'PP3Yu-ro1tA': 0.015138888888888887,
 'NQeI1CRCqeo': 0.0,
 'RhDHGgo4yZg': -0.08333333333333333,
 'sdsz-t540WI': 0.09146825396825395,
 '7SKGXkZKjV8': -0.2558333333333333,
 'oy0wHScCPds': -0.07902777

#### VADER APPROACH

In [43]:
def per_vid_vader_polarity (df):
    polarity_scores={}
    sid_obj = SentimentIntensityAnalyzer()

    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0
        sum_compound= 0
        scores={}
        for sentence in df.loc[df["video_id"]==video]["comment"]:

            sentiment_dict = sid_obj.polarity_scores(sentence)
            sum_neg += sentiment_dict['neg']
            sum_pos += sentiment_dict['pos']
            sum_neu += sentiment_dict['neu']
            sum_compound += sentiment_dict['compound']
            
        scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()
        scores["Overall"] = sum_compound/df.loc[df["video_id"]==video]["comment"].count()
        
        polarity_scores[video]=scores
        '''# decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")'''
    return polarity_scores
        

In [44]:
per_vid_vader_polarity(translated_df)

{'aLZ85hb4wjE': {'Neg': 0.07980000000000001,
  'Pos': 0.194,
  'Neu': 0.7261,
  'Overall': 0.17872000000000002},
 'sYI97jv-pZg': {'Neg': 0.0, 'Pos': 0.112, 'Neu': 0.888, 'Overall': 0.2263},
 '3YFpjgIQqEo': {'Neg': 0.085,
  'Pos': 0.0834,
  'Neu': 0.8316000000000001,
  'Overall': -0.062260000000000024},
 'dIsaz_XlmTw': {'Neg': 0.1342,
  'Pos': 0.1558,
  'Neu': 0.7101999999999999,
  'Overall': 0.030299999999999994},
 'DWxIvQlpJK8': {'Neg': 0.17275000000000001,
  'Pos': 0.04425,
  'Neu': 0.783,
  'Overall': -0.10742499999999999},
 'pMUumjHY3tw': {'Neg': 0.0475,
  'Pos': 0.034,
  'Neu': 0.9185,
  'Overall': -0.15775},
 'nTUWK8vufOk': {'Neg': 0.1105,
  'Pos': 0.1418333333333333,
  'Neu': 0.7478333333333333,
  'Overall': 0.24549999999999997},
 'cPVE7QGS7As': {'Neg': 0.11383333333333334,
  'Pos': 0.24016666666666664,
  'Neu': 0.646,
  'Overall': 0.33454999999999996},
 'CEIrzjA8euQ': {'Neg': 0.196, 'Pos': 0.0, 'Neu': 0.804, 'Overall': -0.296},
 'lw16DeB6zns': {'Neg': 0.062375,
  'Pos': 0.148,


### RoBERTa APPROACH

In [45]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import re
#from scipy.special import softmax

def roberta_per_vid_scores(df):
    polarity_scores={}

    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0 
        scores={}
        for comments in df.loc[df["video_id"]==video]["comment"]:
            comment_words = []
            comments = comments.replace("\n", " ")
            comments = comments.replace("\xa0", " ")
            comments = comments.replace("?", " ")
            comments = comments.replace(":", " ")
            comments = comments.replace(";", " ")
            comments = comments.replace(";", " ")
            comments = re.sub(r"\s+", ' ', comments) 
     #   print(comments)
            for word in comments.split(' '):
                if word.startswith('@') and len(word) > 1:
                    word = '@user'
        
                elif word.startswith('http'):
                    word = "http"
                comment_words.append(word)

            comment_procs = " ".join(comment_words)

            encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
           # print(encoded)
            output = model(**encoded)

            scores = output[0][0].detach().numpy()

            scores = softmax(scores)

            for i in range(len(scores)):

                l = labels[i]
                s = scores[i]
                #    labels = ['Negative', 'Neutral', 'Positive']

                if l=="Negative":
                    sum_neg+= s
                elif l == "Neutral":
                    sum_neu+=s
                else : 
                    sum_pos+=s
       # scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores =dict([("Neg", sum_neg/df.loc[df["video_id"]==video]["comment"].count()),("Pos",sum_pos/df.loc[df["video_id"]==video]["comment"].count()),("Neu",sum_neu/df.loc[df["video_id"]==video]["comment"].count()
)])
        #scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        #scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()

        polarity_scores[video] = scores
    return polarity_scores

In [46]:
roberta_per_vid_scores(translated_df)

{'aLZ85hb4wjE': {'Neg': 0.24424575177254154,
  'Pos': 0.4741820393828675,
  'Neu': 0.28157221488654616},
 'sYI97jv-pZg': {'Neg': 0.5715848803520203,
  'Pos': 0.045553598552942276,
  'Neu': 0.38286152482032776},
 '3YFpjgIQqEo': {'Neg': 0.5375544439069927,
  'Pos': 0.2158916786313057,
  'Neu': 0.2465538427233696},
 'dIsaz_XlmTw': {'Neg': 0.2898032539524138,
  'Pos': 0.2768005203455687,
  'Neu': 0.43339620530605316},
 'DWxIvQlpJK8': {'Neg': 0.41515710740350187,
  'Pos': 0.24553405412007123,
  'Neu': 0.33930883556604385},
 'pMUumjHY3tw': {'Neg': 0.28391369991004467,
  'Pos': 0.05910225957632065,
  'Neu': 0.6569840013980865},
 'nTUWK8vufOk': {'Neg': 0.5233465706308683,
  'Pos': 0.2215364076740419,
  'Neu': 0.2551170115669568},
 'cPVE7QGS7As': {'Neg': 0.20489353542992225,
  'Pos': 0.47540556530778605,
  'Neu': 0.3197009079158306},
 'CEIrzjA8euQ': {'Neg': 0.15780450403690338,
  'Pos': 0.1664552390575409,
  'Neu': 0.6757402420043945},
 'lw16DeB6zns': {'Neg': 0.2628073657397181,
  'Pos': 0.2353