In [1]:
import translators as ts
from langdetect import detect

In [2]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm

In [3]:
df = pd.read_csv("../datasets/covid_philippines/covid_philippines_comments.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same


In [4]:
translated_comments = {}
translated_comments["video_id"] = {}
translated_comments["comment"] = {}
video_id_list = df["video_id"].to_list()
comments_list = df["comment"].to_list()

In [5]:
pbar = tqdm(total=len(video_id_list))
pbar.set_description("Translating...")

for i in range(df.shape[0]):
    if comments_list[i] != None:
        new_comment = comments_list[i]
        try:
            lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
            if lang != 'en':
                new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')
                
        except:
            # No change; get same comment from list
            pass
            
        finally:
            translated_comments["video_id"][i] = video_id_list[i]
            translated_comments["comment"][i] = new_comment
            pbar.update(1)

translated_df = pd.DataFrame.from_dict(translated_comments)
pbar.close()

Translating...:   0%|          | 0/1216 [00:00<?, ?it/s]

Translating...: 100%|██████████| 1216/1216 [07:06<00:00,  2.85it/s]


In [6]:
translated_df

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same
...,...,...
1211,5DvMPgoKZmM,This covid will be a never-ending fuckery as l...
1212,5DvMPgoKZmM,A new variant is inevitable.
1213,5DvMPgoKZmM,The man that should resign from his office is ...
1214,5DvMPgoKZmM,"Is that the people who got been vaccinated.,"


---

In [7]:
def translate(video_id_list, comments_list):
    translated_comments = {}
    translated_comments["video_id"] = {}
    translated_comments["comment"] = {}
    
    pbar = tqdm(total=len(video_id_list))
    pbar.set_description("Translating...")

    for i in range(df.shape[0]):
        if comments_list[i] != None:
            new_comment = comments_list[i]
            try:
                lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
                if lang != 'en':
                    new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')

            except:
                # No change; get same comment from list
                pass

            finally:
                translated_comments["video_id"][i] = video_id_list[i]
                translated_comments["comment"][i] = new_comment
                pbar.update(1)

    translated_df = pd.DataFrame.from_dict(translated_comments)
    pbar.close()

## DETECT SPAM

- 0:Not Spam
- 1:Spam

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [63]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

psy_model_df = pd.read_csv("../datasets/model_train/Youtube01-Psy.csv")
lmfao_model_df = pd.read_csv("../datasets/model_train/Youtube03-LMFAO.csv")
kp_model_df = pd.read_csv("../datasets/model_train/Youtube02-KatyPerry.csv")
shakira_df = pd.read_csv("../datasets/model_train/Youtube05-Shakira.csv")
model_df = pd.concat([psy_model_df,lmfao_model_df,kp_model_df,shakira_df])
model_df.reset_index(inplace=True, drop=True) 
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [64]:
model_df.drop_duplicates(subset="CONTENT")

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1
...,...,...,...,...,...
1502,_2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4,Dinova Sharon,2013-07-13T14:44:00.700000,well done shakira,0
1503,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0
1504,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
1506,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0


In [65]:
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [66]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Melanie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
model_df["CONTENT"]

0       Huh, anyway check out this you[tube] channel: ...
1       Hey guys check out my new channel and our firs...
2                  just for test I have to say murdev.com
3        me shaking my sexy ass on my channel enjoy ^_^ ﻿
4                 watch?v=vtaRGgvGtWQ   Check this out .﻿
                              ...                        
1503    I love this song because we sing it at Camp al...
1504    I love this song for two reasons: 1.it is abou...
1505                                                  wow
1506                              Shakira u are so wiredo
1507                           Shakira is the best dancer
Name: CONTENT, Length: 1508, dtype: object

In [72]:
model_df['CONTENT'] = model_df['CONTENT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in model_df["CONTENT"]: 
    model_df['CONTENT'] = model_df["CONTENT"].apply(ps.stem) 
    model_df['CONTENT'] = model_df["CONTENT"].str.lower()


In [73]:
model_df["CONTENT"]

0         huh, anyway check you[tube] channel: kobyoshi02
1       hey guys check new channel first vid us monkey...
2                                     test say murdev.com
3                    shaking sexy ass channel enjoy ^_^ ﻿
4                            watch?v=vtarggvgtwq check .﻿
                              ...                        
1503                           love song sing camp time!!
1504    love song two reasons: 1.it africa 2.i born be...
1505                                                  wow
1506                                     shakira u wiredo
1507                                    shakira best danc
Name: CONTENT, Length: 1508, dtype: object

In [86]:
x=model_df["CONTENT"]
y=model_df["CLASS"]

cv= CountVectorizer()

x=cv.fit_transform(x)

x_train, x_test, y_train, y_test  = train_test_split(x,y,random_state=42)


In [87]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9699381078691424
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       544
           1       0.97      0.97      0.97       587

    accuracy                           0.97      1131
   macro avg       0.97      0.97      0.97      1131
weighted avg       0.97      0.97      0.97      1131



In [88]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9230769230769231
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       204
           1       0.93      0.90      0.91       173

    accuracy                           0.92       377
   macro avg       0.92      0.92      0.92       377
weighted avg       0.92      0.92      0.92       377



test with another dataset again

In [89]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

em_model_df = pd.read_csv("../datasets/model_train/Youtube04-Eminem.csv")
em_model_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z12rwfnyyrbsefonb232i5ehdxzkjzjs2,Lisa Wellas,,+447935454150 lovely girl talk to me xxx﻿,1
1,z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04,jason graham,2015-05-29T02:26:10.652000,I always end up coming back to this song<br />﻿,0
2,z13vsfqirtavjvu0t22ezrgzyorwxhpf3,Ajkal Khan,,"my sister just received over 6,500 new <a rel=...",1
3,z12wjzc4eprnvja4304cgbbizuved35wxcs,Dakota Taylor,2015-05-29T02:13:07.810000,Cool﻿,0
4,z13xjfr42z3uxdz2223gx5rrzs3dt5hna,Jihad Naser,,Hello I&#39;am from Palastine﻿,1


In [90]:
x=em_model_df["CONTENT"]
y=em_model_df["CLASS"]

x=cv.transform(x)

In [91]:
predictions=spam_nb.predict(x)
accuracy = accuracy_score(y, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8325892857142857
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.69      0.79       203
           1       0.79      0.95      0.86       245

    accuracy                           0.83       448
   macro avg       0.85      0.82      0.83       448
weighted avg       0.85      0.83      0.83       448



try model on translated_df

In [92]:
og_translated_df = translated_df.copy()#in case we want to preserve the original dataframe

In [96]:
translated_df["comment"] = translated_df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in translated_df["comment"]: 
    translated_df["comment"] = translated_df["comment"].apply(ps.stem) 
    translated_df["comment"] = translated_df["comment"].str.lower()

In [97]:
transformed = cv.transform(translated_df["comment"])

translated_df["spam"]=spam_nb.predict(transformed)
translated_df.head()

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,loved calmness manilla.,0
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,0
2,aLZ85hb4wjE,manila looks beautifull less peopl,0
3,aLZ85hb4wjE,the lockdown makes city look like place i want...,0
4,aLZ85hb4wjE,india also,1


In [98]:
translated_df[translated_df["spam"]==1].count()

video_id    563
comment     563
spam        563
dtype: int64

In [99]:
translated_df[translated_df["spam"]==0].count()

video_id    653
comment     653
spam        653
dtype: int64

seems like there are too many tagged as spam

In [100]:
translated_df[translated_df["spam"]==1]

Unnamed: 0,video_id,comment,spam
4,aLZ85hb4wjE,india also,1
5,aLZ85hb4wjE,the president promised he'll best ease traffic...,1
9,aLZ85hb4wjE,2020 year nature fight back reduce human emiss...,1
11,sYI97jv-pZg,ala n covid tngina,1
12,3YFpjgIQqEo,this cold make worse make money,1
...,...,...,...
1204,Wjj__vIdew0,nest,1
1210,5DvMPgoKZmM,"please, option get vaccinated, it. it still po...",1
1212,5DvMPgoKZmM,a new variant inevitable.,1
1213,5DvMPgoKZmM,the man resign office nonetheless present head...,1


In [101]:
translated_df[translated_df["spam"]==0]

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,loved calmness manilla.,0
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,0
2,aLZ85hb4wjE,manila looks beautifull less peopl,0
3,aLZ85hb4wjE,the lockdown makes city look like place i want...,0
6,aLZ85hb4wjE,"i went viewing spot antipolo, beautiful, almos...",0
...,...,...,...
1207,5DvMPgoKZmM,"it's almost 2 years this, many people honestly...",0
1208,5DvMPgoKZmM,"""cases"" mean absolutely nothing whatsoever, re...",0
1209,5DvMPgoKZmM,"honestly, i hope things least get slightly bet...",0
1211,5DvMPgoKZmM,this covid never-ending fuckery long media kee...,0


In [39]:
translated_df[translated_df["spam"]==1].to_csv("check_spam.csv")

In [40]:
translated_df[translated_df["spam"]==0].to_csv("check_not_spam.csv")

Try different dataset from a different source

In [102]:
#source: https://www.kaggle.com/datasets/madhuragl/5000-youtube-spamnot-spam-dataset/data
comments_5k_df = pd.read_csv("../datasets/model_train/5000 YT comments.csv",encoding='cp1252')
comments_5k_df.head()

Unnamed: 0,Name,Comment,Time,Likes,Reply Count,Spam
0,Taofeekat,&lt;????i make my first million investing in f...,2022-09-28T02:08:55Z,30,30,1
1,Angelina Jordan,&lt;?l will forever be indebted to you I will ...,2022-09-23T05:26:48Z,0,0,1
2,Fernandez Joe,<b>????I recommend a professional forex/Bitcoi...,2022-09-20T12:56:30Z,5,2,1
3,Jessica Billy,I think I’m blessed because if not I wouldn’t ...,2022-09-17T20:20:24Z,21,34,1
4,Allison Zar,<b>I recommend a professional broker to you g...,2022-09-05T09:19:30Z,19,27,1


In [103]:
comments_5k_df.count()

Name           5000
Comment        5000
Time           5000
Likes          5000
Reply Count    5000
Spam           5000
dtype: int64

In [104]:
comments_5k_df.drop_duplicates(subset="Comment")

Unnamed: 0,Name,Comment,Time,Likes,Reply Count,Spam
0,Taofeekat,&lt;????i make my first million investing in f...,2022-09-28T02:08:55Z,30,30,1
1,Angelina Jordan,&lt;?l will forever be indebted to you I will ...,2022-09-23T05:26:48Z,0,0,1
2,Fernandez Joe,<b>????I recommend a professional forex/Bitcoi...,2022-09-20T12:56:30Z,5,2,1
3,Jessica Billy,I think I’m blessed because if not I wouldn’t ...,2022-09-17T20:20:24Z,21,34,1
4,Allison Zar,<b>I recommend a professional broker to you g...,2022-09-05T09:19:30Z,19,27,1
...,...,...,...,...,...,...
4995,Anjan Das,She is so beautiful!,2020-06-05T04:18:26Z,5,0,0
4996,Humza Navaid,3 seconds in and I want to marry her. I am goi...,2020-06-04T21:03:14Z,0,0,0
4997,Aadil Ranesh,She talks a lot like Tanmay Bakshi,2020-06-03T17:29:04Z,0,0,0
4998,Fuzail Ahmad,Why does her face look like a bad deepfake?,2020-06-03T11:17:48Z,1,0,0


In [105]:
comments_5k_df.count()

Name           5000
Comment        5000
Time           5000
Likes          5000
Reply Count    5000
Spam           5000
dtype: int64

In [106]:
comments_5k_df['Comment'] = comments_5k_df['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

ps = PorterStemmer() 
for w in comments_5k_df['Comment']: 
    comments_5k_df['Comment'] = comments_5k_df['Comment'].apply(ps.stem) 
    comments_5k_df['Comment'] = comments_5k_df['Comment'].str.lower()

In [107]:
x=comments_5k_df["Comment"]
y=comments_5k_df["Spam"]

cv= CountVectorizer()

x=cv.fit_transform(x)

x_train, x_test, y_train, y_test  = train_test_split(x,y,random_state=42)

In [108]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9301333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1876
           1       0.92      0.94      0.93      1874

    accuracy                           0.93      3750
   macro avg       0.93      0.93      0.93      3750
weighted avg       0.93      0.93      0.93      3750



In [109]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8768
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       624
           1       0.86      0.90      0.88       626

    accuracy                           0.88      1250
   macro avg       0.88      0.88      0.88      1250
weighted avg       0.88      0.88      0.88      1250



In [110]:
translated_df = og_translated_df.copy()#in case we want to preserve the original dataframe

In [111]:
translated_df["comment"] = translated_df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in translated_df["comment"]: 
    translated_df["comment"] = translated_df["comment"].apply(ps.stem) 
    translated_df["comment"] = translated_df["comment"].str.lower()

In [112]:
transformed = cv.transform(translated_df["comment"])

translated_df["spam"]=spam_nb.predict(transformed)
translated_df.head()

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,loved calmness manilla.,0
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,1
2,aLZ85hb4wjE,manila looks beautifull less peopl,0
3,aLZ85hb4wjE,the lockdown makes city look like place i want...,0
4,aLZ85hb4wjE,india also,1


In [113]:
translated_df[translated_df["spam"]==1].count()

video_id    670
comment     670
spam        670
dtype: int64

In [114]:
translated_df[translated_df["spam"]==0].count()

video_id    546
comment     546
spam        546
dtype: int64

In [115]:
translated_df[translated_df["spam"]==1]

Unnamed: 0,video_id,comment,spam
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,1
4,aLZ85hb4wjE,india also,1
6,aLZ85hb4wjE,"i went viewing spot antipolo, beautiful, almos...",1
10,sYI97jv-pZg,cough cold season weather cold .... covid amg ...,1
11,sYI97jv-pZg,ala n covid tngina,1
...,...,...,...
1210,5DvMPgoKZmM,"please, option get vaccinated, it. it still po...",1
1211,5DvMPgoKZmM,this covid never-ending fuckery long media kee...,1
1212,5DvMPgoKZmM,a new variant inevitable.,1
1214,5DvMPgoKZmM,"is people got vaccinated.,",1


## SENTIMENT ANALYSIS

TEST RoBERTa

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from scipy.special import softmax

def roberta_sentiment_scores(list_comments):
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)

    for comments in list_comments:
        comment_words = []
        comments = comments.replace("\n", " ")
        comments = comments.replace("\xa0", " ")
        comments = comments.replace("?", " ")
        comments = comments.replace(":", " ")
        comments = comments.replace(";", " ")
        comments = comments.replace(";", " ")
        comments = re.sub(r"\s+", ' ', comments) 
        print(comments)
        for word in comments.split(' '):
            if word.startswith('@') and len(word) > 1:
               word = '@user'
        
            elif word.startswith('http'):
                word = "http"
            comment_words.append(word)

        comment_procs = " ".join(comment_words)

        encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
        print(encoded)
        output = model(**encoded)

        scores = output[0][0].detach().numpy()

        scores = softmax(scores)

        for i in range(len(scores)):

            l = labels[i]
            s = scores[i]
            print(l, s)

In [None]:
roberta_sentiment_scores(translated_df["comment"][:50])

TEST TEXTBLOB

In [None]:
from textblob import TextBlob

def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [None]:
textblob_sentiment_scores(translated_df["comment"][:50])

TEST STANZA

In [None]:
import stanza
def stanza_sentiment_scores(list_comments):
    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in list_comments:
        doc = nlp(comment.replace("\n", " "))
        print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            print("%d -> %d" % (i, sentence.sentiment))

In [None]:
stanza_sentiment_scores(translated_df["comment"][:50])

TEST VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def vader_sentiment_scores (list_comments):
    for sentence in list_comments:
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict = sid_obj.polarity_scores(sentence)
        print(sentence)
        print("Overall sentiment dictionary is : ", sentiment_dict)
        print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
        print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
        print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
        print("Sentence Overall Rated As", end = " ")
 
        # decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")

    

In [None]:
vader_sentiment_scores (translated_df["comment"][:50])

## Get Sentiment sentiment for each video

#### TEXTBLOB APPROACH

In [None]:
def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [None]:
translated_df["video_id"].unique()

In [None]:
def per_vid_textblob_polarity (df):
    video_polarity={}
    for video in df["video_id"].unique():
        sum_polarity=0
        for comment in df.loc[df["video_id"]==video]["comment"]:
            sum_polarity+=(TextBlob(comment)).sentiment.polarity
        video_polarity[video] = sum_polarity/df.loc[df["video_id"]==video]["comment"].count()
    ''' if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.subjectivity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.subjectivity)
        else:
            print("neutral", testimonial.sentiment.subjectivity)'''
    return video_polarity


In [None]:
per_vid_textblob_polarity(translated_df)

#### VADER APPROACH

In [None]:
def per_vid_vader_polarity (df):
    polarity_scores={}
    sid_obj = SentimentIntensityAnalyzer()

    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0
        sum_compound= 0
        scores={}
        for sentence in df.loc[df["video_id"]==video]["comment"]:

            sentiment_dict = sid_obj.polarity_scores(sentence)
            sum_neg += sentiment_dict['neg']
            sum_pos += sentiment_dict['pos']
            sum_neu += sentiment_dict['neu']
            sum_compound += sentiment_dict['compound']
            
        scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()
        scores["Overall"] = sum_compound/df.loc[df["video_id"]==video]["comment"].count()
        
        polarity_scores[video]=scores
        '''# decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")'''
    return polarity_scores
        

In [None]:
per_vid_vader_polarity(translated_df)

### RoBERTa APPROACH

In [None]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import re
#from scipy.special import softmax

def roberta_per_vid_scores(df):
    polarity_scores={}

    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0 
        scores={}
        for comments in df.loc[df["video_id"]==video]["comment"]:
            comment_words = []
            comments = comments.replace("\n", " ")
            comments = comments.replace("\xa0", " ")
            comments = comments.replace("?", " ")
            comments = comments.replace(":", " ")
            comments = comments.replace(";", " ")
            comments = comments.replace(";", " ")
            comments = re.sub(r"\s+", ' ', comments) 
     #   print(comments)
            for word in comments.split(' '):
                if word.startswith('@') and len(word) > 1:
                    word = '@user'
        
                elif word.startswith('http'):
                    word = "http"
                comment_words.append(word)

            comment_procs = " ".join(comment_words)

            encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
           # print(encoded)
            output = model(**encoded)

            scores = output[0][0].detach().numpy()

            scores = softmax(scores)

            for i in range(len(scores)):

                l = labels[i]
                s = scores[i]
                #    labels = ['Negative', 'Neutral', 'Positive']

                if l=="Negative":
                    sum_neg+= s
                elif l == "Neutral":
                    sum_neu+=s
                else : 
                    sum_pos+=s
       # scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores =dict([("Neg", sum_neg/df.loc[df["video_id"]==video]["comment"].count()),("Pos",sum_pos/df.loc[df["video_id"]==video]["comment"].count()),("Neu",sum_neu/df.loc[df["video_id"]==video]["comment"].count()
)])
        #scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        #scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()

        polarity_scores[video] = scores
    return polarity_scores

In [None]:
roberta_per_vid_scores(translated_df)