In [1]:
import translators as ts
from langdetect import detect

In [2]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm

In [3]:
df = pd.read_csv("../datasets/covid_philippines/covid_philippines_comments.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same


## TRANSLATE

In [4]:
translated_comments = {}
translated_comments["video_id"] = {}
translated_comments["comment"] = {}
video_id_list = df["video_id"].to_list()
comments_list = df["comment"].to_list()

In [5]:
pbar = tqdm(total=len(video_id_list))
pbar.set_description("Translating...")

for i in range(df.shape[0]):
    if comments_list[i] != None:
        new_comment = comments_list[i]
        try:
            lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
            if lang != 'en':
                new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')
                
        except:
            # No change; get same comment from list
            pass
            
        finally:
            translated_comments["video_id"][i] = video_id_list[i]
            translated_comments["comment"][i] = new_comment
            pbar.update(1)

translated_df = pd.DataFrame.from_dict(translated_comments)
pbar.close()

Translating...: 100%|██████████| 1216/1216 [02:59<00:00,  6.77it/s]


In [6]:
translated_df

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same
...,...,...
1211,5DvMPgoKZmM,This covid will be a never-ending fuckery as l...
1212,5DvMPgoKZmM,A new variant is inevitable.
1213,5DvMPgoKZmM,The man that should resign from his office is ...
1214,5DvMPgoKZmM,"Is that the people who got been vaccinated.,"


---

In [7]:
def translate(video_id_list, comments_list):
    translated_comments = {}
    translated_comments["video_id"] = {}
    translated_comments["comment"] = {}
    
    pbar = tqdm(total=len(video_id_list))
    pbar.set_description("Translating...")

    for i in range(df.shape[0]):
        if comments_list[i] != None:
            new_comment = comments_list[i]
            try:
                lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
                if lang != 'en':
                    new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')

            except:
                # No change; get same comment from list
                pass

            finally:
                translated_comments["video_id"][i] = video_id_list[i]
                translated_comments["comment"][i] = new_comment
                pbar.update(1)

    translated_df = pd.DataFrame.from_dict(translated_comments)
    pbar.close()

## DETECT SPAM

### MultinomialNB

- 0:Not Spam
- 1:Spam

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [9]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

psy_model_df = pd.read_csv("../datasets/model_train/Youtube01-Psy.csv")
lmfao_model_df = pd.read_csv("../datasets/model_train/Youtube03-LMFAO.csv")
kp_model_df = pd.read_csv("../datasets/model_train/Youtube02-KatyPerry.csv")
shakira_df = pd.read_csv("../datasets/model_train/Youtube05-Shakira.csv")
model_df = pd.concat([psy_model_df,lmfao_model_df,kp_model_df,shakira_df])
model_df.reset_index(inplace=True, drop=True) 
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [10]:
model_df.drop_duplicates(subset="CONTENT")

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1
...,...,...,...,...,...
1502,_2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4,Dinova Sharon,2013-07-13T14:44:00.700000,well done shakira,0
1503,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0
1504,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
1506,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0


In [11]:
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Melanie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


preprocessing

In [13]:
model_df['CONTENT'] = model_df['CONTENT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in model_df["CONTENT"]: 
    #convert to lowercase
    model_df['CONTENT'] = model_df["CONTENT"].str.lower()
    #Stem
    model_df['CONTENT'] = model_df["CONTENT"].apply(ps.stem) 
    


In [28]:
x = model_df["CONTENT"]
y  =model_df["CLASS"]

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)

x_train, x_test, y_train, y_test  = train_test_split(x,y,random_state=42)



In [15]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9717064544650752
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       544
           1       0.97      0.97      0.97       587

    accuracy                           0.97      1131
   macro avg       0.97      0.97      0.97      1131
weighted avg       0.97      0.97      0.97      1131



In [16]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9177718832891246
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       204
           1       0.92      0.90      0.91       173

    accuracy                           0.92       377
   macro avg       0.92      0.92      0.92       377
weighted avg       0.92      0.92      0.92       377



test with another dataset again

In [17]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

em_model_df = pd.read_csv("../datasets/model_train/Youtube04-Eminem.csv")
em_model_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z12rwfnyyrbsefonb232i5ehdxzkjzjs2,Lisa Wellas,,+447935454150 lovely girl talk to me xxx﻿,1
1,z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04,jason graham,2015-05-29T02:26:10.652000,I always end up coming back to this song<br />﻿,0
2,z13vsfqirtavjvu0t22ezrgzyorwxhpf3,Ajkal Khan,,"my sister just received over 6,500 new <a rel=...",1
3,z12wjzc4eprnvja4304cgbbizuved35wxcs,Dakota Taylor,2015-05-29T02:13:07.810000,Cool﻿,0
4,z13xjfr42z3uxdz2223gx5rrzs3dt5hna,Jihad Naser,,Hello I&#39;am from Palastine﻿,1


In [18]:
em_model_df['CONTENT'] = em_model_df['CONTENT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in em_model_df["CONTENT"]: 
    #convert to lowercase
    em_model_df['CONTENT'] = em_model_df["CONTENT"].str.lower()
    #Stem
    em_model_df['CONTENT'] = em_model_df["CONTENT"].apply(ps.stem) 

In [19]:
em_x=em_model_df["CONTENT"]
em_y=em_model_df["CLASS"]

em_x=vectorizer.transform(em_x)

In [20]:
predictions=spam_nb.predict(em_x)
accuracy = accuracy_score(em_y, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(em_y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8526785714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82       203
           1       0.82      0.94      0.87       245

    accuracy                           0.85       448
   macro avg       0.86      0.84      0.85       448
weighted avg       0.86      0.85      0.85       448



In [None]:
#Hypertuning MultinomialNB

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [111]:
hyperparameters={
    'alpha': [0.01, 0.1, 0.5, 1.0, 10.0,2,0.2 ],
    'fit_prior': [True, False],
    'class_prior': [None, [0.1,]* len(model_df["CLASS"]), ]
         
}

In [112]:
#multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=hyperparameters, n_jobs=-1, cv=5, verbose=5).fi t(x_train,y_train)
#https://coderzcolumn.com/tutorials/machine-learning/scikit-learn-sklearn-naive-bayes#6
multinomial_nb_random = RandomizedSearchCV(MultinomialNB(),param_distributions=hyperparameters,n_iter=500,cv=10,random_state=42).fit(x_train,y_train)




140 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
140 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\naive_bayes.py", line 775, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\naive_bayes.py", line 590, in _update_class_log_pr

In [113]:
multinomial_nb_random.best_params_

{'fit_prior': False, 'class_prior': None, 'alpha': 1.0}

In [114]:
predictions = multinomial_nb_random.best_estimator_.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9161147902869757
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       244
           1       0.92      0.90      0.91       209

    accuracy                           0.92       453
   macro avg       0.92      0.91      0.92       453
weighted avg       0.92      0.92      0.92       453



In [115]:
predictions = multinomial_nb_random.best_estimator_.predict(em_x)
accuracy = accuracy_score(em_y, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(em_y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8928571428571429
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88       203
           1       0.87      0.94      0.91       245

    accuracy                           0.89       448
   macro avg       0.90      0.89      0.89       448
weighted avg       0.89      0.89      0.89       448



### SVM 

In [55]:
x=model_df["CONTENT"]
y=model_df["CLASS"]

vectorizer = CountVectorizer()
x=vectorizer.fit_transform(x)

x_train, x_test, y_train, y_test  = train_test_split(x,y,test_size=0.30,random_state=42)


In [56]:
#from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svm_model = SVC(kernel = 'sigmoid', gamma = 1.0)
svm_model.fit(x_train, y_train)

predictions = svm_model.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.8265402843601896
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       504
           1       0.84      0.83      0.83       551

    accuracy                           0.83      1055
   macro avg       0.83      0.83      0.83      1055
weighted avg       0.83      0.83      0.83      1055



In [57]:
predictions = svm_model.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8432671081677704
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       244
           1       0.81      0.86      0.83       209

    accuracy                           0.84       453
   macro avg       0.84      0.84      0.84       453
weighted avg       0.84      0.84      0.84       453



In [58]:
em_x=em_model_df["CONTENT"]
em_y=em_model_df["CLASS"]

em_x=vectorizer.transform(em_x)

predictions = svm_model.predict(em_x)
accuracy = accuracy_score(em_y, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(em_y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8058035714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       203
           1       0.83      0.80      0.82       245

    accuracy                           0.81       448
   macro avg       0.80      0.81      0.80       448
weighted avg       0.81      0.81      0.81       448



In [99]:
svc = SVC(max_iter=1000)

In [123]:
hyperparameters= [
    {
        "C":[0.0001, 0.001, 0.01 , 0.1, 1.0, 5, 30, 50],
        "kernel": ["linear","poly","rbf","sigmoid"],
        "degree" :[1, 3, 5, 10, 25, 50,100],
        "gamma" :["scale", "auto",1000, 10, 5, 2.5, 1.5, 1.0]
    }
]

In [124]:
rssvc = RandomizedSearchCV(estimator = svc, param_distributions = hyperparameters, n_iter =200, cv=10, random_state=42).fit(x_train,y_train)


111 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
111 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\svm\_base.py", line 268, in fit
    raise ValueError(
ValueError: The dual coefficients or intercepts are not finite. The input data may contain large values and need to be preprocessed.

 0.55545373 0.9

In [126]:
rssvc.best_estimator_

In [127]:
predictions=rssvc.best_estimator_.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9580573951434879
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       244
           1       0.97      0.94      0.95       209

    accuracy                           0.96       453
   macro avg       0.96      0.96      0.96       453
weighted avg       0.96      0.96      0.96       453



In [128]:
predictions=rssvc.best_estimator_.predict(em_x)
accuracy = accuracy_score(em_y, predictions)
print(f"Test Accuracy (EMINEM DF): {accuracy}")

class_report = classification_report(em_y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy (EMINEM DF): 0.96875
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       203
           1       1.00      0.95      0.97       245

    accuracy                           0.97       448
   macro avg       0.97      0.97      0.97       448
weighted avg       0.97      0.97      0.97       448



better performance with eminem dataset using SVM

In [129]:
spam_svc=rssvc.best_estimator_
spam_nb=multinomial_nb_random.best_estimator_

### Check Models Wih Translated_df

try models on translated_df

#### MultinomialNB

In [117]:
nb_spam_filtered = translated_df.copy()
#svm_spam_filtered = translated_df.copy()

In [118]:
nb_spam_filtered["comment_cleaned"] = nb_spam_filtered["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in nb_spam_filtered["comment"]: 
    nb_spam_filtered["comment_cleaned"] = nb_spam_filtered["comment"].apply(ps.stem) 
    nb_spam_filtered["comment_cleaned"] = nb_spam_filtered["comment"].str.lower()

In [119]:
svm_spam_filtered = nb_spam_filtered.copy() #so no need to go through same cleaning/preprocessing

In [133]:
transformed = vectorizer.transform(nb_spam_filtered["comment_cleaned"])

nb_spam_filtered["spam"]=spam_nb.predict(transformed)
nb_spam_filtered.head()

Unnamed: 0,video_id,comment,comment_cleaned,spam
0,aLZ85hb4wjE,Loved the calmness of Manilla.,loved the calmness of manilla.,0
1,aLZ85hb4wjE,After this pandemic I think every country shou...,after this pandemic i think every country shou...,0
2,aLZ85hb4wjE,manila looks beautifull with less people,manila looks beautifull with less people,0
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,the lockdown makes the city look like a place ...,0
4,aLZ85hb4wjE,India also same,india also same,1


In [134]:
nb_spam_filtered[nb_spam_filtered["spam"]==1].count()

video_id           470
comment            470
comment_cleaned    470
spam               470
dtype: int64

In [135]:
nb_spam_filtered[nb_spam_filtered["spam"]==0].count()

video_id           746
comment            746
comment_cleaned    746
spam               746
dtype: int64

#### SVM CHECK

In [130]:
transformed = vectorizer.transform(svm_spam_filtered["comment_cleaned"])

svm_spam_filtered["spam"]=spam_svc.predict(transformed)
svm_spam_filtered.head()

Unnamed: 0,video_id,comment,comment_cleaned,spam
0,aLZ85hb4wjE,Loved the calmness of Manilla.,loved the calmness of manilla.,0
1,aLZ85hb4wjE,After this pandemic I think every country shou...,after this pandemic i think every country shou...,0
2,aLZ85hb4wjE,manila looks beautifull with less people,manila looks beautifull with less people,0
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,the lockdown makes the city look like a place ...,0
4,aLZ85hb4wjE,India also same,india also same,0


In [131]:
svm_spam_filtered[svm_spam_filtered["spam"]==1].count()

video_id           155
comment            155
comment_cleaned    155
spam               155
dtype: int64

In [132]:
svm_spam_filtered[svm_spam_filtered["spam"]==0].count()

video_id           1061
comment            1061
comment_cleaned    1061
spam               1061
dtype: int64

In [136]:
svm_spam_filtered[svm_spam_filtered["spam"]==1]

Unnamed: 0,video_id,comment,comment_cleaned,spam
12,3YFpjgIQqEo,This is how cold it is to just make it worse f...,this is how cold it is to just make it worse f...,1
24,dIsaz_XlmTw,To those who say 18 cases are only detected. T...,to those who say 18 cases are only detected. t...,1
28,dIsaz_XlmTw,Are we back when you hit tooth ache just covid...,are we back when you hit tooth ache just covid...,1
49,nTUWK8vufOk,I'm here out of curiosity if they include how ...,i'm here out of curiosity if they include how ...,1
58,cPVE7QGS7As,Deaths have been low compared to other countri...,deaths have been low compared to other countri...,1
...,...,...,...,...
1170,sEjN7muHOLc,Revelation 13: 15-17 \r\n Are there scientific...,revelation 13: 15-17 \r\n are there scientific...,1
1187,Y-xQtgvNuvA,"In the Philippines, mandatory faceshields and ...","in the philippines, mandatory faceshields and ...",1
1191,W7n2FoRinVk,Find out in your LGU first if you can. It's a ...,find out in your lgu first if you can. it's a ...,1
1207,5DvMPgoKZmM,"It's been almost 2 years of this, and how many...","it's been almost 2 years of this, and how many...",1


In [137]:
svm_spam_filtered[svm_spam_filtered["spam"]==0]

Unnamed: 0,video_id,comment,comment_cleaned,spam
0,aLZ85hb4wjE,Loved the calmness of Manilla.,loved the calmness of manilla.,0
1,aLZ85hb4wjE,After this pandemic I think every country shou...,after this pandemic i think every country shou...,0
2,aLZ85hb4wjE,manila looks beautifull with less people,manila looks beautifull with less people,0
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,the lockdown makes the city look like a place ...,0
4,aLZ85hb4wjE,India also same,india also same,0
...,...,...,...,...
1210,5DvMPgoKZmM,"Please, if you have an option to get vaccinate...","please, if you have an option to get vaccinate...",0
1211,5DvMPgoKZmM,This covid will be a never-ending fuckery as l...,this covid will be a never-ending fuckery as l...,0
1212,5DvMPgoKZmM,A new variant is inevitable.,a new variant is inevitable.,0
1213,5DvMPgoKZmM,The man that should resign from his office is ...,the man that should resign from his office is ...,0


### Israel-Palestine comments check

In [138]:
is_pal_df = pd.read_csv("../datasets/israel-palestine_conflict_history/comments.csv").drop("Unnamed: 0", axis=1)
is_pal_df.head()

Unnamed: 0,video_id,comment
0,R0ftmf_Uv9A,No matter how many times these information get...
1,R0ftmf_Uv9A,"*To learn who RULES over you, simply find out ..."
2,R0ftmf_Uv9A,"Say that part again: Jewish , Christian’s and ..."
3,R0ftmf_Uv9A,So sad. They were living in peace and now suff...
4,R0ftmf_Uv9A,Why start at 1946?


In [139]:
is_pal_df["comment"] = is_pal_df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in is_pal_df["comment"]: 
    is_pal_df["comment"] = is_pal_df["comment"].apply(ps.stem) 
    is_pal_df["comment"] = is_pal_df["comment"].str.lower()

#### MultinomialNB

In [140]:
transformed = vectorizer.transform(is_pal_df["comment"])

is_pal_df["spam"]=spam_nb.predict(transformed)
is_pal_df.head()

Unnamed: 0,video_id,comment,spam
0,R0ftmf_Uv9A,no matter many times information gets thrown e...,0
1,R0ftmf_Uv9A,"*to learn rules you, simply find not allowed c...",1
2,R0ftmf_Uv9A,"say part again: jewish , christian’s muslims l...",1
3,R0ftmf_Uv9A,so sad. they living peace suffering 7 decades 😢,0
4,R0ftmf_Uv9A,why start 1946?,0


In [141]:
is_pal_df[is_pal_df["spam"]==1].count()

video_id    583
comment     583
spam        583
dtype: int64

In [142]:
is_pal_df[is_pal_df["spam"]==0].count()

video_id    1288
comment     1288
spam        1288
dtype: int64

In [143]:
is_pal_df[is_pal_df["spam"]==1]

Unnamed: 0,video_id,comment,spam
1,R0ftmf_Uv9A,"*to learn rules you, simply find not allowed c...",1
2,R0ftmf_Uv9A,"say part again: jewish , christian’s muslims l...",1
5,R0ftmf_Uv9A,"thank's ireland, consistent n vocal supporting...",1
12,Bno1m1zhIWs,finally. an objective concise summary religiou...,1
13,Bno1m1zhIWs,this first video actually explained things tho...,1
...,...,...,...
1859,SsKpy2ftuF4,they gave fair warning time evacuation. that t...,1
1861,JuU7pSDs8f4,angel one demat account(free) - https://tinyur...,1
1863,JuU7pSDs8f4,nobody explain clear do. pl. keep videos.,1
1864,JuU7pSDs8f4,in telugu best channel facts best content rema...,1


#### SVM

In [144]:
transformed = vectorizer.transform(is_pal_df["comment"])

is_pal_df["spam"]=spam_svc.predict(transformed)
is_pal_df.head()

Unnamed: 0,video_id,comment,spam
0,R0ftmf_Uv9A,no matter many times information gets thrown e...,0
1,R0ftmf_Uv9A,"*to learn rules you, simply find not allowed c...",0
2,R0ftmf_Uv9A,"say part again: jewish , christian’s muslims l...",1
3,R0ftmf_Uv9A,so sad. they living peace suffering 7 decades 😢,0
4,R0ftmf_Uv9A,why start 1946?,0


In [145]:
is_pal_df[is_pal_df["spam"]==1].count()

video_id    182
comment     182
spam        182
dtype: int64

In [146]:
is_pal_df[is_pal_df["spam"]==0].count()

video_id    1689
comment     1689
spam        1689
dtype: int64

Seems like SVM works better

In [166]:
covid_filtered_out_df = svm_spam_filtered[svm_spam_filtered["spam"]==0]
covid_filtered_out_df.head()

Unnamed: 0,video_id,comment,comment_cleaned,spam
0,aLZ85hb4wjE,Loved the calmness of Manilla.,loved the calmness of manilla.,0
1,aLZ85hb4wjE,After this pandemic I think every country shou...,after this pandemic i think every country shou...,0
2,aLZ85hb4wjE,manila looks beautifull with less people,manila looks beautifull with less people,0
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,the lockdown makes the city look like a place ...,0
4,aLZ85hb4wjE,India also same,india also same,0


In [167]:
is_pal_filtered_df = svm_spam_filtered[svm_spam_filtered["spam"]==0]
is_pal_filtered_df.head()

Unnamed: 0,video_id,comment,comment_cleaned,spam
0,aLZ85hb4wjE,Loved the calmness of Manilla.,loved the calmness of manilla.,0
1,aLZ85hb4wjE,After this pandemic I think every country shou...,after this pandemic i think every country shou...,0
2,aLZ85hb4wjE,manila looks beautifull with less people,manila looks beautifull with less people,0
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...,the lockdown makes the city look like a place ...,0
4,aLZ85hb4wjE,India also same,india also same,0


## SENTIMENT ANALYSIS

TEST RoBERTa

In [150]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from scipy.special import softmax

def roberta_sentiment_scores(list_comments):
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)

    for comments in list_comments:
        comment_words = []
        comments = comments.replace("\n", " ")
        comments = comments.replace("\xa0", " ")
        comments = comments.replace("?", " ")
        comments = comments.replace(":", " ")
        comments = comments.replace(";", " ")
        comments = comments.replace(";", " ")
        comments = re.sub(r"\s+", ' ', comments) 
        print(comments)
        for word in comments.split(' '):
            if word.startswith('@') and len(word) > 1:
               word = '@user'
        
            elif word.startswith('http'):
                word = "http"
            comment_words.append(word)

        comment_procs = " ".join(comment_words)

        encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
        print(encoded)
        output = model(**encoded)

        scores = output[0][0].detach().numpy()

        scores = softmax(scores)

        for i in range(len(scores)):

            l = labels[i]
            s = scores[i]
            print(l, s)

In [151]:
roberta_sentiment_scores(covid_filtered_out_df["comment"][:50])

  return self.fget.__get__(instance, owner)()


Loved the calmness of Manilla.
{'input_ids': tensor([[    0,   574, 12677,     5,  6327,  1825,     9,  1554,  4699,     4,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Negative 0.0026419058
Neutral 0.031785045
Positive 0.9655731
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
{'input_ids': tensor([[    0,  4993,    42, 23387, 14414,    38,   206,   358,   247,   197,
            33,    10, 23076,   358,   353,     7,   492,   985,  2574,    10,
            86,     7, 14384,     6,    19,    70,     5,  6631,    14,     5,
          6872,    16,  7242,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Negative 0.72357625
Neutral 0.25514364
Positive 0.021280115
manila looks beautifull with less people
{'input_ids': tensor

TEST TEXTBLOB

In [152]:
from textblob import TextBlob

def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [153]:
textblob_sentiment_scores(covid_filtered_out_df["comment"][:50])

positive 0.7
neutral 0.0
negative -0.16666666666666666
positive 0.11250000000000002
neutral 0.0
positive 0.44999999999999996
positive 0.7
negative -0.55
positive 0.475
neutral 0.0
negative -0.6
neutral 0.0
negative -0.19999999999999998
negative -0.45285714285714285
negative -0.5
neutral 0.0
negative -0.3729166666666667
neutral 0.0
positive 0.2
positive 0.48828125
neutral 0.0
positive 0.425
positive 0.2
neutral 0.0
neutral 0.0
neutral 0.0
positive 0.0020833333333333346
neutral 0.0
neutral 0.0
neutral 0.0
neutral 0.0
negative -0.16666666666666666
neutral 0.0
negative -0.4
negative -0.14666666666666664
positive 0.18611111111111112
neutral 0.0
neutral 0.0
negative -0.10166666666666668
negative -0.35
positive 0.5
positive 0.09999999999999998
positive 0.30833333333333335
positive 0.5
negative -0.07857142857142857
positive 0.18194444444444446
neutral 0.0
neutral 0.0
positive 0.2508928571428572
positive 0.4333333333333333


TEST STANZA

In [154]:
import stanza
def stanza_sentiment_scores(list_comments):
    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in list_comments:
        doc = nlp(comment.replace("\n", " "))
        print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            print("%d -> %d" % (i, sentence.sentiment))

In [155]:
stanza_sentiment_scores(covid_filtered_out_df["comment"][:50])

2024-03-07 16:15:00 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-07 16:15:01 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

2024-03-07 16:15:01 INFO: Using device: cpu
2024-03-07 16:15:01 INFO: Loading: tokenize
2024-03-07 16:15:01 INFO: Loading: mwt
2024-03-07 16:15:01 INFO: Loading: sentiment
2024-03-07 16:15:02 INFO: Done loading processors!


Loved the calmness of Manilla.
0 -> 2
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
0 -> 0
manila looks beautifull with less people
0 -> 2
The lockdown makes the city look like a place I want to explore.

Before the lockdown, it looked like a crowded mess filled with traffic and pollution.
0 -> 0
India also same
0 -> 1
The president promised that he'll do his best to ease the traffic in the metro like 5mins of travel along EDSA. Fortunately it happened but unfortunately it wasn't in the ideal way. Nature's sense of humor tho.
0 -> 0
I went up the viewing spot at antipolo, its a very beautiful, almost smog free view of the metro skyline
0 -> 2
The sad part is people around the world did not intend to heal earth but afraid of death.
0 -> 1
i hope it stays like this forever its beautiful with no messy people
0 -> 2
2020 is the year when nature fight back and reduce h

TEST VADER

In [156]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def vader_sentiment_scores (list_comments):
    for sentence in list_comments:
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict = sid_obj.polarity_scores(sentence)
        print(sentence)
        print("Overall sentiment dictionary is : ", sentiment_dict)
        print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
        print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
        print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
        print("Sentence Overall Rated As", end = " ")
 
        # decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")

    

In [157]:
vader_sentiment_scores (covid_filtered_out_df["comment"][:50])

Loved the calmness of Manilla.
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'compound': 0.765}
sentence was rated as  0.0 % Negative
sentence was rated as  31.2 % Neutral
sentence was rated as  68.8 % Positive
Sentence Overall Rated As Positive
After this pandemic I think every country should have a lockdown every month to give mother nature a time to heal, with all the pollution that the earth is experiencing.
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as  0.0 % Negative
sentence was rated as  100.0 % Neutral
sentence was rated as  0.0 % Positive
Sentence Overall Rated As Neutral
manila looks beautifull with less people
Overall sentiment dictionary is :  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as  0.0 % Negative
sentence was rated as  100.0 % Neutral
sentence was rated as  0.0 % Positive
Sentence Overall Rated As Neutral
The lockdown makes the city look like

### Get Sentiment sentiment for each video

#### TEXTBLOB APPROACH

In [158]:
def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [159]:
covid_filtered_out_df["video_id"].unique()

array(['aLZ85hb4wjE', 'sYI97jv-pZg', '3YFpjgIQqEo', 'dIsaz_XlmTw',
       'DWxIvQlpJK8', 'pMUumjHY3tw', 'nTUWK8vufOk', 'cPVE7QGS7As',
       'CEIrzjA8euQ', 'lw16DeB6zns', '8dTelszbObM', 'Mxf8uGFcqSE',
       'MQ5aYS4YFlQ', 'iOE6rAY8l-k', '_a-rQYfsCck', 'jEINZXA_ujo',
       '587N9bJ5J5k', 'E5F3xA_zkFc', '05JLyd58R-w', '2fRQ8OsqOLs',
       'E56W-5xVOss', 'c1oU8U05puY', 'SEHcakm-fAc', '1psSvU1km0I',
       'PP3Yu-ro1tA', 'NQeI1CRCqeo', 'RhDHGgo4yZg', 'sdsz-t540WI',
       '7SKGXkZKjV8', 'oy0wHScCPds', 'o1kPskxFkQ8', 'aoNQUUQno00',
       'liEUC1_l8e8', '-b0EuuMvvy8', 'MmyIvf7bEGc', 'Fd3HcncV6Zk',
       '-n9Ks3VTub4', 'LzymJ2xZhho', '-qFcO_onBdA', 'fp9uRsmTWqg',
       '3ZXR2eARmuQ', '57wz3HuIVLA', 'lbK7UjoLr8o', 'xV1oR-RbOGU',
       'kn6DKdInYXk', '2T3yZ6lNRDg', 'HLYSlKY6Ww4', 'sH_YoV-NA6s',
       'ZLT6L3PHz78', 'lczwrm68u6I', 'zhBdbLj5y6A', 'ibWFsmcnefk',
       'e0fN7HxkIUc', 'Ji47WRv2tQE', 'dj5ov38ihZI', 'j3E3NF9nk44',
       'BomdsEJjb0E', 'T7c6GvrF82k', '6DBFwIlT4fg', '2FgFNBIJT

In [160]:
def per_vid_textblob_polarity (df):
    video_polarity={}
    for video in df["video_id"].unique():
        sum_polarity=0
        for comment in df.loc[df["video_id"]==video]["comment"]:
            sum_polarity+=(TextBlob(comment)).sentiment.polarity
        video_polarity[video] = sum_polarity/df.loc[df["video_id"]==video]["comment"].count()
    ''' if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.subjectivity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.subjectivity)
        else:
            print("neutral", testimonial.sentiment.subjectivity)'''
    return video_polarity


In [161]:
per_vid_textblob_polarity(covid_filtered_out_df)

{'aLZ85hb4wjE': 0.1720833333333333,
 'sYI97jv-pZg': -0.3,
 '3YFpjgIQqEo': -0.09305472883597882,
 'dIsaz_XlmTw': 0.07838541666666667,
 'DWxIvQlpJK8': -0.06590277777777777,
 'pMUumjHY3tw': -0.05083333333333334,
 'nTUWK8vufOk': 0.12907848324514992,
 'cPVE7QGS7As': 0.38312003968253966,
 'CEIrzjA8euQ': -0.25,
 'lw16DeB6zns': 0.11604662698412699,
 '8dTelszbObM': 0.20164552024308124,
 'Mxf8uGFcqSE': 0.16666666666666666,
 'MQ5aYS4YFlQ': 0.26589898989898986,
 'iOE6rAY8l-k': 0.04490740740740742,
 '_a-rQYfsCck': 0.05265151515151515,
 'jEINZXA_ujo': 0.39894179894179893,
 '587N9bJ5J5k': 0.10679012345679012,
 'E5F3xA_zkFc': 0.0866078884078884,
 '05JLyd58R-w': 0.0,
 '2fRQ8OsqOLs': 0.02433862433862434,
 'E56W-5xVOss': 0.10351587301587302,
 'c1oU8U05puY': 0.13055555555555556,
 'SEHcakm-fAc': 0.07676204004329004,
 '1psSvU1km0I': 0.15870664315108762,
 'PP3Yu-ro1tA': 0.11888888888888889,
 'NQeI1CRCqeo': 0.020672348484848484,
 'RhDHGgo4yZg': -0.11822916666666668,
 'sdsz-t540WI': 0.09243055555555554,
 '7SKG

#### VADER APPROACH

In [162]:
def per_vid_vader_polarity (df):
    polarity_scores={}
    sid_obj = SentimentIntensityAnalyzer()

    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0
        sum_compound= 0
        scores={}
        for sentence in df.loc[df["video_id"]==video]["comment"]:

            sentiment_dict = sid_obj.polarity_scores(sentence)
            sum_neg += sentiment_dict['neg']
            sum_pos += sentiment_dict['pos']
            sum_neu += sentiment_dict['neu']
            sum_compound += sentiment_dict['compound']
            
        scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()
        scores["Overall"] = sum_compound/df.loc[df["video_id"]==video]["comment"].count()
        
        polarity_scores[video]=scores
        '''# decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")'''
    return polarity_scores
        

In [163]:
per_vid_vader_polarity(covid_filtered_out_df)

{'aLZ85hb4wjE': {'Neg': 0.07980000000000001,
  'Pos': 0.194,
  'Neu': 0.7261,
  'Overall': 0.17872000000000002},
 'sYI97jv-pZg': {'Neg': 0.0, 'Pos': 0.056, 'Neu': 0.944, 'Overall': 0.11315},
 '3YFpjgIQqEo': {'Neg': 0.05155555555555555,
  'Pos': 0.1257777777777778,
  'Neu': 0.8225555555555556,
  'Overall': 0.08543333333333335},
 'dIsaz_XlmTw': {'Neg': 0.032125,
  'Pos': 0.15962500000000002,
  'Neu': 0.8083750000000001,
  'Overall': 0.23491250000000002},
 'DWxIvQlpJK8': {'Neg': 0.139625,
  'Pos': 0.09925,
  'Neu': 0.761125,
  'Overall': -0.00888750000000002},
 'pMUumjHY3tw': {'Neg': 0.0475,
  'Pos': 0.034,
  'Neu': 0.9185,
  'Overall': -0.15775},
 'nTUWK8vufOk': {'Neg': 0.10566666666666667,
  'Pos': 0.1892222222222222,
  'Neu': 0.7052222222222223,
  'Overall': 0.2995888888888889},
 'cPVE7QGS7As': {'Neg': 0.0758888888888889,
  'Pos': 0.262,
  'Neu': 0.6621111111111111,
  'Overall': 0.3731222222222222},
 'CEIrzjA8euQ': {'Neg': 0.196, 'Pos': 0.0, 'Neu': 0.804, 'Overall': -0.296},
 'lw16DeB6

### RoBERTa APPROACH

In [164]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import re
#from scipy.special import softmax

def roberta_per_vid_scores(df):
    polarity_scores={}

    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0 
        scores={}
        for comments in df.loc[df["video_id"]==video]["comment"]:
            comment_words = []
            comments = comments.replace("\n", " ")
            comments = comments.replace("\xa0", " ")
            comments = comments.replace("?", " ")
            comments = comments.replace(":", " ")
            comments = comments.replace(";", " ")
            comments = comments.replace(";", " ")
            comments = re.sub(r"\s+", ' ', comments) 
     #   print(comments)
            for word in comments.split(' '):
                if word.startswith('@') and len(word) > 1:
                    word = '@user'
        
                elif word.startswith('http'):
                    word = "http"
                comment_words.append(word)

            comment_procs = " ".join(comment_words)

            encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
           # print(encoded)
            output = model(**encoded)

            scores = output[0][0].detach().numpy()

            scores = softmax(scores)

            for i in range(len(scores)):

                l = labels[i]
                s = scores[i]
                #    labels = ['Negative', 'Neutral', 'Positive']

                if l=="Negative":
                    sum_neg+= s
                elif l == "Neutral":
                    sum_neu+=s
                else : 
                    sum_pos+=s
       # scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores =dict([("Neg", sum_neg/df.loc[df["video_id"]==video]["comment"].count()),("Pos",sum_pos/df.loc[df["video_id"]==video]["comment"].count()),("Neu",sum_neu/df.loc[df["video_id"]==video]["comment"].count()
)])
        #scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        #scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()

        polarity_scores[video] = scores
    return polarity_scores

In [165]:
roberta_per_vid_scores(covid_filtered_out_df)

{'aLZ85hb4wjE': {'Neg': 0.24424575177254154,
  'Pos': 0.4741820393828675,
  'Neu': 0.28157221488654616},
 'sYI97jv-pZg': {'Neg': 0.3894941806793213,
  'Pos': 0.0859330091625452,
  'Neu': 0.5245728343725204},
 '3YFpjgIQqEo': {'Neg': 0.4141863211989403,
  'Pos': 0.25543777147928876,
  'Neu': 0.33037588579787147},
 'dIsaz_XlmTw': {'Neg': 0.33079710984020494,
  'Pos': 0.2652561893919483,
  'Neu': 0.4039467005059123},
 'DWxIvQlpJK8': {'Neg': 0.289873747547972,
  'Pos': 0.29180154629284516,
  'Neu': 0.41832468984648585},
 'pMUumjHY3tw': {'Neg': 0.28391369991004467,
  'Pos': 0.05910225957632065,
  'Neu': 0.6569840013980865},
 'nTUWK8vufOk': {'Neg': 0.38223098513359827,
  'Pos': 0.3571269003312207,
  'Neu': 0.2606421046786838},
 'cPVE7QGS7As': {'Neg': 0.16431427941036722,
  'Pos': 0.5078818136826158,
  'Neu': 0.32780391226212185},
 'CEIrzjA8euQ': {'Neg': 0.15780450403690338,
  'Pos': 0.1664552390575409,
  'Neu': 0.6757402420043945},
 'lw16DeB6zns': {'Neg': 0.18314840432140045,
  'Pos': 0.33922