In [1]:
import translators as ts
from langdetect import detect

In [2]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm

In [3]:
df = pd.read_csv("../datasets/covid_philippines/covid_philippines_comments.csv").drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same


## TRANSLATE

In [4]:
translated_comments = {}
translated_comments["video_id"] = {}
translated_comments["comment"] = {}
video_id_list = df["video_id"].to_list()
comments_list = df["comment"].to_list()

In [5]:
pbar = tqdm(total=len(video_id_list))
pbar.set_description("Translating...")

for i in range(df.shape[0]):
    if comments_list[i] != None:
        new_comment = comments_list[i]
        try:
            lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
            if lang != 'en':
                new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')
                
        except:
            # No change; get same comment from list
            pass
            
        finally:
            translated_comments["video_id"][i] = video_id_list[i]
            translated_comments["comment"][i] = new_comment
            pbar.update(1)

translated_df = pd.DataFrame.from_dict(translated_comments)
pbar.close()

Translating...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1216/1216 [04:12<00:00,  4.81it/s]


In [6]:
translated_df

Unnamed: 0,video_id,comment
0,aLZ85hb4wjE,Loved the calmness of Manilla.
1,aLZ85hb4wjE,After this pandemic I think every country shou...
2,aLZ85hb4wjE,manila looks beautifull with less people
3,aLZ85hb4wjE,The lockdown makes the city look like a place ...
4,aLZ85hb4wjE,India also same
...,...,...
1211,5DvMPgoKZmM,This covid will be a never-ending fuckery as l...
1212,5DvMPgoKZmM,A new variant is inevitable.
1213,5DvMPgoKZmM,The man that should resign from his office is ...
1214,5DvMPgoKZmM,"Is that the people who got been vaccinated.,"


---

In [7]:
def translate(video_id_list, comments_list):
    translated_comments = {}
    translated_comments["video_id"] = {}
    translated_comments["comment"] = {}
    
    pbar = tqdm(total=len(video_id_list))
    pbar.set_description("Translating...")

    for i in range(df.shape[0]):
        if comments_list[i] != None:
            new_comment = comments_list[i]
            try:
                lang = detect(comments_list[i]) #added langdetect since it errors if there are many entries to translate, so now it will ontly tranlate if comment not english
                if lang != 'en':
                    new_comment = ts.translate_text(comments_list[i], 'google', to_language = 'en')

            except:
                # No change; get same comment from list
                pass

            finally:
                translated_comments["video_id"][i] = video_id_list[i]
                translated_comments["comment"][i] = new_comment
                pbar.update(1)

    translated_df = pd.DataFrame.from_dict(translated_comments)
    pbar.close()

## DETECT SPAM

### MultinomialNB

- 0:Not Spam
- 1:Spam

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [63]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

psy_model_df = pd.read_csv("../datasets/model_train/Youtube01-Psy.csv")
lmfao_model_df = pd.read_csv("../datasets/model_train/Youtube03-LMFAO.csv")
kp_model_df = pd.read_csv("../datasets/model_train/Youtube02-KatyPerry.csv")
shakira_df = pd.read_csv("../datasets/model_train/Youtube05-Shakira.csv")
model_df = pd.concat([psy_model_df,lmfao_model_df,kp_model_df,shakira_df])
model_df.reset_index(inplace=True, drop=True) 
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [64]:
model_df.drop_duplicates(subset="CONTENT")

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ Ôªø,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .Ôªø,1
...,...,...,...,...,...
1502,_2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4,Dinova Sharon,2013-07-13T14:44:00.700000,well done shakira,0
1503,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0
1504,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
1506,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0


In [65]:
model_df.count()

COMMENT_ID    1508
AUTHOR        1508
DATE          1508
CONTENT       1508
CLASS         1508
dtype: int64

In [66]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Melanie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


preprocessing

In [67]:
model_df['CONTENT'] = model_df['CONTENT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in model_df["CONTENT"]: 
    #convert to lowercase
    model_df['CONTENT'] = model_df["CONTENT"].str.lower()
    #Stem
    model_df['CONTENT'] = model_df["CONTENT"].apply(ps.stem) 
    


In [119]:
x=model_df["CONTENT"]
y=model_df["CLASS"]

vectorizer = CountVectorizer()


x_train, x_test, y_train, y_test  = train_test_split(x,y,random_state=42)
x_train=vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


In [120]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9708222811671088
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       544
           1       0.97      0.97      0.97       587

    accuracy                           0.97      1131
   macro avg       0.97      0.97      0.97      1131
weighted avg       0.97      0.97      0.97      1131



In [121]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.9204244031830239
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       204
           1       0.89      0.94      0.92       173

    accuracy                           0.92       377
   macro avg       0.92      0.92      0.92       377
weighted avg       0.92      0.92      0.92       377



test with another dataset again

In [122]:
# dataset source: https://archive.ics.uci.edu/dataset/380/youtube+spam+collection

em_model_df = pd.read_csv("../datasets/model_train/Youtube04-Eminem.csv")
em_model_df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z12rwfnyyrbsefonb232i5ehdxzkjzjs2,Lisa Wellas,,+447935454150 lovely girl talk to me xxxÔªø,1
1,z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04,jason graham,2015-05-29T02:26:10.652000,I always end up coming back to this song<br />Ôªø,0
2,z13vsfqirtavjvu0t22ezrgzyorwxhpf3,Ajkal Khan,,"my sister just received over 6,500 new <a rel=...",1
3,z12wjzc4eprnvja4304cgbbizuved35wxcs,Dakota Taylor,2015-05-29T02:13:07.810000,CoolÔªø,0
4,z13xjfr42z3uxdz2223gx5rrzs3dt5hna,Jihad Naser,,Hello I&#39;am from PalastineÔªø,1


In [123]:
em_model_df['CONTENT'] = em_model_df['CONTENT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in em_model_df["CONTENT"]: 
    #convert to lowercase
    em_model_df['CONTENT'] = em_model_df["CONTENT"].str.lower()
    #Stem
    em_model_df['CONTENT'] = em_model_df["CONTENT"].apply(ps.stem) 

In [170]:
em_x=em_model_df["CONTENT"]
em_y=em_model_df["CLASS"]

em_x=vectorizer.transform(em_x)

In [125]:
predictions=spam_nb.predict(em_x)
accuracy = accuracy_score(em_y, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(em_y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8526785714285714
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82       203
           1       0.82      0.94      0.87       245

    accuracy                           0.85       448
   macro avg       0.86      0.84      0.85       448
weighted avg       0.86      0.85      0.85       448



### SVM 

In [126]:
x=model_df["CONTENT"]
y=model_df["CLASS"]

vectorizer = CountVectorizer()


x_train, x_test, y_train, y_test  = train_test_split(x,y,test_size=0.30,random_state=42)
x_train=vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [128]:
#from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svm_model = SVC(kernel = 'sigmoid', gamma = 1.0)
svm_model.fit(x_train, y_train)

predictions = svm_model.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.8265402843601896
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       504
           1       0.84      0.83      0.83       551

    accuracy                           0.83      1055
   macro avg       0.83      0.83      0.83      1055
weighted avg       0.83      0.83      0.83      1055



In [129]:
predictions = svm_model.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.8432671081677704
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       244
           1       0.81      0.86      0.83       209

    accuracy                           0.84       453
   macro avg       0.84      0.84      0.84       453
weighted avg       0.84      0.84      0.84       453



In [163]:
from sklearn.model_selection import RandomizedSearchCV
svc = SVC(max_iter=200)

In [164]:
hyperparameters= [
    {
        "C":[0.0001, 0.001, 0.01 , 0.1, 1.0, 5, 30, 50],
        "kernel": ["linear","poly","rbf","sigmoid"],
        "degree" :[1, 3, 5, 10, 25, 50,100],
        "gamma" :["scale", "auto",1000, 10, 5, 2.5, 1.5, 1.0]
    }
]

In [165]:
rssvc = RandomizedSearchCV(estimator = svc, param_distributions = hyperparameters, n_iter =30, cv=5, random_state=42).fit(x_train,y_train)




14 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Program Files\miniconda3\lib\site-packages\sklearn\svm\_base.py", line 268, in fit
    raise ValueError(
ValueError: The dual coefficients or intercepts are not finite. The input data may contain large values and need to be preprocessed.

 0.77630332 0.9298

In [166]:
rssvc.best_estimator_

In [167]:
predictions=rssvc.best_estimator_.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.9448123620309051
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       244
           1       0.94      0.94      0.94       209

    accuracy                           0.94       453
   macro avg       0.94      0.94      0.94       453
weighted avg       0.94      0.94      0.94       453



In [171]:
predictions=rssvc.best_estimator_.predict(em_x)
accuracy = accuracy_score(em_y, predictions)
print(f"Test Accuracy (EMINEM DF): {accuracy}")

class_report = classification_report(em_y, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy (EMINEM DF): 0.9665178571428571
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       203
           1       0.99      0.95      0.97       245

    accuracy                           0.97       448
   macro avg       0.97      0.97      0.97       448
weighted avg       0.97      0.97      0.97       448



In [172]:
spam_svc=rssvc.best_estimator_

try model on translated_df

In [173]:
nb_spam_filtered = translated_df.copy()
#svm_spam_filtered = translated_df.copy()

In [174]:
nb_spam_filtered["comment_cleaned"] = nb_spam_filtered["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in nb_spam_filtered["comment"]: 
    nb_spam_filtered["comment_cleaned"] = nb_spam_filtered["comment"].apply(ps.stem) 
    nb_spam_filtered["comment_cleaned"] = nb_spam_filtered["comment"].str.lower()

In [None]:
svm_spam_filtered = nb_spam_filtered.copy() #so no need to go through same cleaning/preprocessing

In [24]:
transformed = vectorizer.transform(translated_df["comment"])

translated_df["spam"]=spam_nb.predict(transformed)
translated_df.head()

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,loved calmness manilla.,0
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,0
2,aLZ85hb4wjE,manila looks beautifull less peopl,0
3,aLZ85hb4wjE,the lockdown makes city look like place i want...,0
4,aLZ85hb4wjE,india also,1


In [25]:
translated_df[translated_df["spam"]==1].count()

video_id    562
comment     562
spam        562
dtype: int64

In [26]:
translated_df[translated_df["spam"]==0].count()

video_id    654
comment     654
spam        654
dtype: int64

seems like there are too many tagged as spam

In [27]:
translated_df[translated_df["spam"]==1]

Unnamed: 0,video_id,comment,spam
4,aLZ85hb4wjE,india also,1
9,aLZ85hb4wjE,2020 year nature fight back reduce human emiss...,1
10,sYI97jv-pZg,cough cold season weather cold .... covid amg ...,1
11,sYI97jv-pZg,ala n covid tngina,1
12,3YFpjgIQqEo,this cold make worse make money,1
...,...,...,...
1201,Wjj__vIdew0,your daily dose fearmong,1
1203,Wjj__vIdew0,gma bat numb,1
1204,Wjj__vIdew0,nest,1
1212,5DvMPgoKZmM,a new variant inevitable.,1


In [28]:
translated_df[translated_df["spam"]==0]

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,loved calmness manilla.,0
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,0
2,aLZ85hb4wjE,manila looks beautifull less peopl,0
3,aLZ85hb4wjE,the lockdown makes city look like place i want...,0
5,aLZ85hb4wjE,the president promised he'll best ease traffic...,0
...,...,...,...
1209,5DvMPgoKZmM,"honestly, i hope things least get slightly bet...",0
1210,5DvMPgoKZmM,"please, option get vaccinated, it. it still po...",0
1211,5DvMPgoKZmM,this covid never-ending fuckery long media kee...,0
1213,5DvMPgoKZmM,the man resign office nonetheless present head...,0


In [29]:
translated_df[translated_df["spam"]==1].to_csv("check_spam.csv")

In [30]:
translated_df[translated_df["spam"]==0].to_csv("check_not_spam.csv")

Israel-Palestine comments check

In [31]:
is_pal_df = pd.read_csv("../datasets/israel-palestine_conflict_history/comments.csv").drop("Unnamed: 0", axis=1)
is_pal_df.head()

Unnamed: 0,video_id,comment
0,R0ftmf_Uv9A,No matter how many times these information get...
1,R0ftmf_Uv9A,"*To learn who RULES over you, simply find out ..."
2,R0ftmf_Uv9A,"Say that part again: Jewish , Christian‚Äôs and ..."
3,R0ftmf_Uv9A,So sad. They were living in peace and now suff...
4,R0ftmf_Uv9A,Why start at 1946?


In [32]:
is_pal_df["comment"] = is_pal_df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in is_pal_df["comment"]: 
    is_pal_df["comment"] = is_pal_df["comment"].apply(ps.stem) 
    is_pal_df["comment"] = is_pal_df["comment"].str.lower()

In [33]:
transformed = vectorizer.transform(is_pal_df["comment"])

is_pal_df["spam"]=spam_nb.predict(transformed)
is_pal_df.head()

Unnamed: 0,video_id,comment,spam
0,R0ftmf_Uv9A,no matter many times information gets thrown e...,0
1,R0ftmf_Uv9A,"*to learn rules you, simply find not allowed c...",1
2,R0ftmf_Uv9A,"say part again: jewish , christian‚Äôs muslims l...",1
3,R0ftmf_Uv9A,so sad. they living peace suffering 7 decades üò¢,0
4,R0ftmf_Uv9A,why start 1946?,0


In [34]:
is_pal_df[is_pal_df["spam"]==1].count()

video_id    757
comment     757
spam        757
dtype: int64

In [35]:
is_pal_df[is_pal_df["spam"]==0].count()

video_id    1114
comment     1114
spam        1114
dtype: int64

In [36]:
is_pal_df[is_pal_df["spam"]==1]

Unnamed: 0,video_id,comment,spam
1,R0ftmf_Uv9A,"*to learn rules you, simply find not allowed c...",1
2,R0ftmf_Uv9A,"say part again: jewish , christian‚Äôs muslims l...",1
5,R0ftmf_Uv9A,"thank's ireland, consistent n vocal supporting...",1
8,R0ftmf_Uv9A,let peace prevail.,1
12,Bno1m1zhIWs,finally. an objective concise summary religiou...,1
...,...,...,...
1861,JuU7pSDs8f4,angel one demat account(free) - https://tinyur...,1
1863,JuU7pSDs8f4,nobody explain clear do. pl. keep videos.,1
1867,JuU7pSDs8f4,i'm upsc aspirant ....i seen many videos regar...,1
1869,JuU7pSDs8f4,bro literally i searched video war started. no...,1


For this model 40% of israel palestine comments were tagged Spam. While translated covid dataset has 45.39 of data tagged as spam.

In [43]:
#source: https://www.kaggle.com/datasets/madhuragl/5000-youtube-spamnot-spam-dataset/data
comments_5k_df = pd.read_csv("../datasets/model_train/5000 YT comments.csv",encoding='cp1252')
comments_5k_df.head()

Unnamed: 0,Name,Comment,Time,Likes,Reply Count,Spam
0,Taofeekat,&lt;????i make my first million investing in f...,2022-09-28T02:08:55Z,30,30,1
1,Angelina Jordan,&lt;?l will forever be indebted to you I will ...,2022-09-23T05:26:48Z,0,0,1
2,Fernandez Joe,<b>????I recommend a professional forex/Bitcoi...,2022-09-20T12:56:30Z,5,2,1
3,Jessica Billy,I think I‚Äôm blessed because if not I wouldn‚Äôt ...,2022-09-17T20:20:24Z,21,34,1
4,Allison Zar,<b>I recommend a professional broker to you g...,2022-09-05T09:19:30Z,19,27,1


In [44]:
comments_5k_df.count()

Name           5000
Comment        5000
Time           5000
Likes          5000
Reply Count    5000
Spam           5000
dtype: int64

In [45]:
comments_5k_df.drop_duplicates(subset="Comment")

Unnamed: 0,Name,Comment,Time,Likes,Reply Count,Spam
0,Taofeekat,&lt;????i make my first million investing in f...,2022-09-28T02:08:55Z,30,30,1
1,Angelina Jordan,&lt;?l will forever be indebted to you I will ...,2022-09-23T05:26:48Z,0,0,1
2,Fernandez Joe,<b>????I recommend a professional forex/Bitcoi...,2022-09-20T12:56:30Z,5,2,1
3,Jessica Billy,I think I‚Äôm blessed because if not I wouldn‚Äôt ...,2022-09-17T20:20:24Z,21,34,1
4,Allison Zar,<b>I recommend a professional broker to you g...,2022-09-05T09:19:30Z,19,27,1
...,...,...,...,...,...,...
4995,Anjan Das,She is so beautiful!,2020-06-05T04:18:26Z,5,0,0
4996,Humza Navaid,3 seconds in and I want to marry her. I am goi...,2020-06-04T21:03:14Z,0,0,0
4997,Aadil Ranesh,She talks a lot like Tanmay Bakshi,2020-06-03T17:29:04Z,0,0,0
4998,Fuzail Ahmad,Why does her face look like a bad deepfake?,2020-06-03T11:17:48Z,1,0,0


In [46]:
comments_5k_df.count()

Name           5000
Comment        5000
Time           5000
Likes          5000
Reply Count    5000
Spam           5000
dtype: int64

In [47]:
comments_5k_df['Comment'] = comments_5k_df['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

ps = PorterStemmer() 
for w in comments_5k_df['Comment']: 
    comments_5k_df['Comment'] = comments_5k_df['Comment'].apply(ps.stem) 
    comments_5k_df['Comment'] = comments_5k_df['Comment'].str.lower()
    

In [48]:
x=comments_5k_df["Comment"]
y=comments_5k_df["Spam"]

vectorizer= TfidfVectorizer()



x_train, x_test, y_train, y_test  = train_test_split(x,y,random_state=42)
x_train=vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [49]:
spam_nb = MultinomialNB()
spam_nb.fit(x_train,y_train)

predictions=spam_nb.predict(x_train)
accuracy = accuracy_score(y_train, predictions)
print(f"Train Accuracy: {accuracy}")

class_report = classification_report(y_train, predictions)
print("Classification Report:")
print(class_report)

Train Accuracy: 0.936
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      1876
           1       0.92      0.96      0.94      1874

    accuracy                           0.94      3750
   macro avg       0.94      0.94      0.94      3750
weighted avg       0.94      0.94      0.94      3750



In [50]:
predictions=spam_nb.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.8848
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88       624
           1       0.85      0.94      0.89       626

    accuracy                           0.88      1250
   macro avg       0.89      0.88      0.88      1250
weighted avg       0.89      0.88      0.88      1250



In [51]:
translated_df = og_translated_df.copy()

In [52]:
translated_df["comment"] = translated_df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in translated_df["comment"]: 
    translated_df["comment"] = translated_df["comment"].apply(ps.stem) 
    translated_df["comment"] = translated_df["comment"].str.lower()

In [53]:
transformed = vectorizer.transform(translated_df["comment"])

translated_df["spam"]=spam_nb.predict(transformed)
translated_df.head()

Unnamed: 0,video_id,comment,spam
0,aLZ85hb4wjE,loved calmness manilla.,0
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,1
2,aLZ85hb4wjE,manila looks beautifull less peopl,0
3,aLZ85hb4wjE,the lockdown makes city look like place i want...,0
4,aLZ85hb4wjE,india also,1


In [54]:
translated_df[translated_df["spam"]==1].count()

video_id    716
comment     716
spam        716
dtype: int64

In [55]:
translated_df[translated_df["spam"]==0].count()

video_id    500
comment     500
spam        500
dtype: int64

In [56]:
translated_df[translated_df["spam"]==1]

Unnamed: 0,video_id,comment,spam
1,aLZ85hb4wjE,after pandemic i think every country lockdown ...,1
4,aLZ85hb4wjE,india also,1
10,sYI97jv-pZg,cough cold season weather cold .... covid amg ...,1
11,sYI97jv-pZg,ala n covid tngina,1
12,3YFpjgIQqEo,this cold make worse make money,1
...,...,...,...
1207,5DvMPgoKZmM,"it's almost 2 years this, many people honestly...",1
1210,5DvMPgoKZmM,"please, option get vaccinated, it. it still po...",1
1211,5DvMPgoKZmM,this covid never-ending fuckery long media kee...,1
1212,5DvMPgoKZmM,a new variant inevitable.,1


try with israel palestine conflict dataset

In [57]:
is_pal_df["comment"] = is_pal_df["comment"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


ps = PorterStemmer() 
for w in is_pal_df["comment"]: 
    is_pal_df["comment"] = is_pal_df["comment"].apply(ps.stem) 
    is_pal_df["comment"] = is_pal_df["comment"].str.lower()

In [59]:
transformed = vectorizer.transform(is_pal_df["comment"])

is_pal_df["spam"]=spam_nb.predict(transformed)
is_pal_df.head()

Unnamed: 0,video_id,comment,spam
0,R0ftmf_Uv9A,matter many times information gets thrown eyes...,0
1,R0ftmf_Uv9A,"*to learn rules you, simply find allowed criti...",1
2,R0ftmf_Uv9A,"say part again: jewish , christian‚Äôs muslims l...",1
3,R0ftmf_Uv9A,sad. living peace suffering 7 decades üò¢,1
4,R0ftmf_Uv9A,start 1946?,1


In [60]:
is_pal_df[is_pal_df["spam"]==1].count()

video_id    901
comment     901
spam        901
dtype: int64

In [61]:
is_pal_df[is_pal_df["spam"]==0].count()

video_id    970
comment     970
spam        970
dtype: int64

In [None]:
is_pal_df[is_pal_df["spam"]==1]

In [None]:
is_pal_df[is_pal_df["spam"]==1].to_csv("check_spam.csv")

For this model israel-palestine comments had 45.48% of the dataset as spam. While translated covid has 55.1% tagged as spam

## SENTIMENT ANALYSIS

TEST RoBERTa

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from scipy.special import softmax

def roberta_sentiment_scores(list_comments):
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)

    for comments in list_comments:
        comment_words = []
        comments = comments.replace("\n", " ")
        comments = comments.replace("\xa0", " ")
        comments = comments.replace("?", " ")
        comments = comments.replace(":", " ")
        comments = comments.replace(";", " ")
        comments = comments.replace(";", " ")
        comments = re.sub(r"\s+", ' ', comments) 
        print(comments)
        for word in comments.split(' '):
            if word.startswith('@') and len(word) > 1:
               word = '@user'
        
            elif word.startswith('http'):
                word = "http"
            comment_words.append(word)

        comment_procs = " ".join(comment_words)

        encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
        print(encoded)
        output = model(**encoded)

        scores = output[0][0].detach().numpy()

        scores = softmax(scores)

        for i in range(len(scores)):

            l = labels[i]
            s = scores[i]
            print(l, s)

In [None]:
roberta_sentiment_scores(translated_df["comment"][:50])

TEST TEXTBLOB

In [None]:
from textblob import TextBlob

def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [None]:
textblob_sentiment_scores(translated_df["comment"][:50])

TEST STANZA

In [None]:
import stanza
def stanza_sentiment_scores(list_comments):
    nlp = stanza.Pipeline('en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

    for comment in list_comments:
        doc = nlp(comment.replace("\n", " "))
        print(comment)
    #doc.sentences[0].print_dependencies()
        for i, sentence in enumerate(doc.sentences):
            print("%d -> %d" % (i, sentence.sentiment))

In [None]:
stanza_sentiment_scores(translated_df["comment"][:50])

TEST VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def vader_sentiment_scores (list_comments):
    for sentence in list_comments:
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict = sid_obj.polarity_scores(sentence)
        print(sentence)
        print("Overall sentiment dictionary is : ", sentiment_dict)
        print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
        print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
        print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
        print("Sentence Overall Rated As", end = " ")
 
        # decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")

    

In [None]:
vader_sentiment_scores (translated_df["comment"][:50])

## Get Sentiment sentiment for each video

#### TEXTBLOB APPROACH

In [None]:
def textblob_sentiment_scores(list_comments):
    list_sentiment={}
    for comment in list_comments:
        testimonial = TextBlob(comment)
        if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.polarity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.polarity)
        else:
            print("neutral", testimonial.sentiment.polarity)

In [None]:
translated_df["video_id"].unique()

In [None]:
def per_vid_textblob_polarity (df):
    video_polarity={}
    for video in df["video_id"].unique():
        sum_polarity=0
        for comment in df.loc[df["video_id"]==video]["comment"]:
            sum_polarity+=(TextBlob(comment)).sentiment.polarity
        video_polarity[video] = sum_polarity/df.loc[df["video_id"]==video]["comment"].count()
    ''' if (testimonial.sentiment.polarity > 0):
            print("positive", testimonial.sentiment.subjectivity)
        elif (testimonial.sentiment.polarity < 0):
            print("negative", testimonial.sentiment.subjectivity)
        else:
            print("neutral", testimonial.sentiment.subjectivity)'''
    return video_polarity


In [None]:
per_vid_textblob_polarity(translated_df)

#### VADER APPROACH

In [None]:
def per_vid_vader_polarity (df):
    polarity_scores={}
    sid_obj = SentimentIntensityAnalyzer()

    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0
        sum_compound= 0
        scores={}
        for sentence in df.loc[df["video_id"]==video]["comment"]:

            sentiment_dict = sid_obj.polarity_scores(sentence)
            sum_neg += sentiment_dict['neg']
            sum_pos += sentiment_dict['pos']
            sum_neu += sentiment_dict['neu']
            sum_compound += sentiment_dict['compound']
            
        scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()
        scores["Overall"] = sum_compound/df.loc[df["video_id"]==video]["comment"].count()
        
        polarity_scores[video]=scores
        '''# decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            print("Positive")
 
        elif sentiment_dict['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")'''
    return polarity_scores
        

In [None]:
per_vid_vader_polarity(translated_df)

### RoBERTa APPROACH

In [None]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import re
#from scipy.special import softmax

def roberta_per_vid_scores(df):
    polarity_scores={}

    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    labels = ['Negative', 'Neutral', 'Positive']

    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    for video in df["video_id"].unique():
        sum_neg =0
        sum_pos = 0
        sum_neu =0 
        scores={}
        for comments in df.loc[df["video_id"]==video]["comment"]:
            comment_words = []
            comments = comments.replace("\n", " ")
            comments = comments.replace("\xa0", " ")
            comments = comments.replace("?", " ")
            comments = comments.replace(":", " ")
            comments = comments.replace(";", " ")
            comments = comments.replace(";", " ")
            comments = re.sub(r"\s+", ' ', comments) 
     #   print(comments)
            for word in comments.split(' '):
                if word.startswith('@') and len(word) > 1:
                    word = '@user'
        
                elif word.startswith('http'):
                    word = "http"
                comment_words.append(word)

            comment_procs = " ".join(comment_words)

            encoded = tokenizer(comment_procs, return_tensors='pt', max_length=512, truncation=True, padding=True)
           # print(encoded)
            output = model(**encoded)

            scores = output[0][0].detach().numpy()

            scores = softmax(scores)

            for i in range(len(scores)):

                l = labels[i]
                s = scores[i]
                #    labels = ['Negative', 'Neutral', 'Positive']

                if l=="Negative":
                    sum_neg+= s
                elif l == "Neutral":
                    sum_neu+=s
                else : 
                    sum_pos+=s
       # scores["Neg"] = sum_neg/df.loc[df["video_id"]==video]["comment"].count()
        scores =dict([("Neg", sum_neg/df.loc[df["video_id"]==video]["comment"].count()),("Pos",sum_pos/df.loc[df["video_id"]==video]["comment"].count()),("Neu",sum_neu/df.loc[df["video_id"]==video]["comment"].count()
)])
        #scores["Pos"] = sum_pos/df.loc[df["video_id"]==video]["comment"].count()
        #scores["Neu"] = sum_neu/df.loc[df["video_id"]==video]["comment"].count()

        polarity_scores[video] = scores
    return polarity_scores

In [None]:
roberta_per_vid_scores(translated_df)