In [1]:
import json
import pickle
import datasets
import pandas as pd
import numpy as np
from pprint import pprint
from tqdm.notebook import tqdm
from collections import Counter
from datasets import load_dataset
from time import strftime, localtime

pd.set_option('display.max_colwidth', 200)

In [2]:
# ! pip install fsspec=2023.4.0

In [3]:
# dataset = load_dataset('alsubari/Israel-palestine-war')

In [4]:
# print(dataset)

# Exploratory Data Analysis

### Stats about videos

In [5]:
df_metadata = pd.read_csv('data/alsubari_videos_metadata.csv')

In [6]:
df_metadata.head()

Unnamed: 0.1,Unnamed: 0,id,publish_channel,title,start_duration,end_duration,lengthSeconds,total_comments,total_unique_users
0,0,FUzLx1l37RE,NBCNews,Blinken reiterates Israel support but urges restraint as Gaza casualties rise,2023-11-03 19:38:34,2023-11-04 17:38:34,241,25,21
1,1,PcQaG4sC9BM,NBCNews,Full Special Report: Israel declares war after surprise Hamas attack | NBC News,2023-10-07 19:42:03,2023-11-04 12:38:36,1484,21905,14860
2,2,X2t6NyGP1-w,NBCNews,Hamas official claims it's their 'legal right' to fight against Israeli occupation,2023-11-03 19:49:36,2023-11-04 19:46:34,335,1853,1170
3,3,cd24N0tF-jY,NBCNews,FBI director warns of growing domestic threat amid Israel-Hamas war,2023-11-01 19:50:24,2023-11-04 18:50:41,143,987,670
4,4,ICR5ml2YPkI,NBCNews,Israel’s president: 23-year-old Israeli hostage Shani Louk was beheaded by Hamas,2023-10-31 19:51:23,2023-11-04 19:44:35,162,5275,3475


In [7]:
df_metadata['start_duration'] = pd.to_datetime(df_metadata['start_duration'])
df_metadata['end_duration'] = pd.to_datetime(df_metadata['end_duration'])

In [8]:
print("Number of videos:", len(df_metadata))
print("Number of channels:", len(df_metadata.publish_channel.unique()))
print("Total duration:", sum(df_metadata.lengthSeconds), "seconds")
print("Total number of comments:", sum(df_metadata.total_comments))
print("Total number of unique users:", sum(df_metadata.total_unique_users))

first_post = min(df_metadata.start_duration)
last_post = max(df_metadata.start_duration)
print('\n', "Time interval:", first_post, 'to', last_post, '(totaling', (last_post - first_post).days, 'days)')

Number of videos: 310
Number of channels: 6
Total duration: 134468 seconds
Total number of comments: 1028346
Total number of unique users: 689753

 Time interval: 2023-10-04 22:01:46 to 2023-11-06 19:42:46 (totaling 32 days)


### Stats about comments

In [9]:
df_comments = pd.read_csv('data/alsubari_videos_comments.csv')

In [10]:
df_comments.head()

Unnamed: 0.1,Unnamed: 0,author,channel,cid,heart,reply,text,time,time_parsed,votes,video_id
0,0,@LMLewis,UCNUa3f8nwYg-Kn1v4u5wrtQ,UgxbXce9xz7a27nVOyl4AaABAg,False,False,"Blinken deserves to be known as the Butcher of Foggy Bottom, using empty words to try to deflect criticism of the US government's participation in genocide.",2 hours ago,1699120000.0,0,FUzLx1l37RE
1,1,@lovellewilliams5531,UCm9ztdJUAMpUNipdNxAHN_g,Ugx7aiPfji7zYkmV1k54AaABAg,False,False,The United States are supporting the killing of civilians,4 hours ago,1699112000.0,0,FUzLx1l37RE
2,2,@noryaminmatariffin3029,UCCyvX_gd5zANsFYXnDBpuXw,UgywKu6FT8TWbssFsnJ4AaABAg,False,False,"So why hospitals, ambulance, civiliance, refuge camp are bombing is IDF blind! or you just a clown",16 hours ago,1699069000.0,0,FUzLx1l37RE
3,3,@Not_Today___,UCmicKai8OhhPUpOfG3VTFvA,UgwXQG7Qt7HVKsSTC7F4AaABAg,False,False,"I'm an American. I think this war started on October 7th and that before that Israel never did anything wrong. I have the right to free speech and I'm gonna use it, even though I'm this: 🤣",1 day ago,1699040000.0,0,FUzLx1l37RE
4,4,@benzo2632,UCA0boZvrgR4mbnQ2Bi65QKQ,UgxnJo7jgnD3Qcg6aAB4AaABAg,False,False,"WAR CRIMES SUPORTED BY USA ,,,,",1 day ago,1699040000.0,0,FUzLx1l37RE


In [11]:
epoch_to_date = lambda d: strftime('%Y-%m-%d %H:%M:%S', localtime(d))
df_comments['date'] = pd.to_datetime(df_comments['time_parsed'].apply(epoch_to_date))

In [12]:
print("Number of comments:", len(df_comments))
print("Total number of unique posters:", len(df_comments.author.unique()))
print("Average number of comments per video:", int(df_comments['video_id'].value_counts().mean()))

first_post = min(df_comments.date)
last_post = max(df_comments.date)
print('\n', "Time interval:", first_post, 'to', last_post, '(totaling', (last_post - first_post).days, 'days)')

Number of comments: 1028346
Total number of unique posters: 457373
Average number of comments per video: 3317

 Time interval: 2023-10-04 23:01:46 to 2023-11-07 01:41:49 (totaling 33 days)


# Sentiment Analysis

In [13]:
from transformers import pipeline

In [14]:
# No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english 
sentiment_pipeline = pipeline("sentiment-analysis", device="cuda")
data = ["I love you", "I hate you"]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [15]:
df_comments['video_id'].value_counts()

video_id
4idQbwsvtUo    128093
6FVUxvp6Ah0     26252
svOc4Ki68_U     24391
FXrVtabZggI     23255
PcQaG4sC9BM     21905
                ...  
ktAc39_WaTI        71
ivXS0Fnzpns        44
KwWz3wQtV5Y        42
zZGtLXxQhjQ        25
FUzLx1l37RE        25
Name: count, Length: 310, dtype: int64

In [16]:
video_title = df_metadata[df_metadata.id == 'svOc4Ki68_U']['title'].values[0]
print(video_title)
print("It's a debate" if "debate" in video_title.lower() else "It's not a debate")

HEATED DEBATE: Cornel West, Alan Dershowitz spar over Israel-Hamas war
It's a debate


In [17]:
video_id = 'svOc4Ki68_U'
video_comments =  df_comments[df_comments['video_id'] == video_id]

In [18]:
video_comments['text'].tolist()[8797]

'ليسن فاكس نيوز اذا كنت تفهم الصحافه واذا كنت تفهم القراءه والكتابه هل تعلم ان اسرائيل محتله الاراضي الفلسطينيه اكثر من 75 عام تعلم ان اسرائيل تقتل الفلسطينيين يوميا وتقول ان حماس هي ارهابيه حماس نينجا جو هما فلسطينيين يدافعون عن عرضهم وعن كرامتهم وعن حقوقهم من حق كل فلسطينيه حماس اي انسان فلسطيني ان يدافع على حقوقه في الدول العربيه والاسلاميه هي ليست بالخمسينات احنا فهمنا الكثير منكم وعلى عبر قناتكم فاكس نيوز تعلمنا الكثير العنصريه حقكم من الاضطهاد من الظلم من الكهنوت انت من حق اي فلسطيني ان يدافع وان يقتل الجيش الاسرائيلي كله وفي مثل بيقول بالعربي اي شيء يؤخذ منك بالقوه ما يعاد الا بالقوه نحن نعرف الكثير ان اسرائيل هي ولايه من الولايات المتحده الامريكيه تدعمها بالمال بالاسلحه وتقتلون شعبنا العزيز فجاء الحق وزهق الباطل ان الباطل كان زهوقا والنصر والعزه لابناء فلسطين ونحن واقفين معهم بالمال والسلاح والرجال وفي كل شيء رغم انفكم'

# Sentiment Extraction

In [21]:
comment_sentiments = pickle.load(open('comment_sentiments.pickle', 'rb'))

In [22]:
len(df_comments), len(comment_sentiments)

(1028346, 1028346)

In [23]:
df_comments['sentiment_label'] = [cs[0]['label'] for cs in comment_sentiments]
df_comments['sentiment_score'] = [cs[0]['score'] for cs in comment_sentiments]

In [24]:
df_comments.head()

Unnamed: 0.1,Unnamed: 0,author,channel,cid,heart,reply,text,time,time_parsed,votes,video_id,date,sentiment_label,sentiment_score
0,0,@LMLewis,UCNUa3f8nwYg-Kn1v4u5wrtQ,UgxbXce9xz7a27nVOyl4AaABAg,False,False,"Blinken deserves to be known as the Butcher of Foggy Bottom, using empty words to try to deflect criticism of the US government's participation in genocide.",2 hours ago,1699120000.0,0,FUzLx1l37RE,2023-11-04 18:38:34,NEGATIVE,0.998373
1,1,@lovellewilliams5531,UCm9ztdJUAMpUNipdNxAHN_g,Ugx7aiPfji7zYkmV1k54AaABAg,False,False,The United States are supporting the killing of civilians,4 hours ago,1699112000.0,0,FUzLx1l37RE,2023-11-04 16:38:34,NEGATIVE,0.877355
2,2,@noryaminmatariffin3029,UCCyvX_gd5zANsFYXnDBpuXw,UgywKu6FT8TWbssFsnJ4AaABAg,False,False,"So why hospitals, ambulance, civiliance, refuge camp are bombing is IDF blind! or you just a clown",16 hours ago,1699069000.0,0,FUzLx1l37RE,2023-11-04 04:38:34,NEGATIVE,0.999448
3,3,@Not_Today___,UCmicKai8OhhPUpOfG3VTFvA,UgwXQG7Qt7HVKsSTC7F4AaABAg,False,False,"I'm an American. I think this war started on October 7th and that before that Israel never did anything wrong. I have the right to free speech and I'm gonna use it, even though I'm this: 🤣",1 day ago,1699040000.0,0,FUzLx1l37RE,2023-11-03 20:38:34,POSITIVE,0.989705
4,4,@benzo2632,UCA0boZvrgR4mbnQ2Bi65QKQ,UgxnJo7jgnD3Qcg6aAB4AaABAg,False,False,"WAR CRIMES SUPORTED BY USA ,,,,",1 day ago,1699040000.0,0,FUzLx1l37RE,2023-11-03 20:38:34,NEGATIVE,0.986656
